Source code for macro_eeg_model.data_prep.data_preparator

# standard imports
import os

# external imports
import numpy as np

# local imports
from macro_eeg_model.utils.paths import paths


[docs] class DataPreparator: """ A class to prepare and process data from directories containing CSV files with connectivity data across subjects. The processed data is saved as a NumPy array after averaging across multiple subjects. """
[docs] def prep_and_save(self, directory_name, included_word, delimiter, name): """ Handles the prerequisites for preparing and saving the data from a specified directory within the Julich data path (see :py:class:`src.utils.paths.Paths`) and then does the actual data preparation and saving using :py:meth:`_prep_and_save_data`. This method filters the files in the directory based on an included word in their filenames, processes them into NumPy arrays, calculates an average array, and saves it to a specified path. Parameters ---------- directory_name : str The name of the directory containing the subject folders. included_word : str The word that should be included in the CSV filenames to be processed. delimiter : str The delimiter used in the CSV files. name : str The name to use when saving the final averaged array. """ directory = paths.julich_data_path / directory_name subjects = os.listdir(directory) def is_subject(subject): return (subject != ".DS_Store") and (subject != ".gitkeep") # discard ds store subjects = [subject for subject in subjects if is_subject(subject)] subjects.sort() self._prep_and_save_data(directory, subjects, included_word, delimiter, name)
[docs] def _prep_and_save_data(self, directory, subjects, included_word, delimiter, name): """ Extracts relevant CSV files based on the included word using :py:meth:`_extract_csv_files` converts them to NumPy arrays using :py:meth:`_get_arrays_from_files`, computes an average array using :py:meth:`_calculate_avg_array`, and saves it as a .npy file. Parameters ---------- directory : str or pathlib.Path The path to the directory containing the subject folders. subjects : list The list of subject folder names. included_word : str The word that should be included in the CSV filenames to be processed. delimiter : str The delimiter used in the CSV files. name : str The name to use when saving the final averaged array. """ csv_files = list(set(self._extract_csv_files(directory, subjects, included_word))) numpy_arrays = self._get_arrays_from_files(directory, subjects, csv_files, delimiter) avg_array = self._calculate_avg_array(numpy_arrays) path = paths.connectivity_data_path / f"avg_{name}.npy" np.save(path, avg_array)
[docs] @staticmethod def _extract_csv_files(directory, subjects, included_word): """ Extracts the names of CSV files that include a specific word in their filenames. Searches through the directory of each subject for CSV files that contain the specified word in their name. Parameters ---------- directory : str or pathlib.Path The path to the directory containing the subject folders. subjects : list The list of subject folder names. included_word : str The word that must be included in the filenames. Returns ------- list A list of filenames that match the criteria. """ csv_files = [] for subject in subjects: for filename in os.listdir(os.path.join(directory, subject)): if filename.endswith(".csv") and included_word in filename: csv_files.append(filename) return csv_files
[docs] def _get_arrays_from_files(self, directory, subjects, files, delimiter=","): """ Retrieves and converts the relevant CSV files into NumPy arrays using :py:meth:`_convert_csv_file_to_numpy_array`. For each subject in the directory, this method identifies the files to be processed, converts them into NumPy arrays, and collects them for further processing. Parameters ---------- directory : str or pathlib.Path The path to the directory containing the subject folders. subjects : list The list of subject folder names. files : list The list of filenames to be processed. delimiter : str, optional The delimiter used in the CSV files (default is ','). Returns ------- list A list of NumPy arrays corresponding to the processed CSV files. """ arrays = [] for subject in subjects: for file in files: array = self._convert_csv_file_to_numpy_array(os.path.join(directory, subject, file), delimiter) arrays.append(array) return arrays
[docs] @staticmethod def _convert_csv_file_to_numpy_array(file_path, delimiter): """ Converts a CSV file into a NumPy array. Parameters ---------- file_path : str or pathlib.Path The full path to the CSV file. delimiter : str The delimiter used in the CSV file. Returns ------- numpy.ndarray A NumPy array representing the data from the CSV file. """ return np.loadtxt(file_path, delimiter=delimiter)
[docs] @staticmethod def _calculate_avg_array(numpy_arrays): """ Computes the average of each element across multiple NumPy arrays, excluding the highest and lowest 20% of values (to reduce the impact of outliers), and returns the resulting array. Parameters ---------- numpy_arrays : list A list of NumPy arrays to average. Returns ------- numpy.ndarray A NumPy array containing the average values. """ avg_array = np.zeros(numpy_arrays[0].shape) for i in range(numpy_arrays[0].shape[0]): for j in range(numpy_arrays[0].shape[1]): #ij_values = [numpy_arrays[k][i][j] for k in range(numpy_arrays[0].shape[0])] ij_values = [numpy_arrays[k][i][j] for k in range(len(numpy_arrays))] # remove highest and lowest X% from ij_values ij_values = np.sort(ij_values) p = 0.2 ij_values = ij_values[int(p * len(ij_values)):int((1 - p) * len(ij_values))] avg_array[i][j] = np.mean(ij_values) # fill diagonal with nans # np.fill_diagonal(avg_array, np.nan) return avg_array