Source code for sovabids.datasets

"""Module with dataset utilities.
"""
import os
from pandas import read_csv
import shutil
from sovabids.files import download,_get_files
from sovabids.misc import get_num_digits
from sovabids.parsers import parse_from_regex
import mne
import numpy as np
from mne_bids.write import _write_raw_brainvision

[docs]def lemon_prepare(): """Download and prepare a few files of the LEMON dataset. Notes ----- See the `LEMON dataset <http://fcon_1000.projects.nitrc.org/indi/retro/MPI_LEMON.html>`_ . """ # Path Configuration this_dir = os.path.dirname(__file__) data_dir = os.path.join(this_dir,'..','_data') root_path = os.path.abspath(os.path.join(data_dir,'lemon')) os.makedirs(data_dir,exist_ok=True) # Download lemon Database urls = ['https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/MPI-LEMON/Compressed_tar/EEG_MPILMBB_LEMON/EEG_Raw_BIDS_ID/sub-032301.tar.gz', 'https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/MPI-LEMON/Compressed_tar/EEG_MPILMBB_LEMON/EEG_Raw_BIDS_ID/sub-032302.tar.gz', 'https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/MPI-LEMON/Compressed_tar/EEG_MPILMBB_LEMON/EEG_Raw_BIDS_ID/sub-032303.tar.gz', 'https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/MPI-LEMON/name_match.csv'] for url in urls: download(url,os.path.join(data_dir,'lemon')) # Generate all filepaths filepaths = _get_files(root_path) # Label Correction name_match = read_csv(os.path.join(root_path,'name_match.csv')) # Unpack files # TAR FILES tars = [x for x in filepaths if 'tar.gz' in x ] # SUBJECTS old_ids = [parse_from_regex(x,'(sub-.*?).tar.gz',['id']) for x in tars] old_ids = [x['id'] for x in old_ids] new_ids = [name_match.loc[(name_match.INDI_ID==x),'Initial_ID']._values[0] for x in old_ids] # EEG FILES not_tars = [x for x in filepaths if '.vhdr' in x ] not_tars_ids = [parse_from_regex(x,'RSEEG\\/(sub-.*?).vhdr',['id']) for x in not_tars] not_tars_ids = [x['id'] for x in not_tars_ids] assert len(tars) == len(old_ids) == len(new_ids) if set(new_ids) == set(not_tars_ids): # all done return else: for file,old,new in zip(tars,old_ids,new_ids): if not new in not_tars_ids: # skip already prepared files shutil.unpack_archive(file,root_path) olddir = os.path.join(root_path,old) subject_files = _get_files(olddir) for subfile in subject_files: # fix sub-id new_path = subfile.replace(old,new) dir,_ = os.path.split(new_path) os.makedirs(dir,exist_ok=True) shutil.move(subfile,new_path) shutil.rmtree(olddir) print('LEMON PREPARE DONE!')
[docs]def lemon_bidscoin_prepare(src_path): """Download and prepare a few files of the LEMON dataset to be used with BIDSCOIN. Parameters ---------- src_path : str The path where the BIDSCOIN-ready LEMON files will be See Also -------- datasets.lemon_prepare """ lemon_prepare() this_dir = os.path.dirname(__file__) data_dir = os.path.join(this_dir,'..','_data') root_path = os.path.abspath(os.path.join(data_dir,'lemon')) bidscoin_input_path = src_path os.makedirs(bidscoin_input_path,exist_ok=True) files = _get_files(root_path) files = [x for x in files if x.split('.')[-1] in ['eeg','vmrk','vhdr'] ] files_out = [] for f in files: session = 'ses-001' task = 'resting' head,tail=os.path.split(f) sub = tail.split('.')[0] new_path = os.path.join(bidscoin_input_path,sub,session,task,tail) files_out.append(new_path) for old,new in zip(files,files_out): print(old,' to ',new) os.makedirs(os.path.split(new)[0], exist_ok=True) if not os.path.isfile(new): shutil.copy2(old,new) else: print('already done, skipping...') print('finish')
[docs]def make_dummy_dataset(PATTERN='T%task%/S%session%/sub%subject%_%acquisition%_%run%', DATASET = 'DUMMY', NSUBS = 2, NSESSIONS = 2, NTASKS = 2, NACQS = 2, NRUNS = 2, NCHANNELS = 2, SFREQ = 200, STOP = 10, NUMEVENTS = 10, PREFIXES = {'subject':'SU','session':'SE','task':'TA','acquisition':'AC','run':'RU'}, ROOT=None): """Create a dummy dataset given some parameters. Parameters ---------- PATTERN : str, optional The pattern in placeholder notation using the following fields: %dataset%, %task%, %session%, %subject%, %run%, %acquisition% DATASET : str, optional Name of the dataset. NSUBS : int, optional Number of subjects. NSESSIONS : int, optional Number of sessions. NTASKS : int, optional Number of tasks. NACQS : int, optional Number of acquisitions. NRUNS : int, optional Number of runs. NCHANNELS : int, optional Number of channels. SFREQ : float, optional Samplinf frequency of the data. STOP : float, optional Time duration of the data in seconds. NUMEVENTS : int, optional Number of events along the duration. PREFIXES : dict, optional Dictionary with the following keys:'subject', 'session', 'task' and 'acquisition'. The values are the corresponding prefix. RUN is not present because it has to be a number. ROOT : str, optional Path where the files will be generated. If None, the _data subdir will be used. """ if ROOT is None: this_dir = os.path.dirname(__file__) data_dir = os.path.abspath(os.path.join(this_dir,'..','_data')) else: data_dir = ROOT os.makedirs(data_dir,exist_ok=True) sub_zeros = get_num_digits(NSUBS) subs = [ PREFIXES['subject']+ str(x).zfill(sub_zeros) for x in range(NSUBS)] task_zeros = get_num_digits(NTASKS) tasks = [ PREFIXES['task']+str(x).zfill(task_zeros) for x in range(NTASKS)] run_zeros = get_num_digits(NRUNS) runs = [str(x).zfill(run_zeros) for x in range(NRUNS)] ses_zeros = get_num_digits(NSESSIONS) sessions = [ PREFIXES['session']+str(x).zfill(ses_zeros) for x in range(NSESSIONS)] acq_zeros = get_num_digits(NACQS) acquisitions = [ PREFIXES['acquisition']+str(x).zfill(acq_zeros) for x in range(NACQS)] # Create some dummy metadata n_channels = NCHANNELS sampling_freq = SFREQ # in Hertz info = mne.create_info(n_channels, sfreq=sampling_freq) times = np.linspace(0, STOP, STOP*sampling_freq, endpoint=False) data = np.zeros((NCHANNELS,times.shape[0])) raw = mne.io.RawArray(data, info) raw.set_channel_types({x:'eeg' for x in raw.ch_names}) new_events = mne.make_fixed_length_events(raw, duration=STOP//NUMEVENTS) for task in tasks: for session in sessions: for run in runs: for sub in subs: for acq in acquisitions: dummy = PATTERN.replace('%dataset%',DATASET) dummy = dummy.replace('%task%',task) dummy = dummy.replace('%session%',session) dummy = dummy.replace('%subject%',sub) dummy = dummy.replace('%run%',run) dummy = dummy.replace('%acquisition%',acq) path = [data_dir] +dummy.split('/') fpath = os.path.join(*path) _write_raw_brainvision(raw,fpath,new_events)