Source code for flexiznam.schema.datasets

import pathlib
from pathlib import Path
import re
import numpy as np
import pandas as pd
import flexiznam as flz
from flexiznam import utils
from flexiznam.errors import FlexilimsError, DatasetError
from flexiznam.config import PARAMETERS
from datetime import datetime

[docs]class Dataset(object):
    """Master class. Should be inherited by all datasets

    Class to handle dataset identification and validation

    SUBCLASSES are held in different files and added to the Dataset class by
    schema.__init__.py
    """
    SUBCLASSES = dict()

[docs]    @staticmethod
    def parse_dataset_name(name):
        """Parse a name into mouse, session, recording, dataset_name

        Args:
            name (str): name of the Dataset

        Returns:
            dict or None: None if parsing fails, a dictionary otherwise
        """
        pattern = (r'(?P<mouse>.*?)_(?P<session>S\d{8})_?(?P<session_num>\d+)?'
                   r'_?(?P<recording>R\d{6})?_?(?P<recording_num>\d+)?'
                   r'_(?P<dataset>.*)')
        match = re.match(pattern, name)
        if not match:
            raise DatasetError('No match in: `%s`. Must be '
                               '`<MOUSE>_SXXXXXX[...]_<DATASET>`.' % name)
        # group session num and recording num together
        output = match.groupdict()
        sess_num = output.pop('session_num')
        if sess_num is not None:
            if output['session'] is None:
                raise DatasetError('Found session number but not session name in `%s`'
                                   % name)
            output['session'] += '_%s' % sess_num
        rec_num = output.pop('recording_num')
        if rec_num is not None:
            if output['recording'] is None:
                raise DatasetError('Found recording number but not recording name in `%s`'
                                   % name)
            output['recording'] += '_%s' % rec_num
        return output

[docs]    @classmethod
    def from_folder(cls, folder, verbose=True, flm_session=None):
        """Try to load all datasets found in the folder.

        Will try all defined subclasses of datasets and keep everything that does not
        crash. If you know which dataset to expect, use the subclass directly
        """
        data = dict()
        if not cls.SUBCLASSES:
            raise IOError('Dataset subclasses not assigned')
        for ds_type, ds_class in cls.SUBCLASSES.items():
            if verbose:
                print('Looking for %s' % ds_type)
            try:
                res = ds_class.from_folder(folder, verbose=verbose,
                                           flm_session=flm_session)
            except OSError:
                continue
            if any(k in data for k in res):
                raise DatasetError('Found two datasets with the same name')
            data.update(res)
        return data

[docs]    @staticmethod
    def from_flexilims(project=None, name=None, data_series=None, flm_session=None):
        """Loads a dataset from flexilims.

        If the dataset_type attribute of the flexilims entry defined in
        Dataset.SUBCLASSES,this subclass will be used. Otherwise a generic Dataset is
        returned

        Args:
            project: Name of the project or hexadecimal project_id
            name: Unique name of the dataset on flexilims
            data_series: default to None. pd.Series as returned by flz.get_entities.
                         If provided, superseeds project and name
            flm_session: authentication session to access flexilims
        """
        if data_series is not None:
            if (project is not None) or (name is not None):
                raise AttributeError('Specify either data_series OR project + name')
        else:
            data_series = flz.get_entity(project_id=project, datatype='dataset',
                                         name=name, flexilims_session=flm_session)
            if data_series is None:
                raise FlexilimsError('No dataset named {} in project {}'.format(name,
                                                                                project))
        dataset_type = data_series.dataset_type
        if dataset_type in Dataset.SUBCLASSES:
            ds_cls = Dataset.SUBCLASSES[dataset_type]
            return ds_cls.from_flexilims(data_series=data_series, flm_session=flm_session)
        # No subclass, let's do it myself
        kwargs = Dataset._format_series_to_kwargs(data_series)
        name = kwargs.pop('name')
        kwargs['flm_session'] = flm_session
        ds = Dataset(**kwargs)
        try:
            ds.name = name
        except DatasetError:
            print('\n!!! Cannot parse the name !!!\nWill not set mouse, session '
                  'or recording')
            ds.dataset_name = name
        return ds

[docs]    @staticmethod
    def from_origin(project=None, origin_type=None, origin_id=None, origin_name=None,
                    dataset_type=None, conflicts=None, flm_session=None):
        """Creates a dataset of a given type as a child of a parent entity

        Args:
            project (str): Name of the project or hexadecimal project_id
            origin_type (str): sample type of the origin
            origin_id (str): hexadecimal ID of the origin. This or origin_name must be provided
            origin_name (str): name of the origin. This or origin_id must be provided
            dataset_type (str): type of dataset to create. Must be defined in the config file
            conflicts (str): What to do if a dataset of this type already exists
                as a child of the parent entity?

                `append`
                    Create a new dataset with a new name and path
                `abort` or None
                    Through a :py:class:`flexiznam.errors.NameNotUniqueError` and
                    exit
                `skip` or `overwrite`
                    Return a Dataset corresponding to the existing entry if there
                    is exactly one existing entry, otherwise through a
                    :py:class:`flexiznam.errors.NameNotUniqueError`
            flm_session (:py:class:`flexilims.Flexilims`): authentication session to connect to flexilims

        Returns:
            :py:class:`flexiznam.schema.datasets.Dataset`: a dataset object (WITHOUT updating flexilims)

        """
        assert (origin_id is not None) or (origin_name is not None)
        origin = flz.get_entity(
            datatype=origin_type,
            id=origin_id,
            name=origin_name,
            project_id=project,
            flexilims_session=flm_session,
        )
        if origin is None:
            raise FlexilimsError('Origin not found')
        processed = flz.get_entities(
            project_id=project,
            datatype='dataset',
            origin_id=origin['id'],
            query_key='dataset_type',
            query_value=dataset_type,
            flexilims_session=flm_session,
        )
        already_processed = len(processed) > 0
        if (not already_processed) or (conflicts == 'append'):
            dataset_root = '%s_%s' % (origin['name'], dataset_type)
            dataset_name = flz.generate_name(
                'dataset',
                dataset_root,
                project_id=project,
                flexilims_session=flm_session
            )
            dataset_path = str(
                Path(origin['path']) / Dataset.parse_dataset_name(dataset_name
                                                                  )['dataset'])
            return Dataset(
                path=dataset_path,
                is_raw='no',
                dataset_type=dataset_type,
                name=dataset_name,
                created=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                project=project,
                origin_id=origin['id'],
                flm_session=flm_session
            )
        else:
            if (conflicts is None) or (conflicts == 'abort'):
                raise flz.errors.NameNotUniqueError(
                    'Dataset {} already processed'.format(processed['name']))
            elif conflicts == 'skip' or conflicts == 'overwrite':
                if len(processed) == 1:
                    return Dataset.from_flexilims(data_series=processed.iloc[0])
                else:
                    raise flz.errors.NameNotUniqueError(
                        '{} {} datasets exists for {}, which one to return?'.format(
                            len(processed),
                            dataset_type,
                            origin['name']
                        ))

    @staticmethod
    def _format_series_to_kwargs(flm_series):
        """Format a flm get reply into kwargs valid for Dataset constructor"""
        flm_attributes = {'id', 'type', 'name', 'incrementalId', 'createdBy',
                          'dateCreated', 'origin_id', 'objects',
                          'customEntities', 'project'}
        d = dict()
        for k, v in flm_series.items():
            d[k] = v
        attr = {k: v for k, v in flm_series.items() if k not in flm_attributes}
        kwargs = dict(path=attr.pop('path'),
                      is_raw=attr.pop('is_raw', None),
                      dataset_type=attr.pop('dataset_type'),
                      created=attr.pop('created', None),
                      origin_id=flm_series.get('origin_id', None),
                      extra_attributes=attr,
                      project_id=flm_series.project,
                      name=flm_series.name)
        return kwargs

    def __init__(self, path, is_raw, dataset_type, name=None, extra_attributes=None,
                 created=None, project=None, project_id=None, origin_id=None,
                 flm_session=None):
        """Construct a dataset manually. Is usually called through static methods
        'from_folder', 'from_flexilims', or 'from_origin'

        Args:
            path: folder containing the dataset or path to file (valid only for single
                  file datasets)
            is_raw: bool, used to sort in raw and processed subfolders
            dataset_type: type of the dataset, must be in PARAMETERS['dataset_types']
            name: name of the dataset as on flexilims. Is expected to include mouse,
                  session etc...
            extra_attributes: dict, optional attributes.
            created: Creation date, in "YYYY-MM-DD HH:mm:SS"
            project: name of the project. Must be in config, can be guessed from
                     project_id
            project_id: hexadecimal code for the project. Must be in config, can be
                        guessed from project
            flm_session: authentication session to connect to flexilims
        """
        self.mouse = None
        self.session = None
        self.sample = None
        self.recording = None
        self.dataset_name = None
        self.name = name
        self.path = Path(path)
        self.is_raw = is_raw
        self.dataset_type = str(dataset_type)
        self.extra_attributes = extra_attributes if extra_attributes is not None else {}
        self.created = created
        self.origin_id = origin_id
        if project is not None:
            self.project = project
            if project_id is not None:
                assert self.project_id == project_id
        elif project_id is not None:
            self.project_id = project_id
        else:
            self._project = None
            self._project_id = None
        self.flm_session = flm_session

[docs]    def is_valid(self):
        """
        Dummy method definition. Should be reimplemented in children classes

        Should return True if the dataset is found a valid, false otherwise
        """
        raise NotImplementedError('`is_valid` is not defined for generic datasets')

[docs]    def associated_files(self, folder=None):
        """Give a list of all files associated with this dataset

        Args:
            folder: Where to look for files? default to self.path

        Returns:
        """
        raise NotImplementedError('`associated_files` is not defined for generic '
                                  'datasets')

[docs]    def get_flexilims_entry(self):
        """Get the flexilims entry for this dataset

        Returns:
            dict: a dictionary or [] if the entry is not found
        """
        if self.project_id is None:
            raise IOError('You must specify the project to get flexilims status')
        if self.name is None:
            raise IOError('You must specify the dataset name to get flexilims status')
        series = flz.get_entity(datatype='dataset',
                                project_id=self.project_id,
                                name=self.name,
                                flexilims_session=self.flm_session)
        return series

[docs]    def update_flexilims(self, mode='safe'):
        """Create or update flexilims entry for this dataset

        Args:
            mode (str): One of: 'update', 'overwrite', 'safe' (default).
                If 'safe', will only create entry if it does not exist online.
                If 'update' will update existing entry but keep any existing
                attributes that are not specified. If 'overwrite' will update
                existing entry and clear any attributes that are not specified.

        Returns:
            Flexilims reply
        """
        status = self.flexilims_status()
        attributes = self.extra_attributes.copy()
        # the following lines are necessary because pandas converts python types to numpy
        # types, which JSON does not understand
        for attribute in attributes:
            if isinstance(attributes[attribute], np.integer):
                attributes[attribute] = int(attributes[attribute])
            if isinstance(attributes[attribute], np.bool_):
                attributes[attribute] = bool(attributes[attribute])

        if status == 'different':
            if mode == 'safe':
                raise FlexilimsError('Cannot change existing flexilims entry with '
                                     'mode=`safe`')
            if (mode == 'overwrite') or (mode == 'update'):
                # I need to pack the dataset field in attributes
                fmt = self.format()
                for field in ['path', 'created', 'is_raw', 'dataset_type']:
                    attributes[field] = fmt[field]

                # reseting origin_id to null is not implemented. Specifically check
                # that it is not attempted and crash if it is
                if self.origin_id is None:
                    if self.get_flexilims_entry().get('origin_id', None) is not None:
                        raise FlexilimsError('Cannot set origin_id to null')

                resp = flz.update_entity(
                    datatype='dataset',
                    name=self.name,
                    origin_id=self.origin_id,
                    mode=mode,
                    attributes=attributes,
                    project_id=self.project_id,
                    flexilims_session=self.flm_session
                )
            else:
                raise IOError('`mode` must be `safe`, `overwrite` or `update`')
            return resp
        if status == 'up-to-date':
            print('Already up to date, nothing to do')
            return
        # we are in 'not online' case
        utils.clean_dictionary_recursively(attributes)
        resp = flz.add_dataset(
            parent_id=self.origin_id,
            dataset_type=self.dataset_type,
            created=self.created,
            path=str(self.path),
            is_raw='yes' if self.is_raw else 'no',
            project_id=self.project_id,
            dataset_name=self.name,
            attributes=attributes,
            flexilims_session=self.flm_session,
            conflicts='abort',
        )
        # update the dataset name to reflex the potential new index due to append
        online_name = resp['name']
        root_name = '_'.join([e for e in [self.mouse, self.session, self.recording] if e
                             is not None])
        assert online_name.startswith(root_name)
        self.dataset_name = online_name[len(root_name) + 1:]
        return resp

[docs]    def flexilims_status(self):
        """Status of the dataset on flexilims

        Status can be 'up-to-date', 'different' or 'not online'

        This function does not check flexilims these only value:
        'createdBy', 'objects', 'dateCreated', 'customEntities',
        'incrementalId', 'id', 'origin_id'
        """
        series = self.get_flexilims_entry()
        if series is None:
            return 'not online'
        differences = self.flexilims_report(flm_data=series)
        if len(differences):
            return 'different'
        return 'up-to-date'

[docs]    def flexilims_report(self, flm_data=None):
        """Describe the difference between the dataset and what is on flexilims

        Differences are returned in a dictionary:
        property: (value in dataset, value in flexilims)

        Attributes not present in either dataset or on flexilims are labelled as 'N/A'
        """
        if flm_data is None:
            flm_data = self.get_flexilims_entry()
            if flm_data is None:
                raise IOError('No flexilims entry for dataset %s' % self.name)

        # remove the flexilims keywords that are not used by Dataset if they are present
        flm_data = flm_data.drop(['createdBy', 'objects', 'dateCreated', 'customEntities',
                                  'incrementalId', 'id'], errors='ignore')
        # add the fields that are always present in Dataset but returned by flexilims
        # only when they are non null
        for na_field in ['origin_id', 'is_raw', 'dataset_type', 'path', 'created']:
            if na_field not in flm_data:
                flm_data[na_field] = None
        fmt = self.format()

        differences = utils.compare_series(fmt, flm_data, series_name=('offline', 'flexilims'))
        return differences

[docs]    def format(self, mode='flexilims'):
        """Format a dataset

        This can generate either a 'flexilims' type of output (a series similar to
        get_entities output) or a 'yaml' type as that used by flexiznam.camp

        The flexilims series will not include elements that are not used by the Dataset
        class such as created_by

        Args:
            mode: 'flexilims' or 'yaml'
        """
        data = dict(path=str(self.path),
                    created=self.created,
                    dataset_type=self.dataset_type,
                    is_raw='yes' if self.is_raw else 'no',
                    name=self.name,
                    project=self.project_id,
                    origin_id=self.origin_id,
                    type='dataset')

        if mode.lower() == 'flexilims':
            data.update(self.extra_attributes)
            series = pd.Series(data, name=self.name)
            return series
        elif mode.lower() == 'yaml':
            data['extra_attributes'] = self.extra_attributes
            return data
        else:
            raise IOError('Unknown mode "%s". Must be `flexilims` or `yaml`' % mode)

    @property
    def project_id(self):
        """Hexadecimal ID of the parent project. Must be defined in config project list"""
        return self._project_id

    @project_id.setter
    def project_id(self, value):
        project = flz.main._lookup_project(value, flz.PARAMETERS)
        if project is None:
            raise IOError('Unknown project ID. Please update config file')
        self._project = project
        self._project_id = value

    @property
    def project(self):
        """Parent project. Must be defined in config project list"""
        return self._project

    @project.setter
    def project(self, value):
        if value not in flz.PARAMETERS['project_ids']:
            raise IOError('Unknown project name. Please update config file')

        proj_id = flz.PARAMETERS['project_ids'][value]
        self._project_id = proj_id
        self._project = value

    @property
    def name(self):
        """Full name of the dataset as it would appear on Flexilims.

        Including mouse, sample, session and recording, whichever apply.
        """
        if self.dataset_name is None:
            return
        elements = [getattr(self, w) for w in ('mouse', 'sample', 'session', 'recording',
                                               'dataset_name')]
        name = '_'.join([e for e in elements if e is not None])
        return name

    @name.setter
    def name(self, value):
        """Set the name if it is correctly formatted"""
        if value is None:
            for w in ('mouse', 'session', 'recording', 'dataset_name'):
                setattr(self, w, None)
            return
        try:
            match = Dataset.parse_dataset_name(value)
        except DatasetError as err:
            raise DatasetError('Cannot parse dataset name. ' + err.args[0] +
                               '\nSet self.mouse, self.session, self.recording, and '
                               'self.dataset_name individually')
        self.mouse = match['mouse']
        self.dataset_name = match['dataset']
        self.session = match['session']
        self.recording = match['recording']

    @property
    def dataset_type(self):
        """Type of the dataset. Must be in PARAMETERS['dataset_types']"""
        return self._dataset_type

    @dataset_type.setter
    def dataset_type(self, value):
        if value.lower() not in PARAMETERS['dataset_types']:
            raise IOError('dataset_type "%s" not valid. Valid types are: '
                          '%s' % (value, PARAMETERS['dataset_types']))
        self._dataset_type = value.lower()

    @property
    def is_raw(self):
        """Is that dataset containing raw or processed data?"""
        return self._is_raw

    @is_raw.setter
    def is_raw(self, value):
        if isinstance(value, str):
            if value.lower() == 'yes':
                value = True
            elif value.lower() == 'no':
                value = False
            else:
                raise IOError('is_raw must be `yes` or `no`')
        else:
            value = bool(value)
        self._is_raw = value

    @property
    def path_root(self):
        """Get CAMP root path that should apply to this dataset"""
        if self.is_raw:
            return Path(flz.config.PARAMETERS['data_root']['raw'])
        else:
            return Path(flz.config.PARAMETERS['data_root']['processed'])

    @property
    def path_full(self):
        """Get full path including the CAMP root"""
        return self.path_root / self.path