"""
utilities and wrappers for working with HDF archives.

* an archive contains one of more datasets

* a dataset contains

  - an array X: the data
  - an optional array Y: the raw response
  - an optional array T: labels (targets) associated with the response

a dataset specification (dset_path) is the concatenation
(`:` is used as a separator)
of the path to the HDF file and the dataset name. All datasets hang on the root.
"""


import logging
import os

import pandas as pd

import filelock

log = logging.getLogger(__name__)

import re
__HDF_SUFFIX__ = "h5"
__SPEC_SEP__ = ":"

DSPEC_MSG = ("dataset specification; of the form "
             "<hdf_archive_path>%s<dataset_name>" % __SPEC_SEP__)


def __parse(path, PATT=re.compile("(.+[.]%s)[%s](.+)" % (__HDF_SUFFIX__, __SPEC_SEP__))):
    try:
        return PATT.match( path ).groups()
    except AttributeError:
        raise AttributeError("can't parse (pattern %s) path (%s)" % (PATT.pattern,
                                                                     path))


def basename(path):
    """a dataset path of the form <archname>:<basename>

    :rtype: the archive (inside the file) part of the path
    """

    return __parse(path)[1]


def archname(path):
    """a dataset path of the form <archname>:<basename>

    :rtype: the archname part of the path
    """
    return __parse(path)[0]


def split(path):
    """a dataset path of the form <archname>:<basename>

    :rtype: (archname, basename)
    """
    return __parse(path)


def join(path, dsname):
    """a dataset path of the form <archname>:<basename> and a dataset name

    :param path: dataset path
    :param str dsname: dataset name
    :rtype: str of the type <path><__SPEC_SEP__><dataset_name>"""

    return __SPEC_SEP__.join( (path, dsname) )


def save_object( file_path, key, obj ):
    "atomic save operation"
    log.debug("writing to %s:%s", file_path, key)
    #if not os.path.isfile(file_path):
    #    raise IOError("file not found %s" % file_path)
    with filelock.FileLock(file_path) as lock:
        store = pd.HDFStore(  file_path )
        store[key] = obj
        store.close()

        ## check that it is there
        store = pd.HDFStore(  file_path )
        if key in store:
            log.debug("object in %s:%s", file_path, key)
        else:
            log.error("DATA LOSS: couldn't save in %s:%s", file_path, key)
        store.close()


def load_object(file_path, key):
    "atomic load operation"
    log.debug("reading from %s:%s", file_path, key)
    if not os.path.isfile(file_path):
        raise IOError("file not found %s" % file_path)
    with filelock.FileLock(file_path) as lck:
        store = pd.HDFStore( file_path )
        obj = store[key]
        store.close()
    return obj


def dset_path(path):
    """a checker for correct specification of an archive path

    :param str path: path to the dataset (path_to_file:dsname)
    :rtype: the same path if correct, raises error otherwise"""

    return join( *__parse(path) )


##TODO: higly inefficient using pandas
## should re-write with tables
def _list_filter(archfn, sel_f, frm_f):
    """apply filter and formatting functions to the list items in an archive

    :param str archfn: path to the archive
    :param function sel_f: function that returns a boolean on an item of the list
    :param function frm_f: function that returns a string on an item of the list
    :rtype: list of items passing sel_f and formatted by frm_f"""

    store_keys = pd.HDFStore(archfn).keys()
    return map(frm_f, filter(sel_f, store_keys))


def list_datasets(archfn):
    """list datasets in an archive

    :param str archfn: path to the archive
    :rtype: list of datasets"""

    return _list_filter(archfn,
                        lambda s: s.endswith("/X") and s.count("/") == 2,
                        lambda s: os.path.dirname(s)[1:])


def list_experiments(archfn, dsn):
    """list training experiments in a dataset

    :param str archfn: path to the archive
    :param str dsn: name of the dataset
    :rtype: list of training experiments"""

    return _list_filter(archfn,
                        lambda s: s.endswith("model") and s.startswith("/%s" % dsn),
                        lambda s: os.path.dirname(s[len(dsn) + 2:]))
