"modules for dealing with (mainly importing to datasets) genomic data"

from operator import itemgetter
from itertools import groupby
import logging
lg = logging.getLogger()

def parseBED(s, use_score=True, bg_val=1.0):
    """parse a feature in BED4 or BED5 format

    :param str s: tab-separated line from the BED file
    :param bool use_score: use the score of the feature or set it to bg_val
    :rtype: tuple, BED5+ feature
    """

    cols = s.rstrip().split("\t")
    if len(cols) < 4:
        raise ValueError("feature (%s) must be at least BED4" % s)

    if len(cols) == 4:
        if use_score:
            raise ValueError("feature (%s) must be at least BED5 with use_score=True" % s)
        else:
            return (cols[0], int(cols[1]), int(cols[2]), cols[3], bg_val)
    elif len(cols) > 4:
        if use_score:
            ctp = [str, int, int, str, float] + ([str] * (len(cols) - 5))
        else:
            ctp = [str, int, int, str, lambda v: bg_val] + ([str] * (len(cols) - 5))
        return map(lambda t: t[0](t[1]), zip(ctp, cols))
    else:
        raise ValueError("please report this BUG")

def _consecutive_iter(fiter, fill):
    """join consecutive intervals

    :param iter fiter: iterable of features
    :param str fill: name to use for gaps. if None, gap enocunter
        results in error
    :rtype: iterable of features"""

    for i, ft in enumerate(fiter):
        if i == 0:
            prev_ft = ft
            chrom, start, end, name = ft[:4]
            continue
        if prev_ft[2] == ft[1]:
            assert end == ft[1]
            end = ft[2]
        else:
            yield (chrom, start, end, name)
            if fill:
                yield (chrom, end, ft[1], fill)
            else:
                raise ValueError("found gap betwen:\n%s\n%s" % (str(prev_ft),
                                                                str(ft)))
            chrom, start, end, name = ft[:4]
        prev_ft = ft
    yield (chrom, start, end, name)


def join_by_name(data, join_key=itemgetter(0, 3), fill=None):
    """join intervals that share certain attributes and report the start of the
    first interval and the end of the last. For example, for features::

       chr1    1   10  a
       chr1    10  20  a
       chr2    20  30  a

    joining by (chromosome, name) you get::

      chr1    1   20  a
      chr2    20  30  a

    joining by (name,) you get::

      chr1    1   30  a

    :param iter data: iterable of **SORTED** bed featues.
        (chrom, start, end, name, ...)
    :param callable join_key: get the fields by which to join
    :param str/None fill: fill uncovered regions. if fill is None, raise
        an error on non-consecutive intervals (i.e., uncovered regions)
    :rtype: iterator of tuples (chrom, start, end, name)
    """

    for k, g in groupby(data, key=join_key):
        g = list(g)
        if len(g) == 1:
            (chrom, start, end, label) = (k[0], g[0][1], g[0][2], k[1])
            yield (chrom, start, end, label)
        else:
            for chrom, start, end, label in _consecutive_iter(g, fill):
                yield (chrom, start, end, label)
