Source code for vcf.utils

"""
Utilities for VCF files.
"""

[docs]def walk_together(*readers, **kwargs):
    """
    Simultaneously iteratate over two or more VCF readers. For each 
    genomic position with a variant, return a list of size equal to the number 
    of VCF readers. This list contains the VCF record from readers that have
    this variant, and None for readers that don't have it. 
    The caller must make sure that inputs are sorted in the same way and use the 
    same reference otherwise behaviour is undefined.

    Args:
        vcf_record_sort_key: function that takes a VCF record and returns a 
            tuple that can be used as a key for comparing and sorting VCF 
            records across all readers. This tuple defines what it means for two 
            variants to be equal (eg. whether it's only their position or also 
            their allele values), and implicitly determines the chromosome 
            ordering since the tuple's 1st element is typically the chromosome 
            name (or calculated from it).
    """
    if 'vcf_record_sort_key' in kwargs:
        get_key = kwargs['vcf_record_sort_key']
    else:
        get_key = lambda r: (r.CHROM, r.POS) #, r.REF, r.ALT)

    nexts = []
    for reader in readers:
        try:
            nexts.append(next(reader))
        except StopIteration:
            nexts.append(None)

    min_k = (None,)   # keep track of the previous min key's contig
    while any([r is not None for r in nexts]):
        next_idx_to_k = dict(
            (i, get_key(r)) for i, r in enumerate(nexts) if r is not None)
        keys_with_prev_contig = [
            k for k in next_idx_to_k.values() if k[0] == min_k[0]]

        if any(keys_with_prev_contig):
            min_k = min(keys_with_prev_contig)   # finish previous contig
        else:
            min_k = min(next_idx_to_k.values())   # move on to next contig

        min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k])
        yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))]

        for i in min_k_idxs:
            try:
                nexts[i] = readers[i].next()
            except StopIteration:
                nexts[i] = None


[docs]def trim_common_suffix(*sequences):
    """
    Trim a list of sequences by removing the longest common suffix while
    leaving all of them at least one character in length.

    Standard convention with VCF is to place an indel at the left-most
    position, but some tools add additional context to the right of the
    sequences (e.g. samtools). These common suffixes are undesirable when
    comparing variants, for example in variant databases.

        >>> trim_common_suffix('TATATATA', 'TATATA')
        ['TAT', 'T']

        >>> trim_common_suffix('ACCCCC', 'ACCCCCCCC', 'ACCCCCCC', 'ACCCCCCCCC')
        ['A', 'ACCC', 'ACC', 'ACCCC']

    """
    if not sequences:
        return []
    reverses = [seq[::-1] for seq in sequences]
    rev_min = min(reverses)
    rev_max = max(reverses)
    if len(rev_min) < 2:
        return sequences
    for i, c in enumerate(rev_min[:-1]):
        if c != rev_max[i]:
            if i == 0:
                return sequences
            return [seq[:-i] for seq in sequences]
    return [seq[:-(i + 1)] for seq in sequences]