Source code for vcf.utils

"""
Utilities for VCF files.
"""


[docs]def walk_together(*readers): """ Simultaneously iteratate two or more VCF readers and return lists of concurrent records from each reader, with None if no record present. Caller must check the inputs are sorted in the same way and use the same reference otherwise behaviour is undefined. """ nexts = [reader.next() for reader in readers] while True: min_next = min([x for x in nexts if x is not None]) # this line uses equality on Records, which checks the ALTs # not sure what to do with records that have overlapping but different # variation yield [x if x is None or x == min_next else None for x in nexts] # update nexts that we just yielded for i, n in enumerate(nexts): if n is not None and n == min_next: try: nexts[i] = readers[i].next() except StopIteration: nexts[i] = None if all([x is None for x in nexts]): break
[docs]def trim_common_suffix(*sequences): """ Trim a list of sequences by removing the longest common suffix while leaving all of them at least one character in length. Standard convention with VCF is to place an indel at the left-most position, but some tools add additional context to the right of the sequences (e.g. samtools). These common suffixes are undesirable when comparing variants, for example in variant databases. >>> trim_common_suffix('TATATATA', 'TATATA') ['TAT', 'T'] >>> trim_common_suffix('ACCCCC', 'ACCCCCCCC', 'ACCCCCCC', 'ACCCCCCCCC') ['A', 'ACCC', 'ACC', 'ACCCC'] """ if not sequences: return [] reverses = [seq[::-1] for seq in sequences] rev_min = min(reverses) rev_max = max(reverses) if len(rev_min) < 2: return sequences for i, c in enumerate(rev_min[:-1]): if c != rev_max[i]: if i == 0: return sequences return [seq[:-i] for seq in sequences] return [seq[:-(i + 1)] for seq in sequences]