#!/usr/bin/env python from collections import namedtuple from operator import itemgetter Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand']) Event = namedtuple('Event', ['location', 'annotation', 'prediction']) def parse_readlengths(filename): with open(filename) as f: for line in f: name, length = line.strip().split(',') yield name, int(length) def parse_csv(filename): with open(filename) as f: for line in f: name, start, end, strand = line.strip().split(',') yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1) def events(read_lengths, annotations, predictions): names = dict() total = 0 for name, length in parse_readlengths(read_lengths): names[name] = total total += length for name, start, end, strand in parse_csv(annotations): yield Event(names[name] + start, strand, None) yield Event(names[name] + end, 0, None) for name, start, end, strand in parse_csv(predictions): yield Event(names[name] + start, None, strand) yield Event(names[name] + end, None, 0) def rates(read_lengths, annotations, predictions): rates = dict(tp=0, fp=0, tn=0, fn=0) cl = 0 # current location ca = 0 # current annotation cp = 0 # current prediction for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)): if ca == 0 and cp == 0: rates['tn'] += l - cl elif ca == cp: rates['tp'] += l - cl elif ca == 0 and cp != 0: rates['fp'] += l - cl elif ca != 0 and cp == 0: rates['fn'] += l - cl else: # different strands rates['fp'] += l - cl if a is not None: ca = a if p is not None: cp = p cl = l return rates body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}' head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}' print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']: r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv') tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn'] t = tp + fp + tn + fn print(body.format( tool, tp / t, fp / t, tn / t, fn / t, tp / (tp + fp), tp / (tp + fn), tn / (tn + fp), tn / (tn + fn), (tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5 ))