From 94d04a1e503121a98b403f882c18a4f0799267d7 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 29 Jan 2014 11:53:30 -0500 Subject: add filtering based on consecutive mer lengths --- src/select_mers.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 src/select_mers.py (limited to 'src/select_mers.py') diff --git a/src/select_mers.py b/src/select_mers.py new file mode 100755 index 0000000..21306cc --- /dev/null +++ b/src/select_mers.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +import sys +import os + +fg_mers = {} +bg_mers = {} + +min_mer_count = int(os.environ.get("min_mer_count", 0)); + +if(len(sys.argv) == 5): + fg_count_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_count_fn = sys.argv[3] + bg_fasta_fn = sys.argv[4] + + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) +else: + print len(sys.argv) + print "please specify your inputs" + print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta" + exit() + + +# select mers based on our 'selectivity' measure. (count in fg) / (count in bg) +def select_mers(fg_mers, bg_mers): + import numpy as np + + mers = [] # contains mer strings + fg_arr = [] # contains fg counts + bg_arr = [] # contains bg counts + + # populate our bg_arr and fg_arr as well as our mer arr. + for mer in fg_mers.keys(): + mers.append(mer); + bg_arr.append(bg_mers.get(mer, 1)); + fg_arr.append(fg_mers[mer]); + + fg_arr = np.array(fg_arr, dtype='f'); + bg_arr = np.array(bg_arr, dtype='f'); + + selectivity = (fg_arr / bg_arr) + + arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))] + + # filter results less than 1 ( indicates that the bg is more present than the fg) + # arr = filter(lambda i: i[3] > 1, arr) + + # sort by the selectivity + arr = sorted(arr, key = lambda row: row[3]) + + # return only our mers, without our selectivity scores + return arr + + +def main(): + + fg_count_fh = open(fg_count_fn, "r") + bg_count_fh = open(bg_count_fn, "r") + + # copy in our fg_mers and counts + for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: + for line in fh: + (mer, count) = line.split() + mers[mer] = int(count) + + if min_mer_count >= 1: + for mer in fg_mers.keys(): + if(fg_mers[mer] < min_mer_count): + del fg_mers[mer] + if mer in bg_mers: + del bg_mers[mer] + + for mer in bg_mers.keys(): + if mer not in fg_mers: + del bg_mers[mer] + + selected = select_mers(fg_mers, bg_mers) + for row in selected: + print row[0] +"\t"+str("%d" % row[1]) + "\t" + str("%d" % row[2]) + "\t" + str("%.5f" % row[3]) + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3