From 94d04a1e503121a98b403f882c18a4f0799267d7 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 29 Jan 2014 11:53:30 -0500 Subject: add filtering based on consecutive mer lengths --- select_mers.py | 83 ---------------------------------------------------------- 1 file changed, 83 deletions(-) delete mode 100755 select_mers.py (limited to 'select_mers.py') diff --git a/select_mers.py b/select_mers.py deleted file mode 100755 index 21306cc..0000000 --- a/select_mers.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -import sys -import os - -fg_mers = {} -bg_mers = {} - -min_mer_count = int(os.environ.get("min_mer_count", 0)); - -if(len(sys.argv) == 5): - fg_count_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_count_fn = sys.argv[3] - bg_fasta_fn = sys.argv[4] - - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) -else: - print len(sys.argv) - print "please specify your inputs" - print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta" - exit() - - -# select mers based on our 'selectivity' measure. (count in fg) / (count in bg) -def select_mers(fg_mers, bg_mers): - import numpy as np - - mers = [] # contains mer strings - fg_arr = [] # contains fg counts - bg_arr = [] # contains bg counts - - # populate our bg_arr and fg_arr as well as our mer arr. - for mer in fg_mers.keys(): - mers.append(mer); - bg_arr.append(bg_mers.get(mer, 1)); - fg_arr.append(fg_mers[mer]); - - fg_arr = np.array(fg_arr, dtype='f'); - bg_arr = np.array(bg_arr, dtype='f'); - - selectivity = (fg_arr / bg_arr) - - arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))] - - # filter results less than 1 ( indicates that the bg is more present than the fg) - # arr = filter(lambda i: i[3] > 1, arr) - - # sort by the selectivity - arr = sorted(arr, key = lambda row: row[3]) - - # return only our mers, without our selectivity scores - return arr - - -def main(): - - fg_count_fh = open(fg_count_fn, "r") - bg_count_fh = open(bg_count_fn, "r") - - # copy in our fg_mers and counts - for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: - for line in fh: - (mer, count) = line.split() - mers[mer] = int(count) - - if min_mer_count >= 1: - for mer in fg_mers.keys(): - if(fg_mers[mer] < min_mer_count): - del fg_mers[mer] - if mer in bg_mers: - del bg_mers[mer] - - for mer in bg_mers.keys(): - if mer not in fg_mers: - del bg_mers[mer] - - selected = select_mers(fg_mers, bg_mers) - for row in selected: - print row[0] +"\t"+str("%d" % row[1]) + "\t" + str("%d" % row[2]) + "\t" + str("%.5f" % row[3]) - -if __name__ == "__main__": - sys.exit(main()) -- cgit v1.2.3