aboutsummaryrefslogtreecommitdiff
path: root/src/select_mers.py
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-29 11:53:30 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-29 11:53:30 -0500
commit94d04a1e503121a98b403f882c18a4f0799267d7 (patch)
tree0d2cf5586b31bddc9bca99b4b07ebb4b993f1130 /src/select_mers.py
parent73531da5cdf33f9bde7d4db0e4ce96f1e41f581b (diff)
add filtering based on consecutive mer lengths
Diffstat (limited to 'src/select_mers.py')
-rwxr-xr-xsrc/select_mers.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/src/select_mers.py b/src/select_mers.py
new file mode 100755
index 0000000..21306cc
--- /dev/null
+++ b/src/select_mers.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+import sys
+import os
+
+fg_mers = {}
+bg_mers = {}
+
+min_mer_count = int(os.environ.get("min_mer_count", 0));
+
+if(len(sys.argv) == 5):
+ fg_count_fn = sys.argv[1]
+ fg_fasta_fn = sys.argv[2]
+ bg_count_fn = sys.argv[3]
+ bg_fasta_fn = sys.argv[4]
+
+ fg_genome_length = os.path.getsize(fg_fasta_fn)
+ bg_genome_length = os.path.getsize(bg_fasta_fn)
+else:
+ print len(sys.argv)
+ print "please specify your inputs"
+ print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta"
+ exit()
+
+
+# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
+def select_mers(fg_mers, bg_mers):
+ import numpy as np
+
+ mers = [] # contains mer strings
+ fg_arr = [] # contains fg counts
+ bg_arr = [] # contains bg counts
+
+ # populate our bg_arr and fg_arr as well as our mer arr.
+ for mer in fg_mers.keys():
+ mers.append(mer);
+ bg_arr.append(bg_mers.get(mer, 1));
+ fg_arr.append(fg_mers[mer]);
+
+ fg_arr = np.array(fg_arr, dtype='f');
+ bg_arr = np.array(bg_arr, dtype='f');
+
+ selectivity = (fg_arr / bg_arr)
+
+ arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))]
+
+ # filter results less than 1 ( indicates that the bg is more present than the fg)
+ # arr = filter(lambda i: i[3] > 1, arr)
+
+ # sort by the selectivity
+ arr = sorted(arr, key = lambda row: row[3])
+
+ # return only our mers, without our selectivity scores
+ return arr
+
+
+def main():
+
+ fg_count_fh = open(fg_count_fn, "r")
+ bg_count_fh = open(bg_count_fn, "r")
+
+ # copy in our fg_mers and counts
+ for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
+ for line in fh:
+ (mer, count) = line.split()
+ mers[mer] = int(count)
+
+ if min_mer_count >= 1:
+ for mer in fg_mers.keys():
+ if(fg_mers[mer] < min_mer_count):
+ del fg_mers[mer]
+ if mer in bg_mers:
+ del bg_mers[mer]
+
+ for mer in bg_mers.keys():
+ if mer not in fg_mers:
+ del bg_mers[mer]
+
+ selected = select_mers(fg_mers, bg_mers)
+ for row in selected:
+ print row[0] +"\t"+str("%d" % row[1]) + "\t" + str("%d" % row[2]) + "\t" + str("%.5f" % row[3])
+
+if __name__ == "__main__":
+ sys.exit(main())