aboutsummaryrefslogtreecommitdiff
path: root/src/select_mers.py
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-06-25 13:39:03 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-06-25 13:39:03 -0400
commit2fca8ab9cf3e399e1f30c2f180bc8bd8f36c1f14 (patch)
tree247974ba0b6767757598f23786e746f9cbb1690e /src/select_mers.py
parent4b392a3fd57fb8702756c56a790e8b75c7d58a4c (diff)
really reduce memory usage of select_mers.py
Diffstat (limited to 'src/select_mers.py')
-rwxr-xr-xsrc/select_mers.py70
1 files changed, 31 insertions, 39 deletions
diff --git a/src/select_mers.py b/src/select_mers.py
index 5f42717..ceec4a0 100755
--- a/src/select_mers.py
+++ b/src/select_mers.py
@@ -2,58 +2,50 @@
import sys
import os
-fg_mers = {}
-bg_mers = {}
-
fg_weight = float(os.environ.get("fg_weight", 0))
max_check = int(os.environ.get("max_check", 0))
-if(len(sys.argv) == 3):
- fg_count_fn = sys.argv[1]
- bg_count_fn = sys.argv[2]
-else:
- print len(sys.argv)
- sys.stderr.write("please specify your inputs\n")
- sys.stderr.write("ex: select_mers.py fg_counts bg_count\n")
- exit(1)
-
-
-# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
-def select_mers(fg_mers, bg_mers):
-
- # populate our bg_arr and fg_arr as well as our mer arr.
-
- score = {}
-
- for mer in fg_mers.keys():
- score[mer] = (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight)
- sorted_scored_mers = sorted(score, key=score.get)
-
- for mer in sorted_scored_mers:
- print mer, int(fg_mers[mer]), int(bg_mers[mer]), (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight)
+def main():
+ if(len(sys.argv) == 3):
+ fg_count_fn = sys.argv[1]
+ bg_count_fn = sys.argv[2]
+ else:
+ sys.stderr.write("please specify your inputs\n")
+ sys.stderr.write("ex: select_mers.py fg_counts bg_count\n")
+ exit(1)
-def main():
+ # mers dictionary:
+ #
+ # Key: mer name, eg AAAACT
+ # Value: fg_mer_count, bg_mer_count
+ mers = {}
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
- # copy in our fg_mers and counts
- for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
- for line in fh:
- (mer, count) = line.split()
- mers[mer] = float(count)
+ # copy in our foreground mers and counts into mers dictionary
+ for line in fg_count_fh:
+ (mer, count) = line.split()
+ mers[mer] = [float(count), 1]
- for mer in fg_mers.keys():
- if mer not in bg_mers:
- bg_mers[mer] = 1
+
+ for line in bg_count_fh:
+ (mer, count) = line.split()
+ if mer in mers:
+ mers[mer][1] = float(count)
+
+ score = []
+
+ for mer in mers:
+ score.append([mer, (mers[mer][0] / mers[mer][1]) * (mers[mer][0]**fg_weight)])
- for mer in bg_mers.keys():
- if mer not in fg_mers:
- del bg_mers[mer]
+ sorted_scored_mers = sorted(score, key=lambda x: x[1])
- selected = select_mers(fg_mers, bg_mers)
+ sys.stdout.write('#MERS\tFG_COUNT\tBG_COUNT\tSCORE\n')
+ for scores in sorted_scored_mers:
+ sys.stdout.write(scores[0] + '\t' + str(int(mers[scores[0]][0])) + '\t' + str(int(mers[scores[0]][1])) + '\t' + str(scores[1])+ '\n')
if __name__ == "__main__":
sys.exit(main())