aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-06-25 13:39:03 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-06-25 13:39:03 -0400
commit2fca8ab9cf3e399e1f30c2f180bc8bd8f36c1f14 (patch)
tree247974ba0b6767757598f23786e746f9cbb1690e
parent4b392a3fd57fb8702756c56a790e8b75c7d58a4c (diff)
really reduce memory usage of select_mers.py
-rwxr-xr-xsrc/score_mers.py8
-rwxr-xr-xsrc/select_mers.py70
2 files changed, 38 insertions, 40 deletions
diff --git a/src/score_mers.py b/src/score_mers.py
index 9d90ae2..0ddeb91 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -487,6 +487,8 @@ def main():
# load it into our fg and bg counts into their dictionaries
for mer in selected_mers:
+ if mer.startswith("#"):
+ continue
split_mer = mer.split()
fg_mers[split_mer[0]] = []
bg_mers[split_mer[0]] = int(split_mer[2])
@@ -494,7 +496,7 @@ def main():
selected_mers = [x.split()[0] for x in selected_mers]
if len(selected_mers) is 0:
- print "no merss found."
+ print "no mers found."
exit(1)
# we already have our background counts
@@ -511,6 +513,8 @@ def main():
combination_fh = open(args.combination_file, "r")
for line in combination_fh:
+ if line.startswith("#"):
+ continue
mers = line.split()
combinations.append(mers)
for mer in mers:
@@ -529,6 +533,8 @@ def main():
mer_fh = open(args.mer_file, "r")
for mer in mer_fh:
+ if mer.startswith("#"):
+ continue
mer = mer.strip()
if(len(mer.split()) > 1):
print "skipping line:", mer, "each line should contain only one mer"
diff --git a/src/select_mers.py b/src/select_mers.py
index 5f42717..ceec4a0 100755
--- a/src/select_mers.py
+++ b/src/select_mers.py
@@ -2,58 +2,50 @@
import sys
import os
-fg_mers = {}
-bg_mers = {}
-
fg_weight = float(os.environ.get("fg_weight", 0))
max_check = int(os.environ.get("max_check", 0))
-if(len(sys.argv) == 3):
- fg_count_fn = sys.argv[1]
- bg_count_fn = sys.argv[2]
-else:
- print len(sys.argv)
- sys.stderr.write("please specify your inputs\n")
- sys.stderr.write("ex: select_mers.py fg_counts bg_count\n")
- exit(1)
-
-
-# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
-def select_mers(fg_mers, bg_mers):
-
- # populate our bg_arr and fg_arr as well as our mer arr.
-
- score = {}
-
- for mer in fg_mers.keys():
- score[mer] = (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight)
- sorted_scored_mers = sorted(score, key=score.get)
-
- for mer in sorted_scored_mers:
- print mer, int(fg_mers[mer]), int(bg_mers[mer]), (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight)
+def main():
+ if(len(sys.argv) == 3):
+ fg_count_fn = sys.argv[1]
+ bg_count_fn = sys.argv[2]
+ else:
+ sys.stderr.write("please specify your inputs\n")
+ sys.stderr.write("ex: select_mers.py fg_counts bg_count\n")
+ exit(1)
-def main():
+ # mers dictionary:
+ #
+ # Key: mer name, eg AAAACT
+ # Value: fg_mer_count, bg_mer_count
+ mers = {}
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
- # copy in our fg_mers and counts
- for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
- for line in fh:
- (mer, count) = line.split()
- mers[mer] = float(count)
+ # copy in our foreground mers and counts into mers dictionary
+ for line in fg_count_fh:
+ (mer, count) = line.split()
+ mers[mer] = [float(count), 1]
- for mer in fg_mers.keys():
- if mer not in bg_mers:
- bg_mers[mer] = 1
+
+ for line in bg_count_fh:
+ (mer, count) = line.split()
+ if mer in mers:
+ mers[mer][1] = float(count)
+
+ score = []
+
+ for mer in mers:
+ score.append([mer, (mers[mer][0] / mers[mer][1]) * (mers[mer][0]**fg_weight)])
- for mer in bg_mers.keys():
- if mer not in fg_mers:
- del bg_mers[mer]
+ sorted_scored_mers = sorted(score, key=lambda x: x[1])
- selected = select_mers(fg_mers, bg_mers)
+ sys.stdout.write('#MERS\tFG_COUNT\tBG_COUNT\tSCORE\n')
+ for scores in sorted_scored_mers:
+ sys.stdout.write(scores[0] + '\t' + str(int(mers[scores[0]][0])) + '\t' + str(int(mers[scores[0]][1])) + '\t' + str(scores[1])+ '\n')
if __name__ == "__main__":
sys.exit(main())