From 2fca8ab9cf3e399e1f30c2f180bc8bd8f36c1f14 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 25 Jun 2014 13:39:03 -0400 Subject: really reduce memory usage of select_mers.py --- src/select_mers.py | 70 ++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 39 deletions(-) (limited to 'src/select_mers.py') diff --git a/src/select_mers.py b/src/select_mers.py index 5f42717..ceec4a0 100755 --- a/src/select_mers.py +++ b/src/select_mers.py @@ -2,58 +2,50 @@ import sys import os -fg_mers = {} -bg_mers = {} - fg_weight = float(os.environ.get("fg_weight", 0)) max_check = int(os.environ.get("max_check", 0)) -if(len(sys.argv) == 3): - fg_count_fn = sys.argv[1] - bg_count_fn = sys.argv[2] -else: - print len(sys.argv) - sys.stderr.write("please specify your inputs\n") - sys.stderr.write("ex: select_mers.py fg_counts bg_count\n") - exit(1) - - -# select mers based on our 'selectivity' measure. (count in fg) / (count in bg) -def select_mers(fg_mers, bg_mers): - - # populate our bg_arr and fg_arr as well as our mer arr. - - score = {} - - for mer in fg_mers.keys(): - score[mer] = (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight) - sorted_scored_mers = sorted(score, key=score.get) - - for mer in sorted_scored_mers: - print mer, int(fg_mers[mer]), int(bg_mers[mer]), (fg_mers[mer] / bg_mers[mer]) * (fg_mers[mer]**fg_weight) +def main(): + if(len(sys.argv) == 3): + fg_count_fn = sys.argv[1] + bg_count_fn = sys.argv[2] + else: + sys.stderr.write("please specify your inputs\n") + sys.stderr.write("ex: select_mers.py fg_counts bg_count\n") + exit(1) -def main(): + # mers dictionary: + # + # Key: mer name, eg AAAACT + # Value: fg_mer_count, bg_mer_count + mers = {} fg_count_fh = open(fg_count_fn, "r") bg_count_fh = open(bg_count_fn, "r") - # copy in our fg_mers and counts - for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: - for line in fh: - (mer, count) = line.split() - mers[mer] = float(count) + # copy in our foreground mers and counts into mers dictionary + for line in fg_count_fh: + (mer, count) = line.split() + mers[mer] = [float(count), 1] - for mer in fg_mers.keys(): - if mer not in bg_mers: - bg_mers[mer] = 1 + + for line in bg_count_fh: + (mer, count) = line.split() + if mer in mers: + mers[mer][1] = float(count) + + score = [] + + for mer in mers: + score.append([mer, (mers[mer][0] / mers[mer][1]) * (mers[mer][0]**fg_weight)]) - for mer in bg_mers.keys(): - if mer not in fg_mers: - del bg_mers[mer] + sorted_scored_mers = sorted(score, key=lambda x: x[1]) - selected = select_mers(fg_mers, bg_mers) + sys.stdout.write('#MERS\tFG_COUNT\tBG_COUNT\tSCORE\n') + for scores in sorted_scored_mers: + sys.stdout.write(scores[0] + '\t' + str(int(mers[scores[0]][0])) + '\t' + str(int(mers[scores[0]][1])) + '\t' + str(scores[1])+ '\n') if __name__ == "__main__": sys.exit(main()) -- cgit v1.2.1