aboutsummaryrefslogtreecommitdiff
path: root/src/score_mers.py
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-27 15:10:04 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-27 15:10:04 -0400
commitf5ac0df9d42de657a64e15d3f0cfa2198a1921c6 (patch)
tree68609ff01dfac4a0f402d7b5e259b3a29bf23596 /src/score_mers.py
parent495228f7167a6df24a139022e7a0560a4dd07b56 (diff)
Add scoring for specific combinations.
- add scoring for specific combos - update readme - add CLI args for score_mers.py - update score_wrapper - use more generic print functions in score_mers
Diffstat (limited to 'src/score_mers.py')
-rwxr-xr-xsrc/score_mers.py195
1 files changed, 123 insertions, 72 deletions
diff --git a/src/score_mers.py b/src/score_mers.py
index aaf8e13..5c5416a 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -2,6 +2,8 @@
import sys
import os
+import argparse
+
from multiprocessing import Pool
from multiprocessing import cpu_count
@@ -26,9 +28,6 @@ output_file = ""
# import our variables
cpus = int(os.environ.get("cpus", cpu_count()))
debug = os.environ.get("debug", False)
-min_mer_range = int(os.environ.get("min_mer_range", 6))
-max_mer_range = int(os.environ.get("max_mer_range", 12))
-min_mer_count = int(os.environ.get("min_mer_count", 0))
max_select = int(os.environ.get("max_select", 15))
max_check = int(os.environ.get("max_check", 35))
max_mer_distance = int(os.environ.get("max_mer_distance", 5000))
@@ -115,26 +114,7 @@ def populate_locations(selected_mers, mer_dic, input_fn):
mer_dic[selected_mers[int(mer)]].append(int(pos))
merlist_fh.close()
-
-
-
-def filter_mers(combination):
- '''
- filter out mers that are either inside other mers,
- or don't fit the heterodimer requirement.
- '''
- for combo in combinations(combination, 2):
- if heterodimer_dic[combo]:
- return True
-
- for mer in combination:
- for other_mer in combination:
- if not mer == other_mer:
- if mer in other_mer:
- return True
-
- return False
def check_feasible(selected):
total = 0
@@ -159,9 +139,53 @@ def percentage(part, whole, precision=2):
return str(percent) + "%"
-def score_mers(selected):
- import time
+def write_header(fh):
+ fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n")
+def write_result(fh, score_res):
+ combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res
+ fh.write(str(combination) + "\t")
+ fh.write(str(score_val) + "\t")
+ fh.write(str(fg_mean_dist) + "\t")
+ fh.write(str(fg_stddev_dist) + "\t")
+ fh.write(str(bg_ratio) + "\n")
+
+def print_rejected(total_reject, total_checked, total_scored, excluded):
+ print ""
+ print "Reasons mers were excluded:\n"
+ print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")"
+ print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")"
+ print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")"
+ print ""
+ print " total combinations checked: ", total_checked
+ print " total combinations scored: ", total_scored
+ print " percent rejected: " + percentage(total_reject, total_checked)
+ print ""
+
+def score_specific_combinations(mers):
+
+ total_scored = 0
+ total_checked = 0
+ excluded = [0, 0, 0]
+
+ p = Pool(cpus)
+
+ fh = open(output_file, 'wb')
+ write_header(fh)
+
+ score_it = p.map(score, mers)
+ for score_res in score_it:
+ if type(score_res) is list:
+ total_scored += 1
+ write_result(fh, score_res)
+ else:
+ excluded[score_res] += 1;
+
+ total_reject = len(mers) - total_scored
+ print_rejected(total_reject, len(mers), total_scored, excluded)
+
+def score_all_combinations(selected):
+ import time
total_scored = 0
total_checked = 0
@@ -172,7 +196,7 @@ def score_mers(selected):
p = Pool(cpus)
fh = open(output_file, 'wb')
- fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n")
+ write_header(fh)
for select_n in range(1, max_select+1):
print "scoring size ", select_n,
@@ -182,28 +206,15 @@ def score_mers(selected):
total_checked += 1
if type(score_res) is list:
total_scored += 1
- combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res
- fh.write(str(combination) + "\t")
- fh.write(str(score_val) + "\t")
- fh.write(str(fg_mean_dist) + "\t")
- fh.write(str(fg_stddev_dist) + "\t")
- fh.write(str(bg_ratio) + "\n")
+ write_result(fh, score_res)
else:
excluded[score_res] += 1;
print "size ", select_n, "took:", time.time() - t
total_reject = total_checked - total_scored
- print ""
- print "Reasons mers were excluded:\n"
- print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")"
- print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")"
- print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")"
- print ""
- print " total combinations checked: ", total_checked
- print " total combinations scored: ", total_scored
- print " percent rejected: " + percentage(total_reject, total_checked)
- print ""
+
+ print_rejected(total_reject, total_checked, total_scored, excluded)
if(total_scored == 0):
print "NO RESULTS FOUND"
@@ -329,52 +340,92 @@ def main():
'''
global fg_genome_length
global bg_genome_length
+ global seq_ends
global output_file
- if len(sys.argv) == 5:
- selectivity_fn = sys.argv[1]
- fg_fasta_fn = sys.argv[2]
- bg_fasta_fn = sys.argv[3]
- output_file = sys.argv[4]
- else:
- print "please specify your inputs"
- print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
+ parser = argparse.ArgumentParser(description="score mers")
+ parser.add_argument("-f", "--foreground", help="foreground fasta file", required=True)
+ parser.add_argument("-b", "--background", help="background fasta file", required=True)
+ parser.add_argument("-o", "--output", help="output fasta with UIDs in the file", required=True)
+ parser.add_argument("-s", "--selectivity-file", help="mer selectivity file generated by select_mers.py", required=False)
+ parser.add_argument("-c", "--combination-file", help="a set of combinations you want to score", required=False)
+
+ args = parser.parse_args()
+
+ if args.selectivity_file is None and args.combination_file is None:
+ print "you must either have a selectivity file or a combination file to score from"
exit()
+ if args.selectivity_file is not None and args.combination_file is not None:
+ print "you can only select either a selectivity file or a combination file to score from"
+ exit()
+
+ output_file = args.output
+
+ print "Getting genome length"
+ fg_genome_length = get_length(args.foreground)
+ bg_genome_length = get_length(args.background)
+
+ print "Populating sequence end points"
+ seq_ends = load_end_points(args.foreground)
- fg_genome_length = get_length(fg_fasta_fn)
- bg_genome_length = get_length(bg_fasta_fn)
- selectivity_fh = open(selectivity_fn, "r")
+ if(args.selectivity_file is not None):
+
+ print "Scoring all mer combinations"
+
+ selectivity_fh = open(args.selectivity_file, "r")
- # load our mer list into python
- mer_selectivity = selectivity_fh.readlines()
+ # load our mer list into python
+ mer_selectivity = selectivity_fh.readlines()
- # get the last max_check (it's sorted)
- selected_mers = mer_selectivity[-max_check:]
+ # get the last max_check (it's sorted)
+ selected_mers = mer_selectivity[-max_check:]
- # load it into our fg and bg counts into their dictionaries
- for mer in selected_mers:
- split_mer = mer.split()
- fg_mers[split_mer[0]] = []
- bg_mers[split_mer[0]] = int(split_mer[2])
+ # load it into our fg and bg counts into their dictionaries
+ for mer in selected_mers:
+ split_mer = mer.split()
+ fg_mers[split_mer[0]] = []
+ bg_mers[split_mer[0]] = int(split_mer[2])
- selected_mers = [x.split()[0] for x in selected_mers]
+ selected_mers = [x.split()[0] for x in selected_mers]
- print "Populating sequence end points"
- global seq_ends
- seq_ends = load_end_points(fg_fasta_fn)
+ print "Populating foreground locations"
+ populate_locations(selected_mers, fg_mers, args.foreground)
- print "Populating foreground locations"
- populate_locations(selected_mers, fg_mers, fg_fasta_fn)
+ print "calculating heterodimer distances"
+ load_heterodimer_dic(selected_mers)
- print "calculating heterodimer distances"
- load_heterodimer_dic(selected_mers)
+ print "scoring mer combinations"
+ score_all_combinations(selected_mers)
+
+ else:
+ print "Scoring specific mer combinations"
+
+ combinations = []
+
+ combination_fh = open(args.combination_file, "r")
+ for line in combination_fh:
+ mers = line.split()
+ combinations.append(mers)
+ for mer in mers:
+ fg_mers[mer] = []
+ bg_mers[mer] = []
+
+ print "calculating heterodimer distances"
+ load_heterodimer_dic(fg_mers.keys())
- print "scoring mer combinations"
- score_mers(selected_mers)
+ print "Populating foreground locations"
+ populate_locations(fg_mers.keys(), fg_mers, args.foreground)
+
+ print "Populating background locations"
+ populate_locations(fg_mers.keys(), bg_mers, args.background)
+
+ for mer in bg_mers:
+ bg_mers[mer] = len(bg_mers[mer])
+
+ score_specific_combinations(combinations)
print "output_file:", output_file
-
if __name__ == "__main__":
sys.exit(main())