From 59d6b220c2a5dd7bae81404f1cf4a5e6aa0089c3 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Fri, 17 Jan 2014 15:03:07 -0500 Subject: major select_mers.py changes --- select_mers.py | 204 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 114 insertions(+), 90 deletions(-) diff --git a/select_mers.py b/select_mers.py index 9dea7fe..380e4fd 100644 --- a/select_mers.py +++ b/select_mers.py @@ -11,6 +11,13 @@ import pdb fg_mers = {} bg_mers = {} + +fg_count_fn = sys.argv[1] +fg_fasta_fn = sys.argv[2] + +bg_count_fn = sys.argv[3] +bg_fasta_fn = sys.argv[4] + # empty class to fill up mer information with class Mer: pass @@ -23,108 +30,132 @@ min_mer_range = int(os.getenv("min_mer_range")); max_mer_range = int(os.getenv("max_mer_range")); min_mer_count = int(os.getenv("min_mer_count")); max_select = int(os.getenv("max_select")); +max_mer_distance = int(os.getenv("max_mer_distance")); + +def populate_locations(input_fn, mers, mer): -def populate_locations(input_fn, mers, selected): - mer_str = ' '.join(selected) - cmd = './strstream ' + mer_str + " < " + input_fn + + cmd = 'strstreamone ' + mer + " < " + input_fn strstream = Popen(cmd, stdout=PIPE, shell=True) for line in strstream.stdout: - (index, pos) = line.split(' ') - mers[selected[int(index)]].pts.append(int(pos)) + mers[mer].pts.append(int(line)) + return None -def select_mers(selected): +def score_mers(selected): from itertools import combinations + import time scores = [] p = Pool() - for select_n in range(1, max_select): + for select_n in range(1, max_select+1): + t = time.time() print "scoring size ", select_n scores_it = [] - scores_it = p.map(score, combinations(selected, select_n)) + scores_it = p.imap_unordered(score, combinations(selected, select_n)) + scores_it = filter(lambda x: x is not None, scores_it) scores = scores + scores_it + print time.time() - t + - p.close() - scores = sorted(scores, key=lambda score: score.score) return scores def score(combination): - - # print "scoring", combination +# input is a string of mers like +# ['ACCAA', 'ACCCGA', 'ACGTATA'] - fg_pts_arr = [] - fg_dist_arr = [] - bg_pts_arr = [] - bg_dist_arr = [] + for mer in combination: + for other_mer in combination: + if not mer == other_mer: + if mer in other_mer: + return None + + + fg_pts = [] + fg_dist = [] + + bg_pts = [] + bg_dist = [] for mer in combination: - fg_pts_arr = fg_pts_arr + fg_mers[mer].pts - bg_pts_arr = bg_pts_arr + bg_mers[mer].pts + fg_pts = fg_pts + fg_mers[mer].pts + bg_pts = bg_pts + bg_mers[mer].pts - fg_pts_arr.sort() - bg_pts_arr.sort() + fg_pts.sort() + bg_pts.sort() # remove any exact duplicates - fg_pts_arr = list(set(fg_pts_arr)) - bg_pts_arr = list(set(bg_pts_arr)) + # fg_pts = list(set(fg_pts)) + # bg_pts = list(set(bg_pts)) # distances - for i in range(1, len(fg_pts_arr)): - fg_dist_arr.append(abs(fg_pts_arr[i-1] - fg_pts_arr[i])); - - for i in range(1, len(bg_pts_arr)): - bg_dist_arr.append(abs(bg_pts_arr[i-1] - bg_pts_arr[i])); + fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))]) + bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))]) - fg_dist_arr = np.array(fg_dist_arr) - bg_dist_arr = np.array(bg_dist_arr) + if any(dist > max_mer_distance for dist in fg_dist): + return None nb_primers = len(combination) - fg_mean_dist = np.mean(fg_dist_arr) - fg_variance_dist = np.var(fg_dist_arr) - bg_mean_dist = np.mean(bg_dist_arr) - bg_variance_dist = np.var(bg_dist_arr) + fg_mean_dist = np.mean(fg_dist) + fg_variance_dist = np.var(fg_dist) + bg_mean_dist = np.mean(bg_dist) + bg_variance_dist = np.var(bg_dist) # this is our equation - - score = Score() - score.mers = combination - score.score = (nb_primers * (fg_mean_dist * fg_variance_dist)) / (bg_mean_dist * bg_variance_dist) - # score.bg_dist_arr = bg_dist_arr - # score.fg_dist_arr = fg_dist_arr - return score + score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001) + + return (combination, score) -def most_frequent(mers, select_nb): - print "selecting the top ", select_nb, " mers" - selected = [] - for mer in mers.keys(): - selected.append([mer, mers[mer].count]) - - selected = sorted(selected, key=lambda mer: mer[1]) - selected = selected[-select_nb:] - selected = list(row[0] for row in selected) +# select mers based on our 'selectivity' measure. (count in fg) / (count in bg) +def select_mers(fg_mers, bg_mers, select_nb): - return selected + mers = [] # contains mer strings + fg_arr = [] # contains fg counts + bg_arr = [] # contains bg counts + # populate our bg_arr and fg_arr as well as our mer arr. + for mer in fg_mers.keys(): + mers.append(mer); + bg_arr.append(bg_mers[mer].count); + fg_arr.append(fg_mers[mer].count); -def main(): - fg_count_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] + fg_arr = np.array(fg_arr, dtype='f'); + bg_arr = np.array(bg_arr, dtype='f'); - bg_count_fn = sys.argv[3] - bg_fasta_fn = sys.argv[4] + selectivity = fg_arr - bg_arr + arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))] + + # filter results less than 1 ( indicates that the bg is more present than the fg) + # arr = filter(lambda i: i[3] > 1, arr) + + # sort by the selectivity + arr = sorted(arr, key = lambda row: row[3]) + + # return only our mers, without our selectivity scores + arr = list(row[0] for row in arr) + return arr + +def pop_fg(mer): + populate_locations(fg_fasta_fn, fg_mers, mer) + +def pop_bg(mer): + populate_locations(bg_fasta_fn, bg_mers, mer) + +def main(): + import time fg_count_fh = open(fg_count_fn, "r") bg_count_fh = open(bg_count_fn, "r") @@ -135,28 +166,16 @@ def main(): # copy in our fg_mers and counts for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: + t = time.time() for line in fh: (mer, count) = line.split() - count = int(count) mers[mer] = Mer() - mers[mer].count = count + mers[mer].count = int(count) mers[mer].pts = [] + print time.time() - t - # if min_mer_count is less than one, then we are looking at a bottom cutoff threshold - if(min_mer_count < 1 and min_mer_count > 0): - pass - # counts = [] - # for mer in fg_mers.keys(): - # counts.append(fg_mers[mer]['count']) - # - # counts = counts.sort() - # - # count_set = set(counts): - # len(count_set) - # count_set( - # if it is 1 or great then use it as a numerical cutoff - print "removing mers that don't meet our minimum count" if min_mer_count >= 1: + print "removing that are less frequent than: ", min_mer_count for mer in fg_mers.keys(): if(fg_mers[mer].count < min_mer_count): del fg_mers[mer] @@ -176,38 +195,43 @@ def main(): bg_mers[mer].pts = [0, bg_genome_length] - exhaustive = True + exhaustive = False if exhaustive: selected = fg_mers.keys() - if not exhaustive: - selected = most_frequent(fg_mers, max_select) + else: + selected = select_mers(fg_mers, bg_mers, max_select) + selected = selected[-50:] + print selected - print "selected the top ", max_select, " mers" - print "selected:", ", ".join(selected) + pdb.set_trace() + + # print "selected the top ", max_select, " mers" + # print "selected:", ", ".join(selected) print "Populating foreground locations" - populate_locations(fg_fasta_fn, fg_mers, selected) - print "Populating background locations" - populate_locations(bg_fasta_fn, bg_mers, selected) - scores = select_mers(selected) + # p = Pool() + + map(pop_fg, selected) + map(pop_bg, selected) + pdb.set_trace() + + scores = score_mers(selected) + pdb.set_trace() print "fg_genome_length", fg_genome_length print "bg_genome_length", bg_genome_length + sys.stdout.write("scores:\n"); for score in scores: - sys.stdout.write(str(score.score) + "\t") - sys.stdout.write(", ".join(list(score.mers)) + "\t") - # sys.stdout.write("\t".join(score.dist_arr)); - sys.stdout.write("\n"); - - #matplotlib.pyplot.hist(score.fg_dist_arr) - #matplotlib.pyplot.savefig('graph/' + str(x) + ".png") - #matplotlib.pyplot.clf() - import pdb - pdb.set_trace() + if score is not "Distance": + sys.stdout.write(", ".join(list(score[0])) + "\t") + sys.stdout.write(str(score[1]) + "\t") + # sys.stdsys.stdout.write("\t".join(score.dist_arr)); + sys.stdout.write("\n"); + if __name__ == "__main__": sys.exit(main()) -- cgit v1.2.3