From 6a7295a258932a78c616ea4f8f591eb5f0191da4 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 8 Jan 2014 11:20:39 -0500 Subject: initial commit --- select_mers.py | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 select_mers.py (limited to 'select_mers.py') diff --git a/select_mers.py b/select_mers.py new file mode 100644 index 0000000..9dea7fe --- /dev/null +++ b/select_mers.py @@ -0,0 +1,213 @@ +import sys +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot +import os + +from multiprocessing import Pool +from subprocess import * +import numpy as np +import pdb + +fg_mers = {} +bg_mers = {} +# empty class to fill up mer information with +class Mer: + pass + +class Score: + pass + +# import our variables +min_mer_range = int(os.getenv("min_mer_range")); +max_mer_range = int(os.getenv("max_mer_range")); +min_mer_count = int(os.getenv("min_mer_count")); +max_select = int(os.getenv("max_select")); + +def populate_locations(input_fn, mers, selected): + + mer_str = ' '.join(selected) + cmd = './strstream ' + mer_str + " < " + input_fn + + strstream = Popen(cmd, stdout=PIPE, shell=True) + for line in strstream.stdout: + (index, pos) = line.split(' ') + mers[selected[int(index)]].pts.append(int(pos)) + + + return None + +def select_mers(selected): + from itertools import combinations + + scores = [] + + p = Pool() + + for select_n in range(1, max_select): + print "scoring size ", select_n + scores_it = [] + scores_it = p.map(score, combinations(selected, select_n)) + scores = scores + scores_it + + + p.close() + + + scores = sorted(scores, key=lambda score: score.score) + return scores + +def score(combination): + + # print "scoring", combination + + fg_pts_arr = [] + fg_dist_arr = [] + + bg_pts_arr = [] + bg_dist_arr = [] + + for mer in combination: + fg_pts_arr = fg_pts_arr + fg_mers[mer].pts + bg_pts_arr = bg_pts_arr + bg_mers[mer].pts + + fg_pts_arr.sort() + bg_pts_arr.sort() + + # remove any exact duplicates + fg_pts_arr = list(set(fg_pts_arr)) + bg_pts_arr = list(set(bg_pts_arr)) + + # distances + for i in range(1, len(fg_pts_arr)): + fg_dist_arr.append(abs(fg_pts_arr[i-1] - fg_pts_arr[i])); + + for i in range(1, len(bg_pts_arr)): + bg_dist_arr.append(abs(bg_pts_arr[i-1] - bg_pts_arr[i])); + + fg_dist_arr = np.array(fg_dist_arr) + bg_dist_arr = np.array(bg_dist_arr) + + nb_primers = len(combination) + fg_mean_dist = np.mean(fg_dist_arr) + fg_variance_dist = np.var(fg_dist_arr) + bg_mean_dist = np.mean(bg_dist_arr) + bg_variance_dist = np.var(bg_dist_arr) + + # this is our equation + + score = Score() + score.mers = combination + score.score = (nb_primers * (fg_mean_dist * fg_variance_dist)) / (bg_mean_dist * bg_variance_dist) + # score.bg_dist_arr = bg_dist_arr + # score.fg_dist_arr = fg_dist_arr + return score + + + +def most_frequent(mers, select_nb): + print "selecting the top ", select_nb, " mers" + selected = [] + for mer in mers.keys(): + selected.append([mer, mers[mer].count]) + + selected = sorted(selected, key=lambda mer: mer[1]) + selected = selected[-select_nb:] + selected = list(row[0] for row in selected) + + return selected + + +def main(): + fg_count_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + + bg_count_fn = sys.argv[3] + bg_fasta_fn = sys.argv[4] + + fg_count_fh = open(fg_count_fn, "r") + bg_count_fh = open(bg_count_fn, "r") + + # get our genome length + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) + + # copy in our fg_mers and counts + for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: + + for line in fh: + (mer, count) = line.split() + count = int(count) + mers[mer] = Mer() + mers[mer].count = count + mers[mer].pts = [] + + # if min_mer_count is less than one, then we are looking at a bottom cutoff threshold + if(min_mer_count < 1 and min_mer_count > 0): + pass + # counts = [] + # for mer in fg_mers.keys(): + # counts.append(fg_mers[mer]['count']) + # + # counts = counts.sort() + # + # count_set = set(counts): + # len(count_set) + # count_set( + # if it is 1 or great then use it as a numerical cutoff + print "removing mers that don't meet our minimum count" + if min_mer_count >= 1: + for mer in fg_mers.keys(): + if(fg_mers[mer].count < min_mer_count): + del fg_mers[mer] + if mer in bg_mers: + del bg_mers[mer] + + print "removing useless mers from the background" + for mer in bg_mers.keys(): + if mer not in fg_mers: + del bg_mers[mer] + + print "adding empty mers to the background" + for mer in fg_mers: + if mer not in bg_mers: + bg_mers[mer] = Mer() + bg_mers[mer].count = 2 + bg_mers[mer].pts = [0, bg_genome_length] + + + exhaustive = True + + if exhaustive: + selected = fg_mers.keys() + if not exhaustive: + selected = most_frequent(fg_mers, max_select) + + print "selected the top ", max_select, " mers" + print "selected:", ", ".join(selected) + + print "Populating foreground locations" + populate_locations(fg_fasta_fn, fg_mers, selected) + print "Populating background locations" + populate_locations(bg_fasta_fn, bg_mers, selected) + + scores = select_mers(selected) + + print "fg_genome_length", fg_genome_length + print "bg_genome_length", bg_genome_length + + + for score in scores: + sys.stdout.write(str(score.score) + "\t") + sys.stdout.write(", ".join(list(score.mers)) + "\t") + # sys.stdout.write("\t".join(score.dist_arr)); + sys.stdout.write("\n"); + + #matplotlib.pyplot.hist(score.fg_dist_arr) + #matplotlib.pyplot.savefig('graph/' + str(x) + ".png") + #matplotlib.pyplot.clf() + import pdb + pdb.set_trace() + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3