From 42f2b64fffdbc16912c17c599e0f4fb417e4dcbe Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 21 Jan 2014 15:42:53 -0500 Subject: split out score and selection --- Makefile | 2 +- SelectiveGenomeAmplification | 22 +++-- score_mers.py | 181 +++++++++++++++++++++++++++++++++++++++ select_mers.py | 200 ++++++------------------------------------- 4 files changed, 220 insertions(+), 185 deletions(-) create mode 100755 score_mers.py diff --git a/Makefile b/Makefile index 09a157c..8837865 100644 --- a/Makefile +++ b/Makefile @@ -20,5 +20,5 @@ clean: rm -vf $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone install: all - install -c $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone SelectiveGenomeAmplification select_mers.py $(DEST) + install -c $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone SelectiveGenomeAmplification select_mers.py score_mers.py $(DEST) diff --git a/SelectiveGenomeAmplification b/SelectiveGenomeAmplification index 1f495eb..4e14ecb 100755 --- a/SelectiveGenomeAmplification +++ b/SelectiveGenomeAmplification @@ -107,6 +107,8 @@ bg_counts=$counts_directory/$(basename $background)-counts fg_tmp=$tmp_directory/$(basename $foreground) bg_tmp=$tmp_directory/$(basename $background) +selected=$tmp_directory/$(basename $foreground)$(basename $background)-selected + # remove ignored mers if [ "$ignore_mers" ]; then echo "removing ignored mers: " + $ignore_mers @@ -118,16 +120,18 @@ fi echo "checking if mers are below melting temperature in the foreground" -rm $fg_counts-fg-non-melting -melting_range $min_melting_temp $max_melting_temp < $fg_counts > $fg_counts-fg-non-melting & +rm $fg_counts-non-melting +melting_range $min_melting_temp $max_melting_temp < $fg_counts > $fg_counts-non-melting echo "checking if mers are below melting temperature in the background" -rm $bg_counts-bg-non-melting -melting_range $min_melting_temp $max_melting_temp < $bg_counts > $bg_counts-bg-non-melting +rm $bg_counts-non-melting +melting_range $min_melting_temp $max_melting_temp < $bg_counts > $bg_counts-non-melting -# echo "scoring mer selectivity" -# python ./mer_selectivity.py $fg_counts-fg-non-melting $bg_counts-bg-non-melting +#echo "scoring mer selectivity" +# python ./mer_selectivity.py $fg_counts-non-melting $bg_counts-non-melting +mer_selectivity.py $fg_counts-non-melting $fg_tmp $bg_counts-non-melting $bg_tmp > $selected + -echo "" -echo "scoring mers" -select_mers.py $fg_counts-fg-non-melting $fg_tmp $bg_counts-bg-non-melting $bg_tmp $output_directory/output_`date +%s` +# echo "scoring top 100 mers based on selectivity" +# select_mers.py $selected $output_directory/output_`date +%s` +select_mers.py $selected $fg_tmp $bg_tmp $output_directory/output_`date +%s` diff --git a/score_mers.py b/score_mers.py new file mode 100755 index 0000000..3409fc0 --- /dev/null +++ b/score_mers.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +import sys +import os + +from multiprocessing import Pool +from subprocess import * +import numpy as np +import pdb + +fg_mers = {} +bg_mers = {} + +if(len(sys.argv) == 4): + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] +else: + print "please specify your inputs" + print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file" + exit() + +# empty class to fill up mer information with +class Mer: + pass + +# import our variables +min_mer_range = int(os.environ.get("min_mer_range", 6)); +max_mer_range = int(os.environ.get("max_mer_range", 10)); +min_mer_count = int(os.environ.get("min_mer_count", 0)); +max_select = int(os.environ.get("max_select", 15)); +max_mer_distance = int(os.environ.get("max_mer_distance", 5000)); + + +def populate_locations(input_fn, mers, mer): + ''' Run the strstreamone command, and parse in the integers that are output + by the command, and add it to mers[mer].pts + ''' + + cmd = 'strstreamone ' + mer + " < " + input_fn + + strstream = Popen(cmd, stdout=PIPE, shell=True) + for line in strstream.stdout: + mers[mer].pts.append(int(line)) + + +def score_mers(selected): + from itertools import combinations + import time + + scores = [] + + p = Pool() + + fh = open(output_file, 'w'); + fh.write("scores:\n"); + for select_n in range(1, max_select+1): + print "scoring size ", select_n, + t = time.time() + scores_it = p.imap_unordered(score, combinations(selected, select_n)) + for score_res in scores_it: + fh.write(str(score_res) + "\n"); + print "size ", select_n, "took:", t - time.time() + return scores + + +def score(combination): +# input is a string of mers like +# ['ACCAA', 'ACCCGA', 'ACGTATA'] + + ret = [combination] + + for mer in combination: + for other_mer in combination: + if not mer == other_mer: + if mer in other_mer: + ret.append("duplicates") + return ret + + fg_pts = [] + fg_dist = [] + + bg_pts = [] + bg_dist = [] + + for mer in combination: + fg_pts = fg_pts + fg_mers[mer].pts + bg_pts = bg_pts + bg_mers[mer].pts + + fg_pts.sort() + bg_pts.sort() + + # remove any exact duplicates + # fg_pts = list(set(fg_pts)) + # bg_pts = list(set(bg_pts)) + + # distances + min_mer_distance = max(len(i) for i in combination) + fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))]) + bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))]) + + # return without calculating scores if any objects are higher than our max distance + if any(dist > max_mer_distance for dist in fg_dist): + ret.append("max") + ret.append(max(fg_dist)) + return ret + + # return without calculating scores if any mers are closer than the length of our longest mer in the combination + if any(dist < min_mer_distance for dist in fg_dist): + ret.append("min") + ret.append(min(fg_dist)) + return ret + + nb_primers = len(combination) + fg_mean_dist = np.mean(fg_dist) + fg_variance_dist = np.var(fg_dist) + bg_mean_dist = np.mean(bg_dist) + bg_variance_dist = np.var(bg_dist) + + # this is our equation + score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001) + + ret.append(score) + ret.append(fg_mean_dist) + ret.append(fg_variance_dist) + ret.append(bg_mean_dist) + ret.append(bg_variance_dist) + + return ret + +def pop_fg(mer): + ''' helper for map function ''' + populate_locations(fg_fasta_fn, fg_mers, mer) + +def pop_bg(mer): + ''' helper for map function ''' + populate_locations(bg_fasta_fn, bg_mers, mer) + +def main(): + import time + selected = [] + selectivty_fh = open(selectivity_fn, "r") + + # get our genome length + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) + + for row in selectivity_fn: + (mer, fg_count, bg_count, selectivity) = row.split() + fg_mers[mer] = Mer() + fg_mers[mer].count = fg_count + bg_mers[mer] = Mer() + bg_mers[mer].count = bg_count + selected.append([mer, selectivity]) + + # exhaustive = False + # + # if exhaustive: + # selected = fg_mers.keys() + # else: + # selected = select_mers(fg_mers, bg_mers, max_select) + selected = selected[-100:] + pdb.set_trace() + # print "searching through combinations of" + # print selected + + print "Populating foreground locations" + + + map(pop_fg, selected) + map(pop_bg, selected) + + scores = score_mers(selected) + + print "fg_genome_length", fg_genome_length + print "bg_genome_length", bg_genome_length + print "output_file:", output_file + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/select_mers.py b/select_mers.py index a09bd6c..a4657d4 100755 --- a/select_mers.py +++ b/select_mers.py @@ -2,151 +2,44 @@ import sys import os -from multiprocessing import Pool -from subprocess import * -import numpy as np -import pdb - fg_mers = {} bg_mers = {} -if(len(sys.argv) != 5): +min_mer_count = int(os.environ.get("min_mer_count", 0)); + +if(len(sys.argv) == 5): fg_count_fn = sys.argv[1] fg_fasta_fn = sys.argv[2] bg_count_fn = sys.argv[3] bg_fasta_fn = sys.argv[4] - output_file = sys.argv[5] + + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) else: + print len(sys.argv) print "please specify your inputs" - print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file" + print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta" exit() -# empty class to fill up mer information with -class Mer: - pass - -# import our variables -min_mer_range = int(os.environ.get("min_mer_range", 6)); -max_mer_range = int(os.environ.get("max_mer_range", 10)); -min_mer_count = int(os.environ.get("min_mer_count", 0)); -max_select = int(os.environ.get("max_select", 15)); -max_mer_distance = int(os.environ.get("max_mer_distance", 5000)); - - -def populate_locations(input_fn, mers, mer): - ''' Run the strstreamone command, and parse in the integers that are output - by the command, and add it to mers[mer].pts - ''' - - cmd = 'strstreamone ' + mer + " < " + input_fn - - strstream = Popen(cmd, stdout=PIPE, shell=True) - for line in strstream.stdout: - mers[mer].pts.append(int(line)) - - -def score_mers(selected): - from itertools import combinations - import time - - scores = [] - - p = Pool() - - fh = open(output_file, 'w'); - fh.write("scores:\n"); - for select_n in range(1, max_select+1): - print "scoring size ", select_n, - t = time.time() - scores_it = p.imap_unordered(score, combinations(selected, select_n)) - for score_res in scores_it: - fh.write(str(score_res) + "\n"); - print "size ", select_n, "took:", t - time.time() - return scores - - -def score(combination): -# input is a string of mers like -# ['ACCAA', 'ACCCGA', 'ACGTATA'] - - ret = [combination] - - for mer in combination: - for other_mer in combination: - if not mer == other_mer: - if mer in other_mer: - ret.append("duplicates") - return ret - - fg_pts = [] - fg_dist = [] - - bg_pts = [] - bg_dist = [] - - for mer in combination: - fg_pts = fg_pts + fg_mers[mer].pts - bg_pts = bg_pts + bg_mers[mer].pts - - fg_pts.sort() - bg_pts.sort() - - # remove any exact duplicates - # fg_pts = list(set(fg_pts)) - # bg_pts = list(set(bg_pts)) - - # distances - min_mer_distance = max(len(i) for i in combination) - fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))]) - bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))]) - - # return without calculating scores if any objects are higher than our max distance - if any(dist > max_mer_distance for dist in fg_dist): - ret.append("max") - ret.append(max(fg_dist)) - return ret - - # return without calculating scores if any mers are closer than the length of our longest mer in the combination - if any(dist < min_mer_distance for dist in fg_dist): - ret.append("min") - ret.append(min(fg_dist)) - return ret - - nb_primers = len(combination) - fg_mean_dist = np.mean(fg_dist) - fg_variance_dist = np.var(fg_dist) - bg_mean_dist = np.mean(bg_dist) - bg_variance_dist = np.var(bg_dist) - - # this is our equation - score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001) - - ret.append(score) - ret.append(fg_mean_dist) - ret.append(fg_variance_dist) - ret.append(bg_mean_dist) - ret.append(bg_variance_dist) - - return ret - # select mers based on our 'selectivity' measure. (count in fg) / (count in bg) -def select_mers(fg_mers, bg_mers, select_nb): +def select_mers(fg_mers, bg_mers): + import numpy as np mers = [] # contains mer strings fg_arr = [] # contains fg counts bg_arr = [] # contains bg counts - + # populate our bg_arr and fg_arr as well as our mer arr. for mer in fg_mers.keys(): mers.append(mer); - bg_arr.append(bg_mers[mer].count); - fg_arr.append(fg_mers[mer].count); + bg_arr.append(bg_mers[mer] + 1); + fg_arr.append(fg_mers[mer]); fg_arr = np.array(fg_arr, dtype='f'); bg_arr = np.array(bg_arr, dtype='f'); - selectivity = fg_arr / bg_arr + selectivity = (fg_arr/fg_genome_length) / (bg_arr/bg_genome_length) arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))] @@ -157,41 +50,26 @@ def select_mers(fg_mers, bg_mers, select_nb): arr = sorted(arr, key = lambda row: row[3]) # return only our mers, without our selectivity scores - arr = list(row[0] for row in arr) return arr -def pop_fg(mer): - ''' helper for map function ''' - populate_locations(fg_fasta_fn, fg_mers, mer) - -def pop_bg(mer): - ''' helper for map function ''' - populate_locations(bg_fasta_fn, bg_mers, mer) def main(): - import time + + fg_count_fh = open(fg_count_fn, "r") bg_count_fh = open(bg_count_fn, "r") - # get our genome length - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) - # copy in our fg_mers and counts + print "populating our mers dictionary" for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: - - t = time.time() for line in fh: (mer, count) = line.split() - mers[mer] = Mer() - mers[mer].count = int(count) - mers[mer].pts = [] - print time.time() - t + mers[mer] = int(count) if min_mer_count >= 1: print "removing that are less frequent than: ", min_mer_count for mer in fg_mers.keys(): - if(fg_mers[mer].count < min_mer_count): + if(fg_mers[mer] < min_mer_count): del fg_mers[mer] if mer in bg_mers: del bg_mers[mer] @@ -204,41 +82,13 @@ def main(): print "adding empty mers to the background" for mer in fg_mers: if mer not in bg_mers: - bg_mers[mer] = Mer() - bg_mers[mer].count = 2 - bg_mers[mer].pts = [0, bg_genome_length] - - - exhaustive = False - - if exhaustive: - selected = fg_mers.keys() - else: - selected = select_mers(fg_mers, bg_mers, max_select) - selected = selected[-100:] - print "searching through combinations of" - print selected - -# pdb.set_trace() - - # print "selected the top ", max_select, " mers" - # print "selected:", ", ".join(selected) - - print "Populating foreground locations" - - # p = Pool() - - map(pop_fg, selected) - map(pop_bg, selected) - - scores = score_mers(selected) - - print "fg_genome_length", fg_genome_length - print "bg_genome_length", bg_genome_length - print "output_file:", output_file - - + bg_mers[mer] = 0 + print fg_genome_length + print bg_genome_length + selected = select_mers(fg_mers, bg_mers) + for row in selected: + print row[0] +"\t"+str(row[1]) + "\t" + str(row[2]) + str(row[3]) if __name__ == "__main__": sys.exit(main()) -- cgit v1.2.3