aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-21 15:42:53 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-21 15:42:53 -0500
commit42f2b64fffdbc16912c17c599e0f4fb417e4dcbe (patch)
tree74e3a1abec5718912986ffb05054220508673b16
parent24d17ba72d27162653f4be6ddc47e64f99e35067 (diff)
split out score and selection
-rw-r--r--Makefile2
-rwxr-xr-xSelectiveGenomeAmplification22
-rwxr-xr-xscore_mers.py181
-rwxr-xr-xselect_mers.py200
4 files changed, 220 insertions, 185 deletions
diff --git a/Makefile b/Makefile
index 09a157c..8837865 100644
--- a/Makefile
+++ b/Makefile
@@ -20,5 +20,5 @@ clean:
rm -vf $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone
install: all
- install -c $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone SelectiveGenomeAmplification select_mers.py $(DEST)
+ install -c $(BIN)/strstream $(BIN)/melting_range $(BIN)/strstreamone SelectiveGenomeAmplification select_mers.py score_mers.py $(DEST)
diff --git a/SelectiveGenomeAmplification b/SelectiveGenomeAmplification
index 1f495eb..4e14ecb 100755
--- a/SelectiveGenomeAmplification
+++ b/SelectiveGenomeAmplification
@@ -107,6 +107,8 @@ bg_counts=$counts_directory/$(basename $background)-counts
fg_tmp=$tmp_directory/$(basename $foreground)
bg_tmp=$tmp_directory/$(basename $background)
+selected=$tmp_directory/$(basename $foreground)$(basename $background)-selected
+
# remove ignored mers
if [ "$ignore_mers" ]; then
echo "removing ignored mers: " + $ignore_mers
@@ -118,16 +120,18 @@ fi
echo "checking if mers are below melting temperature in the foreground"
-rm $fg_counts-fg-non-melting
-melting_range $min_melting_temp $max_melting_temp < $fg_counts > $fg_counts-fg-non-melting &
+rm $fg_counts-non-melting
+melting_range $min_melting_temp $max_melting_temp < $fg_counts > $fg_counts-non-melting
echo "checking if mers are below melting temperature in the background"
-rm $bg_counts-bg-non-melting
-melting_range $min_melting_temp $max_melting_temp < $bg_counts > $bg_counts-bg-non-melting
+rm $bg_counts-non-melting
+melting_range $min_melting_temp $max_melting_temp < $bg_counts > $bg_counts-non-melting
-# echo "scoring mer selectivity"
-# python ./mer_selectivity.py $fg_counts-fg-non-melting $bg_counts-bg-non-melting
+#echo "scoring mer selectivity"
+# python ./mer_selectivity.py $fg_counts-non-melting $bg_counts-non-melting
+mer_selectivity.py $fg_counts-non-melting $fg_tmp $bg_counts-non-melting $bg_tmp > $selected
+
-echo ""
-echo "scoring mers"
-select_mers.py $fg_counts-fg-non-melting $fg_tmp $bg_counts-bg-non-melting $bg_tmp $output_directory/output_`date +%s`
+# echo "scoring top 100 mers based on selectivity"
+# select_mers.py $selected $output_directory/output_`date +%s`
+select_mers.py $selected $fg_tmp $bg_tmp $output_directory/output_`date +%s`
diff --git a/score_mers.py b/score_mers.py
new file mode 100755
index 0000000..3409fc0
--- /dev/null
+++ b/score_mers.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+import sys
+import os
+
+from multiprocessing import Pool
+from subprocess import *
+import numpy as np
+import pdb
+
+fg_mers = {}
+bg_mers = {}
+
+if(len(sys.argv) == 4):
+ selectivity_fn = sys.argv[1]
+ fg_fasta_fn = sys.argv[2]
+ bg_fasta_fn = sys.argv[3]
+ output_file = sys.argv[4]
+else:
+ print "please specify your inputs"
+ print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file"
+ exit()
+
+# empty class to fill up mer information with
+class Mer:
+ pass
+
+# import our variables
+min_mer_range = int(os.environ.get("min_mer_range", 6));
+max_mer_range = int(os.environ.get("max_mer_range", 10));
+min_mer_count = int(os.environ.get("min_mer_count", 0));
+max_select = int(os.environ.get("max_select", 15));
+max_mer_distance = int(os.environ.get("max_mer_distance", 5000));
+
+
+def populate_locations(input_fn, mers, mer):
+ ''' Run the strstreamone command, and parse in the integers that are output
+ by the command, and add it to mers[mer].pts
+ '''
+
+ cmd = 'strstreamone ' + mer + " < " + input_fn
+
+ strstream = Popen(cmd, stdout=PIPE, shell=True)
+ for line in strstream.stdout:
+ mers[mer].pts.append(int(line))
+
+
+def score_mers(selected):
+ from itertools import combinations
+ import time
+
+ scores = []
+
+ p = Pool()
+
+ fh = open(output_file, 'w');
+ fh.write("scores:\n");
+ for select_n in range(1, max_select+1):
+ print "scoring size ", select_n,
+ t = time.time()
+ scores_it = p.imap_unordered(score, combinations(selected, select_n))
+ for score_res in scores_it:
+ fh.write(str(score_res) + "\n");
+ print "size ", select_n, "took:", t - time.time()
+ return scores
+
+
+def score(combination):
+# input is a string of mers like
+# ['ACCAA', 'ACCCGA', 'ACGTATA']
+
+ ret = [combination]
+
+ for mer in combination:
+ for other_mer in combination:
+ if not mer == other_mer:
+ if mer in other_mer:
+ ret.append("duplicates")
+ return ret
+
+ fg_pts = []
+ fg_dist = []
+
+ bg_pts = []
+ bg_dist = []
+
+ for mer in combination:
+ fg_pts = fg_pts + fg_mers[mer].pts
+ bg_pts = bg_pts + bg_mers[mer].pts
+
+ fg_pts.sort()
+ bg_pts.sort()
+
+ # remove any exact duplicates
+ # fg_pts = list(set(fg_pts))
+ # bg_pts = list(set(bg_pts))
+
+ # distances
+ min_mer_distance = max(len(i) for i in combination)
+ fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))])
+ bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))])
+
+ # return without calculating scores if any objects are higher than our max distance
+ if any(dist > max_mer_distance for dist in fg_dist):
+ ret.append("max")
+ ret.append(max(fg_dist))
+ return ret
+
+ # return without calculating scores if any mers are closer than the length of our longest mer in the combination
+ if any(dist < min_mer_distance for dist in fg_dist):
+ ret.append("min")
+ ret.append(min(fg_dist))
+ return ret
+
+ nb_primers = len(combination)
+ fg_mean_dist = np.mean(fg_dist)
+ fg_variance_dist = np.var(fg_dist)
+ bg_mean_dist = np.mean(bg_dist)
+ bg_variance_dist = np.var(bg_dist)
+
+ # this is our equation
+ score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001)
+
+ ret.append(score)
+ ret.append(fg_mean_dist)
+ ret.append(fg_variance_dist)
+ ret.append(bg_mean_dist)
+ ret.append(bg_variance_dist)
+
+ return ret
+
+def pop_fg(mer):
+ ''' helper for map function '''
+ populate_locations(fg_fasta_fn, fg_mers, mer)
+
+def pop_bg(mer):
+ ''' helper for map function '''
+ populate_locations(bg_fasta_fn, bg_mers, mer)
+
+def main():
+ import time
+ selected = []
+ selectivty_fh = open(selectivity_fn, "r")
+
+ # get our genome length
+ fg_genome_length = os.path.getsize(fg_fasta_fn)
+ bg_genome_length = os.path.getsize(bg_fasta_fn)
+
+ for row in selectivity_fn:
+ (mer, fg_count, bg_count, selectivity) = row.split()
+ fg_mers[mer] = Mer()
+ fg_mers[mer].count = fg_count
+ bg_mers[mer] = Mer()
+ bg_mers[mer].count = bg_count
+ selected.append([mer, selectivity])
+
+ # exhaustive = False
+ #
+ # if exhaustive:
+ # selected = fg_mers.keys()
+ # else:
+ # selected = select_mers(fg_mers, bg_mers, max_select)
+ selected = selected[-100:]
+ pdb.set_trace()
+ # print "searching through combinations of"
+ # print selected
+
+ print "Populating foreground locations"
+
+
+ map(pop_fg, selected)
+ map(pop_bg, selected)
+
+ scores = score_mers(selected)
+
+ print "fg_genome_length", fg_genome_length
+ print "bg_genome_length", bg_genome_length
+ print "output_file:", output_file
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/select_mers.py b/select_mers.py
index a09bd6c..a4657d4 100755
--- a/select_mers.py
+++ b/select_mers.py
@@ -2,151 +2,44 @@
import sys
import os
-from multiprocessing import Pool
-from subprocess import *
-import numpy as np
-import pdb
-
fg_mers = {}
bg_mers = {}
-if(len(sys.argv) != 5):
+min_mer_count = int(os.environ.get("min_mer_count", 0));
+
+if(len(sys.argv) == 5):
fg_count_fn = sys.argv[1]
fg_fasta_fn = sys.argv[2]
bg_count_fn = sys.argv[3]
bg_fasta_fn = sys.argv[4]
- output_file = sys.argv[5]
+
+ fg_genome_length = os.path.getsize(fg_fasta_fn)
+ bg_genome_length = os.path.getsize(bg_fasta_fn)
else:
+ print len(sys.argv)
print "please specify your inputs"
- print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file"
+ print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta"
exit()
-# empty class to fill up mer information with
-class Mer:
- pass
-
-# import our variables
-min_mer_range = int(os.environ.get("min_mer_range", 6));
-max_mer_range = int(os.environ.get("max_mer_range", 10));
-min_mer_count = int(os.environ.get("min_mer_count", 0));
-max_select = int(os.environ.get("max_select", 15));
-max_mer_distance = int(os.environ.get("max_mer_distance", 5000));
-
-
-def populate_locations(input_fn, mers, mer):
- ''' Run the strstreamone command, and parse in the integers that are output
- by the command, and add it to mers[mer].pts
- '''
-
- cmd = 'strstreamone ' + mer + " < " + input_fn
-
- strstream = Popen(cmd, stdout=PIPE, shell=True)
- for line in strstream.stdout:
- mers[mer].pts.append(int(line))
-
-
-def score_mers(selected):
- from itertools import combinations
- import time
-
- scores = []
-
- p = Pool()
-
- fh = open(output_file, 'w');
- fh.write("scores:\n");
- for select_n in range(1, max_select+1):
- print "scoring size ", select_n,
- t = time.time()
- scores_it = p.imap_unordered(score, combinations(selected, select_n))
- for score_res in scores_it:
- fh.write(str(score_res) + "\n");
- print "size ", select_n, "took:", t - time.time()
- return scores
-
-
-def score(combination):
-# input is a string of mers like
-# ['ACCAA', 'ACCCGA', 'ACGTATA']
-
- ret = [combination]
-
- for mer in combination:
- for other_mer in combination:
- if not mer == other_mer:
- if mer in other_mer:
- ret.append("duplicates")
- return ret
-
- fg_pts = []
- fg_dist = []
-
- bg_pts = []
- bg_dist = []
-
- for mer in combination:
- fg_pts = fg_pts + fg_mers[mer].pts
- bg_pts = bg_pts + bg_mers[mer].pts
-
- fg_pts.sort()
- bg_pts.sort()
-
- # remove any exact duplicates
- # fg_pts = list(set(fg_pts))
- # bg_pts = list(set(bg_pts))
-
- # distances
- min_mer_distance = max(len(i) for i in combination)
- fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))])
- bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))])
-
- # return without calculating scores if any objects are higher than our max distance
- if any(dist > max_mer_distance for dist in fg_dist):
- ret.append("max")
- ret.append(max(fg_dist))
- return ret
-
- # return without calculating scores if any mers are closer than the length of our longest mer in the combination
- if any(dist < min_mer_distance for dist in fg_dist):
- ret.append("min")
- ret.append(min(fg_dist))
- return ret
-
- nb_primers = len(combination)
- fg_mean_dist = np.mean(fg_dist)
- fg_variance_dist = np.var(fg_dist)
- bg_mean_dist = np.mean(bg_dist)
- bg_variance_dist = np.var(bg_dist)
-
- # this is our equation
- score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001)
-
- ret.append(score)
- ret.append(fg_mean_dist)
- ret.append(fg_variance_dist)
- ret.append(bg_mean_dist)
- ret.append(bg_variance_dist)
-
- return ret
-
# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
-def select_mers(fg_mers, bg_mers, select_nb):
+def select_mers(fg_mers, bg_mers):
+ import numpy as np
mers = [] # contains mer strings
fg_arr = [] # contains fg counts
bg_arr = [] # contains bg counts
-
+
# populate our bg_arr and fg_arr as well as our mer arr.
for mer in fg_mers.keys():
mers.append(mer);
- bg_arr.append(bg_mers[mer].count);
- fg_arr.append(fg_mers[mer].count);
+ bg_arr.append(bg_mers[mer] + 1);
+ fg_arr.append(fg_mers[mer]);
fg_arr = np.array(fg_arr, dtype='f');
bg_arr = np.array(bg_arr, dtype='f');
- selectivity = fg_arr / bg_arr
+ selectivity = (fg_arr/fg_genome_length) / (bg_arr/bg_genome_length)
arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))]
@@ -157,41 +50,26 @@ def select_mers(fg_mers, bg_mers, select_nb):
arr = sorted(arr, key = lambda row: row[3])
# return only our mers, without our selectivity scores
- arr = list(row[0] for row in arr)
return arr
-def pop_fg(mer):
- ''' helper for map function '''
- populate_locations(fg_fasta_fn, fg_mers, mer)
-
-def pop_bg(mer):
- ''' helper for map function '''
- populate_locations(bg_fasta_fn, bg_mers, mer)
def main():
- import time
+
+
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
- # get our genome length
- fg_genome_length = os.path.getsize(fg_fasta_fn)
- bg_genome_length = os.path.getsize(bg_fasta_fn)
-
# copy in our fg_mers and counts
+ print "populating our mers dictionary"
for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
-
- t = time.time()
for line in fh:
(mer, count) = line.split()
- mers[mer] = Mer()
- mers[mer].count = int(count)
- mers[mer].pts = []
- print time.time() - t
+ mers[mer] = int(count)
if min_mer_count >= 1:
print "removing that are less frequent than: ", min_mer_count
for mer in fg_mers.keys():
- if(fg_mers[mer].count < min_mer_count):
+ if(fg_mers[mer] < min_mer_count):
del fg_mers[mer]
if mer in bg_mers:
del bg_mers[mer]
@@ -204,41 +82,13 @@ def main():
print "adding empty mers to the background"
for mer in fg_mers:
if mer not in bg_mers:
- bg_mers[mer] = Mer()
- bg_mers[mer].count = 2
- bg_mers[mer].pts = [0, bg_genome_length]
-
-
- exhaustive = False
-
- if exhaustive:
- selected = fg_mers.keys()
- else:
- selected = select_mers(fg_mers, bg_mers, max_select)
- selected = selected[-100:]
- print "searching through combinations of"
- print selected
-
-# pdb.set_trace()
-
- # print "selected the top ", max_select, " mers"
- # print "selected:", ", ".join(selected)
-
- print "Populating foreground locations"
-
- # p = Pool()
-
- map(pop_fg, selected)
- map(pop_bg, selected)
-
- scores = score_mers(selected)
-
- print "fg_genome_length", fg_genome_length
- print "bg_genome_length", bg_genome_length
- print "output_file:", output_file
-
-
+ bg_mers[mer] = 0
+ print fg_genome_length
+ print bg_genome_length
+ selected = select_mers(fg_mers, bg_mers)
+ for row in selected:
+ print row[0] +"\t"+str(row[1]) + "\t" + str(row[2]) + str(row[3])
if __name__ == "__main__":
sys.exit(main())