aboutsummaryrefslogtreecommitdiff
path: root/select_mers.py
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-17 15:03:07 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-17 15:03:07 -0500
commit59d6b220c2a5dd7bae81404f1cf4a5e6aa0089c3 (patch)
treef05e5c0fb35799e00114f8ef5f940bd0f59d2e5d /select_mers.py
parent1db2c90027d7273fbfb19d640f22ef01bdee37c3 (diff)
major select_mers.py changes
Diffstat (limited to 'select_mers.py')
-rw-r--r--select_mers.py204
1 files changed, 114 insertions, 90 deletions
diff --git a/select_mers.py b/select_mers.py
index 9dea7fe..380e4fd 100644
--- a/select_mers.py
+++ b/select_mers.py
@@ -11,6 +11,13 @@ import pdb
fg_mers = {}
bg_mers = {}
+
+fg_count_fn = sys.argv[1]
+fg_fasta_fn = sys.argv[2]
+
+bg_count_fn = sys.argv[3]
+bg_fasta_fn = sys.argv[4]
+
# empty class to fill up mer information with
class Mer:
pass
@@ -23,108 +30,132 @@ min_mer_range = int(os.getenv("min_mer_range"));
max_mer_range = int(os.getenv("max_mer_range"));
min_mer_count = int(os.getenv("min_mer_count"));
max_select = int(os.getenv("max_select"));
+max_mer_distance = int(os.getenv("max_mer_distance"));
+
+def populate_locations(input_fn, mers, mer):
-def populate_locations(input_fn, mers, selected):
- mer_str = ' '.join(selected)
- cmd = './strstream ' + mer_str + " < " + input_fn
+
+ cmd = 'strstreamone ' + mer + " < " + input_fn
strstream = Popen(cmd, stdout=PIPE, shell=True)
for line in strstream.stdout:
- (index, pos) = line.split(' ')
- mers[selected[int(index)]].pts.append(int(pos))
+ mers[mer].pts.append(int(line))
+
return None
-def select_mers(selected):
+def score_mers(selected):
from itertools import combinations
+ import time
scores = []
p = Pool()
- for select_n in range(1, max_select):
+ for select_n in range(1, max_select+1):
+ t = time.time()
print "scoring size ", select_n
scores_it = []
- scores_it = p.map(score, combinations(selected, select_n))
+ scores_it = p.imap_unordered(score, combinations(selected, select_n))
+ scores_it = filter(lambda x: x is not None, scores_it)
scores = scores + scores_it
+ print time.time() - t
+
- p.close()
- scores = sorted(scores, key=lambda score: score.score)
return scores
def score(combination):
-
- # print "scoring", combination
+# input is a string of mers like
+# ['ACCAA', 'ACCCGA', 'ACGTATA']
- fg_pts_arr = []
- fg_dist_arr = []
- bg_pts_arr = []
- bg_dist_arr = []
+ for mer in combination:
+ for other_mer in combination:
+ if not mer == other_mer:
+ if mer in other_mer:
+ return None
+
+
+ fg_pts = []
+ fg_dist = []
+
+ bg_pts = []
+ bg_dist = []
for mer in combination:
- fg_pts_arr = fg_pts_arr + fg_mers[mer].pts
- bg_pts_arr = bg_pts_arr + bg_mers[mer].pts
+ fg_pts = fg_pts + fg_mers[mer].pts
+ bg_pts = bg_pts + bg_mers[mer].pts
- fg_pts_arr.sort()
- bg_pts_arr.sort()
+ fg_pts.sort()
+ bg_pts.sort()
# remove any exact duplicates
- fg_pts_arr = list(set(fg_pts_arr))
- bg_pts_arr = list(set(bg_pts_arr))
+ # fg_pts = list(set(fg_pts))
+ # bg_pts = list(set(bg_pts))
# distances
- for i in range(1, len(fg_pts_arr)):
- fg_dist_arr.append(abs(fg_pts_arr[i-1] - fg_pts_arr[i]));
-
- for i in range(1, len(bg_pts_arr)):
- bg_dist_arr.append(abs(bg_pts_arr[i-1] - bg_pts_arr[i]));
+ fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))])
+ bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))])
- fg_dist_arr = np.array(fg_dist_arr)
- bg_dist_arr = np.array(bg_dist_arr)
+ if any(dist > max_mer_distance for dist in fg_dist):
+ return None
nb_primers = len(combination)
- fg_mean_dist = np.mean(fg_dist_arr)
- fg_variance_dist = np.var(fg_dist_arr)
- bg_mean_dist = np.mean(bg_dist_arr)
- bg_variance_dist = np.var(bg_dist_arr)
+ fg_mean_dist = np.mean(fg_dist)
+ fg_variance_dist = np.var(fg_dist)
+ bg_mean_dist = np.mean(bg_dist)
+ bg_variance_dist = np.var(bg_dist)
# this is our equation
-
- score = Score()
- score.mers = combination
- score.score = (nb_primers * (fg_mean_dist * fg_variance_dist)) / (bg_mean_dist * bg_variance_dist)
- # score.bg_dist_arr = bg_dist_arr
- # score.fg_dist_arr = fg_dist_arr
- return score
+ score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001)
+
+ return (combination, score)
-def most_frequent(mers, select_nb):
- print "selecting the top ", select_nb, " mers"
- selected = []
- for mer in mers.keys():
- selected.append([mer, mers[mer].count])
-
- selected = sorted(selected, key=lambda mer: mer[1])
- selected = selected[-select_nb:]
- selected = list(row[0] for row in selected)
+# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
+def select_mers(fg_mers, bg_mers, select_nb):
- return selected
+ mers = [] # contains mer strings
+ fg_arr = [] # contains fg counts
+ bg_arr = [] # contains bg counts
+ # populate our bg_arr and fg_arr as well as our mer arr.
+ for mer in fg_mers.keys():
+ mers.append(mer);
+ bg_arr.append(bg_mers[mer].count);
+ fg_arr.append(fg_mers[mer].count);
-def main():
- fg_count_fn = sys.argv[1]
- fg_fasta_fn = sys.argv[2]
+ fg_arr = np.array(fg_arr, dtype='f');
+ bg_arr = np.array(bg_arr, dtype='f');
- bg_count_fn = sys.argv[3]
- bg_fasta_fn = sys.argv[4]
+ selectivity = fg_arr - bg_arr
+ arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))]
+
+ # filter results less than 1 ( indicates that the bg is more present than the fg)
+ # arr = filter(lambda i: i[3] > 1, arr)
+
+ # sort by the selectivity
+ arr = sorted(arr, key = lambda row: row[3])
+
+ # return only our mers, without our selectivity scores
+ arr = list(row[0] for row in arr)
+ return arr
+
+def pop_fg(mer):
+ populate_locations(fg_fasta_fn, fg_mers, mer)
+
+def pop_bg(mer):
+ populate_locations(bg_fasta_fn, bg_mers, mer)
+
+def main():
+ import time
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
@@ -135,28 +166,16 @@ def main():
# copy in our fg_mers and counts
for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
+ t = time.time()
for line in fh:
(mer, count) = line.split()
- count = int(count)
mers[mer] = Mer()
- mers[mer].count = count
+ mers[mer].count = int(count)
mers[mer].pts = []
+ print time.time() - t
- # if min_mer_count is less than one, then we are looking at a bottom cutoff threshold
- if(min_mer_count < 1 and min_mer_count > 0):
- pass
- # counts = []
- # for mer in fg_mers.keys():
- # counts.append(fg_mers[mer]['count'])
- #
- # counts = counts.sort()
- #
- # count_set = set(counts):
- # len(count_set)
- # count_set(
- # if it is 1 or great then use it as a numerical cutoff
- print "removing mers that don't meet our minimum count"
if min_mer_count >= 1:
+ print "removing that are less frequent than: ", min_mer_count
for mer in fg_mers.keys():
if(fg_mers[mer].count < min_mer_count):
del fg_mers[mer]
@@ -176,38 +195,43 @@ def main():
bg_mers[mer].pts = [0, bg_genome_length]
- exhaustive = True
+ exhaustive = False
if exhaustive:
selected = fg_mers.keys()
- if not exhaustive:
- selected = most_frequent(fg_mers, max_select)
+ else:
+ selected = select_mers(fg_mers, bg_mers, max_select)
+ selected = selected[-50:]
+ print selected
- print "selected the top ", max_select, " mers"
- print "selected:", ", ".join(selected)
+ pdb.set_trace()
+
+ # print "selected the top ", max_select, " mers"
+ # print "selected:", ", ".join(selected)
print "Populating foreground locations"
- populate_locations(fg_fasta_fn, fg_mers, selected)
- print "Populating background locations"
- populate_locations(bg_fasta_fn, bg_mers, selected)
- scores = select_mers(selected)
+ # p = Pool()
+
+ map(pop_fg, selected)
+ map(pop_bg, selected)
+ pdb.set_trace()
+
+ scores = score_mers(selected)
+ pdb.set_trace()
print "fg_genome_length", fg_genome_length
print "bg_genome_length", bg_genome_length
+ sys.stdout.write("scores:\n");
for score in scores:
- sys.stdout.write(str(score.score) + "\t")
- sys.stdout.write(", ".join(list(score.mers)) + "\t")
- # sys.stdout.write("\t".join(score.dist_arr));
- sys.stdout.write("\n");
-
- #matplotlib.pyplot.hist(score.fg_dist_arr)
- #matplotlib.pyplot.savefig('graph/' + str(x) + ".png")
- #matplotlib.pyplot.clf()
- import pdb
- pdb.set_trace()
+ if score is not "Distance":
+ sys.stdout.write(", ".join(list(score[0])) + "\t")
+ sys.stdout.write(str(score[1]) + "\t")
+ # sys.stdsys.stdout.write("\t".join(score.dist_arr));
+ sys.stdout.write("\n");
+
if __name__ == "__main__":
sys.exit(main())