1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
import sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot
import os
from multiprocessing import Pool
from subprocess import *
import numpy as np
import pdb
fg_mers = {}
bg_mers = {}
fg_count_fn = sys.argv[1]
fg_fasta_fn = sys.argv[2]
bg_count_fn = sys.argv[3]
bg_fasta_fn = sys.argv[4]
# empty class to fill up mer information with
class Mer:
pass
class Score:
pass
# import our variables
min_mer_range = int(os.getenv("min_mer_range"));
max_mer_range = int(os.getenv("max_mer_range"));
min_mer_count = int(os.getenv("min_mer_count"));
max_select = int(os.getenv("max_select"));
max_mer_distance = int(os.getenv("max_mer_distance"));
def populate_locations(input_fn, mers, mer):
cmd = 'strstreamone ' + mer + " < " + input_fn
strstream = Popen(cmd, stdout=PIPE, shell=True)
for line in strstream.stdout:
mers[mer].pts.append(int(line))
return None
def score_mers(selected):
from itertools import combinations
import time
scores = []
p = Pool()
for select_n in range(1, max_select+1):
t = time.time()
print "scoring size ", select_n
scores_it = []
scores_it = p.imap_unordered(score, combinations(selected, select_n))
scores_it = filter(lambda x: x is not None, scores_it)
scores = scores + scores_it
print time.time() - t
return scores
def score(combination):
# input is a string of mers like
# ['ACCAA', 'ACCCGA', 'ACGTATA']
for mer in combination:
for other_mer in combination:
if not mer == other_mer:
if mer in other_mer:
return None
fg_pts = []
fg_dist = []
bg_pts = []
bg_dist = []
for mer in combination:
fg_pts = fg_pts + fg_mers[mer].pts
bg_pts = bg_pts + bg_mers[mer].pts
fg_pts.sort()
bg_pts.sort()
# remove any exact duplicates
# fg_pts = list(set(fg_pts))
# bg_pts = list(set(bg_pts))
# distances
fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))])
bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))])
if any(dist > max_mer_distance for dist in fg_dist):
return None
nb_primers = len(combination)
fg_mean_dist = np.mean(fg_dist)
fg_variance_dist = np.var(fg_dist)
bg_mean_dist = np.mean(bg_dist)
bg_variance_dist = np.var(bg_dist)
# this is our equation
score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001)
return (combination, score)
# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
def select_mers(fg_mers, bg_mers, select_nb):
mers = [] # contains mer strings
fg_arr = [] # contains fg counts
bg_arr = [] # contains bg counts
# populate our bg_arr and fg_arr as well as our mer arr.
for mer in fg_mers.keys():
mers.append(mer);
bg_arr.append(bg_mers[mer].count);
fg_arr.append(fg_mers[mer].count);
fg_arr = np.array(fg_arr, dtype='f');
bg_arr = np.array(bg_arr, dtype='f');
selectivity = fg_arr - bg_arr
arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))]
# filter results less than 1 ( indicates that the bg is more present than the fg)
# arr = filter(lambda i: i[3] > 1, arr)
# sort by the selectivity
arr = sorted(arr, key = lambda row: row[3])
# return only our mers, without our selectivity scores
arr = list(row[0] for row in arr)
return arr
def pop_fg(mer):
populate_locations(fg_fasta_fn, fg_mers, mer)
def pop_bg(mer):
populate_locations(bg_fasta_fn, bg_mers, mer)
def main():
import time
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
# get our genome length
fg_genome_length = os.path.getsize(fg_fasta_fn)
bg_genome_length = os.path.getsize(bg_fasta_fn)
# copy in our fg_mers and counts
for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
t = time.time()
for line in fh:
(mer, count) = line.split()
mers[mer] = Mer()
mers[mer].count = int(count)
mers[mer].pts = []
print time.time() - t
if min_mer_count >= 1:
print "removing that are less frequent than: ", min_mer_count
for mer in fg_mers.keys():
if(fg_mers[mer].count < min_mer_count):
del fg_mers[mer]
if mer in bg_mers:
del bg_mers[mer]
print "removing useless mers from the background"
for mer in bg_mers.keys():
if mer not in fg_mers:
del bg_mers[mer]
print "adding empty mers to the background"
for mer in fg_mers:
if mer not in bg_mers:
bg_mers[mer] = Mer()
bg_mers[mer].count = 2
bg_mers[mer].pts = [0, bg_genome_length]
exhaustive = False
if exhaustive:
selected = fg_mers.keys()
else:
selected = select_mers(fg_mers, bg_mers, max_select)
selected = selected[-50:]
print selected
pdb.set_trace()
# print "selected the top ", max_select, " mers"
# print "selected:", ", ".join(selected)
print "Populating foreground locations"
# p = Pool()
map(pop_fg, selected)
map(pop_bg, selected)
pdb.set_trace()
scores = score_mers(selected)
pdb.set_trace()
print "fg_genome_length", fg_genome_length
print "bg_genome_length", bg_genome_length
sys.stdout.write("scores:\n");
for score in scores:
if score is not "Distance":
sys.stdout.write(", ".join(list(score[0])) + "\t")
sys.stdout.write(str(score[1]) + "\t")
# sys.stdsys.stdout.write("\t".join(score.dist_arr));
sys.stdout.write("\n");
if __name__ == "__main__":
sys.exit(main())
|