1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
#!/usr/bin/env python
import sys
import os
fg_mers = {}
bg_mers = {}
min_mer_count = int(os.environ.get("min_mer_count", 0));
if(len(sys.argv) == 5):
fg_count_fn = sys.argv[1]
fg_fasta_fn = sys.argv[2]
bg_count_fn = sys.argv[3]
bg_fasta_fn = sys.argv[4]
fg_genome_length = os.path.getsize(fg_fasta_fn)
bg_genome_length = os.path.getsize(bg_fasta_fn)
else:
print len(sys.argv)
print "please specify your inputs"
print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta"
exit()
# select mers based on our 'selectivity' measure. (count in fg) / (count in bg)
def select_mers(fg_mers, bg_mers):
import numpy as np
mers = [] # contains mer strings
fg_arr = [] # contains fg counts
bg_arr = [] # contains bg counts
# populate our bg_arr and fg_arr as well as our mer arr.
for mer in fg_mers.keys():
mers.append(mer);
bg_arr.append(bg_mers.get(mer, 1));
fg_arr.append(fg_mers[mer]);
fg_arr = np.array(fg_arr, dtype='f');
bg_arr = np.array(bg_arr, dtype='f');
selectivity = (fg_arr / bg_arr)
arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))]
# filter results less than 1 ( indicates that the bg is more present than the fg)
# arr = filter(lambda i: i[3] > 1, arr)
# sort by the selectivity
arr = sorted(arr, key = lambda row: row[3])
# return only our mers, without our selectivity scores
return arr
def main():
fg_count_fh = open(fg_count_fn, "r")
bg_count_fh = open(bg_count_fn, "r")
# copy in our fg_mers and counts
for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
for line in fh:
(mer, count) = line.split()
mers[mer] = int(count)
if min_mer_count >= 1:
for mer in fg_mers.keys():
if(fg_mers[mer] < min_mer_count):
del fg_mers[mer]
if mer in bg_mers:
del bg_mers[mer]
for mer in bg_mers.keys():
if mer not in fg_mers:
del bg_mers[mer]
selected = select_mers(fg_mers, bg_mers)
for row in selected:
print row[0] +"\t"+str("%d" % row[1]) + "\t" + str("%d" % row[2]) + "\t" + str("%.5f" % row[3])
if __name__ == "__main__":
sys.exit(main())
|