aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-24 18:59:10 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-24 18:59:10 -0400
commitac8f5cc40e9db6274ec09ffadfc6369e06097b69 (patch)
tree16eb2a807615d12b7168ce86a2a914a4ddaff299 /src
parent65e9bc6772f4538078d0eb1d78ff7b08df6d9c4e (diff)
Major Updates:
SelectiveGenomeAmplication: remove need (again) for any temporary fasta file, instead do all parsing in house with strstreamone and python correctly get reverse compliment. a combination of tac and rev, not just tac add non_melitng and filtered_binding variables to bash for clarity and to reduce clutter score_mers: This should speed up our loading and memory and probably each job since there's less memory overhead. revamp mer points - remove class, save as arrays instead - only allocate mers we need, - don't bother getting counts - reduce command line arguments - use original fasta files now, so it's easier for a user to use (as per issue #5 - remove weird pop_fg/bg functions - add one to total incase of zero div.
Diffstat (limited to 'src')
-rwxr-xr-xsrc/score_mers.py96
1 files changed, 50 insertions, 46 deletions
diff --git a/src/score_mers.py b/src/score_mers.py
index 5c1a194..ba63f71 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -17,7 +17,7 @@ bg_mers = {}
seq_ends = []
-if(len(sys.argv) == 5):
+if len(sys.argv) == 5:
selectivity_fn = sys.argv[1]
fg_fasta_fn = sys.argv[2]
bg_fasta_fn = sys.argv[3]
@@ -27,13 +27,9 @@ if(len(sys.argv) == 5):
bg_genome_length = os.path.getsize(bg_fasta_fn)
else:
print "please specify your inputs"
- print "ex: score_mers.py selectivity_file fg_fasta_file bg_fasta_file"
+ print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
exit()
-# empty class to fill up mer information with
-class Mer:
- pass
-
# import our variables
cpus = int(os.environ.get("cpus", cpu_count()));
min_mer_range = int(os.environ.get("min_mer_range", 6));
@@ -82,36 +78,46 @@ def get_max_consecutive_binding(mer1, mer2):
return max_bind
-def pop_fg(mer):
- ''' helper for map function '''
- populate_locations(fg_fasta_fn, fg_mers, mer)
-
-def pop_bg(mer):
- ''' helper for map function '''
- populate_locations(bg_fasta_fn, bg_mers, mer)
-
-def populate_locations(input_fn, mers, mer):
+def populate_locations(selected_mers, mer_dic, input_fn):
'''
Run the strstreamone command, and parse in the integers that are output
- by the command, and add it to mers[mer].pts
+ by the command, and add it to mers[mer]
strstreamone just prints the location of a string argv[1] in stdout.
We also do the reverse compliment, using tac and tr piped together.
'''
+ import tempfile
- cmd = 'strstreamone ' + mer + " < " + input_fn
+
+ cmds = []
+ # strip file of header and delete newlines
+ cmds.append("grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ")
+ # reverse file, strip and delete newlines
+ cmds.append("tac " + input_fn + " | rev | grep -v '>$' | tr -d '\\n' | tr [ACGT] [TGCA] | strstream ")
- strstream = Popen(cmd, stdout=PIPE, shell=True)
- for line in strstream.stdout:
- mers[mer].pts.append(int(line))
+ for cmd in cmds:
+ fid, merlist_fn = tempfile.mkstemp()
+
+ # write our mers out to a fifi
+ merlist_fh = open(merlist_fn, 'w')
+ for mer in selected_mers:
+ print mer
+ merlist_fh.write(mer + '\n')
- cmd = 'tac ' + input_fn + " | tr '[ACGT]' '[TGCA]' | strstreamone " + mer + " " + input_fn
- strstream = Popen(cmd, stdout=PIPE, shell=True)
- for line in strstream.stdout:
- mers[mer].pts.append(int(line))
+ merlist_fh.flush()
+ # add our merlist fn to our command
+ cmd = cmd + " " + merlist_fn
+ strstream = Popen(cmd, stdout=PIPE, shell=True)
+ for line in strstream.stdout:
+ (mer, pos) = line.strip().split(" ")
+ mer_dic[selected_mers[int(mer)]].append(int(pos))
+
+ merlist_fh.close()
+
+
def filter_mers(combination):
for combo in combinations(combination, 2):
@@ -129,8 +135,8 @@ def filter_mers(combination):
def check_feasible(selected):
total = 0;
for mer in selected:
- total += len(fg_mers[mer].pts)
- if (fg_genome_length / total) > max_mer_distance:
+ total += len(fg_mers[mer])
+ if (fg_genome_length / (total + 1 )) > max_mer_distance:
print "even if we select all top ", max_select,
print "mers disregarding any critera, and they were perfectly evenly spaced we would ",
print "still not meet the right max mer distance < ", max_mer_distance, "requirement."
@@ -181,7 +187,7 @@ def score(combination):
fg_dist = []
for mer in combination:
- fg_pts = fg_pts + fg_mers[mer].pts
+ fg_pts = fg_pts + fg_mers[mer]
fg_pts = fg_pts + seq_ends
@@ -211,7 +217,7 @@ def score(combination):
bg_dist = []
for mer in combination:
- bg_pts = bg_pts + bg_mers[mer].pts
+ bg_pts = bg_pts + bg_mers[mer]
if len(bg_pts()) <= 1:
bg_pts.append(0, 1, fg_genome_length)
@@ -230,9 +236,10 @@ def score(combination):
def load_end_points(fn):
- end_points = []
+ end_points = [0]
cmd = "sequence_end_points < " + fn
+ print cmd
points_fh = Popen(cmd, stdout=PIPE, shell=True)
for line in points_fh.stdout:
@@ -268,29 +275,26 @@ def main():
selected = []
selectivity_fh = open(selectivity_fn, "r")
- # get our genome length
-
- for row in selectivity_fh:
- (mer, fg_count, bg_count, selectivity) = row.split()
- fg_mers[mer] = Mer()
- fg_mers[mer].pts = []
- fg_mers[mer].count = fg_count
- bg_mers[mer] = Mer()
- bg_mers[mer].pts = []
- bg_mers[mer].count = bg_count
- selected.append([mer, selectivity])
-
- selected = selected[-max_check:]
- selected_mers = [row[0] for row in selected]
- # print selected_mers
+ # load our mer list into python
+ mer_selectivity = selectivity_fh.readlines()
+
+ # get the last max_check (it's sorted)
+ selected_mers = mer_selectivity[-max_check:]
+ selected_mers = [x.split()[0] for x in selected_mers]
+
+ # load it into our fg and bg dictionary points
+ for mer in selected_mers:
+ fg_mers[mer] = []
+ bg_mers[mer] = []
print "Populating sequence end points"
seq_ends = load_end_points(fg_fasta_fn)
+
print "Populating foreground locations"
- map(pop_fg, selected_mers)
+ populate_locations(selected_mers, fg_mers, fg_fasta_fn)
print "Populating background locations"
- map(pop_bg, selected_mers)
+ populate_locations(selected_mers, bg_mers, bg_fasta_fn)
print "calculating heterodimer distances"
load_heterodimer_dic(selected_mers)