From c5ddb534d0f06542c3818f9b17e2ab93ee981698 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Mon, 14 Apr 2014 15:17:12 -0400 Subject: don't use strstream, use kmer_locations --- src/output_full_genome.py | 65 ++++++++++++++++++++--------------------------- src/score_mers.py | 61 ++++++++++++++++++++------------------------ 2 files changed, 55 insertions(+), 71 deletions(-) (limited to 'src') diff --git a/src/output_full_genome.py b/src/output_full_genome.py index eb12a82..5a0e3d5 100755 --- a/src/output_full_genome.py +++ b/src/output_full_genome.py @@ -58,58 +58,49 @@ def get_length(fn): return length -def populate_locations(selected_mers, mer_dic, input_fn, length): + +def size_iterator(mer_arr): + for mer_len in set([ len(it) for it in mer_arr]): + mers = filter(lambda x: len(x) == mer_len, mer_arr) + yield mers + +def populate_locations(mers, mer_dic, input_fn, length): ''' - Run the strstreamone command, and parse in the integers that are output + Run the kmer_locations command, and parse in the integers that are output by the command, and add it to mers[mer] - strstreamone just prints the location of a string argv[1] in stdout. - - We also do the reverse compliment, using tac and tr piped together. + We also do the reverse compliment, so we subtract the length of the genome for it ''' import tempfile - - cmds = [] - # strip file of header and delete newlines - cmds.append(["grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ", False, 5]) - # reverse file, strip and delete newlines - cmds.append(["tac " + input_fn + \ - "| rev " \ - "| grep -v '>$' " \ - "| tr -d '\\n' " \ - "| tr [ACGT] [TGCA] | strstream ", True, 3]) - - for (cmd, reverse, strand) in cmds: - if(debug): - print(cmd) + # write our mers out to a fifi + for n_mers in size_iterator(mers): _, merlist_fn = tempfile.mkstemp() - - # write our mers out to a fifi merlist_fh = open(merlist_fn, 'w') - for mer in selected_mers: + for mer in n_mers: + print mer merlist_fh.write(mer + '\n') merlist_fh.flush() # add our merlist fn to our command - cmd = cmd + " " + merlist_fn - - strstream = Popen(cmd, stdout=PIPE, shell=True) - if reverse: - for line in strstream.stdout: - (mer, pos) = line.split(" ") - mer_dic[selected_mers[int(mer)]].append([pos, strand]) - else: - for line in strstream.stdout: - (mer, pos) = line.split(" ") - mer_dic[selected_mers[int(mer)]].append([int(pos), strand]) + cmd = "kmer_locations -k " + str(len(n_mers[0])) + " -c -m \"" + merlist_fn + "\" -l -i \"" + input_fn + "\"" - if strstream.wait() is not 0: - print "executing", cmd, "failed" + if(debug): + print "Executing", cmd + + cmd_pipe = Popen(cmd, stdout=PIPE, shell=True) + for line in cmd_pipe.stdout: + line_s = line[:-1].split(" ") + if len(line_s) == 2: + mer_dic[line_s[0]].append([int(line_s[1]), 3]) + elif len(line_s) == 3: + mer_dic[line_s[0]].append([length - int(line_s[1]), 5]) + + if cmd_pipe.wait() is not 0: + print "Executing", cmd, "failed" + exit() merlist_fh.close() - - def main(): parser = argparse.ArgumentParser(description="take a top-scores file, and \ diff --git a/src/score_mers.py b/src/score_mers.py index e245f39..7538ed4 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -80,54 +80,46 @@ def get_max_consecutive_binding(mer1, mer2): return max_bind +def size_iterator(mer_arr): + for mer_len in set([ len(it) for it in mer_arr]): + mers = filter(lambda x: len(x) == mer_len, mer_arr) + yield mers -def populate_locations(selected_mers, mer_dic, input_fn, length): +def populate_locations(mers, mer_dic, input_fn, length): ''' - Run the strstreamone command, and parse in the integers that are output + Run the kmer_locations command, and parse in the integers that are output by the command, and add it to mers[mer] - strstreamone just prints the location of a string argv[1] in stdout. - - We also do the reverse compliment, using tac and tr piped together. + We also do the reverse compliment, so we subtract the length of the genome for it ''' import tempfile - - cmds = [] - # strip file of header and delete newlines - cmds.append(["grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ", False]) - # reverse file, strip and delete newlines - cmds.append(["tac " + input_fn + \ - "| rev " \ - "| grep -v '>$' " \ - "| tr -d '\\n' " \ - "| tr [ACGT] [TGCA] | strstream ", True]) - - for (cmd, reverse) in cmds: + # write our mers out to a fifi + for n_mers in size_iterator(mers): _, merlist_fn = tempfile.mkstemp() - - # write our mers out to a fifi merlist_fh = open(merlist_fn, 'w') - for mer in selected_mers: + for mer in n_mers: + print mer merlist_fh.write(mer + '\n') merlist_fh.flush() # add our merlist fn to our command - cmd = cmd + " " + merlist_fn - - strstream = Popen(cmd, stdout=PIPE, shell=True) - if reverse: - for line in strstream.stdout: - (mer, pos) = line.split(" ") - pos = length - int(pos) - mer_dic[selected_mers[int(mer)]].append(pos) - else: - for line in strstream.stdout: - (mer, pos) = line.split(" ") - mer_dic[selected_mers[int(mer)]].append(int(pos)) + cmd = "kmer_locations -k " + str(len(n_mers[0])) + " -c -m \"" + merlist_fn + "\" -l -i \"" + input_fn + "\"" + + if(debug): + print "Executing", cmd + + cmd_pipe = Popen(cmd, stdout=PIPE, shell=True) + for line in cmd_pipe.stdout: + line_s = line[:-1].split(" ") + if len(line_s) == 2: + mer_dic[line_s[0]].append(int(line_s[1])) + elif len(line_s) == 3: + mer_dic[line_s[0]].append(length - int(line_s[1])) - if strstream.wait() is not 0: - print "executing", cmd, "failed" + if cmd_pipe.wait() is not 0: + print "Executing", cmd, "failed" + exit() merlist_fh.close() @@ -170,6 +162,7 @@ def get_length(fn): if grep.wait() is not 0: print "executing", cmd, "failed" + exit() return length -- cgit v1.2.1