aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-04-07 12:57:14 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-04-07 12:57:14 -0400
commit0ff852452621030cb90c36c0884b4f03a6f45319 (patch)
treef9267712003fe1f670a95da18d60fdce5d04a250
parent13af712fbd3395d07d0940edf518fbf7eece3b99 (diff)
initial commit of output_full_genome.py
-rwxr-xr-xsrc/output_full_genome.py170
1 files changed, 170 insertions, 0 deletions
diff --git a/src/output_full_genome.py b/src/output_full_genome.py
new file mode 100755
index 0000000..a4f99d5
--- /dev/null
+++ b/src/output_full_genome.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+import sys
+import os
+import argparse
+
+mers = {}
+debug = False
+
+from subprocess import Popen
+from subprocess import PIPE
+
+
+def get_sequence(pt):
+
+ for it, seq in enumerate(seq_ends):
+ if pt <= seq:
+ return it
+
+def load_end_points(fn):
+ ''' get all the points of the end of each sequence in a sample '''
+
+ end_points = [0]
+
+ cmd = "sequence_end_points < " + fn
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+
+ points_fh = Popen(cmd, stdout=PIPE, shell=True)
+
+ for line in points_fh.stdout:
+ end_points.append(int(line))
+
+
+ if points_fh.wait() is not 0:
+ print "executing", cmd, "failed"
+
+ return end_points
+
+def get_length(fn):
+ ''' get length of a genome ( number of base pairs )'''
+
+ cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+ grep = Popen(cmd, stdout=PIPE, shell=True)
+
+ length = grep.stdout.readline()
+
+ length = int(length)
+
+ if grep.wait() is not 0:
+ print "executing", cmd, "failed"
+
+
+ return length
+
+def populate_locations(selected_mers, mer_dic, input_fn, length):
+ '''
+ Run the strstreamone command, and parse in the integers that are output
+ by the command, and add it to mers[mer]
+
+ strstreamone just prints the location of a string argv[1] in stdout.
+
+ We also do the reverse compliment, using tac and tr piped together.
+ '''
+ import tempfile
+
+
+ cmds = []
+ # strip file of header and delete newlines
+ cmds.append(["grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ", False])
+ # reverse file, strip and delete newlines
+ cmds.append(["tac " + input_fn + \
+ "| rev " \
+ "| grep -v '>$' " \
+ "| tr -d '\\n' " \
+ "| tr [ACGT] [TGCA] | strstream ", True])
+
+ for (cmd, reverse) in cmds:
+ _, merlist_fn = tempfile.mkstemp()
+
+ # write our mers out to a fifi
+ merlist_fh = open(merlist_fn, 'w')
+ for mer in selected_mers:
+ merlist_fh.write(mer + '\n')
+
+ merlist_fh.flush()
+ # add our merlist fn to our command
+ cmd = cmd + " " + merlist_fn
+
+ strstream = Popen(cmd, stdout=PIPE, shell=True)
+ if reverse:
+ for line in strstream.stdout:
+ (mer, pos) = line.split(" ")
+ pos = length - int(pos)
+ mer_dic[selected_mers[int(mer)]].append(pos)
+ else:
+ for line in strstream.stdout:
+ (mer, pos) = line.split(" ")
+ mer_dic[selected_mers[int(mer)]].append(int(pos))
+
+ if strstream.wait() is not 0:
+ print "executing", cmd, "failed"
+
+ merlist_fh.close()
+
+
+def main():
+
+ parser = argparse.ArgumentParser(description="score mers")
+ parser.add_argument("-f", "--fasta", help="input fasta file", required=True)
+ parser.add_argument("-s", "--scores", help="scores file", required=True)
+ parser.add_argument("-o", "--output_directory", help="scores file", required=True)
+ parser.add_argument("-n", "--number", help="output top n", required=False, type=int, default=20)
+
+ args = parser.parse_args()
+
+ if os.path.isfile(args.output_directory):
+ parser.error(args.output_directory + "must point to a directory")
+ elif not os.path.isdir(args.output_directory):
+ os.mkdir(args.output_directory)
+
+ score_fh = open(args.scores, "r")
+
+ global seq_ends
+ seq_ends = load_end_points(args.fasta)
+
+ nb_done = 0;
+ for line in score_fh:
+ # skip headers
+ if line.startswith("#"):
+ continue
+
+ # get mers
+ combination = line.split('\t')[1].split()
+
+ # open output
+ fh = open(args.output_directory + "/" + '-'.join(combination), 'w')
+
+ # populate only un-populated mers
+ new_populate = []
+ for mer in combination:
+ if mer not in mers:
+ mers[mer] = []
+ new_populate.append(mer)
+
+ if len(new_populate) is not 0:
+ populate_locations(new_populate, mers, args.fasta, get_length(args.fasta))
+
+ pts = []
+ for mer in combination:
+ for pt in mers[mer]:
+ pts.append([pt, mer, get_sequence(pt)])
+
+ pts = sorted(pts, key = lambda row: row[0])
+
+ for pt in pts:
+ fh.write(str(pt))
+ fh.write("\n")
+
+ nb_done += 1
+ if nb_done == args.number:
+ exit()
+
+if __name__ == "__main__":
+ sys.exit(main())