diff options
-rw-r--r-- | README.md | 24 | ||||
-rwxr-xr-x | src/output_full_genome.py | 19 |
2 files changed, 31 insertions, 12 deletions
@@ -230,11 +230,17 @@ You need to use **valid** python code. ## Filters -There are several filters that our mers go through, to eliminate ones that won't fit our needs. They are all configurable via the tunable parameters. If you look in a output directory, you'll see a folder called "passes-filter". This contains a file for each of the different steps in the pipeline, and the contents of each file is what 'passes' that filter. +There are several filters that our mers go through, to eliminate ones that +won't fit our needs. They are all configurable via the tunable parameters. If +you look in a output directory, you'll see a folder called "passes-filter". +This contains a file for each of the different steps in the pipeline, and the +contents of each file is what 'passes' that filter. -For example, if you ignored the mer 'AAAAA', then in passes-filter/1-$foreground-ignore-mers there would be no line containing that. +For example, if you ignored the mer 'AAAAA', then in +passes-filter/1-$foreground-ignore-mers there would be no line containing that. -The filter system works like a big pipe, whatever gets filtered out won't make it to the next step. the order is like this +The filter system works like a big pipe, whatever gets filtered out won't make +it to the next step. the order is like this All mers -> ignore_mers -> ignore_all_mers -> average_binding -> non_melting -> consecutive_binding @@ -272,3 +278,15 @@ background count, and the mer selectivity value. (higher is better) score mers outputs a tab delimited file with 6 columns: nb_primers Combination Score FG_mean_dist FG_stdev_dist BG_ratio + +## Post Processing + +To get a more detailed look at each scored combination we provide the +output\_full\_genome.py script. This script will output all of the points in a +selected set along with some metadata, including position, what sequence it is in, +what strand and what mer it is. + + output_full_genome.py -f fg.fa -s fg.fa_bg.fa/run_12/top-scores -n 15 -o sets + +this will output one file for eat of the the top 15 sets in top-scores, in the +folder sets. diff --git a/src/output_full_genome.py b/src/output_full_genome.py index e55136a..eb12a82 100755 --- a/src/output_full_genome.py +++ b/src/output_full_genome.py @@ -72,15 +72,15 @@ def populate_locations(selected_mers, mer_dic, input_fn, length): cmds = [] # strip file of header and delete newlines - cmds.append(["grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ", False]) + cmds.append(["grep -v '^>' " + input_fn + " | tr -d '\\n' | strstream ", False, 5]) # reverse file, strip and delete newlines cmds.append(["tac " + input_fn + \ "| rev " \ "| grep -v '>$' " \ "| tr -d '\\n' " \ - "| tr [ACGT] [TGCA] | strstream ", True]) + "| tr [ACGT] [TGCA] | strstream ", True, 3]) - for (cmd, reverse) in cmds: + for (cmd, reverse, strand) in cmds: if(debug): print(cmd) _, merlist_fn = tempfile.mkstemp() @@ -98,12 +98,11 @@ def populate_locations(selected_mers, mer_dic, input_fn, length): if reverse: for line in strstream.stdout: (mer, pos) = line.split(" ") - pos = length - int(pos) - mer_dic[selected_mers[int(mer)]].append(pos) + mer_dic[selected_mers[int(mer)]].append([pos, strand]) else: for line in strstream.stdout: (mer, pos) = line.split(" ") - mer_dic[selected_mers[int(mer)]].append(int(pos)) + mer_dic[selected_mers[int(mer)]].append([int(pos), strand]) if strstream.wait() is not 0: print "executing", cmd, "failed" @@ -127,12 +126,13 @@ def main(): parser.error(args.output_directory + "must point to a directory") elif not os.path.isdir(args.output_directory): os.mkdir(args.output_directory) - score_fh = open(args.scores, "r") global seq_ends seq_ends = load_end_points(args.fasta) + length = get_length(args.fasta) + nb_done = 0; for line in score_fh: # skip headers @@ -153,15 +153,16 @@ def main(): new_populate.append(mer) if len(new_populate) is not 0: - populate_locations(new_populate, mers, args.fasta, get_length(args.fasta)) + populate_locations(new_populate, mers, args.fasta, length) pts = [] for mer in combination: for pt in mers[mer]: - pts.append([pt, mer, get_sequence(pt)]) + pts.append([pt[0], pt[1], mer, get_sequence(pt[0])]) pts = sorted(pts, key = lambda row: row[0]) + fh.write("pt\tstrand\tmer\tsequence\n") for pt in pts: fh.write('\t'.join(str(x) for x in pt) + '\n') |