diff options
-rw-r--r-- | Makefile | 6 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rwxr-xr-x | SelectiveGenomeAmplification | 20 | ||||
-rwxr-xr-x | SelectiveGenomeAmplificationUI | 6 | ||||
-rw-r--r-- | src/filter_average_binding.py | 35 | ||||
-rw-r--r-- | src/filter_melting_temperature.py (renamed from src/below_melting_temperature.py) | 1 | ||||
-rwxr-xr-x | src/score_mers.py | 53 | ||||
-rwxr-xr-x | src/select_mers.py | 9 |
8 files changed, 95 insertions, 37 deletions
@@ -20,15 +20,19 @@ clean: rm -vf bin/* -Rv install: all + # c tools install -c bin/strstream $(DEST) install -c bin/filter_melting_range $(DEST) install -c bin/strstreamone $(DEST) install -c bin/sequence_end_points $(DEST) + # bash scripts install -c SelectiveGenomeAmplification $(DEST) install -c SelectiveGenomeAmplificationUI $(DEST) + # python scripts install -c src/select_mers.py $(DEST) install -c src/score_mers.py $(DEST) install -c src/score_wrapper.sh $(DEST) - install -c src/below_melting_temperature.py $(DEST) + install -c src/filter_melting_temperature.py $(DEST) install -c src/filter_max_consecutive_binding.py $(DEST) + install -c src/filter_average_binding.py $(DEST) @@ -41,7 +41,7 @@ Y | counts\_directory | $output\_directory/.tmp | directory for counts directory Y | tmp\_directory=$output\_directory/.tmp | temporary files directory Y | max\_melting\_temp | 30° | maximum melting temp of mers Y | min\_melting\_temp | 0° | minimum melting temp of mers -Y | min\_mer\_count | Not Enabled (0) | only select mers that occur more frequently than this number +Y | min\_foreground\_binding\_average | 50000 | elminate mers that appear less frequently than the average (length of foreground / # of occurances) Y | max\_select | 15 | maximum number of mers to pick Y | max\_check | 35 | maximum number of mers to select (check the top #) Y | ignore\_mers | Not Enabled | mers to explicitly ignore, space seperated ex. ignore\_mers="ACAGTA ACCATAA ATATATAT" diff --git a/SelectiveGenomeAmplification b/SelectiveGenomeAmplification index b3bde4d..b66d06c 100755 --- a/SelectiveGenomeAmplification +++ b/SelectiveGenomeAmplification @@ -41,8 +41,8 @@ fi : ${max_melting_temp=30} : ${min_melting_temp=0} -# minimum mer count -: ${min_mer_count=0} +# minimum average binding distance in the foreground +: ${min_foreground_binding_average=50000} # maximum mers to pick : ${max_select=15} @@ -62,7 +62,7 @@ export max_mer_range export max_select -export min_mer_count +export min_foreground_binding_average export max_mer_distance export max_melting_temp @@ -130,17 +130,21 @@ for var in ignore_mers min_mer_range max_check cpus max_consecutive_binding max_ echo $var "${!var}" >> $output_directory/$current_run/parameters done; -non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting-$min_melting_temp-$max_melting_temp -filtered_binding=$output_directory/$current_run/`basename $foreground`-counts-filtered-binding-$min_melting_temp-$max_melting_temp +average_binding=$output_directory/$current_run/`basename $foreground`-counts-average-binding +consecutive_binding=$output_directory/$current_run/`basename $foreground`-counts-consecutive-binding +non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting + +echo "checking if mers appear at least as often in the fg as the average binding site or more $min_foreground_binding_average" +cat $fg_counts | filter_average_binding.py $foreground $min_foreground_binding_average > $average_binding || exit 1 echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp" -cat $fg_counts | below_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1 +cat $average_binding | filter_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1 echo "filtering out elements that have more consecutive binding mers than allowed by \$max_consecutive_binding $max_consecutive_binding" -cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $filtered_binding || exit 1 +cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $consecutive_binding || exit 1 echo "scoring mer selectivity" -select_mers.py $non_melting $bg_counts > $selected || exit 1 +select_mers.py $consecutive_binding $bg_counts > $selected || exit 1 echo "scoring top mers based on selectivity" score_wrapper.sh $selected $foreground $background $output_directory/$current_run/scores-output || exit 1 diff --git a/SelectiveGenomeAmplificationUI b/SelectiveGenomeAmplificationUI index 9c5e1c6..82d3f9e 100755 --- a/SelectiveGenomeAmplificationUI +++ b/SelectiveGenomeAmplificationUI @@ -27,9 +27,9 @@ questions = [ 'default_str': '6', 'variable': 'min_mer_range' }, - { 'question': 'minimum mer count you want to have for your selected mer?', - 'default_str': 'None', - 'variable': 'min_mer_count' }, + { 'question': 'eliminate mers that appear less frequently on average than this number ?', + 'default_str': '50000', + 'variable': 'min_foreground_binding_average' }, { 'question': 'maximum size of mer combinations you want to search and select?', 'default_str': '15', diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py new file mode 100644 index 0000000..d250c98 --- /dev/null +++ b/src/filter_average_binding.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys +import os + +debug = os.environ.get("debug", False) + +from subprocess import Popen +from subprocess import PIPE + +def get_length(fn): + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + return int(points_fh.stdout.readline()) + +if len(sys.argv) < 2: + print "filter_average_binding.py foreground.fa binding_distance" + exit() + +foreground = sys.argv[1] +distance = int(sys.argv[2]) + +genome_length = get_length(foreground) + +for line in sys.stdin: + (mer, count) = line.split() + if (genome_length / int(count)) < distance: + sys.stdout.write(line) + diff --git a/src/below_melting_temperature.py b/src/filter_melting_temperature.py index e91e777..9e5477b 100644 --- a/src/below_melting_temperature.py +++ b/src/filter_melting_temperature.py @@ -23,7 +23,6 @@ min_melting_temp = float(sys.argv[1]) max_melting_temp = float(sys.argv[2]) -output = [] for line in sys.stdin: if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: sys.stdout.write(line) diff --git a/src/score_mers.py b/src/score_mers.py index 4679c45..db7f838 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -18,22 +18,14 @@ bg_mers = {} seq_ends = [] -if len(sys.argv) == 5: - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] - - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) -else: - print "please specify your inputs" - print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" - exit() +fg_genome_length = 0 +bg_genome_length = 0 + +output_file = "" # import our variables cpus = int(os.environ.get("cpus", cpu_count())) -debug = int(os.environ.get("debug", False)) +debug = os.environ.get("debug", False) min_mer_range = int(os.environ.get("min_mer_range", 6)) max_mer_range = int(os.environ.get("max_mer_range", 12)) min_mer_count = int(os.environ.get("min_mer_count", 0)) @@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35)) max_mer_distance = int(os.environ.get("max_mer_distance", 5000)) max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4)) - def get_max_consecutive_binding(mer1, mer2): ''' Return the maximum number of consecutively binding mers @@ -237,6 +228,7 @@ def score(combination): return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio] def load_end_points(fn): + ''' get all the points of the end of each sequence in a sample ''' end_points = [0] @@ -253,6 +245,22 @@ def load_end_points(fn): return end_points +def get_length(fn): + ''' get length of a genome ( number of base pairs )''' + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + length = points_fh.stdout.readline() + + length = int(length) + + return length + def load_heterodimer_dic(selected_mers): ''' Generate a heterodimer dict which contains every possible combination of @@ -277,6 +285,23 @@ def main(): Score Combinations For All Sizes ''' + global fg_genome_length + global bg_genome_length + global output_file + + if len(sys.argv) == 5: + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] + else: + print "please specify your inputs" + print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" + exit() + + fg_genome_length = get_length(fg_fasta_fn) + bg_genome_length = get_length(bg_fasta_fn) + selectivity_fh = open(selectivity_fn, "r") # load our mer list into python diff --git a/src/select_mers.py b/src/select_mers.py index ef019c0..5bd6877 100755 --- a/src/select_mers.py +++ b/src/select_mers.py @@ -5,8 +5,6 @@ import os fg_mers = {} bg_mers = {} -min_mer_count = int(os.environ.get("min_mer_count", 0)); - if(len(sys.argv) == 3): fg_count_fn = sys.argv[1] bg_count_fn = sys.argv[2] @@ -59,13 +57,6 @@ def main(): (mer, count) = line.split() mers[mer] = int(count) - if min_mer_count >= 1: - for mer in fg_mers.keys(): - if(fg_mers[mer] < min_mer_count): - del fg_mers[mer] - if mer in bg_mers: - del bg_mers[mer] - for mer in bg_mers.keys(): if mer not in fg_mers: del bg_mers[mer] |