From 06fa848b90982ddcd4308bb88d70a0d5f11f785b Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 25 Mar 2014 16:33:36 -0400 Subject: add average binding filter --- Makefile | 6 ++++- README.md | 2 +- SelectiveGenomeAmplification | 20 +++++++++------ SelectiveGenomeAmplificationUI | 6 ++--- src/below_melting_temperature.py | 29 --------------------- src/filter_average_binding.py | 35 ++++++++++++++++++++++++++ src/filter_melting_temperature.py | 28 +++++++++++++++++++++ src/score_mers.py | 53 ++++++++++++++++++++++++++++----------- src/select_mers.py | 9 ------- 9 files changed, 123 insertions(+), 65 deletions(-) delete mode 100644 src/below_melting_temperature.py create mode 100644 src/filter_average_binding.py create mode 100644 src/filter_melting_temperature.py diff --git a/Makefile b/Makefile index 7d5b400..1f72fa1 100644 --- a/Makefile +++ b/Makefile @@ -20,15 +20,19 @@ clean: rm -vf bin/* -Rv install: all + # c tools install -c bin/strstream $(DEST) install -c bin/filter_melting_range $(DEST) install -c bin/strstreamone $(DEST) install -c bin/sequence_end_points $(DEST) + # bash scripts install -c SelectiveGenomeAmplification $(DEST) install -c SelectiveGenomeAmplificationUI $(DEST) + # python scripts install -c src/select_mers.py $(DEST) install -c src/score_mers.py $(DEST) install -c src/score_wrapper.sh $(DEST) - install -c src/below_melting_temperature.py $(DEST) + install -c src/filter_melting_temperature.py $(DEST) install -c src/filter_max_consecutive_binding.py $(DEST) + install -c src/filter_average_binding.py $(DEST) diff --git a/README.md b/README.md index f7477eb..5163087 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Y | counts\_directory | $output\_directory/.tmp | directory for counts directory Y | tmp\_directory=$output\_directory/.tmp | temporary files directory Y | max\_melting\_temp | 30° | maximum melting temp of mers Y | min\_melting\_temp | 0° | minimum melting temp of mers -Y | min\_mer\_count | Not Enabled (0) | only select mers that occur more frequently than this number +Y | min\_foreground\_binding\_average | 50000 | elminate mers that appear less frequently than the average (length of foreground / # of occurances) Y | max\_select | 15 | maximum number of mers to pick Y | max\_check | 35 | maximum number of mers to select (check the top #) Y | ignore\_mers | Not Enabled | mers to explicitly ignore, space seperated ex. ignore\_mers="ACAGTA ACCATAA ATATATAT" diff --git a/SelectiveGenomeAmplification b/SelectiveGenomeAmplification index b3bde4d..b66d06c 100755 --- a/SelectiveGenomeAmplification +++ b/SelectiveGenomeAmplification @@ -41,8 +41,8 @@ fi : ${max_melting_temp=30} : ${min_melting_temp=0} -# minimum mer count -: ${min_mer_count=0} +# minimum average binding distance in the foreground +: ${min_foreground_binding_average=50000} # maximum mers to pick : ${max_select=15} @@ -62,7 +62,7 @@ export max_mer_range export max_select -export min_mer_count +export min_foreground_binding_average export max_mer_distance export max_melting_temp @@ -130,17 +130,21 @@ for var in ignore_mers min_mer_range max_check cpus max_consecutive_binding max_ echo $var "${!var}" >> $output_directory/$current_run/parameters done; -non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting-$min_melting_temp-$max_melting_temp -filtered_binding=$output_directory/$current_run/`basename $foreground`-counts-filtered-binding-$min_melting_temp-$max_melting_temp +average_binding=$output_directory/$current_run/`basename $foreground`-counts-average-binding +consecutive_binding=$output_directory/$current_run/`basename $foreground`-counts-consecutive-binding +non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting + +echo "checking if mers appear at least as often in the fg as the average binding site or more $min_foreground_binding_average" +cat $fg_counts | filter_average_binding.py $foreground $min_foreground_binding_average > $average_binding || exit 1 echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp" -cat $fg_counts | below_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1 +cat $average_binding | filter_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1 echo "filtering out elements that have more consecutive binding mers than allowed by \$max_consecutive_binding $max_consecutive_binding" -cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $filtered_binding || exit 1 +cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $consecutive_binding || exit 1 echo "scoring mer selectivity" -select_mers.py $non_melting $bg_counts > $selected || exit 1 +select_mers.py $consecutive_binding $bg_counts > $selected || exit 1 echo "scoring top mers based on selectivity" score_wrapper.sh $selected $foreground $background $output_directory/$current_run/scores-output || exit 1 diff --git a/SelectiveGenomeAmplificationUI b/SelectiveGenomeAmplificationUI index 9c5e1c6..82d3f9e 100755 --- a/SelectiveGenomeAmplificationUI +++ b/SelectiveGenomeAmplificationUI @@ -27,9 +27,9 @@ questions = [ 'default_str': '6', 'variable': 'min_mer_range' }, - { 'question': 'minimum mer count you want to have for your selected mer?', - 'default_str': 'None', - 'variable': 'min_mer_count' }, + { 'question': 'eliminate mers that appear less frequently on average than this number ?', + 'default_str': '50000', + 'variable': 'min_foreground_binding_average' }, { 'question': 'maximum size of mer combinations you want to search and select?', 'default_str': '15', diff --git a/src/below_melting_temperature.py b/src/below_melting_temperature.py deleted file mode 100644 index e91e777..0000000 --- a/src/below_melting_temperature.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -import sys -from Bio.SeqUtils.MeltingTemp import Tm_staluc - -# naiive -def in_temp_range(kmer): - - A = kmer.count('A') - C = kmer.count('C') - G = kmer.count('G') - T = kmer.count('T') - - melt_temp = 0.0; - - if len(kmer) < 13: - melt_temp = ((A+T) * 2) + ((C+G) * 4) - else: - melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C) - - return min_melting_temp < melt_temp < max_melting_temp - -min_melting_temp = float(sys.argv[1]) -max_melting_temp = float(sys.argv[2]) - - -output = [] -for line in sys.stdin: - if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: - sys.stdout.write(line) diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py new file mode 100644 index 0000000..d250c98 --- /dev/null +++ b/src/filter_average_binding.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys +import os + +debug = os.environ.get("debug", False) + +from subprocess import Popen +from subprocess import PIPE + +def get_length(fn): + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + return int(points_fh.stdout.readline()) + +if len(sys.argv) < 2: + print "filter_average_binding.py foreground.fa binding_distance" + exit() + +foreground = sys.argv[1] +distance = int(sys.argv[2]) + +genome_length = get_length(foreground) + +for line in sys.stdin: + (mer, count) = line.split() + if (genome_length / int(count)) < distance: + sys.stdout.write(line) + diff --git a/src/filter_melting_temperature.py b/src/filter_melting_temperature.py new file mode 100644 index 0000000..9e5477b --- /dev/null +++ b/src/filter_melting_temperature.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +import sys +from Bio.SeqUtils.MeltingTemp import Tm_staluc + +# naiive +def in_temp_range(kmer): + + A = kmer.count('A') + C = kmer.count('C') + G = kmer.count('G') + T = kmer.count('T') + + melt_temp = 0.0; + + if len(kmer) < 13: + melt_temp = ((A+T) * 2) + ((C+G) * 4) + else: + melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C) + + return min_melting_temp < melt_temp < max_melting_temp + +min_melting_temp = float(sys.argv[1]) +max_melting_temp = float(sys.argv[2]) + + +for line in sys.stdin: + if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: + sys.stdout.write(line) diff --git a/src/score_mers.py b/src/score_mers.py index 4679c45..db7f838 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -18,22 +18,14 @@ bg_mers = {} seq_ends = [] -if len(sys.argv) == 5: - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] - - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) -else: - print "please specify your inputs" - print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" - exit() +fg_genome_length = 0 +bg_genome_length = 0 + +output_file = "" # import our variables cpus = int(os.environ.get("cpus", cpu_count())) -debug = int(os.environ.get("debug", False)) +debug = os.environ.get("debug", False) min_mer_range = int(os.environ.get("min_mer_range", 6)) max_mer_range = int(os.environ.get("max_mer_range", 12)) min_mer_count = int(os.environ.get("min_mer_count", 0)) @@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35)) max_mer_distance = int(os.environ.get("max_mer_distance", 5000)) max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4)) - def get_max_consecutive_binding(mer1, mer2): ''' Return the maximum number of consecutively binding mers @@ -237,6 +228,7 @@ def score(combination): return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio] def load_end_points(fn): + ''' get all the points of the end of each sequence in a sample ''' end_points = [0] @@ -253,6 +245,22 @@ def load_end_points(fn): return end_points +def get_length(fn): + ''' get length of a genome ( number of base pairs )''' + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + length = points_fh.stdout.readline() + + length = int(length) + + return length + def load_heterodimer_dic(selected_mers): ''' Generate a heterodimer dict which contains every possible combination of @@ -277,6 +285,23 @@ def main(): Score Combinations For All Sizes ''' + global fg_genome_length + global bg_genome_length + global output_file + + if len(sys.argv) == 5: + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] + else: + print "please specify your inputs" + print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" + exit() + + fg_genome_length = get_length(fg_fasta_fn) + bg_genome_length = get_length(bg_fasta_fn) + selectivity_fh = open(selectivity_fn, "r") # load our mer list into python diff --git a/src/select_mers.py b/src/select_mers.py index ef019c0..5bd6877 100755 --- a/src/select_mers.py +++ b/src/select_mers.py @@ -5,8 +5,6 @@ import os fg_mers = {} bg_mers = {} -min_mer_count = int(os.environ.get("min_mer_count", 0)); - if(len(sys.argv) == 3): fg_count_fn = sys.argv[1] bg_count_fn = sys.argv[2] @@ -59,13 +57,6 @@ def main(): (mer, count) = line.split() mers[mer] = int(count) - if min_mer_count >= 1: - for mer in fg_mers.keys(): - if(fg_mers[mer] < min_mer_count): - del fg_mers[mer] - if mer in bg_mers: - del bg_mers[mer] - for mer in bg_mers.keys(): if mer not in fg_mers: del bg_mers[mer] -- cgit v1.2.3