From 06fa848b90982ddcd4308bb88d70a0d5f11f785b Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 25 Mar 2014 16:33:36 -0400 Subject: add average binding filter --- src/below_melting_temperature.py | 29 --------------------- src/filter_average_binding.py | 35 ++++++++++++++++++++++++++ src/filter_melting_temperature.py | 28 +++++++++++++++++++++ src/score_mers.py | 53 ++++++++++++++++++++++++++++----------- src/select_mers.py | 9 ------- 5 files changed, 102 insertions(+), 52 deletions(-) delete mode 100644 src/below_melting_temperature.py create mode 100644 src/filter_average_binding.py create mode 100644 src/filter_melting_temperature.py (limited to 'src') diff --git a/src/below_melting_temperature.py b/src/below_melting_temperature.py deleted file mode 100644 index e91e777..0000000 --- a/src/below_melting_temperature.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -import sys -from Bio.SeqUtils.MeltingTemp import Tm_staluc - -# naiive -def in_temp_range(kmer): - - A = kmer.count('A') - C = kmer.count('C') - G = kmer.count('G') - T = kmer.count('T') - - melt_temp = 0.0; - - if len(kmer) < 13: - melt_temp = ((A+T) * 2) + ((C+G) * 4) - else: - melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C) - - return min_melting_temp < melt_temp < max_melting_temp - -min_melting_temp = float(sys.argv[1]) -max_melting_temp = float(sys.argv[2]) - - -output = [] -for line in sys.stdin: - if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: - sys.stdout.write(line) diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py new file mode 100644 index 0000000..d250c98 --- /dev/null +++ b/src/filter_average_binding.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys +import os + +debug = os.environ.get("debug", False) + +from subprocess import Popen +from subprocess import PIPE + +def get_length(fn): + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + return int(points_fh.stdout.readline()) + +if len(sys.argv) < 2: + print "filter_average_binding.py foreground.fa binding_distance" + exit() + +foreground = sys.argv[1] +distance = int(sys.argv[2]) + +genome_length = get_length(foreground) + +for line in sys.stdin: + (mer, count) = line.split() + if (genome_length / int(count)) < distance: + sys.stdout.write(line) + diff --git a/src/filter_melting_temperature.py b/src/filter_melting_temperature.py new file mode 100644 index 0000000..9e5477b --- /dev/null +++ b/src/filter_melting_temperature.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +import sys +from Bio.SeqUtils.MeltingTemp import Tm_staluc + +# naiive +def in_temp_range(kmer): + + A = kmer.count('A') + C = kmer.count('C') + G = kmer.count('G') + T = kmer.count('T') + + melt_temp = 0.0; + + if len(kmer) < 13: + melt_temp = ((A+T) * 2) + ((C+G) * 4) + else: + melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C) + + return min_melting_temp < melt_temp < max_melting_temp + +min_melting_temp = float(sys.argv[1]) +max_melting_temp = float(sys.argv[2]) + + +for line in sys.stdin: + if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: + sys.stdout.write(line) diff --git a/src/score_mers.py b/src/score_mers.py index 4679c45..db7f838 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -18,22 +18,14 @@ bg_mers = {} seq_ends = [] -if len(sys.argv) == 5: - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] - - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) -else: - print "please specify your inputs" - print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" - exit() +fg_genome_length = 0 +bg_genome_length = 0 + +output_file = "" # import our variables cpus = int(os.environ.get("cpus", cpu_count())) -debug = int(os.environ.get("debug", False)) +debug = os.environ.get("debug", False) min_mer_range = int(os.environ.get("min_mer_range", 6)) max_mer_range = int(os.environ.get("max_mer_range", 12)) min_mer_count = int(os.environ.get("min_mer_count", 0)) @@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35)) max_mer_distance = int(os.environ.get("max_mer_distance", 5000)) max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4)) - def get_max_consecutive_binding(mer1, mer2): ''' Return the maximum number of consecutively binding mers @@ -237,6 +228,7 @@ def score(combination): return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio] def load_end_points(fn): + ''' get all the points of the end of each sequence in a sample ''' end_points = [0] @@ -253,6 +245,22 @@ def load_end_points(fn): return end_points +def get_length(fn): + ''' get length of a genome ( number of base pairs )''' + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + length = points_fh.stdout.readline() + + length = int(length) + + return length + def load_heterodimer_dic(selected_mers): ''' Generate a heterodimer dict which contains every possible combination of @@ -277,6 +285,23 @@ def main(): Score Combinations For All Sizes ''' + global fg_genome_length + global bg_genome_length + global output_file + + if len(sys.argv) == 5: + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] + else: + print "please specify your inputs" + print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" + exit() + + fg_genome_length = get_length(fg_fasta_fn) + bg_genome_length = get_length(bg_fasta_fn) + selectivity_fh = open(selectivity_fn, "r") # load our mer list into python diff --git a/src/select_mers.py b/src/select_mers.py index ef019c0..5bd6877 100755 --- a/src/select_mers.py +++ b/src/select_mers.py @@ -5,8 +5,6 @@ import os fg_mers = {} bg_mers = {} -min_mer_count = int(os.environ.get("min_mer_count", 0)); - if(len(sys.argv) == 3): fg_count_fn = sys.argv[1] bg_count_fn = sys.argv[2] @@ -59,13 +57,6 @@ def main(): (mer, count) = line.split() mers[mer] = int(count) - if min_mer_count >= 1: - for mer in fg_mers.keys(): - if(fg_mers[mer] < min_mer_count): - del fg_mers[mer] - if mer in bg_mers: - del bg_mers[mer] - for mer in bg_mers.keys(): if mer not in fg_mers: del bg_mers[mer] -- cgit v1.2.1