diff options
author | Calvin Morrison <mutantturkey@gmail.com> | 2014-03-25 16:33:36 -0400 |
---|---|---|
committer | Calvin Morrison <mutantturkey@gmail.com> | 2014-03-25 16:33:36 -0400 |
commit | 06fa848b90982ddcd4308bb88d70a0d5f11f785b (patch) | |
tree | 472f8e8675a3b01060159bcf24cd1b2c35fb5d9e /src | |
parent | 55d58f92e388bbed44963565c5073c444ffa60b2 (diff) |
add average binding filter
Diffstat (limited to 'src')
-rw-r--r-- | src/filter_average_binding.py | 35 | ||||
-rw-r--r-- | src/filter_melting_temperature.py (renamed from src/below_melting_temperature.py) | 1 | ||||
-rwxr-xr-x | src/score_mers.py | 53 | ||||
-rwxr-xr-x | src/select_mers.py | 9 |
4 files changed, 74 insertions, 24 deletions
diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py new file mode 100644 index 0000000..d250c98 --- /dev/null +++ b/src/filter_average_binding.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys +import os + +debug = os.environ.get("debug", False) + +from subprocess import Popen +from subprocess import PIPE + +def get_length(fn): + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + return int(points_fh.stdout.readline()) + +if len(sys.argv) < 2: + print "filter_average_binding.py foreground.fa binding_distance" + exit() + +foreground = sys.argv[1] +distance = int(sys.argv[2]) + +genome_length = get_length(foreground) + +for line in sys.stdin: + (mer, count) = line.split() + if (genome_length / int(count)) < distance: + sys.stdout.write(line) + diff --git a/src/below_melting_temperature.py b/src/filter_melting_temperature.py index e91e777..9e5477b 100644 --- a/src/below_melting_temperature.py +++ b/src/filter_melting_temperature.py @@ -23,7 +23,6 @@ min_melting_temp = float(sys.argv[1]) max_melting_temp = float(sys.argv[2]) -output = [] for line in sys.stdin: if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp: sys.stdout.write(line) diff --git a/src/score_mers.py b/src/score_mers.py index 4679c45..db7f838 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -18,22 +18,14 @@ bg_mers = {} seq_ends = [] -if len(sys.argv) == 5: - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] - - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) -else: - print "please specify your inputs" - print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" - exit() +fg_genome_length = 0 +bg_genome_length = 0 + +output_file = "" # import our variables cpus = int(os.environ.get("cpus", cpu_count())) -debug = int(os.environ.get("debug", False)) +debug = os.environ.get("debug", False) min_mer_range = int(os.environ.get("min_mer_range", 6)) max_mer_range = int(os.environ.get("max_mer_range", 12)) min_mer_count = int(os.environ.get("min_mer_count", 0)) @@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35)) max_mer_distance = int(os.environ.get("max_mer_distance", 5000)) max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4)) - def get_max_consecutive_binding(mer1, mer2): ''' Return the maximum number of consecutively binding mers @@ -237,6 +228,7 @@ def score(combination): return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio] def load_end_points(fn): + ''' get all the points of the end of each sequence in a sample ''' end_points = [0] @@ -253,6 +245,22 @@ def load_end_points(fn): return end_points +def get_length(fn): + ''' get length of a genome ( number of base pairs )''' + + cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" + + if debug: + print "loading sequence end points" + print "executing: " + cmd + points_fh = Popen(cmd, stdout=PIPE, shell=True) + + length = points_fh.stdout.readline() + + length = int(length) + + return length + def load_heterodimer_dic(selected_mers): ''' Generate a heterodimer dict which contains every possible combination of @@ -277,6 +285,23 @@ def main(): Score Combinations For All Sizes ''' + global fg_genome_length + global bg_genome_length + global output_file + + if len(sys.argv) == 5: + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] + else: + print "please specify your inputs" + print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" + exit() + + fg_genome_length = get_length(fg_fasta_fn) + bg_genome_length = get_length(bg_fasta_fn) + selectivity_fh = open(selectivity_fn, "r") # load our mer list into python diff --git a/src/select_mers.py b/src/select_mers.py index ef019c0..5bd6877 100755 --- a/src/select_mers.py +++ b/src/select_mers.py @@ -5,8 +5,6 @@ import os fg_mers = {} bg_mers = {} -min_mer_count = int(os.environ.get("min_mer_count", 0)); - if(len(sys.argv) == 3): fg_count_fn = sys.argv[1] bg_count_fn = sys.argv[2] @@ -59,13 +57,6 @@ def main(): (mer, count) = line.split() mers[mer] = int(count) - if min_mer_count >= 1: - for mer in fg_mers.keys(): - if(fg_mers[mer] < min_mer_count): - del fg_mers[mer] - if mer in bg_mers: - del bg_mers[mer] - for mer in bg_mers.keys(): if mer not in fg_mers: del bg_mers[mer] |