aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-25 16:33:36 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-25 16:33:36 -0400
commit06fa848b90982ddcd4308bb88d70a0d5f11f785b (patch)
tree472f8e8675a3b01060159bcf24cd1b2c35fb5d9e /src
parent55d58f92e388bbed44963565c5073c444ffa60b2 (diff)
add average binding filter
Diffstat (limited to 'src')
-rw-r--r--src/filter_average_binding.py35
-rw-r--r--src/filter_melting_temperature.py (renamed from src/below_melting_temperature.py)1
-rwxr-xr-xsrc/score_mers.py53
-rwxr-xr-xsrc/select_mers.py9
4 files changed, 74 insertions, 24 deletions
diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py
new file mode 100644
index 0000000..d250c98
--- /dev/null
+++ b/src/filter_average_binding.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import sys
+import os
+
+debug = os.environ.get("debug", False)
+
+from subprocess import Popen
+from subprocess import PIPE
+
+def get_length(fn):
+
+ cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+
+ points_fh = Popen(cmd, stdout=PIPE, shell=True)
+
+ return int(points_fh.stdout.readline())
+
+if len(sys.argv) < 2:
+ print "filter_average_binding.py foreground.fa binding_distance"
+ exit()
+
+foreground = sys.argv[1]
+distance = int(sys.argv[2])
+
+genome_length = get_length(foreground)
+
+for line in sys.stdin:
+ (mer, count) = line.split()
+ if (genome_length / int(count)) < distance:
+ sys.stdout.write(line)
+
diff --git a/src/below_melting_temperature.py b/src/filter_melting_temperature.py
index e91e777..9e5477b 100644
--- a/src/below_melting_temperature.py
+++ b/src/filter_melting_temperature.py
@@ -23,7 +23,6 @@ min_melting_temp = float(sys.argv[1])
max_melting_temp = float(sys.argv[2])
-output = []
for line in sys.stdin:
if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp:
sys.stdout.write(line)
diff --git a/src/score_mers.py b/src/score_mers.py
index 4679c45..db7f838 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -18,22 +18,14 @@ bg_mers = {}
seq_ends = []
-if len(sys.argv) == 5:
- selectivity_fn = sys.argv[1]
- fg_fasta_fn = sys.argv[2]
- bg_fasta_fn = sys.argv[3]
- output_file = sys.argv[4]
-
- fg_genome_length = os.path.getsize(fg_fasta_fn)
- bg_genome_length = os.path.getsize(bg_fasta_fn)
-else:
- print "please specify your inputs"
- print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
- exit()
+fg_genome_length = 0
+bg_genome_length = 0
+
+output_file = ""
# import our variables
cpus = int(os.environ.get("cpus", cpu_count()))
-debug = int(os.environ.get("debug", False))
+debug = os.environ.get("debug", False)
min_mer_range = int(os.environ.get("min_mer_range", 6))
max_mer_range = int(os.environ.get("max_mer_range", 12))
min_mer_count = int(os.environ.get("min_mer_count", 0))
@@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35))
max_mer_distance = int(os.environ.get("max_mer_distance", 5000))
max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4))
-
def get_max_consecutive_binding(mer1, mer2):
'''
Return the maximum number of consecutively binding mers
@@ -237,6 +228,7 @@ def score(combination):
return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio]
def load_end_points(fn):
+ ''' get all the points of the end of each sequence in a sample '''
end_points = [0]
@@ -253,6 +245,22 @@ def load_end_points(fn):
return end_points
+def get_length(fn):
+ ''' get length of a genome ( number of base pairs )'''
+
+ cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+ points_fh = Popen(cmd, stdout=PIPE, shell=True)
+
+ length = points_fh.stdout.readline()
+
+ length = int(length)
+
+ return length
+
def load_heterodimer_dic(selected_mers):
'''
Generate a heterodimer dict which contains every possible combination of
@@ -277,6 +285,23 @@ def main():
Score Combinations For All Sizes
'''
+ global fg_genome_length
+ global bg_genome_length
+ global output_file
+
+ if len(sys.argv) == 5:
+ selectivity_fn = sys.argv[1]
+ fg_fasta_fn = sys.argv[2]
+ bg_fasta_fn = sys.argv[3]
+ output_file = sys.argv[4]
+ else:
+ print "please specify your inputs"
+ print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
+ exit()
+
+ fg_genome_length = get_length(fg_fasta_fn)
+ bg_genome_length = get_length(bg_fasta_fn)
+
selectivity_fh = open(selectivity_fn, "r")
# load our mer list into python
diff --git a/src/select_mers.py b/src/select_mers.py
index ef019c0..5bd6877 100755
--- a/src/select_mers.py
+++ b/src/select_mers.py
@@ -5,8 +5,6 @@ import os
fg_mers = {}
bg_mers = {}
-min_mer_count = int(os.environ.get("min_mer_count", 0));
-
if(len(sys.argv) == 3):
fg_count_fn = sys.argv[1]
bg_count_fn = sys.argv[2]
@@ -59,13 +57,6 @@ def main():
(mer, count) = line.split()
mers[mer] = int(count)
- if min_mer_count >= 1:
- for mer in fg_mers.keys():
- if(fg_mers[mer] < min_mer_count):
- del fg_mers[mer]
- if mer in bg_mers:
- del bg_mers[mer]
-
for mer in bg_mers.keys():
if mer not in fg_mers:
del bg_mers[mer]