aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile6
-rw-r--r--README.md2
-rwxr-xr-xSelectiveGenomeAmplification20
-rwxr-xr-xSelectiveGenomeAmplificationUI6
-rw-r--r--src/filter_average_binding.py35
-rw-r--r--src/filter_melting_temperature.py (renamed from src/below_melting_temperature.py)1
-rwxr-xr-xsrc/score_mers.py53
-rwxr-xr-xsrc/select_mers.py9
8 files changed, 95 insertions, 37 deletions
diff --git a/Makefile b/Makefile
index 7d5b400..1f72fa1 100644
--- a/Makefile
+++ b/Makefile
@@ -20,15 +20,19 @@ clean:
rm -vf bin/* -Rv
install: all
+ # c tools
install -c bin/strstream $(DEST)
install -c bin/filter_melting_range $(DEST)
install -c bin/strstreamone $(DEST)
install -c bin/sequence_end_points $(DEST)
+ # bash scripts
install -c SelectiveGenomeAmplification $(DEST)
install -c SelectiveGenomeAmplificationUI $(DEST)
+ # python scripts
install -c src/select_mers.py $(DEST)
install -c src/score_mers.py $(DEST)
install -c src/score_wrapper.sh $(DEST)
- install -c src/below_melting_temperature.py $(DEST)
+ install -c src/filter_melting_temperature.py $(DEST)
install -c src/filter_max_consecutive_binding.py $(DEST)
+ install -c src/filter_average_binding.py $(DEST)
diff --git a/README.md b/README.md
index f7477eb..5163087 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ Y | counts\_directory | $output\_directory/.tmp | directory for counts directory
Y | tmp\_directory=$output\_directory/.tmp | temporary files directory
Y | max\_melting\_temp | 30° | maximum melting temp of mers
Y | min\_melting\_temp | 0° | minimum melting temp of mers
-Y | min\_mer\_count | Not Enabled (0) | only select mers that occur more frequently than this number
+Y | min\_foreground\_binding\_average | 50000 | elminate mers that appear less frequently than the average (length of foreground / # of occurances)
Y | max\_select | 15 | maximum number of mers to pick
Y | max\_check | 35 | maximum number of mers to select (check the top #)
Y | ignore\_mers | Not Enabled | mers to explicitly ignore, space seperated ex. ignore\_mers="ACAGTA ACCATAA ATATATAT"
diff --git a/SelectiveGenomeAmplification b/SelectiveGenomeAmplification
index b3bde4d..b66d06c 100755
--- a/SelectiveGenomeAmplification
+++ b/SelectiveGenomeAmplification
@@ -41,8 +41,8 @@ fi
: ${max_melting_temp=30}
: ${min_melting_temp=0}
-# minimum mer count
-: ${min_mer_count=0}
+# minimum average binding distance in the foreground
+: ${min_foreground_binding_average=50000}
# maximum mers to pick
: ${max_select=15}
@@ -62,7 +62,7 @@ export max_mer_range
export max_select
-export min_mer_count
+export min_foreground_binding_average
export max_mer_distance
export max_melting_temp
@@ -130,17 +130,21 @@ for var in ignore_mers min_mer_range max_check cpus max_consecutive_binding max_
echo $var "${!var}" >> $output_directory/$current_run/parameters
done;
-non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting-$min_melting_temp-$max_melting_temp
-filtered_binding=$output_directory/$current_run/`basename $foreground`-counts-filtered-binding-$min_melting_temp-$max_melting_temp
+average_binding=$output_directory/$current_run/`basename $foreground`-counts-average-binding
+consecutive_binding=$output_directory/$current_run/`basename $foreground`-counts-consecutive-binding
+non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting
+
+echo "checking if mers appear at least as often in the fg as the average binding site or more $min_foreground_binding_average"
+cat $fg_counts | filter_average_binding.py $foreground $min_foreground_binding_average > $average_binding || exit 1
echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp"
-cat $fg_counts | below_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1
+cat $average_binding | filter_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1
echo "filtering out elements that have more consecutive binding mers than allowed by \$max_consecutive_binding $max_consecutive_binding"
-cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $filtered_binding || exit 1
+cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $consecutive_binding || exit 1
echo "scoring mer selectivity"
-select_mers.py $non_melting $bg_counts > $selected || exit 1
+select_mers.py $consecutive_binding $bg_counts > $selected || exit 1
echo "scoring top mers based on selectivity"
score_wrapper.sh $selected $foreground $background $output_directory/$current_run/scores-output || exit 1
diff --git a/SelectiveGenomeAmplificationUI b/SelectiveGenomeAmplificationUI
index 9c5e1c6..82d3f9e 100755
--- a/SelectiveGenomeAmplificationUI
+++ b/SelectiveGenomeAmplificationUI
@@ -27,9 +27,9 @@ questions = [
'default_str': '6',
'variable': 'min_mer_range' },
- { 'question': 'minimum mer count you want to have for your selected mer?',
- 'default_str': 'None',
- 'variable': 'min_mer_count' },
+ { 'question': 'eliminate mers that appear less frequently on average than this number ?',
+ 'default_str': '50000',
+ 'variable': 'min_foreground_binding_average' },
{ 'question': 'maximum size of mer combinations you want to search and select?',
'default_str': '15',
diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py
new file mode 100644
index 0000000..d250c98
--- /dev/null
+++ b/src/filter_average_binding.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import sys
+import os
+
+debug = os.environ.get("debug", False)
+
+from subprocess import Popen
+from subprocess import PIPE
+
+def get_length(fn):
+
+ cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+
+ points_fh = Popen(cmd, stdout=PIPE, shell=True)
+
+ return int(points_fh.stdout.readline())
+
+if len(sys.argv) < 2:
+ print "filter_average_binding.py foreground.fa binding_distance"
+ exit()
+
+foreground = sys.argv[1]
+distance = int(sys.argv[2])
+
+genome_length = get_length(foreground)
+
+for line in sys.stdin:
+ (mer, count) = line.split()
+ if (genome_length / int(count)) < distance:
+ sys.stdout.write(line)
+
diff --git a/src/below_melting_temperature.py b/src/filter_melting_temperature.py
index e91e777..9e5477b 100644
--- a/src/below_melting_temperature.py
+++ b/src/filter_melting_temperature.py
@@ -23,7 +23,6 @@ min_melting_temp = float(sys.argv[1])
max_melting_temp = float(sys.argv[2])
-output = []
for line in sys.stdin:
if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp:
sys.stdout.write(line)
diff --git a/src/score_mers.py b/src/score_mers.py
index 4679c45..db7f838 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -18,22 +18,14 @@ bg_mers = {}
seq_ends = []
-if len(sys.argv) == 5:
- selectivity_fn = sys.argv[1]
- fg_fasta_fn = sys.argv[2]
- bg_fasta_fn = sys.argv[3]
- output_file = sys.argv[4]
-
- fg_genome_length = os.path.getsize(fg_fasta_fn)
- bg_genome_length = os.path.getsize(bg_fasta_fn)
-else:
- print "please specify your inputs"
- print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
- exit()
+fg_genome_length = 0
+bg_genome_length = 0
+
+output_file = ""
# import our variables
cpus = int(os.environ.get("cpus", cpu_count()))
-debug = int(os.environ.get("debug", False))
+debug = os.environ.get("debug", False)
min_mer_range = int(os.environ.get("min_mer_range", 6))
max_mer_range = int(os.environ.get("max_mer_range", 12))
min_mer_count = int(os.environ.get("min_mer_count", 0))
@@ -42,7 +34,6 @@ max_check = int(os.environ.get("max_check", 35))
max_mer_distance = int(os.environ.get("max_mer_distance", 5000))
max_consecutive_binding = int(os.environ.get("max_consecutive_binding", 4))
-
def get_max_consecutive_binding(mer1, mer2):
'''
Return the maximum number of consecutively binding mers
@@ -237,6 +228,7 @@ def score(combination):
return [combination, mer_score, fg_mean_dist, fg_std_dist, bg_ratio]
def load_end_points(fn):
+ ''' get all the points of the end of each sequence in a sample '''
end_points = [0]
@@ -253,6 +245,22 @@ def load_end_points(fn):
return end_points
+def get_length(fn):
+ ''' get length of a genome ( number of base pairs )'''
+
+ cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
+
+ if debug:
+ print "loading sequence end points"
+ print "executing: " + cmd
+ points_fh = Popen(cmd, stdout=PIPE, shell=True)
+
+ length = points_fh.stdout.readline()
+
+ length = int(length)
+
+ return length
+
def load_heterodimer_dic(selected_mers):
'''
Generate a heterodimer dict which contains every possible combination of
@@ -277,6 +285,23 @@ def main():
Score Combinations For All Sizes
'''
+ global fg_genome_length
+ global bg_genome_length
+ global output_file
+
+ if len(sys.argv) == 5:
+ selectivity_fn = sys.argv[1]
+ fg_fasta_fn = sys.argv[2]
+ bg_fasta_fn = sys.argv[3]
+ output_file = sys.argv[4]
+ else:
+ print "please specify your inputs"
+ print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
+ exit()
+
+ fg_genome_length = get_length(fg_fasta_fn)
+ bg_genome_length = get_length(bg_fasta_fn)
+
selectivity_fh = open(selectivity_fn, "r")
# load our mer list into python
diff --git a/src/select_mers.py b/src/select_mers.py
index ef019c0..5bd6877 100755
--- a/src/select_mers.py
+++ b/src/select_mers.py
@@ -5,8 +5,6 @@ import os
fg_mers = {}
bg_mers = {}
-min_mer_count = int(os.environ.get("min_mer_count", 0));
-
if(len(sys.argv) == 3):
fg_count_fn = sys.argv[1]
bg_count_fn = sys.argv[2]
@@ -59,13 +57,6 @@ def main():
(mer, count) = line.split()
mers[mer] = int(count)
- if min_mer_count >= 1:
- for mer in fg_mers.keys():
- if(fg_mers[mer] < min_mer_count):
- del fg_mers[mer]
- if mer in bg_mers:
- del bg_mers[mer]
-
for mer in bg_mers.keys():
if mer not in fg_mers:
del bg_mers[mer]