aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-08-22 17:37:49 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-08-22 17:37:49 -0400
commit428cc8fc9c27de8a24e6bcfc426a11e7839cebbd (patch)
tree16504e911597b083a30390682efe6094e3b7b235
parentd4ec5459d0fc141d20a4bbbf0a7dc40742e0372f (diff)
revamp filter_average_binding.py
-rwxr-xr-xSelectiveWholeGenomeAmplification2
-rwxr-xr-xsrc/filter_average_binding.py55
2 files changed, 42 insertions, 15 deletions
diff --git a/SelectiveWholeGenomeAmplification b/SelectiveWholeGenomeAmplification
index 8304387..b2b95ca 100755
--- a/SelectiveWholeGenomeAmplification
+++ b/SelectiveWholeGenomeAmplification
@@ -335,7 +335,7 @@ if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then
check_non_empty "$ignore_all_mers_counts" "ignore all mers from file "
echo " filtering mers that appear less frequently than the average binding site distance ($min_foreground_binding_average)"
- filter_average_binding.py "$ignore_all_mers_counts" "$min_foreground_binding_average" < "$fg_counts" > "$average_binding" || exit 1
+ filter_average_binding.py --fasta "$foreground" --minimum "$min_foreground_binding_average" --counts "$ignore_all_mers_counts" > "$average_binding" || exit 1
check_non_empty "$average_binding" "average binding"
echo " filtering mers that are not in the melting range ($min_melting_temp-$max_melting_temp)"
diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py
index e526be8..97088b2 100755
--- a/src/filter_average_binding.py
+++ b/src/filter_average_binding.py
@@ -1,13 +1,21 @@
#!/usr/bin/env python2.7
import sys
+import argparse
import os
debug = os.environ.get("debug", False)
-from subprocess import Popen
-from subprocess import PIPE
-
def get_length(fn):
+ '''
+ get the length of a fasta file by piping it through several unix
+ programs.
+
+ 1) remove headers by grepping for any ">" at the start of a line
+ 2) delete all occurances of a new line, to join sequences together
+ 3) sum the number of characters.
+ '''
+ from subprocess import Popen
+ from subprocess import PIPE
cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"
@@ -19,17 +27,36 @@ def get_length(fn):
return int(points_fh.stdout.readline())
-if len(sys.argv) < 2:
- print "filter_average_binding.py foreground.fa binding_distance"
- exit()
-
-foreground = sys.argv[1]
-distance = int(sys.argv[2])
+def main():
+ '''
+ This filter removes mers where the count / (length of the genome) is below a
+ certain threshold as specified by -m
+ '''
+
+ parser = argparse.ArgumentParser(description="Filter mers where (k-mer count / length of the genome) < minimum")
+ parser.add_argument("-f", "--fasta", help="foreground fasta file", required=True )
+ parser.add_argument("-c", "--counts", help="kmer counts of the foreground fasta file", required=True )
+ parser.add_argument("-m", "--minimum", help="the minium average foreground binding distance", required=True, type=float)
+
+ args = parser.parse_args()
+
+ if not os.path.exists(args.fasta):
+ exit("foreground fasta file " + args.fasta + " not found.")
+
+ if not os.path.exists(args.counts):
+ exit("count file " + args.counts + " not found.")
+
+
+ # get genome length
+ genome_length = float(get_length(args.fasta))
+
+ count_fh = open(args.counts, "rU")
-genome_length = get_length(foreground)
+ for line in count_fh:
+ (_, count) = line.split()
+ if (genome_length / float(count)) < args.minimum:
+ sys.stdout.write(line)
-for line in sys.stdin:
- (mer, count) = line.split()
- if (genome_length / int(count)) < distance:
- sys.stdout.write(line)
+if __name__ == "__main__":
+ sys.exit(main())