From 428cc8fc9c27de8a24e6bcfc426a11e7839cebbd Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Fri, 22 Aug 2014 17:37:49 -0400 Subject: revamp filter_average_binding.py --- SelectiveWholeGenomeAmplification | 2 +- src/filter_average_binding.py | 55 +++++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/SelectiveWholeGenomeAmplification b/SelectiveWholeGenomeAmplification index 8304387..b2b95ca 100755 --- a/SelectiveWholeGenomeAmplification +++ b/SelectiveWholeGenomeAmplification @@ -335,7 +335,7 @@ if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then check_non_empty "$ignore_all_mers_counts" "ignore all mers from file " echo " filtering mers that appear less frequently than the average binding site distance ($min_foreground_binding_average)" - filter_average_binding.py "$ignore_all_mers_counts" "$min_foreground_binding_average" < "$fg_counts" > "$average_binding" || exit 1 + filter_average_binding.py --fasta "$foreground" --minimum "$min_foreground_binding_average" --counts "$ignore_all_mers_counts" > "$average_binding" || exit 1 check_non_empty "$average_binding" "average binding" echo " filtering mers that are not in the melting range ($min_melting_temp-$max_melting_temp)" diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py index e526be8..97088b2 100755 --- a/src/filter_average_binding.py +++ b/src/filter_average_binding.py @@ -1,13 +1,21 @@ #!/usr/bin/env python2.7 import sys +import argparse import os debug = os.environ.get("debug", False) -from subprocess import Popen -from subprocess import PIPE - def get_length(fn): + ''' + get the length of a fasta file by piping it through several unix + programs. + + 1) remove headers by grepping for any ">" at the start of a line + 2) delete all occurances of a new line, to join sequences together + 3) sum the number of characters. + ''' + from subprocess import Popen + from subprocess import PIPE cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" @@ -19,17 +27,36 @@ def get_length(fn): return int(points_fh.stdout.readline()) -if len(sys.argv) < 2: - print "filter_average_binding.py foreground.fa binding_distance" - exit() - -foreground = sys.argv[1] -distance = int(sys.argv[2]) +def main(): + ''' + This filter removes mers where the count / (length of the genome) is below a + certain threshold as specified by -m + ''' + + parser = argparse.ArgumentParser(description="Filter mers where (k-mer count / length of the genome) < minimum") + parser.add_argument("-f", "--fasta", help="foreground fasta file", required=True ) + parser.add_argument("-c", "--counts", help="kmer counts of the foreground fasta file", required=True ) + parser.add_argument("-m", "--minimum", help="the minium average foreground binding distance", required=True, type=float) + + args = parser.parse_args() + + if not os.path.exists(args.fasta): + exit("foreground fasta file " + args.fasta + " not found.") + + if not os.path.exists(args.counts): + exit("count file " + args.counts + " not found.") + + + # get genome length + genome_length = float(get_length(args.fasta)) + + count_fh = open(args.counts, "rU") -genome_length = get_length(foreground) + for line in count_fh: + (_, count) = line.split() + if (genome_length / float(count)) < args.minimum: + sys.stdout.write(line) -for line in sys.stdin: - (mer, count) = line.split() - if (genome_length / int(count)) < distance: - sys.stdout.write(line) +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.1