From 428cc8fc9c27de8a24e6bcfc426a11e7839cebbd Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Fri, 22 Aug 2014 17:37:49 -0400 Subject: revamp filter_average_binding.py --- src/filter_average_binding.py | 55 ++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/filter_average_binding.py b/src/filter_average_binding.py index e526be8..97088b2 100755 --- a/src/filter_average_binding.py +++ b/src/filter_average_binding.py @@ -1,13 +1,21 @@ #!/usr/bin/env python2.7 import sys +import argparse import os debug = os.environ.get("debug", False) -from subprocess import Popen -from subprocess import PIPE - def get_length(fn): + ''' + get the length of a fasta file by piping it through several unix + programs. + + 1) remove headers by grepping for any ">" at the start of a line + 2) delete all occurances of a new line, to join sequences together + 3) sum the number of characters. + ''' + from subprocess import Popen + from subprocess import PIPE cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c" @@ -19,17 +27,36 @@ def get_length(fn): return int(points_fh.stdout.readline()) -if len(sys.argv) < 2: - print "filter_average_binding.py foreground.fa binding_distance" - exit() - -foreground = sys.argv[1] -distance = int(sys.argv[2]) +def main(): + ''' + This filter removes mers where the count / (length of the genome) is below a + certain threshold as specified by -m + ''' + + parser = argparse.ArgumentParser(description="Filter mers where (k-mer count / length of the genome) < minimum") + parser.add_argument("-f", "--fasta", help="foreground fasta file", required=True ) + parser.add_argument("-c", "--counts", help="kmer counts of the foreground fasta file", required=True ) + parser.add_argument("-m", "--minimum", help="the minium average foreground binding distance", required=True, type=float) + + args = parser.parse_args() + + if not os.path.exists(args.fasta): + exit("foreground fasta file " + args.fasta + " not found.") + + if not os.path.exists(args.counts): + exit("count file " + args.counts + " not found.") + + + # get genome length + genome_length = float(get_length(args.fasta)) + + count_fh = open(args.counts, "rU") -genome_length = get_length(foreground) + for line in count_fh: + (_, count) = line.split() + if (genome_length / float(count)) < args.minimum: + sys.stdout.write(line) -for line in sys.stdin: - (mer, count) = line.split() - if (genome_length / int(count)) < distance: - sys.stdout.write(line) +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3