aboutsummaryrefslogtreecommitdiff
path: root/src/filter_average_binding.py
blob: 97088b2f04d02e6c9f8f3ca3feeb8ff0c6d526c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python2.7
import sys
import argparse
import os

debug = os.environ.get("debug", False)

def get_length(fn):
	'''
		get the length of a fasta file by piping it through several unix 
		programs.

		1) remove headers by grepping for any ">" at the start of a line
		2) delete all occurances of a new line, to join sequences together
		3) sum the number of characters.
	'''
	from subprocess import Popen
	from subprocess import PIPE

	cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"

	if debug:
		print "loading sequence end points"
		print "executing: " + cmd

	points_fh = Popen(cmd, stdout=PIPE, shell=True)

	return int(points_fh.stdout.readline())

def main(): 
	'''
	This filter removes mers where the count / (length of the genome) is below a
	certain threshold as specified by -m
	'''

	parser = argparse.ArgumentParser(description="Filter mers where (k-mer count / length of the genome) < minimum") 
	parser.add_argument("-f", "--fasta", help="foreground fasta file", required=True )
	parser.add_argument("-c", "--counts", help="kmer counts of the foreground fasta file", required=True )
	parser.add_argument("-m", "--minimum", help="the minium average foreground binding distance", required=True, type=float)

	args = parser.parse_args()

	if not os.path.exists(args.fasta):
		exit("foreground fasta file " + args.fasta + " not found.")

	if not os.path.exists(args.counts):
		exit("count file " + args.counts + " not found.")
			

	# get genome length
	genome_length = float(get_length(args.fasta))

	count_fh = open(args.counts, "rU")

	for line in count_fh:
		(_, count) = line.split()
		if (genome_length / float(count)) < args.minimum:
			sys.stdout.write(line)

if __name__ == "__main__":
	sys.exit(main())