#!/usr/bin/env python2.7
import sys
import argparse
import os

debug = os.environ.get("debug", False)

def get_length(fn):
	'''
		get the length of a fasta file by piping it through several unix 
		programs.

		1) remove headers by grepping for any ">" at the start of a line
		2) delete all occurances of a new line, to join sequences together
		3) sum the number of characters.
	'''
	from subprocess import Popen
	from subprocess import PIPE

	cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"

	if debug:
		print "loading sequence end points"
		print "executing: " + cmd

	points_fh = Popen(cmd, stdout=PIPE, shell=True)

	return int(points_fh.stdout.readline())

def main(): 
	'''
	This filter removes mers where the count / (length of the genome) is below a
	certain threshold as specified by -m
	'''

	parser = argparse.ArgumentParser(description="Filter mers where (k-mer count / length of the genome) < minimum") 
	parser.add_argument("-f", "--fasta", help="foreground fasta file", required=True )
	parser.add_argument("-c", "--counts", help="kmer counts of the foreground fasta file", required=True )
	parser.add_argument("-m", "--minimum", help="the minium average foreground binding distance", required=True, type=float)

	args = parser.parse_args()

	if not os.path.exists(args.fasta):
		exit("foreground fasta file " + args.fasta + " not found.")

	if not os.path.exists(args.counts):
		exit("count file " + args.counts + " not found.")
			

	# get genome length
	genome_length = float(get_length(args.fasta))

	count_fh = open(args.counts, "rU")

	for line in count_fh:
		(_, count) = line.split()
		if (genome_length / float(count)) < args.minimum:
			sys.stdout.write(line)

if __name__ == "__main__":
	sys.exit(main())