#!/usr/bin/env python
import sys
import os
import argparse

mers = {} 
debug = os.environ.get("debug", False)

from subprocess import Popen
from subprocess import PIPE


def get_sequence(pt):
	
	for it, seq in enumerate(seq_ends, start=1):
		if pt <= seq:
			return it

def load_end_points(fn):
	''' get all the points of the end of each sequence in a sample '''

	end_points = []

	cmd = "sequence_end_points < " + fn

	if debug:
		print "loading sequence end points"
		print "executing: " + cmd

	points_fh = Popen(cmd, stdout=PIPE, shell=True)

	for line in points_fh.stdout:
		end_points.append(int(line))
	

	if points_fh.wait() is not 0:
		print "executing", cmd, "failed"

	return end_points

def get_length(fn):
	''' get length of a genome ( number of base pairs )'''

	cmd = 'grep "^>" ' + fn + " -v | tr -d '\\n' | wc -c"

	if debug:
		print "loading sequence end points"
		print "executing: " + cmd
	grep = Popen(cmd, stdout=PIPE, shell=True)

	length = grep.stdout.readline()

	length = int(length)

	if grep.wait() is not 0:
		print "executing", cmd, "failed"


	return length

def populate_locations(selected_mers, mer_dic, input_fn, length):
	''' 
	Run the strstreamone command, and parse in the integers that are output
	by the command, and add it to mers[mer] 

	strstreamone just prints the location of a string argv[1] in stdout.

	We also do the reverse compliment, using tac and tr piped together.
	'''
	import tempfile


	cmds = []
	# strip file of header and delete newlines
	cmds.append(["grep -v '^>' " + input_fn  +  " | tr -d '\\n' | strstream ", False, 5])
	# reverse file, strip and delete newlines
	cmds.append(["tac " + input_fn + \
							"| rev " \
							"| grep -v '>$' " \
							"| tr -d '\\n' " \
							"| tr [ACGT] [TGCA] | strstream ", True, 3])
	
	for (cmd, reverse, strand) in cmds:
		if(debug):
			print(cmd)
		_, merlist_fn = tempfile.mkstemp()

		# write our mers out to a fifi
		merlist_fh = open(merlist_fn, 'w')
		for mer in selected_mers:
			merlist_fh.write(mer + '\n')

		merlist_fh.flush()
		# add our merlist fn to our command
		cmd = cmd + " " + merlist_fn

		strstream = Popen(cmd, stdout=PIPE, shell=True)
		if reverse:
			for line in strstream.stdout:
				(mer, pos) = line.split(" ")
				mer_dic[selected_mers[int(mer)]].append([pos, strand])
		else:
			for line in strstream.stdout:
				(mer, pos) = line.split(" ")
				mer_dic[selected_mers[int(mer)]].append([int(pos), strand])

		if strstream.wait() is not 0:
			print "executing", cmd, "failed"

		merlist_fh.close()


def main():

	parser = argparse.ArgumentParser(description="take a top-scores file, and \
		create a set of files that contain the positions (and other information) in \
		the foreground fasta file of each mer in that combination. ") 
	parser.add_argument("-f", "--fasta", help="input fasta file", required=True)
	parser.add_argument("-s", "--scores", help="input scores file", required=True)
	parser.add_argument("-o", "--output_directory", help="directory to output sets to.", required=True)
	parser.add_argument("-n", "--number", help="output the first n sets", required=False, type=int, default=20)

	args = parser.parse_args()

	if os.path.isfile(args.output_directory):
		parser.error(args.output_directory + "must point to a directory")
	elif not os.path.isdir(args.output_directory):
		os.mkdir(args.output_directory)
	score_fh = open(args.scores, "r")

	global seq_ends
	seq_ends = load_end_points(args.fasta)

	length = get_length(args.fasta)

	nb_done = 0;
	for line in score_fh:
		# skip headers
		if line.startswith("#"):
			continue

		# get mers
		combination = line.split('\t')[1].split()
		
		# open output
		fh = open(args.output_directory + "/" + '-'.join(combination), 'w')

		# populate only un-populated mers
		new_populate = []
		for mer in combination:
			if mer not in mers:
				mers[mer] = []
				new_populate.append(mer)
		
		if len(new_populate) is not 0:
			populate_locations(new_populate, mers, args.fasta, length)

		pts = []
		for mer in combination:
			for pt in mers[mer]:
				pts.append([pt[0], pt[1], mer, get_sequence(pt[0])])
		
		pts = sorted(pts, key = lambda row: row[0])
			
		fh.write("pt\tstrand\tmer\tsequence\n")
		for pt in pts:
			fh.write('\t'.join(str(x) for x in pt) + '\n')

		nb_done += 1 
		if nb_done == args.number:
			exit()

if __name__ == "__main__":
	sys.exit(main())