From e8dfa85cd7e0428e53aac532c31f1ac1cc3cf1c1 Mon Sep 17 00:00:00 2001 From: Calvin Date: Thu, 7 Mar 2013 17:17:20 -0500 Subject: starting to modularize --- quikr.py | 73 +++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 31 deletions(-) (limited to 'quikr.py') diff --git a/quikr.py b/quikr.py index f83cdf2..19ff96c 100755 --- a/quikr.py +++ b/quikr.py @@ -9,47 +9,56 @@ from subprocess import * import argparse import platform import gzip +import itertools -def main(): +def generate_kmers(kmer): + """ This will return a list of kmers seperated by newlines """ + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) - parser = argparse.ArgumentParser(description= - "Quikr returns the estimated frequencies of batcteria present when given a \ - input FASTA file. \n \ - A default trained matrix will be used if none is supplied \n \ - You must supply a kmer and default lambda if using a custom trained \ - matrix.") +def isCompressed(filename): + """ This function checks to see if the file is gzipped """ - parser.add_argument("-f", "--fasta", help="fasta file", required=True) - parser.add_argument("-o", "--output", help="output path (csv output)", required=True) - parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) - parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") - parser.add_argument("-k", "--kmer", type=int, required=True, - help="specifies which kmer to use, must be used with a custom trained database") + f = open(filename, "rb") + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def quikr_train(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ - args = parser.parse_args() - - # our default lambda is 10,000 - lamb = 10000 + kmer_file_name = str(kmer) + "mers.txt" + if not os.path.isfile(kmer_file_name): + print "could not find kmer file" + exit() + + uname = platform.uname()[0] - # Make sure our input exist - if not os.path.isfile(args.fasta): - parser.error( "Input fasta file not found") + if uname == "Linux": + input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) + elif uname == "Darwin": + input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") - - # use alternative lambda - if args.lamb is not None: - lamb = args.lamb - - xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb) + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix - np.savetxt(args.output, xstar, delimiter=",", fmt="%f") - return 0 def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda): + """ This is a helper function to load our trained matrix and run quikr """ if qu.isCompressed(trained_matrix_location): trained_matrix_file = gzip.open(trained_matrix_location, "rb") @@ -60,7 +69,8 @@ def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_loc xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda) return xstar - + + def quikr(input_fasta_location, trained_matrix, kmer, default_lambda): """ input_fasta is the input fasta file to find the estimated frequencies of @@ -74,6 +84,7 @@ def quikr(input_fasta_location, trained_matrix, kmer, default_lambda): In practice reconstruction is accurate only down to the genus level (not species or strain). + """ uname = platform.uname()[0] -- cgit v1.2.3