diff options
author | Calvin <calvin@EESI> | 2013-03-07 17:17:20 -0500 |
---|---|---|
committer | Calvin <calvin@EESI> | 2013-03-07 17:17:20 -0500 |
commit | e8dfa85cd7e0428e53aac532c31f1ac1cc3cf1c1 (patch) | |
tree | e7663ea51680b07dd4a42f4132b746e9437aeb10 | |
parent | 7df3bd86f20197aad9e82f7ee89b7c0c8938dc15 (diff) |
starting to modularize
-rwxr-xr-x | generate_kmers.py | 4 | ||||
-rwxr-xr-x | quikr | 45 | ||||
-rwxr-xr-x | quikr.py | 73 | ||||
-rwxr-xr-x | quikr_train (renamed from quikr_train.py) | 31 | ||||
-rw-r--r-- | quikr_util.py | 13 |
5 files changed, 91 insertions, 75 deletions
diff --git a/generate_kmers.py b/generate_kmers.py index 9a338b7..38fdf0a 100755 --- a/generate_kmers.py +++ b/generate_kmers.py @@ -1,5 +1,5 @@ #!/usr/bin/python -import itertools +import quikr import sys -print '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=int(sys.argv[1]))) +print quikr.generate_kmers(int(sys.argv[1])) @@ -0,0 +1,45 @@ +#!/usr/bin/python +import sys +import argparse +import quikr + +def main(): + + parser = argparse.ArgumentParser(description= + "Quikr returns the estimated frequencies of batcteria present when given a \ + input FASTA file. \n \ + A default trained matrix will be used if none is supplied \n \ + You must supply a kmer and default lambda if using a custom trained \ + matrix.") + + parser.add_argument("-f", "--fasta", help="fasta file", required=True) + parser.add_argument("-o", "--output", help="output path (csv output)", required=True) + parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, required=True, + help="specifies which kmer to use, must be used with a custom trained database") + + + args = parser.parse_args() + + # our default lambda is 10,000 + lamb = 10000 + + # Make sure our input exist + if not os.path.isfile(args.fasta): + parser.error( "Input fasta file not found") + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb) + + np.savetxt(args.output, xstar, delimiter=",", fmt="%f") + return 0 + +if __name__ == "__main__": + sys.exit(main()) @@ -9,47 +9,56 @@ from subprocess import * import argparse import platform import gzip +import itertools -def main(): +def generate_kmers(kmer): + """ This will return a list of kmers seperated by newlines """ + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) - parser = argparse.ArgumentParser(description= - "Quikr returns the estimated frequencies of batcteria present when given a \ - input FASTA file. \n \ - A default trained matrix will be used if none is supplied \n \ - You must supply a kmer and default lambda if using a custom trained \ - matrix.") +def isCompressed(filename): + """ This function checks to see if the file is gzipped """ - parser.add_argument("-f", "--fasta", help="fasta file", required=True) - parser.add_argument("-o", "--output", help="output path (csv output)", required=True) - parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) - parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") - parser.add_argument("-k", "--kmer", type=int, required=True, - help="specifies which kmer to use, must be used with a custom trained database") + f = open(filename, "rb") + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def quikr_train(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ - args = parser.parse_args() - - # our default lambda is 10,000 - lamb = 10000 + kmer_file_name = str(kmer) + "mers.txt" + if not os.path.isfile(kmer_file_name): + print "could not find kmer file" + exit() + + uname = platform.uname()[0] - # Make sure our input exist - if not os.path.isfile(args.fasta): - parser.error( "Input fasta file not found") + if uname == "Linux": + input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) + elif uname == "Darwin": + input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") - - # use alternative lambda - if args.lamb is not None: - lamb = args.lamb - - xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb) + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix - np.savetxt(args.output, xstar, delimiter=",", fmt="%f") - return 0 def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda): + """ This is a helper function to load our trained matrix and run quikr """ if qu.isCompressed(trained_matrix_location): trained_matrix_file = gzip.open(trained_matrix_location, "rb") @@ -60,7 +69,8 @@ def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_loc xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda) return xstar - + + def quikr(input_fasta_location, trained_matrix, kmer, default_lambda): """ input_fasta is the input fasta file to find the estimated frequencies of @@ -74,6 +84,7 @@ def quikr(input_fasta_location, trained_matrix, kmer, default_lambda): In practice reconstruction is accurate only down to the genus level (not species or strain). + """ uname = platform.uname()[0] diff --git a/quikr_train.py b/quikr_train index a427436..9e9caa7 100755 --- a/quikr_train.py +++ b/quikr_train @@ -1,5 +1,6 @@ #!/usr/bin/python import numpy as np +import quikr import os import sys import gzip @@ -31,7 +32,7 @@ def main(): parser.error( "Input database not found") # call the quikr train function, save the output with np.save - matrix = quikr_train(args.input, args.kmer) + matrix = quikr.quikr_train(args.input, args.kmer) if args.compress: output_file = gzip.open(args.output, "wb") @@ -41,34 +42,6 @@ def main(): np.save(output_file, matrix) return 0 - -def quikr_train(input_file_location, kmer): - """ - Takes a input fasta file, and kmer, returns a custom trained matrix - """ - - kmer_file_name = str(kmer) + "mers.txt" - - if not os.path.isfile(kmer_file_name): - print "could not find kmer file" - exit() - - - uname = platform.uname()[0] - - if uname == "Linux": - input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) - elif uname == "Darwin": - input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) - - # load and normalize the matrix by dividing each element by the sum of it's column. - # also do some fancy rotations so that it works properly with quikr - matrix = np.loadtxt(input_file.stdout) - - matrix = np.rot90(matrix) - matrix = matrix / matrix.sum(0) - matrix = np.flipud(matrix); - return matrix if __name__ == "__main__": sys.exit(main()) diff --git a/quikr_util.py b/quikr_util.py deleted file mode 100644 index b2fb968..0000000 --- a/quikr_util.py +++ /dev/null @@ -1,13 +0,0 @@ -# Helper Functions - -# Check if file is gzip compressed -def isCompressed(filename): - f = open(filename, "rb") - - # The first two bytes of a tar file are always '1f 8b' - if f.read(2) == '\x1f\x8b': - f.close() - return True - else: - f.close() - return False |