diff options
| -rwxr-xr-x | generate_kmers.py | 4 | ||||
| -rwxr-xr-x | quikr | 45 | ||||
| -rwxr-xr-x | quikr.py | 73 | ||||
| -rwxr-xr-x | quikr_train (renamed from quikr_train.py) | 31 | ||||
| -rw-r--r-- | quikr_util.py | 13 | 
5 files changed, 91 insertions, 75 deletions
diff --git a/generate_kmers.py b/generate_kmers.py index 9a338b7..38fdf0a 100755 --- a/generate_kmers.py +++ b/generate_kmers.py @@ -1,5 +1,5 @@  #!/usr/bin/python -import itertools +import quikr  import sys -print '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=int(sys.argv[1]))) +print quikr.generate_kmers(int(sys.argv[1])) @@ -0,0 +1,45 @@ +#!/usr/bin/python +import sys +import argparse +import quikr + +def main(): + +    parser = argparse.ArgumentParser(description= +    "Quikr returns the estimated frequencies of batcteria present when given a \ +    input FASTA file. \n \ +    A default trained matrix will be used if none is supplied \n \ +    You must supply a kmer and default lambda if using a custom trained \ +    matrix.") + +    parser.add_argument("-f", "--fasta", help="fasta file", required=True) +    parser.add_argument("-o", "--output", help="output path (csv output)", required=True) +    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) +    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") +    parser.add_argument("-k", "--kmer", type=int, required=True, +        help="specifies which kmer to use, must be used with a custom trained database") + + +    args = parser.parse_args() +     +    # our default lambda is 10,000 +    lamb = 10000 + +    # Make sure our input exist +    if not os.path.isfile(args.fasta): +        parser.error( "Input fasta file not found") + +    if not os.path.isfile(args.trained_matrix): +        parser.error("Custom trained matrix not found") +     +    # use alternative lambda +    if args.lamb is not None: +        lamb = args.lamb +     +    xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb) + +    np.savetxt(args.output, xstar, delimiter=",", fmt="%f") +    return 0 + +if __name__ == "__main__": +    sys.exit(main()) @@ -9,47 +9,56 @@ from subprocess import *  import argparse  import platform  import gzip +import itertools -def main(): +def generate_kmers(kmer): +  """ This will return a list of kmers seperated by newlines """ +  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) -    parser = argparse.ArgumentParser(description= -    "Quikr returns the estimated frequencies of batcteria present when given a \ -    input FASTA file. \n \ -    A default trained matrix will be used if none is supplied \n \ -    You must supply a kmer and default lambda if using a custom trained \ -    matrix.") +def isCompressed(filename): +  """ This function checks to see if the file is gzipped """  -    parser.add_argument("-f", "--fasta", help="fasta file", required=True) -    parser.add_argument("-o", "--output", help="output path (csv output)", required=True) -    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) -    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") -    parser.add_argument("-k", "--kmer", type=int, required=True, -        help="specifies which kmer to use, must be used with a custom trained database") +  f = open(filename, "rb") +  # The first two bytes of a gzipped file are always '1f 8b' +  if f.read(2) == '\x1f\x8b': +    f.close() +    return True +  else: +    f.close() +    return False + +   +def quikr_train(input_file_location, kmer): +  """ +  Takes a input fasta file, and kmer, returns a custom trained matrix +  """ -    args = parser.parse_args() -     -    # our default lambda is 10,000 -    lamb = 10000 +  kmer_file_name = str(kmer) + "mers.txt" +  if not os.path.isfile(kmer_file_name): +    print "could not find kmer file" +    exit() +   +  uname = platform.uname()[0] -    # Make sure our input exist -    if not os.path.isfile(args.fasta): -        parser.error( "Input fasta file not found") +  if uname == "Linux":  +    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)  +  elif uname == "Darwin": +    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])  -    if not os.path.isfile(args.trained_matrix): -        parser.error("Custom trained matrix not found") -     -    # use alternative lambda -    if args.lamb is not None: -        lamb = args.lamb -     -    xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb) +  # load and  normalize the matrix by dividing each element by the sum of it's column. +  # also do some fancy rotations so that it works properly with quikr +  matrix  = np.loadtxt(input_file.stdout) +   +  matrix = np.rot90(matrix) +  matrix = matrix / matrix.sum(0) +  matrix = np.flipud(matrix); +  return matrix -    np.savetxt(args.output, xstar, delimiter=",", fmt="%f") -    return 0  def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda): +  """ This is a helper function to load our trained matrix and run quikr """    if qu.isCompressed(trained_matrix_location):      trained_matrix_file = gzip.open(trained_matrix_location, "rb") @@ -60,7 +69,8 @@ def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_loc    xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda)    return xstar -   + +  def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):    """    input_fasta is the input fasta file to find the estimated frequencies of @@ -74,6 +84,7 @@ def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):    In practice reconstruction is accurate only down to the genus level (not     species or strain). +    """    uname = platform.uname()[0] diff --git a/quikr_train.py b/quikr_train index a427436..9e9caa7 100755 --- a/quikr_train.py +++ b/quikr_train @@ -1,5 +1,6 @@  #!/usr/bin/python  import numpy as np +import quikr  import os  import sys  import gzip @@ -31,7 +32,7 @@ def main():      parser.error( "Input database not found")    # call the quikr train function, save the output with np.save -  matrix = quikr_train(args.input, args.kmer) +  matrix = quikr.quikr_train(args.input, args.kmer)    if args.compress:       output_file = gzip.open(args.output, "wb") @@ -41,34 +42,6 @@ def main():    np.save(output_file, matrix)    return 0 - -def quikr_train(input_file_location, kmer): -  """ -  Takes a input fasta file, and kmer, returns a custom trained matrix -  """ - -  kmer_file_name = str(kmer) + "mers.txt" - -  if not os.path.isfile(kmer_file_name): -    print "could not find kmer file" -    exit() - -   -  uname = platform.uname()[0] - -  if uname == "Linux":  -    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)  -  elif uname == "Darwin": -    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])  - -  # load and  normalize the matrix by dividing each element by the sum of it's column. -  # also do some fancy rotations so that it works properly with quikr -  matrix  = np.loadtxt(input_file.stdout) -   -  matrix = np.rot90(matrix) -  matrix = matrix / matrix.sum(0) -  matrix = np.flipud(matrix); -  return matrix  if __name__ == "__main__":      sys.exit(main()) diff --git a/quikr_util.py b/quikr_util.py deleted file mode 100644 index b2fb968..0000000 --- a/quikr_util.py +++ /dev/null @@ -1,13 +0,0 @@ -# Helper Functions - -# Check if file is gzip compressed -def isCompressed(filename): -  f = open(filename, "rb") - -  # The first two bytes of a tar file are always '1f 8b' -  if f.read(2) == '\x1f\x8b': -    f.close() -    return True -  else: -    f.close() -    return False  | 
