diff options
Diffstat (limited to 'src')
| -rwxr-xr-x | src/generate_kmers | 5 | ||||
| -rwxr-xr-x | src/multifasta_to_otu.py | 154 | ||||
| -rwxr-xr-x | src/quikr | 48 | ||||
| -rwxr-xr-x | src/quikr.py | 115 | ||||
| -rwxr-xr-x | src/quikr_train | 48 | 
5 files changed, 370 insertions, 0 deletions
| diff --git a/src/generate_kmers b/src/generate_kmers new file mode 100755 index 0000000..38fdf0a --- /dev/null +++ b/src/generate_kmers @@ -0,0 +1,5 @@ +#!/usr/bin/python +import quikr +import sys + +print quikr.generate_kmers(int(sys.argv[1])) diff --git a/src/multifasta_to_otu.py b/src/multifasta_to_otu.py new file mode 100755 index 0000000..c6fb562 --- /dev/null +++ b/src/multifasta_to_otu.py @@ -0,0 +1,154 @@ +#!/usr/bin/python + +from multiprocessing import Pool +from Bio import SeqIO +import multiprocessing +from subprocess import * +import os +import gzip +import quikr as q +import sys +import numpy as np +import argparse +import platform +import csv + +# our defaults +kmer = 6 +lamb = 10000 +output_directory = "" +input_directory = "" + + +def main(): + +  global kmer +  global input_directory  +  global output_directory  +  global lamb +  global trained_matrix  + +  parser = argparse.ArgumentParser(description="MultifastaOTU") + +  parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) +  parser.add_argument("-o", "--otu-table", help="otu_table", required=True) +  parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) +  parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) +  parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) +  parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") +  parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") +  parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") +  args = parser.parse_args() +     +  # our defaults +  jobs=multiprocessing.cpu_count() +  trained_matrix = args.trained_matrix +  input_directory = args.input_directory +  output_directory = args.output_directory + +  # Make sure our input exist +  if not os.path.isdir(args.input_directory): +    parser.error("Input directory not found") + +  if not os.path.isdir(args.output_directory): +    print "Output directory not found, creating directory" +    os.path.mkdir(args, output_directory) + +  if not os.path.isfile(args.trained_matrix): +    parser.error("Custom trained matrix not found") +     +  if not os.path.isfile(args.trained_fasta): +    parser.error("Fasta file of trained matrix not found") + +  # use alternative lambda +  if args.lamb is not None: +    lamb = args.lamb +     +  if args.jobs is not None: +    jobs = args.jobs + +  if args.kmer is not None: +    kmer = args.kmer + +  # Load trained matrix +  if qu.isCompressed(args.trained_matrix): +    trained_matrix_file = gzip.open(args.trained_matrix, "rb") +  else: +    trained_matrix_file = open(args.trained_matrix, "rb") +   +  trained_matrix = np.load(trained_matrix_file) + +  # Return a list of the input directory +  fasta_list = os.listdir(args.input_directory) + +  # Queue up and run our quikr functions. +  pool = Pool(processes=jobs) +  results = pool.map(quikr_call, fasta_list) + +  # Create an array of headers +  headers = [] +  trained_matrix_headers = open(args.trained_fasta, "rU") +  for header in SeqIO.parse(trained_matrix_headers, "fasta"): +    headers.append(header.id) +  trained_matrix_headers.close() + +  # create our number of reads matrix +  number_of_reads = np.zeros((len(headers), len(fasta_list))) + +  # load the keys with values from each fasta result +  for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): + +    count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE)  +    number_of_sequences = int(count_sequences.stdout.readline()) + +    proportions = np.loadtxt(output_directory + fasta); +     +    for proportion, proportion_it in map(None, proportions, range(len(proportions))): +      number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) + +  # remove empty rows from our matrix +  final_headers = list() +  final_data = list() +  for row, header in map(None, number_of_reads, headers): +    if row.sum() != 0: +      final_headers.append(header) +      final_data.append(row) + +  # convert from a list back into a numpy array +  final_data = np.array(final_data, dtype=int) + +  # stack our final header and our number_of_reads matrix +  number_of_reads = np.column_stack((final_headers, final_data)) + +  # write our OTU table +  output_file = open(args.otu_table, "wb")  +  writer = csv.writer(output_file, delimiter="\t") + +  #write out our fasta file row +  writer.writerow(['# QIIME vGail OTU table']) + +  fasta_row = ['#OTU_ID'] +  fasta_row.append(' '.join(fasta_list)) +  fasta_row = [' '.join(fasta_row)] +  writer.writerow(fasta_row) + +  # write out our results +  for i in range(0, np.shape(number_of_reads)[0]): +      writer.writerow(list(number_of_reads[i])) + +  output_file.close() + +  return 0 + + +def quikr_call(fasta_file): +  input_location = input_directory + fasta_file +  output_location = output_directory + os.path.basename(fasta_file) + +  xstar = q.quikr(input_location, trained_matrix, kmer, lamb) +  np.savetxt(output_location, xstar, delimiter=",", fmt="%f") +  return xstar + +if __name__ == "__main__": +  sys.exit(main()) +  diff --git a/src/quikr b/src/quikr new file mode 100755 index 0000000..bac01ca --- /dev/null +++ b/src/quikr @@ -0,0 +1,48 @@ +#!/usr/bin/python +import sys +import os +import argparse +import quikr +import numpy as np + +def main(): + +    parser = argparse.ArgumentParser(description= +    "Quikr returns the estimated frequencies of batcteria present when given a \ +    input FASTA file. \n \ +    A default trained matrix will be used if none is supplied \n \ +    You must supply a kmer and default lambda if using a custom trained \ +    matrix.") + +    parser.add_argument("-f", "--fasta", help="fasta file", required=True) +    parser.add_argument("-o", "--output", help="output path (csv output)", required=True) +    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) +    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") +    parser.add_argument("-k", "--kmer", type=int, required=True, +        help="specifies which kmer to use, must be used with a custom trained database") + + +    args = parser.parse_args() +     +    # our default lambda is 10,000 +    lamb = 10000 + +    # Make sure our input exist +    if not os.path.isfile(args.fasta): +        parser.error( "Input fasta file not found") + +    if not os.path.isfile(args.trained_matrix): +        parser.error("Custom trained matrix not found") +     +    # use alternative lambda +    if args.lamb is not None: +        lamb = args.lamb +     +    trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix) +    xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb) + +    np.savetxt(args.output, xstar, delimiter=",", fmt="%f") +    return 0 + +if __name__ == "__main__": +    sys.exit(main()) diff --git a/src/quikr.py b/src/quikr.py new file mode 100755 index 0000000..3f8221e --- /dev/null +++ b/src/quikr.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +import os +import sys +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +from subprocess import * +import argparse +import platform +import gzip +import itertools + +def generate_kmers(kmer): +  """ This will return a list of kmers seperated by newlines """ +  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) + +def isCompressed(filename): +  """ This function checks to see if the file is gzipped """  + +  f = open(filename, "rb") + +  # The first two bytes of a gzipped file are always '1f 8b' +  if f.read(2) == '\x1f\x8b': +    f.close() +    return True +  else: +    f.close() +    return False + +   +def train_matrix(input_file_location, kmer): +  """ +  Takes a input fasta file, and kmer, returns a custom trained matrix +  """ + +  kmer_file_name = str(kmer) + "mers.txt" + +  if not os.path.isfile(kmer_file_name): +    print "could not find kmer file" +    exit() +   +  uname = platform.uname()[0] + +  if uname == "Linux":  +    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)  +  elif uname == "Darwin": +    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])  + +  # load and  normalize the matrix by dividing each element by the sum of it's column. +  # also do some fancy rotations so that it works properly with quikr +  matrix  = np.loadtxt(input_file.stdout) +   +  matrix = np.rot90(matrix) +  matrix = matrix / matrix.sum(0) +  matrix = np.flipud(matrix); +  return matrix + + +def load_trained_matrix_from_file(trained_matrix_location): +  """ This is a helper function to load our trained matrix and run quikr """ +   +  if isCompressed(trained_matrix_location): +    trained_matrix_file = gzip.open(trained_matrix_location, "rb") +  else: +    trained_matrix_file = open(trained_matrix_location, "rb") +   +  trained_matrix = np.load(trained_matrix_file) + +  return trained_matrix + + +def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): +  """ +  input_fasta is the input fasta file to find the estimated frequencies of +  trained_matrix is the trained matrix we are using to estimate the species +  kmer is the desired k-mer to use +  default_lambda is inp  +   +  returns the estimated requencies of bacteria present when given an input +  FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity +  promoting algorthim is utilized.  + +  In practice reconstruction is accurate only down to the genus level (not  +  species or strain). + +  """ + +  uname = platform.uname()[0] + +  # We use the count program to count ____ +  if uname == "Linux" and os.path.isfile("./count-linux"): +    count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)  +  elif uname == "Darwin" and os.path.isfile("./count-osx"): +    count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)  + + +  # load the output of our count program and form a probability vector from the counts   +  counts = np.loadtxt(count_input.stdout)  +  counts = counts / counts.sum(0)  +  counts = default_lambda * counts +  counts = np.concatenate([np.zeros(1), counts]) + +  #form the k-mer sensing matrix +  trained_matrix = trained_matrix * default_lambda; +  trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + + +  xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)  +  xstar = xstar / xstar.sum(0)  + +  return xstar + + +if __name__ == "__main__": +    sys.exit(main()) diff --git a/src/quikr_train b/src/quikr_train new file mode 100755 index 0000000..6e599c9 --- /dev/null +++ b/src/quikr_train @@ -0,0 +1,48 @@ +#!/usr/bin/python +import numpy as np +import quikr +import os +import sys +import gzip +from subprocess import * +import platform +import argparse + +def main(): +  """ +  You can call this script independently, and will save the  +  trained matrix as a numpy file. + +  example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy +  +  """ +  parser = argparse.ArgumentParser(description= +  " quikr_train returns a custom trained matrix that can be used with \ +    the quikr function. \n You must supply a kmer. \n ") + +  parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True) +  parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True) +  parser.add_argument("-k", "--kmer", help="kmer size (integer)",  +                      type=int, required=False ) +  parser.add_argument("-z", "--compress", help="compress output (integer)",  +                      action='store_true', required=False) + +  args = parser.parse_args() + +  if not os.path.isfile(args.input): +    parser.error( "Input database not found") + +  # call the quikr train function, save the output with np.save +  matrix = quikr.train_matrix(args.input, args.kmer) + +  if args.compress:  +    output_file = gzip.open(args.output, "wb") +  else: +    output_file = open(args.output, "wb") + +  np.save(output_file, matrix) + +  return 0 +   +if __name__ == "__main__": +    sys.exit(main()) | 
