aboutsummaryrefslogtreecommitdiff
path: root/src/python
diff options
context:
space:
mode:
Diffstat (limited to 'src/python')
-rwxr-xr-xsrc/python/generate_kmers5
-rwxr-xr-xsrc/python/multifasta_to_otu.py154
-rwxr-xr-xsrc/python/quikr48
-rwxr-xr-xsrc/python/quikr.py127
-rwxr-xr-xsrc/python/quikr_train53
5 files changed, 387 insertions, 0 deletions
diff --git a/src/python/generate_kmers b/src/python/generate_kmers
new file mode 100755
index 0000000..38fdf0a
--- /dev/null
+++ b/src/python/generate_kmers
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+import quikr
+import sys
+
+print quikr.generate_kmers(int(sys.argv[1]))
diff --git a/src/python/multifasta_to_otu.py b/src/python/multifasta_to_otu.py
new file mode 100755
index 0000000..c6fb562
--- /dev/null
+++ b/src/python/multifasta_to_otu.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python
+
+from multiprocessing import Pool
+from Bio import SeqIO
+import multiprocessing
+from subprocess import *
+import os
+import gzip
+import quikr as q
+import sys
+import numpy as np
+import argparse
+import platform
+import csv
+
+# our defaults
+kmer = 6
+lamb = 10000
+output_directory = ""
+input_directory = ""
+
+
+def main():
+
+ global kmer
+ global input_directory
+ global output_directory
+ global lamb
+ global trained_matrix
+
+ parser = argparse.ArgumentParser(description="MultifastaOTU")
+
+ parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True)
+ parser.add_argument("-o", "--otu-table", help="otu_table", required=True)
+ parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True)
+ parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True)
+ parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True)
+ parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+ parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6")
+ parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs")
+ args = parser.parse_args()
+
+ # our defaults
+ jobs=multiprocessing.cpu_count()
+ trained_matrix = args.trained_matrix
+ input_directory = args.input_directory
+ output_directory = args.output_directory
+
+ # Make sure our input exist
+ if not os.path.isdir(args.input_directory):
+ parser.error("Input directory not found")
+
+ if not os.path.isdir(args.output_directory):
+ print "Output directory not found, creating directory"
+ os.path.mkdir(args, output_directory)
+
+ if not os.path.isfile(args.trained_matrix):
+ parser.error("Custom trained matrix not found")
+
+ if not os.path.isfile(args.trained_fasta):
+ parser.error("Fasta file of trained matrix not found")
+
+ # use alternative lambda
+ if args.lamb is not None:
+ lamb = args.lamb
+
+ if args.jobs is not None:
+ jobs = args.jobs
+
+ if args.kmer is not None:
+ kmer = args.kmer
+
+ # Load trained matrix
+ if qu.isCompressed(args.trained_matrix):
+ trained_matrix_file = gzip.open(args.trained_matrix, "rb")
+ else:
+ trained_matrix_file = open(args.trained_matrix, "rb")
+
+ trained_matrix = np.load(trained_matrix_file)
+
+ # Return a list of the input directory
+ fasta_list = os.listdir(args.input_directory)
+
+ # Queue up and run our quikr functions.
+ pool = Pool(processes=jobs)
+ results = pool.map(quikr_call, fasta_list)
+
+ # Create an array of headers
+ headers = []
+ trained_matrix_headers = open(args.trained_fasta, "rU")
+ for header in SeqIO.parse(trained_matrix_headers, "fasta"):
+ headers.append(header.id)
+ trained_matrix_headers.close()
+
+ # create our number of reads matrix
+ number_of_reads = np.zeros((len(headers), len(fasta_list)))
+
+ # load the keys with values from each fasta result
+ for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):
+
+ count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE)
+ number_of_sequences = int(count_sequences.stdout.readline())
+
+ proportions = np.loadtxt(output_directory + fasta);
+
+ for proportion, proportion_it in map(None, proportions, range(len(proportions))):
+ number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences)
+
+ # remove empty rows from our matrix
+ final_headers = list()
+ final_data = list()
+ for row, header in map(None, number_of_reads, headers):
+ if row.sum() != 0:
+ final_headers.append(header)
+ final_data.append(row)
+
+ # convert from a list back into a numpy array
+ final_data = np.array(final_data, dtype=int)
+
+ # stack our final header and our number_of_reads matrix
+ number_of_reads = np.column_stack((final_headers, final_data))
+
+ # write our OTU table
+ output_file = open(args.otu_table, "wb")
+ writer = csv.writer(output_file, delimiter="\t")
+
+ #write out our fasta file row
+ writer.writerow(['# QIIME vGail OTU table'])
+
+ fasta_row = ['#OTU_ID']
+ fasta_row.append(' '.join(fasta_list))
+ fasta_row = [' '.join(fasta_row)]
+ writer.writerow(fasta_row)
+
+ # write out our results
+ for i in range(0, np.shape(number_of_reads)[0]):
+ writer.writerow(list(number_of_reads[i]))
+
+ output_file.close()
+
+ return 0
+
+
+def quikr_call(fasta_file):
+ input_location = input_directory + fasta_file
+ output_location = output_directory + os.path.basename(fasta_file)
+
+ xstar = q.quikr(input_location, trained_matrix, kmer, lamb)
+ np.savetxt(output_location, xstar, delimiter=",", fmt="%f")
+ return xstar
+
+if __name__ == "__main__":
+ sys.exit(main())
+
diff --git a/src/python/quikr b/src/python/quikr
new file mode 100755
index 0000000..bac01ca
--- /dev/null
+++ b/src/python/quikr
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+import sys
+import os
+import argparse
+import quikr
+import numpy as np
+
+def main():
+
+ parser = argparse.ArgumentParser(description=
+ "Quikr returns the estimated frequencies of batcteria present when given a \
+ input FASTA file. \n \
+ A default trained matrix will be used if none is supplied \n \
+ You must supply a kmer and default lambda if using a custom trained \
+ matrix.")
+
+ parser.add_argument("-f", "--fasta", help="fasta file", required=True)
+ parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
+ parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
+ parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+ parser.add_argument("-k", "--kmer", type=int, required=True,
+ help="specifies which kmer to use, must be used with a custom trained database")
+
+
+ args = parser.parse_args()
+
+ # our default lambda is 10,000
+ lamb = 10000
+
+ # Make sure our input exist
+ if not os.path.isfile(args.fasta):
+ parser.error( "Input fasta file not found")
+
+ if not os.path.isfile(args.trained_matrix):
+ parser.error("Custom trained matrix not found")
+
+ # use alternative lambda
+ if args.lamb is not None:
+ lamb = args.lamb
+
+ trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix)
+ xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb)
+
+ np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/src/python/quikr.py b/src/python/quikr.py
new file mode 100755
index 0000000..368ab31
--- /dev/null
+++ b/src/python/quikr.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python
+import os
+import sys
+from StringIO import StringIO
+import scipy.optimize.nnls
+import scipy.sparse
+import numpy as np
+from subprocess import *
+import argparse
+import platform
+import gzip
+import itertools
+
+def generate_kmers(kmer):
+ """ generate all possible kmers permutations seperated by newlines
+
+ >>> kmers = generate_kmers(1)
+ >>> generate_kmers(2)
+
+ param kmer: the desired Mer size
+ type kmer: int
+ return: Returns a string of kmers seperated by newlines
+ rtype: string
+ """
+
+ return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
+
+def isCompressed(filename):
+ """ This function checks to see if the file is gzipped
+
+ >>> boolean_value = isCompressed("/path/to/compressed/gzip/file")
+ >>> print boolean_value
+ True
+
+ param filename: the filename to check
+ type filename: string
+ return: Returns whether the file is gzipped
+ rtype: boolean
+
+ """
+ f = open(filename, "rb")
+
+ # The first two bytes of a gzipped file are always '1f 8b'
+ if f.read(2) == '\x1f\x8b':
+ f.close()
+ return True
+ else:
+ f.close()
+ return False
+
+
+def train_matrix(input_file_location, kmer):
+ """
+ Takes a input fasta file, and kmer, returns a custom trained matrix
+ """
+
+ kmer_file_name = str(kmer) + "mers.txt"
+
+ kmer_output = Popen(["generate_kmers", str(kmer)], stdout=PIPE)
+ input_file = Popen(["probabilities-by-read", str(kmer), input_file_location] , stdout=PIPE)
+
+ # load and normalize the matrix by dividing each element by the sum of it's column.
+ # also do some fancy rotations so that it works properly with quikr
+ matrix = np.loadtxt(input_file.stdout)
+
+ matrix = np.rot90(matrix)
+ matrix = matrix / matrix.sum(0)
+ matrix = np.flipud(matrix);
+ return matrix
+
+
+def load_trained_matrix_from_file(trained_matrix_location):
+ """ This is a helper function to load our trained matrix and run quikr """
+
+ if isCompressed(trained_matrix_location):
+ trained_matrix_file = gzip.open(trained_matrix_location, "rb")
+ else:
+ trained_matrix_file = open(trained_matrix_location, "rb")
+
+ trained_matrix = np.load(trained_matrix_file)
+
+ return trained_matrix
+
+
+def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
+ """
+ input_fasta is the input fasta file to find the estimated frequencies of
+ trained_matrix is the trained matrix we are using to estimate the species
+ kmer is the desired k-mer to use
+ default_lambda is inp
+
+ returns the estimated requencies of bacteria present when given an input
+ FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
+ promoting algorthim is utilized.
+
+ In practice reconstruction is accurate only down to the genus level (not
+ species or strain).
+
+ """
+
+ uname = platform.uname()[0]
+
+ # We use the count program to count ____
+ if uname == "Linux" and os.path.isfile("./count-linux"):
+ count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
+ elif uname == "Darwin" and os.path.isfile("./count-osx"):
+ count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
+
+
+ # load the output of our count program and form a probability vector from the counts
+ counts = np.loadtxt(count_input.stdout)
+ counts = counts / counts.sum(0)
+ counts = default_lambda * counts
+ counts = np.concatenate([np.zeros(1), counts])
+
+ #form the k-mer sensing matrix
+ trained_matrix = trained_matrix * default_lambda;
+ trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
+
+ xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)
+ xstar = xstar / xstar.sum(0)
+
+ return xstar
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/src/python/quikr_train b/src/python/quikr_train
new file mode 100755
index 0000000..bf74e12
--- /dev/null
+++ b/src/python/quikr_train
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+import numpy as np
+import quikr
+import os
+import sys
+import gzip
+from subprocess import *
+import platform
+import argparse
+
+def main():
+ """
+ You can call this script independently, and will save the
+ trained matrix as a numpy file.
+
+ example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy
+
+ """
+ parser = argparse.ArgumentParser(description=
+ " quikr_train returns a custom trained matrix that can be used with \
+ the quikr function. \n You must supply a kmer. \n ")
+
+ parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True)
+ parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True)
+ parser.add_argument("-k", "--kmer", help="kmer size (integer)",
+ type=int, required=False )
+ parser.add_argument("-z", "--compress", help="compress output (integer)",
+ action='store_true', required=False)
+
+ args = parser.parse_args()
+
+ if not os.path.isfile(args.input):
+ parser.error( "Input database not found")
+
+ # call the quikr train function, save the output with np.save
+ matrix = quikr.train_matrix(args.input, args.kmer)
+
+ if args.kmer is None:
+ kmer = 6
+ else:
+ kmer = args.kmer
+
+ if args.compress:
+ output_file = gzip.open(args.output, "wb")
+ else:
+ output_file = open(args.output, "wb")
+
+ np.save(output_file, matrix)
+
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main())