diff options
Diffstat (limited to 'src/python')
-rwxr-xr-x | src/python/generate_kmers | 5 | ||||
-rwxr-xr-x | src/python/multifasta_to_otu.py | 154 | ||||
-rwxr-xr-x | src/python/quikr | 48 | ||||
-rwxr-xr-x | src/python/quikr.py | 127 | ||||
-rwxr-xr-x | src/python/quikr_train | 53 |
5 files changed, 387 insertions, 0 deletions
diff --git a/src/python/generate_kmers b/src/python/generate_kmers new file mode 100755 index 0000000..38fdf0a --- /dev/null +++ b/src/python/generate_kmers @@ -0,0 +1,5 @@ +#!/usr/bin/python +import quikr +import sys + +print quikr.generate_kmers(int(sys.argv[1])) diff --git a/src/python/multifasta_to_otu.py b/src/python/multifasta_to_otu.py new file mode 100755 index 0000000..c6fb562 --- /dev/null +++ b/src/python/multifasta_to_otu.py @@ -0,0 +1,154 @@ +#!/usr/bin/python + +from multiprocessing import Pool +from Bio import SeqIO +import multiprocessing +from subprocess import * +import os +import gzip +import quikr as q +import sys +import numpy as np +import argparse +import platform +import csv + +# our defaults +kmer = 6 +lamb = 10000 +output_directory = "" +input_directory = "" + + +def main(): + + global kmer + global input_directory + global output_directory + global lamb + global trained_matrix + + parser = argparse.ArgumentParser(description="MultifastaOTU") + + parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) + parser.add_argument("-o", "--otu-table", help="otu_table", required=True) + parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) + parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) + parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") + parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") + args = parser.parse_args() + + # our defaults + jobs=multiprocessing.cpu_count() + trained_matrix = args.trained_matrix + input_directory = args.input_directory + output_directory = args.output_directory + + # Make sure our input exist + if not os.path.isdir(args.input_directory): + parser.error("Input directory not found") + + if not os.path.isdir(args.output_directory): + print "Output directory not found, creating directory" + os.path.mkdir(args, output_directory) + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + if not os.path.isfile(args.trained_fasta): + parser.error("Fasta file of trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + if args.jobs is not None: + jobs = args.jobs + + if args.kmer is not None: + kmer = args.kmer + + # Load trained matrix + if qu.isCompressed(args.trained_matrix): + trained_matrix_file = gzip.open(args.trained_matrix, "rb") + else: + trained_matrix_file = open(args.trained_matrix, "rb") + + trained_matrix = np.load(trained_matrix_file) + + # Return a list of the input directory + fasta_list = os.listdir(args.input_directory) + + # Queue up and run our quikr functions. + pool = Pool(processes=jobs) + results = pool.map(quikr_call, fasta_list) + + # Create an array of headers + headers = [] + trained_matrix_headers = open(args.trained_fasta, "rU") + for header in SeqIO.parse(trained_matrix_headers, "fasta"): + headers.append(header.id) + trained_matrix_headers.close() + + # create our number of reads matrix + number_of_reads = np.zeros((len(headers), len(fasta_list))) + + # load the keys with values from each fasta result + for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): + + count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) + number_of_sequences = int(count_sequences.stdout.readline()) + + proportions = np.loadtxt(output_directory + fasta); + + for proportion, proportion_it in map(None, proportions, range(len(proportions))): + number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) + + # remove empty rows from our matrix + final_headers = list() + final_data = list() + for row, header in map(None, number_of_reads, headers): + if row.sum() != 0: + final_headers.append(header) + final_data.append(row) + + # convert from a list back into a numpy array + final_data = np.array(final_data, dtype=int) + + # stack our final header and our number_of_reads matrix + number_of_reads = np.column_stack((final_headers, final_data)) + + # write our OTU table + output_file = open(args.otu_table, "wb") + writer = csv.writer(output_file, delimiter="\t") + + #write out our fasta file row + writer.writerow(['# QIIME vGail OTU table']) + + fasta_row = ['#OTU_ID'] + fasta_row.append(' '.join(fasta_list)) + fasta_row = [' '.join(fasta_row)] + writer.writerow(fasta_row) + + # write out our results + for i in range(0, np.shape(number_of_reads)[0]): + writer.writerow(list(number_of_reads[i])) + + output_file.close() + + return 0 + + +def quikr_call(fasta_file): + input_location = input_directory + fasta_file + output_location = output_directory + os.path.basename(fasta_file) + + xstar = q.quikr(input_location, trained_matrix, kmer, lamb) + np.savetxt(output_location, xstar, delimiter=",", fmt="%f") + return xstar + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/src/python/quikr b/src/python/quikr new file mode 100755 index 0000000..bac01ca --- /dev/null +++ b/src/python/quikr @@ -0,0 +1,48 @@ +#!/usr/bin/python +import sys +import os +import argparse +import quikr +import numpy as np + +def main(): + + parser = argparse.ArgumentParser(description= + "Quikr returns the estimated frequencies of batcteria present when given a \ + input FASTA file. \n \ + A default trained matrix will be used if none is supplied \n \ + You must supply a kmer and default lambda if using a custom trained \ + matrix.") + + parser.add_argument("-f", "--fasta", help="fasta file", required=True) + parser.add_argument("-o", "--output", help="output path (csv output)", required=True) + parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, required=True, + help="specifies which kmer to use, must be used with a custom trained database") + + + args = parser.parse_args() + + # our default lambda is 10,000 + lamb = 10000 + + # Make sure our input exist + if not os.path.isfile(args.fasta): + parser.error( "Input fasta file not found") + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix) + xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb) + + np.savetxt(args.output, xstar, delimiter=",", fmt="%f") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/python/quikr.py b/src/python/quikr.py new file mode 100755 index 0000000..368ab31 --- /dev/null +++ b/src/python/quikr.py @@ -0,0 +1,127 @@ +#!/usr/bin/python +import os +import sys +from StringIO import StringIO +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +from subprocess import * +import argparse +import platform +import gzip +import itertools + +def generate_kmers(kmer): + """ generate all possible kmers permutations seperated by newlines + + >>> kmers = generate_kmers(1) + >>> generate_kmers(2) + + param kmer: the desired Mer size + type kmer: int + return: Returns a string of kmers seperated by newlines + rtype: string + """ + + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) + +def isCompressed(filename): + """ This function checks to see if the file is gzipped + + >>> boolean_value = isCompressed("/path/to/compressed/gzip/file") + >>> print boolean_value + True + + param filename: the filename to check + type filename: string + return: Returns whether the file is gzipped + rtype: boolean + + """ + f = open(filename, "rb") + + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def train_matrix(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ + + kmer_file_name = str(kmer) + "mers.txt" + + kmer_output = Popen(["generate_kmers", str(kmer)], stdout=PIPE) + input_file = Popen(["probabilities-by-read", str(kmer), input_file_location] , stdout=PIPE) + + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix + + +def load_trained_matrix_from_file(trained_matrix_location): + """ This is a helper function to load our trained matrix and run quikr """ + + if isCompressed(trained_matrix_location): + trained_matrix_file = gzip.open(trained_matrix_location, "rb") + else: + trained_matrix_file = open(trained_matrix_location, "rb") + + trained_matrix = np.load(trained_matrix_file) + + return trained_matrix + + +def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): + """ + input_fasta is the input fasta file to find the estimated frequencies of + trained_matrix is the trained matrix we are using to estimate the species + kmer is the desired k-mer to use + default_lambda is inp + + returns the estimated requencies of bacteria present when given an input + FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity + promoting algorthim is utilized. + + In practice reconstruction is accurate only down to the genus level (not + species or strain). + + """ + + uname = platform.uname()[0] + + # We use the count program to count ____ + if uname == "Linux" and os.path.isfile("./count-linux"): + count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + elif uname == "Darwin" and os.path.isfile("./count-osx"): + count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + + + # load the output of our count program and form a probability vector from the counts + counts = np.loadtxt(count_input.stdout) + counts = counts / counts.sum(0) + counts = default_lambda * counts + counts = np.concatenate([np.zeros(1), counts]) + + #form the k-mer sensing matrix + trained_matrix = trained_matrix * default_lambda; + trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + + xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) + xstar = xstar / xstar.sum(0) + + return xstar + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/python/quikr_train b/src/python/quikr_train new file mode 100755 index 0000000..bf74e12 --- /dev/null +++ b/src/python/quikr_train @@ -0,0 +1,53 @@ +#!/usr/bin/python +import numpy as np +import quikr +import os +import sys +import gzip +from subprocess import * +import platform +import argparse + +def main(): + """ + You can call this script independently, and will save the + trained matrix as a numpy file. + + example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy + + """ + parser = argparse.ArgumentParser(description= + " quikr_train returns a custom trained matrix that can be used with \ + the quikr function. \n You must supply a kmer. \n ") + + parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True) + parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True) + parser.add_argument("-k", "--kmer", help="kmer size (integer)", + type=int, required=False ) + parser.add_argument("-z", "--compress", help="compress output (integer)", + action='store_true', required=False) + + args = parser.parse_args() + + if not os.path.isfile(args.input): + parser.error( "Input database not found") + + # call the quikr train function, save the output with np.save + matrix = quikr.train_matrix(args.input, args.kmer) + + if args.kmer is None: + kmer = 6 + else: + kmer = args.kmer + + if args.compress: + output_file = gzip.open(args.output, "wb") + else: + output_file = open(args.output, "wb") + + np.save(output_file, matrix) + + return 0 + +if __name__ == "__main__": + sys.exit(main()) |