From c18793c210ea44cef63db3568791cbad537bdb79 Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 8 Mar 2013 15:32:50 -0500 Subject: moving some stuff --- generate_kmers | 5 -- multifasta_to_otu.py | 154 ----------------------------------------------- quikr | 48 --------------- quikr.py | 115 ----------------------------------- quikr_train | 48 --------------- src/generate_kmers | 5 ++ src/multifasta_to_otu.py | 154 +++++++++++++++++++++++++++++++++++++++++++++++ src/quikr | 48 +++++++++++++++ src/quikr.py | 115 +++++++++++++++++++++++++++++++++++ src/quikr_train | 48 +++++++++++++++ 10 files changed, 370 insertions(+), 370 deletions(-) delete mode 100755 generate_kmers delete mode 100755 multifasta_to_otu.py delete mode 100755 quikr delete mode 100755 quikr.py delete mode 100755 quikr_train create mode 100755 src/generate_kmers create mode 100755 src/multifasta_to_otu.py create mode 100755 src/quikr create mode 100755 src/quikr.py create mode 100755 src/quikr_train diff --git a/generate_kmers b/generate_kmers deleted file mode 100755 index 38fdf0a..0000000 --- a/generate_kmers +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/python -import quikr -import sys - -print quikr.generate_kmers(int(sys.argv[1])) diff --git a/multifasta_to_otu.py b/multifasta_to_otu.py deleted file mode 100755 index c6fb562..0000000 --- a/multifasta_to_otu.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/python - -from multiprocessing import Pool -from Bio import SeqIO -import multiprocessing -from subprocess import * -import os -import gzip -import quikr as q -import sys -import numpy as np -import argparse -import platform -import csv - -# our defaults -kmer = 6 -lamb = 10000 -output_directory = "" -input_directory = "" - - -def main(): - - global kmer - global input_directory - global output_directory - global lamb - global trained_matrix - - parser = argparse.ArgumentParser(description="MultifastaOTU") - - parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) - parser.add_argument("-o", "--otu-table", help="otu_table", required=True) - parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) - parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) - parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) - parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") - parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") - parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") - args = parser.parse_args() - - # our defaults - jobs=multiprocessing.cpu_count() - trained_matrix = args.trained_matrix - input_directory = args.input_directory - output_directory = args.output_directory - - # Make sure our input exist - if not os.path.isdir(args.input_directory): - parser.error("Input directory not found") - - if not os.path.isdir(args.output_directory): - print "Output directory not found, creating directory" - os.path.mkdir(args, output_directory) - - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") - - if not os.path.isfile(args.trained_fasta): - parser.error("Fasta file of trained matrix not found") - - # use alternative lambda - if args.lamb is not None: - lamb = args.lamb - - if args.jobs is not None: - jobs = args.jobs - - if args.kmer is not None: - kmer = args.kmer - - # Load trained matrix - if qu.isCompressed(args.trained_matrix): - trained_matrix_file = gzip.open(args.trained_matrix, "rb") - else: - trained_matrix_file = open(args.trained_matrix, "rb") - - trained_matrix = np.load(trained_matrix_file) - - # Return a list of the input directory - fasta_list = os.listdir(args.input_directory) - - # Queue up and run our quikr functions. - pool = Pool(processes=jobs) - results = pool.map(quikr_call, fasta_list) - - # Create an array of headers - headers = [] - trained_matrix_headers = open(args.trained_fasta, "rU") - for header in SeqIO.parse(trained_matrix_headers, "fasta"): - headers.append(header.id) - trained_matrix_headers.close() - - # create our number of reads matrix - number_of_reads = np.zeros((len(headers), len(fasta_list))) - - # load the keys with values from each fasta result - for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): - - count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) - number_of_sequences = int(count_sequences.stdout.readline()) - - proportions = np.loadtxt(output_directory + fasta); - - for proportion, proportion_it in map(None, proportions, range(len(proportions))): - number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) - - # remove empty rows from our matrix - final_headers = list() - final_data = list() - for row, header in map(None, number_of_reads, headers): - if row.sum() != 0: - final_headers.append(header) - final_data.append(row) - - # convert from a list back into a numpy array - final_data = np.array(final_data, dtype=int) - - # stack our final header and our number_of_reads matrix - number_of_reads = np.column_stack((final_headers, final_data)) - - # write our OTU table - output_file = open(args.otu_table, "wb") - writer = csv.writer(output_file, delimiter="\t") - - #write out our fasta file row - writer.writerow(['# QIIME vGail OTU table']) - - fasta_row = ['#OTU_ID'] - fasta_row.append(' '.join(fasta_list)) - fasta_row = [' '.join(fasta_row)] - writer.writerow(fasta_row) - - # write out our results - for i in range(0, np.shape(number_of_reads)[0]): - writer.writerow(list(number_of_reads[i])) - - output_file.close() - - return 0 - - -def quikr_call(fasta_file): - input_location = input_directory + fasta_file - output_location = output_directory + os.path.basename(fasta_file) - - xstar = q.quikr(input_location, trained_matrix, kmer, lamb) - np.savetxt(output_location, xstar, delimiter=",", fmt="%f") - return xstar - -if __name__ == "__main__": - sys.exit(main()) - diff --git a/quikr b/quikr deleted file mode 100755 index bac01ca..0000000 --- a/quikr +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/python -import sys -import os -import argparse -import quikr -import numpy as np - -def main(): - - parser = argparse.ArgumentParser(description= - "Quikr returns the estimated frequencies of batcteria present when given a \ - input FASTA file. \n \ - A default trained matrix will be used if none is supplied \n \ - You must supply a kmer and default lambda if using a custom trained \ - matrix.") - - parser.add_argument("-f", "--fasta", help="fasta file", required=True) - parser.add_argument("-o", "--output", help="output path (csv output)", required=True) - parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) - parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") - parser.add_argument("-k", "--kmer", type=int, required=True, - help="specifies which kmer to use, must be used with a custom trained database") - - - args = parser.parse_args() - - # our default lambda is 10,000 - lamb = 10000 - - # Make sure our input exist - if not os.path.isfile(args.fasta): - parser.error( "Input fasta file not found") - - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") - - # use alternative lambda - if args.lamb is not None: - lamb = args.lamb - - trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix) - xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb) - - np.savetxt(args.output, xstar, delimiter=",", fmt="%f") - return 0 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/quikr.py b/quikr.py deleted file mode 100755 index 3f8221e..0000000 --- a/quikr.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/python -import os -import sys -import scipy.optimize.nnls -import scipy.sparse -import numpy as np -from subprocess import * -import argparse -import platform -import gzip -import itertools - -def generate_kmers(kmer): - """ This will return a list of kmers seperated by newlines """ - return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) - -def isCompressed(filename): - """ This function checks to see if the file is gzipped """ - - f = open(filename, "rb") - - # The first two bytes of a gzipped file are always '1f 8b' - if f.read(2) == '\x1f\x8b': - f.close() - return True - else: - f.close() - return False - - -def train_matrix(input_file_location, kmer): - """ - Takes a input fasta file, and kmer, returns a custom trained matrix - """ - - kmer_file_name = str(kmer) + "mers.txt" - - if not os.path.isfile(kmer_file_name): - print "could not find kmer file" - exit() - - uname = platform.uname()[0] - - if uname == "Linux": - input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) - elif uname == "Darwin": - input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) - - # load and normalize the matrix by dividing each element by the sum of it's column. - # also do some fancy rotations so that it works properly with quikr - matrix = np.loadtxt(input_file.stdout) - - matrix = np.rot90(matrix) - matrix = matrix / matrix.sum(0) - matrix = np.flipud(matrix); - return matrix - - -def load_trained_matrix_from_file(trained_matrix_location): - """ This is a helper function to load our trained matrix and run quikr """ - - if isCompressed(trained_matrix_location): - trained_matrix_file = gzip.open(trained_matrix_location, "rb") - else: - trained_matrix_file = open(trained_matrix_location, "rb") - - trained_matrix = np.load(trained_matrix_file) - - return trained_matrix - - -def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): - """ - input_fasta is the input fasta file to find the estimated frequencies of - trained_matrix is the trained matrix we are using to estimate the species - kmer is the desired k-mer to use - default_lambda is inp - - returns the estimated requencies of bacteria present when given an input - FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity - promoting algorthim is utilized. - - In practice reconstruction is accurate only down to the genus level (not - species or strain). - - """ - - uname = platform.uname()[0] - - # We use the count program to count ____ - if uname == "Linux" and os.path.isfile("./count-linux"): - count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) - elif uname == "Darwin" and os.path.isfile("./count-osx"): - count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) - - - # load the output of our count program and form a probability vector from the counts - counts = np.loadtxt(count_input.stdout) - counts = counts / counts.sum(0) - counts = default_lambda * counts - counts = np.concatenate([np.zeros(1), counts]) - - #form the k-mer sensing matrix - trained_matrix = trained_matrix * default_lambda; - trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) - - - xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) - xstar = xstar / xstar.sum(0) - - return xstar - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/quikr_train b/quikr_train deleted file mode 100755 index 6e599c9..0000000 --- a/quikr_train +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/python -import numpy as np -import quikr -import os -import sys -import gzip -from subprocess import * -import platform -import argparse - -def main(): - """ - You can call this script independently, and will save the - trained matrix as a numpy file. - - example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy - - """ - parser = argparse.ArgumentParser(description= - " quikr_train returns a custom trained matrix that can be used with \ - the quikr function. \n You must supply a kmer. \n ") - - parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True) - parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True) - parser.add_argument("-k", "--kmer", help="kmer size (integer)", - type=int, required=False ) - parser.add_argument("-z", "--compress", help="compress output (integer)", - action='store_true', required=False) - - args = parser.parse_args() - - if not os.path.isfile(args.input): - parser.error( "Input database not found") - - # call the quikr train function, save the output with np.save - matrix = quikr.train_matrix(args.input, args.kmer) - - if args.compress: - output_file = gzip.open(args.output, "wb") - else: - output_file = open(args.output, "wb") - - np.save(output_file, matrix) - - return 0 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/generate_kmers b/src/generate_kmers new file mode 100755 index 0000000..38fdf0a --- /dev/null +++ b/src/generate_kmers @@ -0,0 +1,5 @@ +#!/usr/bin/python +import quikr +import sys + +print quikr.generate_kmers(int(sys.argv[1])) diff --git a/src/multifasta_to_otu.py b/src/multifasta_to_otu.py new file mode 100755 index 0000000..c6fb562 --- /dev/null +++ b/src/multifasta_to_otu.py @@ -0,0 +1,154 @@ +#!/usr/bin/python + +from multiprocessing import Pool +from Bio import SeqIO +import multiprocessing +from subprocess import * +import os +import gzip +import quikr as q +import sys +import numpy as np +import argparse +import platform +import csv + +# our defaults +kmer = 6 +lamb = 10000 +output_directory = "" +input_directory = "" + + +def main(): + + global kmer + global input_directory + global output_directory + global lamb + global trained_matrix + + parser = argparse.ArgumentParser(description="MultifastaOTU") + + parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) + parser.add_argument("-o", "--otu-table", help="otu_table", required=True) + parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) + parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) + parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") + parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") + args = parser.parse_args() + + # our defaults + jobs=multiprocessing.cpu_count() + trained_matrix = args.trained_matrix + input_directory = args.input_directory + output_directory = args.output_directory + + # Make sure our input exist + if not os.path.isdir(args.input_directory): + parser.error("Input directory not found") + + if not os.path.isdir(args.output_directory): + print "Output directory not found, creating directory" + os.path.mkdir(args, output_directory) + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + if not os.path.isfile(args.trained_fasta): + parser.error("Fasta file of trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + if args.jobs is not None: + jobs = args.jobs + + if args.kmer is not None: + kmer = args.kmer + + # Load trained matrix + if qu.isCompressed(args.trained_matrix): + trained_matrix_file = gzip.open(args.trained_matrix, "rb") + else: + trained_matrix_file = open(args.trained_matrix, "rb") + + trained_matrix = np.load(trained_matrix_file) + + # Return a list of the input directory + fasta_list = os.listdir(args.input_directory) + + # Queue up and run our quikr functions. + pool = Pool(processes=jobs) + results = pool.map(quikr_call, fasta_list) + + # Create an array of headers + headers = [] + trained_matrix_headers = open(args.trained_fasta, "rU") + for header in SeqIO.parse(trained_matrix_headers, "fasta"): + headers.append(header.id) + trained_matrix_headers.close() + + # create our number of reads matrix + number_of_reads = np.zeros((len(headers), len(fasta_list))) + + # load the keys with values from each fasta result + for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): + + count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) + number_of_sequences = int(count_sequences.stdout.readline()) + + proportions = np.loadtxt(output_directory + fasta); + + for proportion, proportion_it in map(None, proportions, range(len(proportions))): + number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) + + # remove empty rows from our matrix + final_headers = list() + final_data = list() + for row, header in map(None, number_of_reads, headers): + if row.sum() != 0: + final_headers.append(header) + final_data.append(row) + + # convert from a list back into a numpy array + final_data = np.array(final_data, dtype=int) + + # stack our final header and our number_of_reads matrix + number_of_reads = np.column_stack((final_headers, final_data)) + + # write our OTU table + output_file = open(args.otu_table, "wb") + writer = csv.writer(output_file, delimiter="\t") + + #write out our fasta file row + writer.writerow(['# QIIME vGail OTU table']) + + fasta_row = ['#OTU_ID'] + fasta_row.append(' '.join(fasta_list)) + fasta_row = [' '.join(fasta_row)] + writer.writerow(fasta_row) + + # write out our results + for i in range(0, np.shape(number_of_reads)[0]): + writer.writerow(list(number_of_reads[i])) + + output_file.close() + + return 0 + + +def quikr_call(fasta_file): + input_location = input_directory + fasta_file + output_location = output_directory + os.path.basename(fasta_file) + + xstar = q.quikr(input_location, trained_matrix, kmer, lamb) + np.savetxt(output_location, xstar, delimiter=",", fmt="%f") + return xstar + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/src/quikr b/src/quikr new file mode 100755 index 0000000..bac01ca --- /dev/null +++ b/src/quikr @@ -0,0 +1,48 @@ +#!/usr/bin/python +import sys +import os +import argparse +import quikr +import numpy as np + +def main(): + + parser = argparse.ArgumentParser(description= + "Quikr returns the estimated frequencies of batcteria present when given a \ + input FASTA file. \n \ + A default trained matrix will be used if none is supplied \n \ + You must supply a kmer and default lambda if using a custom trained \ + matrix.") + + parser.add_argument("-f", "--fasta", help="fasta file", required=True) + parser.add_argument("-o", "--output", help="output path (csv output)", required=True) + parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, required=True, + help="specifies which kmer to use, must be used with a custom trained database") + + + args = parser.parse_args() + + # our default lambda is 10,000 + lamb = 10000 + + # Make sure our input exist + if not os.path.isfile(args.fasta): + parser.error( "Input fasta file not found") + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix) + xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb) + + np.savetxt(args.output, xstar, delimiter=",", fmt="%f") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/quikr.py b/src/quikr.py new file mode 100755 index 0000000..3f8221e --- /dev/null +++ b/src/quikr.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +import os +import sys +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +from subprocess import * +import argparse +import platform +import gzip +import itertools + +def generate_kmers(kmer): + """ This will return a list of kmers seperated by newlines """ + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) + +def isCompressed(filename): + """ This function checks to see if the file is gzipped """ + + f = open(filename, "rb") + + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def train_matrix(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ + + kmer_file_name = str(kmer) + "mers.txt" + + if not os.path.isfile(kmer_file_name): + print "could not find kmer file" + exit() + + uname = platform.uname()[0] + + if uname == "Linux": + input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) + elif uname == "Darwin": + input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) + + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix + + +def load_trained_matrix_from_file(trained_matrix_location): + """ This is a helper function to load our trained matrix and run quikr """ + + if isCompressed(trained_matrix_location): + trained_matrix_file = gzip.open(trained_matrix_location, "rb") + else: + trained_matrix_file = open(trained_matrix_location, "rb") + + trained_matrix = np.load(trained_matrix_file) + + return trained_matrix + + +def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): + """ + input_fasta is the input fasta file to find the estimated frequencies of + trained_matrix is the trained matrix we are using to estimate the species + kmer is the desired k-mer to use + default_lambda is inp + + returns the estimated requencies of bacteria present when given an input + FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity + promoting algorthim is utilized. + + In practice reconstruction is accurate only down to the genus level (not + species or strain). + + """ + + uname = platform.uname()[0] + + # We use the count program to count ____ + if uname == "Linux" and os.path.isfile("./count-linux"): + count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + elif uname == "Darwin" and os.path.isfile("./count-osx"): + count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + + + # load the output of our count program and form a probability vector from the counts + counts = np.loadtxt(count_input.stdout) + counts = counts / counts.sum(0) + counts = default_lambda * counts + counts = np.concatenate([np.zeros(1), counts]) + + #form the k-mer sensing matrix + trained_matrix = trained_matrix * default_lambda; + trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + + + xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) + xstar = xstar / xstar.sum(0) + + return xstar + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/quikr_train b/src/quikr_train new file mode 100755 index 0000000..6e599c9 --- /dev/null +++ b/src/quikr_train @@ -0,0 +1,48 @@ +#!/usr/bin/python +import numpy as np +import quikr +import os +import sys +import gzip +from subprocess import * +import platform +import argparse + +def main(): + """ + You can call this script independently, and will save the + trained matrix as a numpy file. + + example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy + + """ + parser = argparse.ArgumentParser(description= + " quikr_train returns a custom trained matrix that can be used with \ + the quikr function. \n You must supply a kmer. \n ") + + parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True) + parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True) + parser.add_argument("-k", "--kmer", help="kmer size (integer)", + type=int, required=False ) + parser.add_argument("-z", "--compress", help="compress output (integer)", + action='store_true', required=False) + + args = parser.parse_args() + + if not os.path.isfile(args.input): + parser.error( "Input database not found") + + # call the quikr train function, save the output with np.save + matrix = quikr.train_matrix(args.input, args.kmer) + + if args.compress: + output_file = gzip.open(args.output, "wb") + else: + output_file = open(args.output, "wb") + + np.save(output_file, matrix) + + return 0 + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3