diff options
Diffstat (limited to 'src/python')
-rwxr-xr-x | src/python/multifasta_to_otu | 34 | ||||
-rwxr-xr-x | src/python/quikr | 16 | ||||
-rwxr-xr-x | src/python/quikr.py | 26 |
3 files changed, 36 insertions, 40 deletions
diff --git a/src/python/multifasta_to_otu b/src/python/multifasta_to_otu index 3cc8f3e..431cc74 100755 --- a/src/python/multifasta_to_otu +++ b/src/python/multifasta_to_otu @@ -26,14 +26,14 @@ def main(): global input_directory global output_directory global lamb - global trained_matrix + global sensing_matrix parser = argparse.ArgumentParser(description="MultifastaOTU") parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) parser.add_argument("-o", "--otu-table", help="otu_table", required=True) - parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) - parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) + parser.add_argument("-s", "--sensing-matrix", help="your sensing matrix ", required=True) + parser.add_argument("-f", "--sensing-fasta", help="the fasta file used to train your matrix", required=True) parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") @@ -41,18 +41,18 @@ def main(): # our defaults jobs = multiprocessing.cpu_count() - trained_matrix = args.trained_matrix + sensing_matrix = args.sensing_matrix input_directory = args.input_directory # Make sure our input exist if not os.path.isdir(args.input_directory): parser.error("Input directory not found") - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") + if not os.path.isfile(args.sensing_matrix): + parser.error("Custom sensing matrix not found") - if not os.path.isfile(args.trained_fasta): - parser.error("Fasta file of trained matrix not found") + if not os.path.isfile(args.sensing_fasta): + parser.error("Fasta file of sensing matrix not found") # use alternative lambda if args.lamb is not None: @@ -64,13 +64,13 @@ def main(): if args.kmer is not None: kmer = args.kmer - # Load trained matrix - if q.is_compressed(args.trained_matrix): - trained_matrix_file = gzip.open(args.trained_matrix, "rb") + # Load sensing matrix + if q.is_compressed(args.sensing_matrix): + sensing_matrix_file = gzip.open(args.sensing_matrix, "rb") else: - trained_matrix_file = open(args.trained_matrix, "rb") + sensing_matrix_file = open(args.sensing_matrix, "rb") - trained_matrix = np.load(trained_matrix_file) + sensing_matrix = np.load(sensing_matrix_file) fasta_list = [] @@ -89,10 +89,10 @@ def main(): # Create an array of headers headers = [] - trained_matrix_headers = open(args.trained_fasta, "rU") - for header in SeqIO.parse(trained_matrix_headers, "fasta"): + sensing_matrix_headers = open(args.sensing_fasta, "rU") + for header in SeqIO.parse(sensing_matrix_headers, "fasta"): headers.append(header.id) - trained_matrix_headers.close() + sensing_matrix_headers.close() # create our number of reads matrix number_of_reads = np.zeros((len(headers), len(fasta_list))) @@ -148,7 +148,7 @@ def main(): def quikr_call(fasta_file): print os.path.basename(fasta_file) - xstar = q.calculate_estimated_frequencies(fasta_file, trained_matrix, kmer, lamb) + xstar = q.calculate_estimated_frequencies(fasta_file, sensing_matrix, kmer, lamb) return xstar if __name__ == "__main__": diff --git a/src/python/quikr b/src/python/quikr index bae2b9f..7c9ce25 100755 --- a/src/python/quikr +++ b/src/python/quikr @@ -9,15 +9,11 @@ def main(): parser = argparse.ArgumentParser(description= "Quikr returns the estimated frequencies of batcteria present when given a \ - input FASTA file. \n \ - A default trained matrix will be used if none is supplied \n \ - You must supply a kmer and default lambda if using a custom trained \ - matrix.") - + input FASTA file. \n") parser.add_argument("-f", "--fasta", help="the sample's fasta file of NGS READS", required=True) parser.add_argument("-o", "--output", help="OTU_FRACTION_PRESENT, a vector \ representing the percentage of database sequence's presence in a sequence. (csv output)", required=True) - parser.add_argument("-t", "--trained-matrix", help="the trained matrix", required=True) + parser.add_argument("-s", "--sensing-matrix", help="the sensing matrix", required=True) parser.add_argument("-l", "--lamb", type=int, help="the lambda size. (default is 10,000") parser.add_argument("-k", "--kmer", type=int, required=True, help="specifies the size of the kmer to use (the default is 6)") @@ -33,8 +29,8 @@ def main(): if not os.path.isfile(args.fasta): parser.error( "Input fasta file not found") - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") + if not os.path.isfile(args.sensing_matrix): + parser.error("Custom sensing matrix not found") # use alternative lambda if args.lamb is not None: @@ -44,8 +40,8 @@ def main(): if args.kmer is not None: kmer = args.kmer - trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix) - xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, kmer, lamb) + sensing_matrix = quikr.load_sensing_matrix_from_file(args.sensing_matrix) + xstar = quikr.calculate_estimated_frequencies(args.fasta, sensing_matrix, kmer, lamb) np.savetxt(args.output, xstar, delimiter=",", fmt="%f") return 0 diff --git a/src/python/quikr.py b/src/python/quikr.py index 98cc953..3c2b23a 100755 --- a/src/python/quikr.py +++ b/src/python/quikr.py @@ -39,7 +39,7 @@ def is_compressed(filename): def train_matrix(input_file_location, kmer): """ - Takes a input fasta file, and kmer, returns a custom trained matrix + Takes a input fasta file, and kmer, returns a custom sensing matrix """ input_file = Popen(["bash", "-c", "probabilities-by-read " + str(kmer) + " " + input_file_location + " <(generate_kmers 6)"], stdout=PIPE) @@ -54,23 +54,23 @@ def train_matrix(input_file_location, kmer): return matrix -def load_trained_matrix_from_file(trained_matrix_location): - """ This is a helper function to load our trained matrix and run quikr """ +def load_sensing_matrix_from_file(sensing_matrix_location): + """ This is a helper function to load our sensing matrix and run quikr """ - if is_compressed(trained_matrix_location): - trained_matrix_file = gzip.open(trained_matrix_location, "rb") + if is_compressed(sensing_matrix_location): + sensing_matrix_file = gzip.open(sensing_matrix_location, "rb") else: - trained_matrix_file = open(trained_matrix_location, "rb") + sensing_matrix_file = open(sensing_matrix_location, "rb") - trained_matrix = np.load(trained_matrix_file) + sensing_matrix = np.load(sensing_matrix_file) - return trained_matrix + return sensing_matrix -def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): +def calculate_estimated_frequencies(input_fasta_location, sensing_matrix, kmer, default_lambda): """ input_fasta is the input fasta file to find the estimated frequencies of - trained_matrix is the trained matrix we are using to estimate the species + sensing_matrix is the sensing matrix we are using to estimate the species kmer is the desired k-mer to use default_lambda is inp @@ -93,10 +93,10 @@ def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, counts = np.concatenate([np.zeros(1), counts]) #form the k-mer sensing matrix - trained_matrix = trained_matrix * default_lambda; - trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + sensing_matrix = sensing_matrix * default_lambda; + sensing_matrix = np.vstack((np.ones(sensing_matrix.shape[1]), sensing_matrix)) - xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) + xstar, rnorm = scipy.optimize.nnls(sensing_matrix, counts) xstar = xstar / xstar.sum(0) return xstar |