From a200b2df4212521b7e006f17979d7c4c2d4b8ca9 Mon Sep 17 00:00:00 2001 From: Calvin Date: Tue, 12 Mar 2013 14:14:59 -0400 Subject: removed multifasta's py extension, modified the --help argument output for quikr_train --- src/python/multifasta_to_otu | 154 ++++++++++++++++++++++++++++++++++++++++ src/python/multifasta_to_otu.py | 154 ---------------------------------------- src/python/quikr_train | 2 +- 3 files changed, 155 insertions(+), 155 deletions(-) create mode 100755 src/python/multifasta_to_otu delete mode 100755 src/python/multifasta_to_otu.py diff --git a/src/python/multifasta_to_otu b/src/python/multifasta_to_otu new file mode 100755 index 0000000..c6fb562 --- /dev/null +++ b/src/python/multifasta_to_otu @@ -0,0 +1,154 @@ +#!/usr/bin/python + +from multiprocessing import Pool +from Bio import SeqIO +import multiprocessing +from subprocess import * +import os +import gzip +import quikr as q +import sys +import numpy as np +import argparse +import platform +import csv + +# our defaults +kmer = 6 +lamb = 10000 +output_directory = "" +input_directory = "" + + +def main(): + + global kmer + global input_directory + global output_directory + global lamb + global trained_matrix + + parser = argparse.ArgumentParser(description="MultifastaOTU") + + parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) + parser.add_argument("-o", "--otu-table", help="otu_table", required=True) + parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) + parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) + parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) + parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") + parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") + parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") + args = parser.parse_args() + + # our defaults + jobs=multiprocessing.cpu_count() + trained_matrix = args.trained_matrix + input_directory = args.input_directory + output_directory = args.output_directory + + # Make sure our input exist + if not os.path.isdir(args.input_directory): + parser.error("Input directory not found") + + if not os.path.isdir(args.output_directory): + print "Output directory not found, creating directory" + os.path.mkdir(args, output_directory) + + if not os.path.isfile(args.trained_matrix): + parser.error("Custom trained matrix not found") + + if not os.path.isfile(args.trained_fasta): + parser.error("Fasta file of trained matrix not found") + + # use alternative lambda + if args.lamb is not None: + lamb = args.lamb + + if args.jobs is not None: + jobs = args.jobs + + if args.kmer is not None: + kmer = args.kmer + + # Load trained matrix + if qu.isCompressed(args.trained_matrix): + trained_matrix_file = gzip.open(args.trained_matrix, "rb") + else: + trained_matrix_file = open(args.trained_matrix, "rb") + + trained_matrix = np.load(trained_matrix_file) + + # Return a list of the input directory + fasta_list = os.listdir(args.input_directory) + + # Queue up and run our quikr functions. + pool = Pool(processes=jobs) + results = pool.map(quikr_call, fasta_list) + + # Create an array of headers + headers = [] + trained_matrix_headers = open(args.trained_fasta, "rU") + for header in SeqIO.parse(trained_matrix_headers, "fasta"): + headers.append(header.id) + trained_matrix_headers.close() + + # create our number of reads matrix + number_of_reads = np.zeros((len(headers), len(fasta_list))) + + # load the keys with values from each fasta result + for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): + + count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) + number_of_sequences = int(count_sequences.stdout.readline()) + + proportions = np.loadtxt(output_directory + fasta); + + for proportion, proportion_it in map(None, proportions, range(len(proportions))): + number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) + + # remove empty rows from our matrix + final_headers = list() + final_data = list() + for row, header in map(None, number_of_reads, headers): + if row.sum() != 0: + final_headers.append(header) + final_data.append(row) + + # convert from a list back into a numpy array + final_data = np.array(final_data, dtype=int) + + # stack our final header and our number_of_reads matrix + number_of_reads = np.column_stack((final_headers, final_data)) + + # write our OTU table + output_file = open(args.otu_table, "wb") + writer = csv.writer(output_file, delimiter="\t") + + #write out our fasta file row + writer.writerow(['# QIIME vGail OTU table']) + + fasta_row = ['#OTU_ID'] + fasta_row.append(' '.join(fasta_list)) + fasta_row = [' '.join(fasta_row)] + writer.writerow(fasta_row) + + # write out our results + for i in range(0, np.shape(number_of_reads)[0]): + writer.writerow(list(number_of_reads[i])) + + output_file.close() + + return 0 + + +def quikr_call(fasta_file): + input_location = input_directory + fasta_file + output_location = output_directory + os.path.basename(fasta_file) + + xstar = q.quikr(input_location, trained_matrix, kmer, lamb) + np.savetxt(output_location, xstar, delimiter=",", fmt="%f") + return xstar + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/src/python/multifasta_to_otu.py b/src/python/multifasta_to_otu.py deleted file mode 100755 index c6fb562..0000000 --- a/src/python/multifasta_to_otu.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/python - -from multiprocessing import Pool -from Bio import SeqIO -import multiprocessing -from subprocess import * -import os -import gzip -import quikr as q -import sys -import numpy as np -import argparse -import platform -import csv - -# our defaults -kmer = 6 -lamb = 10000 -output_directory = "" -input_directory = "" - - -def main(): - - global kmer - global input_directory - global output_directory - global lamb - global trained_matrix - - parser = argparse.ArgumentParser(description="MultifastaOTU") - - parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True) - parser.add_argument("-o", "--otu-table", help="otu_table", required=True) - parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True) - parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True) - parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True) - parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000") - parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6") - parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs") - args = parser.parse_args() - - # our defaults - jobs=multiprocessing.cpu_count() - trained_matrix = args.trained_matrix - input_directory = args.input_directory - output_directory = args.output_directory - - # Make sure our input exist - if not os.path.isdir(args.input_directory): - parser.error("Input directory not found") - - if not os.path.isdir(args.output_directory): - print "Output directory not found, creating directory" - os.path.mkdir(args, output_directory) - - if not os.path.isfile(args.trained_matrix): - parser.error("Custom trained matrix not found") - - if not os.path.isfile(args.trained_fasta): - parser.error("Fasta file of trained matrix not found") - - # use alternative lambda - if args.lamb is not None: - lamb = args.lamb - - if args.jobs is not None: - jobs = args.jobs - - if args.kmer is not None: - kmer = args.kmer - - # Load trained matrix - if qu.isCompressed(args.trained_matrix): - trained_matrix_file = gzip.open(args.trained_matrix, "rb") - else: - trained_matrix_file = open(args.trained_matrix, "rb") - - trained_matrix = np.load(trained_matrix_file) - - # Return a list of the input directory - fasta_list = os.listdir(args.input_directory) - - # Queue up and run our quikr functions. - pool = Pool(processes=jobs) - results = pool.map(quikr_call, fasta_list) - - # Create an array of headers - headers = [] - trained_matrix_headers = open(args.trained_fasta, "rU") - for header in SeqIO.parse(trained_matrix_headers, "fasta"): - headers.append(header.id) - trained_matrix_headers.close() - - # create our number of reads matrix - number_of_reads = np.zeros((len(headers), len(fasta_list))) - - # load the keys with values from each fasta result - for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): - - count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) - number_of_sequences = int(count_sequences.stdout.readline()) - - proportions = np.loadtxt(output_directory + fasta); - - for proportion, proportion_it in map(None, proportions, range(len(proportions))): - number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences) - - # remove empty rows from our matrix - final_headers = list() - final_data = list() - for row, header in map(None, number_of_reads, headers): - if row.sum() != 0: - final_headers.append(header) - final_data.append(row) - - # convert from a list back into a numpy array - final_data = np.array(final_data, dtype=int) - - # stack our final header and our number_of_reads matrix - number_of_reads = np.column_stack((final_headers, final_data)) - - # write our OTU table - output_file = open(args.otu_table, "wb") - writer = csv.writer(output_file, delimiter="\t") - - #write out our fasta file row - writer.writerow(['# QIIME vGail OTU table']) - - fasta_row = ['#OTU_ID'] - fasta_row.append(' '.join(fasta_list)) - fasta_row = [' '.join(fasta_row)] - writer.writerow(fasta_row) - - # write out our results - for i in range(0, np.shape(number_of_reads)[0]): - writer.writerow(list(number_of_reads[i])) - - output_file.close() - - return 0 - - -def quikr_call(fasta_file): - input_location = input_directory + fasta_file - output_location = output_directory + os.path.basename(fasta_file) - - xstar = q.quikr(input_location, trained_matrix, kmer, lamb) - np.savetxt(output_location, xstar, delimiter=",", fmt="%f") - return xstar - -if __name__ == "__main__": - sys.exit(main()) - diff --git a/src/python/quikr_train b/src/python/quikr_train index bf74e12..167a0e4 100755 --- a/src/python/quikr_train +++ b/src/python/quikr_train @@ -24,7 +24,7 @@ def main(): parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True) parser.add_argument("-k", "--kmer", help="kmer size (integer)", type=int, required=False ) - parser.add_argument("-z", "--compress", help="compress output (integer)", + parser.add_argument("-z", "--compress", help="compress output (flag)", action='store_true', required=False) args = parser.parse_args() -- cgit v1.2.3