diff options
author | Calvin <calvin@EESI> | 2013-03-05 16:47:27 -0500 |
---|---|---|
committer | Calvin <calvin@EESI> | 2013-03-05 16:47:27 -0500 |
commit | 991f0940eeb412b67cfe36a09701b09d03367c93 (patch) | |
tree | e55a1bfac15c7243e98fe3c29b863ed85a706a10 | |
parent | e36cc91bad7bfdb73c5cbca73f4f2e07d870f7cc (diff) |
Added OTU table output and grep parsing
A giant batch commit. Sorry!
* OTU table output
* results are still different from matlab
* sample names still have suffix
* otherwise should be ok
* only return rows which sum up to greater than zero (ie non empty)
* Use grep instead of BIOseq because it is a lot faster to count our
sequences this way.
* re-enable our pool process management
* refactor and rename records to headers
* refactor final_output to output
-rwxr-xr-x | multifasta_to_otu.py | 74 |
1 files changed, 50 insertions, 24 deletions
diff --git a/multifasta_to_otu.py b/multifasta_to_otu.py index ac883ca..9ce5f48 100755 --- a/multifasta_to_otu.py +++ b/multifasta_to_otu.py @@ -1,7 +1,9 @@ #!/usr/bin/python + from multiprocessing import Pool from Bio import SeqIO import multiprocessing +from subprocess import * import os import quikr_train as qt import quikr as q @@ -9,6 +11,7 @@ import sys import numpy as np import argparse import platform +import csv # our defaults kmer = 6 @@ -73,48 +76,71 @@ def main(): fasta_list = os.listdir(args.input_directory) # Queue up and run our quikr functions. -# pool = Pool(processes=jobs) -# results = pool.map(quikr_call, fasta_list) + pool = Pool(processes=jobs) + results = pool.map(quikr_call, fasta_list) # Create an array of headers - records = [] + headers = [] trained_matrix_headers = open(args.trained_fasta, "rU") - for record in SeqIO.parse(trained_matrix_headers, "fasta"): - records.append(record.id) + for header in SeqIO.parse(trained_matrix_headers, "fasta"): + headers.append(header.id) trained_matrix_headers.close() - final_output = np.zeros((len(records), len(fasta_list))) - print len(fasta_list) + + output = np.zeros((len(headers), len(fasta_list))) # load the keys with values from each fasta result for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): - fasta_file = open(input_directory + fasta, "rU") - sequences = list(SeqIO.parse(fasta_file, "fasta")) - number_of_sequences = len(sequences) - fasta_file.close() - print number_of_sequences - + count_sequences = Popen(["grep", "-c" ,"^>", args.input_directory + fasta], stdout=PIPE) + number_of_sequences = int(count_sequences.stdout.readline()) + proportions = np.loadtxt(output_directory + fasta); + for proportion, proportion_it in map(None, proportions, range(len(proportions))): - if(round(proportion * number_of_sequences) is not 0): - print str(fasta_it) + " " + str(proportion_it) - final_output[fasta_it, proportion_it] = proportion * number_of_sequences - - np.savetxt(args.otu_table, final_output, delimiter=",", fmt="%d") - + output[proportion_it, fasta_it] = round(proportion * number_of_sequences) + + # remove empty rows from our matrix + final_headers = list() + final_data = list() + for row, header in map(None, output, headers): + if row.sum() != 0: + final_headers.append(header) + final_data.append(row) + # convert from a list back into a numpy array + final_data = np.array(final_data, dtype=int) + # stack our final header and our output matrix + output = np.column_stack((final_headers, final_data)) + + # write our OTU table + output_file = open(args.otu_table, "wb") + writer = csv.writer(output_file, delimiter="\t") + + #write out our fasta file row + writer.writerow(['# QIIME vGail OTU table']) + fasta_row = ['#OTU_ID'] + fasta_list + fasta_row.append(' '.join(fasta_list)) + fasta_row = [' '.join(fasta_row)] + writer.writerow(fasta_row) + + # write out our results + for i in range(1, np.shape(output)[0]): + writer.writerow(list(output[i])) + + output_file.close() - # Write the otu table return 0 + def quikr_call(fasta_file): - inp = input_directory + fasta_file - output = output_directory + os.path.basename(fasta_file) + input_location = input_directory + fasta_file + output_location = output_directory + os.path.basename(fasta_file) - xstar = q.quikr(inp, trained_matrix, kmer, lamb) - np.savetxt(output, xstar, delimiter=",", fmt="%f") + xstar = q.quikr(input_location, trained_matrix, kmer, lamb) + np.savetxt(output_location, xstar, delimiter=",", fmt="%f") return xstar if __name__ == "__main__": |