Added OTU table output and grep parsing

A giant batch commit. Sorry! * OTU table output * results are still different from matlab * sample names still have suffix * otherwise should be ok * only return rows which sum up to greater than zero (ie non empty) * Use grep instead of BIOseq because it is a lot faster to count our sequences this way. * re-enable our pool process management * refactor and rename records to headers * refactor final_output to output
author: Calvin <calvin@EESI> 2013-03-05 16:47:27 -0500
committer: Calvin <calvin@EESI> 2013-03-05 16:47:27 -0500
commit: 991f0940eeb412b67cfe36a09701b09d03367c93 (patch)
tree: e55a1bfac15c7243e98fe3c29b863ed85a706a10
parent: e36cc91bad7bfdb73c5cbca73f4f2e07d870f7cc (diff)
1 files changed, 50 insertions, 24 deletions
diff --git a/multifasta_to_otu.py b/multifasta_to_otu.py
index ac883ca..9ce5f48 100755
--- a/multifasta_to_otu.py
+++ b/multifasta_to_otu.py
@@ -1,7 +1,9 @@
 #!/usr/bin/python
+
 from multiprocessing import Pool
 from Bio import SeqIO
 import multiprocessing
+from subprocess import *
 import os
 import quikr_train as qt
 import quikr as q
@@ -9,6 +11,7 @@ import sys
 import numpy as np
 import argparse
 import platform
+import csv
 
 # our defaults
 kmer = 6
@@ -73,48 +76,71 @@ def main():
   fasta_list = os.listdir(args.input_directory)
 
   # Queue up and run our quikr functions.
-#  pool = Pool(processes=jobs)
-#  results = pool.map(quikr_call, fasta_list)
+  pool = Pool(processes=jobs)
+  results = pool.map(quikr_call, fasta_list)
 
   # Create an array of headers
-  records = []
+  headers = []
   trained_matrix_headers = open(args.trained_fasta, "rU")
-  for record in SeqIO.parse(trained_matrix_headers, "fasta"):
-    records.append(record.id)
+  for header in SeqIO.parse(trained_matrix_headers, "fasta"):
+    headers.append(header.id)
   trained_matrix_headers.close()
 
-  final_output = np.zeros((len(records), len(fasta_list)))
-  print len(fasta_list)
+
+  output = np.zeros((len(headers), len(fasta_list)))
 
   # load the keys with values from each fasta result
   for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):
-    fasta_file = open(input_directory + fasta, "rU")
-    sequences = list(SeqIO.parse(fasta_file, "fasta"))
-    number_of_sequences = len(sequences)
-    fasta_file.close()
 
-    print number_of_sequences
-    
+    count_sequences = Popen(["grep", "-c" ,"^>", args.input_directory + fasta], stdout=PIPE) 
+    number_of_sequences = int(count_sequences.stdout.readline())
+
     proportions = np.loadtxt(output_directory + fasta);
+    
     for proportion, proportion_it in map(None, proportions, range(len(proportions))):
-     if(round(proportion * number_of_sequences) is not 0):
-        print str(fasta_it) + " " + str(proportion_it)
-        final_output[fasta_it, proportion_it] = proportion * number_of_sequences
-  
-  np.savetxt(args.otu_table, final_output, delimiter=",", fmt="%d")
-      
+      output[proportion_it, fasta_it] = round(proportion * number_of_sequences)
+
+  # remove empty rows from our matrix
+  final_headers = list()
+  final_data = list()
+  for row, header in map(None, output, headers):
+    if row.sum() != 0:
+      final_headers.append(header)
+      final_data.append(row)
 
+  # convert from a list back into a numpy array
+  final_data = np.array(final_data, dtype=int)
 
+  # stack our final header and our output matrix
+  output = np.column_stack((final_headers, final_data))
+
+  # write our OTU table
+  output_file = open(args.otu_table, "wb") 
+  writer = csv.writer(output_file, delimiter="\t")
+
+  #write out our fasta file row
+  writer.writerow(['# QIIME vGail OTU table'])
+  fasta_row = ['#OTU_ID']
+  fasta_list
+  fasta_row.append(' '.join(fasta_list))
+  fasta_row = [' '.join(fasta_row)]
+  writer.writerow(fasta_row)
+
+  # write out our results
+  for i in range(1, np.shape(output)[0]):
+      writer.writerow(list(output[i]))
+
+  output_file.close()
 
-  # Write the otu table
   return 0
 
+
 def quikr_call(fasta_file):
-  inp = input_directory + fasta_file
-  output = output_directory + os.path.basename(fasta_file)
+  input_location = input_directory + fasta_file
+  output_location = output_directory + os.path.basename(fasta_file)
 
-  xstar = q.quikr(inp, trained_matrix, kmer, lamb)
-  np.savetxt(output, xstar, delimiter=",", fmt="%f")
+  xstar = q.quikr(input_location, trained_matrix, kmer, lamb)
+  np.savetxt(output_location, xstar, delimiter=",", fmt="%f")
   return xstar
 
 if __name__ == "__main__":
author	Calvin <calvin@EESI>	2013-03-05 16:47:27 -0500
committer	Calvin <calvin@EESI>	2013-03-05 16:47:27 -0500
commit	991f0940eeb412b67cfe36a09701b09d03367c93 (patch)
tree	e55a1bfac15c7243e98fe3c29b863ed85a706a10
parent	e36cc91bad7bfdb73c5cbca73f4f2e07d870f7cc (diff)