diff options
| author | Calvin <calvin@EESI> | 2013-03-05 16:47:27 -0500 | 
|---|---|---|
| committer | Calvin <calvin@EESI> | 2013-03-05 16:47:27 -0500 | 
| commit | 991f0940eeb412b67cfe36a09701b09d03367c93 (patch) | |
| tree | e55a1bfac15c7243e98fe3c29b863ed85a706a10 | |
| parent | e36cc91bad7bfdb73c5cbca73f4f2e07d870f7cc (diff) | |
Added OTU table output and grep parsing
A giant batch commit. Sorry!
  * OTU table output
    * results are still different from matlab
    * sample names still have suffix
    * otherwise should be ok
    * only return rows which sum up to greater than zero (ie non empty)
  * Use grep instead of BIOseq because it is a lot faster to count our
    sequences this way.
  * re-enable our pool process management
  * refactor and rename records to headers
  * refactor final_output to output
| -rwxr-xr-x | multifasta_to_otu.py | 74 | 
1 files changed, 50 insertions, 24 deletions
| diff --git a/multifasta_to_otu.py b/multifasta_to_otu.py index ac883ca..9ce5f48 100755 --- a/multifasta_to_otu.py +++ b/multifasta_to_otu.py @@ -1,7 +1,9 @@  #!/usr/bin/python +  from multiprocessing import Pool  from Bio import SeqIO  import multiprocessing +from subprocess import *  import os  import quikr_train as qt  import quikr as q @@ -9,6 +11,7 @@ import sys  import numpy as np  import argparse  import platform +import csv  # our defaults  kmer = 6 @@ -73,48 +76,71 @@ def main():    fasta_list = os.listdir(args.input_directory)    # Queue up and run our quikr functions. -#  pool = Pool(processes=jobs) -#  results = pool.map(quikr_call, fasta_list) +  pool = Pool(processes=jobs) +  results = pool.map(quikr_call, fasta_list)    # Create an array of headers -  records = [] +  headers = []    trained_matrix_headers = open(args.trained_fasta, "rU") -  for record in SeqIO.parse(trained_matrix_headers, "fasta"): -    records.append(record.id) +  for header in SeqIO.parse(trained_matrix_headers, "fasta"): +    headers.append(header.id)    trained_matrix_headers.close() -  final_output = np.zeros((len(records), len(fasta_list))) -  print len(fasta_list) + +  output = np.zeros((len(headers), len(fasta_list)))    # load the keys with values from each fasta result    for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))): -    fasta_file = open(input_directory + fasta, "rU") -    sequences = list(SeqIO.parse(fasta_file, "fasta")) -    number_of_sequences = len(sequences) -    fasta_file.close() -    print number_of_sequences -     +    count_sequences = Popen(["grep", "-c" ,"^>", args.input_directory + fasta], stdout=PIPE)  +    number_of_sequences = int(count_sequences.stdout.readline()) +      proportions = np.loadtxt(output_directory + fasta); +          for proportion, proportion_it in map(None, proportions, range(len(proportions))): -     if(round(proportion * number_of_sequences) is not 0): -        print str(fasta_it) + " " + str(proportion_it) -        final_output[fasta_it, proportion_it] = proportion * number_of_sequences -   -  np.savetxt(args.otu_table, final_output, delimiter=",", fmt="%d") -       +      output[proportion_it, fasta_it] = round(proportion * number_of_sequences) + +  # remove empty rows from our matrix +  final_headers = list() +  final_data = list() +  for row, header in map(None, output, headers): +    if row.sum() != 0: +      final_headers.append(header) +      final_data.append(row) +  # convert from a list back into a numpy array +  final_data = np.array(final_data, dtype=int) +  # stack our final header and our output matrix +  output = np.column_stack((final_headers, final_data)) + +  # write our OTU table +  output_file = open(args.otu_table, "wb")  +  writer = csv.writer(output_file, delimiter="\t") + +  #write out our fasta file row +  writer.writerow(['# QIIME vGail OTU table']) +  fasta_row = ['#OTU_ID'] +  fasta_list +  fasta_row.append(' '.join(fasta_list)) +  fasta_row = [' '.join(fasta_row)] +  writer.writerow(fasta_row) + +  # write out our results +  for i in range(1, np.shape(output)[0]): +      writer.writerow(list(output[i])) + +  output_file.close() -  # Write the otu table    return 0 +  def quikr_call(fasta_file): -  inp = input_directory + fasta_file -  output = output_directory + os.path.basename(fasta_file) +  input_location = input_directory + fasta_file +  output_location = output_directory + os.path.basename(fasta_file) -  xstar = q.quikr(inp, trained_matrix, kmer, lamb) -  np.savetxt(output, xstar, delimiter=",", fmt="%f") +  xstar = q.quikr(input_location, trained_matrix, kmer, lamb) +  np.savetxt(output_location, xstar, delimiter=",", fmt="%f")    return xstar  if __name__ == "__main__": | 
