multifasta_to_otu.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#!/usr/bin/python

from multiprocessing import Pool
from Bio import SeqIO
import multiprocessing
from subprocess import *
import os
import gzip
import quikr as q
import sys
import numpy as np
import argparse
import platform
import csv

# our defaults
kmer = 6
lamb = 10000
output_directory = ""
input_directory = ""


def main():

  global kmer
  global input_directory 
  global output_directory 
  global lamb
  global trained_matrix 

  parser = argparse.ArgumentParser(description="MultifastaOTU")

  parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True)
  parser.add_argument("-o", "--otu-table", help="otu_table", required=True)
  parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True)
  parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True)
  parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True)
  parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
  parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6")
  parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs")
  args = parser.parse_args()
    
  # our defaults
  jobs=multiprocessing.cpu_count()
  trained_matrix = args.trained_matrix
  input_directory = args.input_directory
  output_directory = args.output_directory

  # Make sure our input exist
  if not os.path.isdir(args.input_directory):
    parser.error("Input directory not found")

  if not os.path.isdir(args.output_directory):
    print "Output directory not found, creating directory"
    os.path.mkdir(args, output_directory)

  if not os.path.isfile(args.trained_matrix):
    parser.error("Custom trained matrix not found")
    
  if not os.path.isfile(args.trained_fasta):
    parser.error("Fasta file of trained matrix not found")

  # use alternative lambda
  if args.lamb is not None:
    lamb = args.lamb
    
  if args.jobs is not None:
    jobs = args.jobs

  if args.kmer is not None:
    kmer = args.kmer

  # Load trained matrix
  if qu.isCompressed(args.trained_matrix):
    trained_matrix_file = gzip.open(args.trained_matrix, "rb")
  else:
    trained_matrix_file = open(args.trained_matrix, "rb")
  
  trained_matrix = np.load(trained_matrix_file)

  # Return a list of the input directory
  fasta_list = os.listdir(args.input_directory)

  # Queue up and run our quikr functions.
  pool = Pool(processes=jobs)
  results = pool.map(quikr_call, fasta_list)

  # Create an array of headers
  headers = []
  trained_matrix_headers = open(args.trained_fasta, "rU")
  for header in SeqIO.parse(trained_matrix_headers, "fasta"):
    headers.append(header.id)
  trained_matrix_headers.close()

  # create our number of reads matrix
  number_of_reads = np.zeros((len(headers), len(fasta_list)))

  # load the keys with values from each fasta result
  for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):

    count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) 
    number_of_sequences = int(count_sequences.stdout.readline())

    proportions = np.loadtxt(output_directory + fasta);
    
    for proportion, proportion_it in map(None, proportions, range(len(proportions))):
      number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences)

  # remove empty rows from our matrix
  final_headers = list()
  final_data = list()
  for row, header in map(None, number_of_reads, headers):
    if row.sum() != 0:
      final_headers.append(header)
      final_data.append(row)

  # convert from a list back into a numpy array
  final_data = np.array(final_data, dtype=int)

  # stack our final header and our number_of_reads matrix
  number_of_reads = np.column_stack((final_headers, final_data))

  # write our OTU table
  output_file = open(args.otu_table, "wb") 
  writer = csv.writer(output_file, delimiter="\t")

  #write out our fasta file row
  writer.writerow(['# QIIME vGail OTU table'])

  fasta_row = ['#OTU_ID']
  fasta_row.append(' '.join(fasta_list))
  fasta_row = [' '.join(fasta_row)]
  writer.writerow(fasta_row)

  # write out our results
  for i in range(0, np.shape(number_of_reads)[0]):
      writer.writerow(list(number_of_reads[i]))

  output_file.close()

  return 0


def quikr_call(fasta_file):
  input_location = input_directory + fasta_file
  output_location = output_directory + os.path.basename(fasta_file)

  xstar = q.quikr(input_location, trained_matrix, kmer, lamb)
  np.savetxt(output_location, xstar, delimiter=",", fmt="%f")
  return xstar

if __name__ == "__main__":
  sys.exit(main())