aboutsummaryrefslogtreecommitdiff
path: root/multifasta_to_otu.py
blob: ac883cad2af0ba1d74f928e5730f38604beb8d52 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
from multiprocessing import Pool
from Bio import SeqIO
import multiprocessing
import os
import quikr_train as qt
import quikr as q
import sys
import numpy as np
import argparse
import platform

# our defaults
kmer = 6
lamb = 10000
output_directory = ""
input_directory = ""


def main():

  global kmer
  global input_directory 
  global output_directory 
  global lamb
  global trained_matrix 
  #do: write up the description
  parser = argparse.ArgumentParser(description="MultifastaOTU")

  parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True)
  parser.add_argument("-o", "--otu-table", help="otu_table", required=True)
  parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True)
  parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True)
  parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True)
  parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
  parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6")
  parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs")
  args = parser.parse_args()
    
  # our defaults
  jobs=multiprocessing.cpu_count()
  trained_matrix = args.trained_matrix
  input_directory = args.input_directory
  output_directory = args.output_directory

  # Make sure our input exist
  if not os.path.isdir(args.input_directory):
    parser.error( "Input directory not found")

  if not os.path.isdir(args.output_directory):
    parser.error( "Input directory not found")

  if not os.path.isdir(args.output_directory):
    os.path.mkdir(args,output_directory)

  if not os.path.isfile(args.trained_matrix):
    parser.error("custom trained matrix not found")
    
  # use alternative lambda
  if args.lamb is not None:
    lamb = args.lamb
    
  if args.jobs is not None:
    jobs = args.jobs

  if args.kmer is not None:
    kmer = args.kmer

  # Load trained matrix
  trained_matrix = np.load(args.trained_matrix);

  # Return a list of the input directory
  fasta_list = os.listdir(args.input_directory)

  # Queue up and run our quikr functions.
#  pool = Pool(processes=jobs)
#  results = pool.map(quikr_call, fasta_list)

  # Create an array of headers
  records = []
  trained_matrix_headers = open(args.trained_fasta, "rU")
  for record in SeqIO.parse(trained_matrix_headers, "fasta"):
    records.append(record.id)
  trained_matrix_headers.close()

  final_output = np.zeros((len(records), len(fasta_list)))
  print len(fasta_list)

  # load the keys with values from each fasta result
  for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):
    fasta_file = open(input_directory + fasta, "rU")
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    number_of_sequences = len(sequences)
    fasta_file.close()

    print number_of_sequences
    
    proportions = np.loadtxt(output_directory + fasta);
    for proportion, proportion_it in map(None, proportions, range(len(proportions))):
     if(round(proportion * number_of_sequences) is not 0):
        print str(fasta_it) + " " + str(proportion_it)
        final_output[fasta_it, proportion_it] = proportion * number_of_sequences
  
  np.savetxt(args.otu_table, final_output, delimiter=",", fmt="%d")
      



  # Write the otu table
  return 0

def quikr_call(fasta_file):
  inp = input_directory + fasta_file
  output = output_directory + os.path.basename(fasta_file)

  xstar = q.quikr(inp, trained_matrix, kmer, lamb)
  np.savetxt(output, xstar, delimiter=",", fmt="%f")
  return xstar

if __name__ == "__main__":
  sys.exit(main())