quikr.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

#!/usr/bin/python
import os
import sys
import scipy.optimize.nnls
import scipy.sparse
import numpy as np
import quikr_util as qu
from subprocess import *
import argparse
import platform
import gzip

def main():

    parser = argparse.ArgumentParser(description=
    "Quikr returns the estimated frequencies of batcteria present when given a \
    input FASTA file. \n \
    A default trained matrix will be used if none is supplied \n \
    You must supply a kmer and default lambda if using a custom trained \
    matrix.")

    parser.add_argument("-f", "--fasta", help="fasta file", required=True)
    parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
    parser.add_argument("-k", "--kmer", type=int, required=True,
        help="specifies which kmer to use, must be used with a custom trained database")


    args = parser.parse_args()
    
    # our default lambda is 10,000
    lamb = 10000


    # Make sure our input exist
    if not os.path.isfile(args.fasta):
        parser.error( "Input fasta file not found")

    if not os.path.isfile(args.trained_matrix):
        parser.error("Custom trained matrix not found")
    
    # use alternative lambda
    if args.lamb is not None:
        lamb = args.lamb
    
    xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb)

    np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
    return 0

def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda):
  
  if qu.isCompressed(trained_matrix_location):
    trained_matrix_file = gzip.open(trained_matrix_location, "rb")
  else:
    trained_matrix_file = open(trained_matrix_location, "rb")
  
  trained_matrix = np.load(trained_matrix_file)

  xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda)
  return xstar
  
def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):
  """
  input_fasta is the input fasta file to find the estimated frequencies of
  trained_matrix is the trained matrix we are using to estimate the species
  kmer is the desired k-mer to use
  default_lambda is inp 
  
  returns the estimated requencies of bacteria present when given an input
  FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
  promoting algorthim is utilized. 

  In practice reconstruction is accurate only down to the genus level (not 
  species or strain).
  """

  uname = platform.uname()[0]

  # We use the count program to count ____
  if uname == "Linux" and os.path.isfile("./count-linux"):
    count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
  elif uname == "Darwin" and os.path.isfile("./count-osx"):
    count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 


  # load the output of our count program and form a probability vector from the counts  
  counts = np.loadtxt(count_input.stdout) 
  counts = counts / counts.sum(0) 
  counts = default_lambda * counts
  counts = np.concatenate([np.zeros(1), counts])

  #form the k-mer sensing matrix
  trained_matrix = trained_matrix * default_lambda;
  trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))


  xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) 
  xstar = xstar / xstar.sum(0) 

  return xstar


if __name__ == "__main__":
    sys.exit(main())