1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
import os
import sys
import scipy.optimize.nnls
import scipy.sparse
import numpy as np
from subprocess import *
import argparse
import platform
def main():
parser = argparse.ArgumentParser(description=
"Quikr returns the estimated frequencies of batcteria present when given a \
input FASTA file. \n \
A default trained matrix will be used if none is supplied \n \
You must supply a kmer and default lambda if using a custom trained \
matrix.")
parser.add_argument("-f", "--fasta", help="path to a fasta file", required=True)
parser.add_argument("-t", "--trained-matrix", help="path to a custom trained matrix")
parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
parser.add_argument("-k", "--kmer", type=int,
help="specifies which kmer to use, must be used with a custom trained database")
args = parser.parse_args()
# Do some basic sanity checks
if not os.path.isfile(args.fasta):
parser.error( "Input fasta file not found")
# If we are using a custom trained matrix, we need to do some basic checks
if args.trained is not None:
if args.kmer is None:
parser.error("A kmer is required when using a custom matrix")
else:
kmer = args.kmer
if args.lamb is None:
# use 10,000 as default Lambda
input_lambda = 10000
# If we aren't using a custom trained matrix, load in the defaults
else:
trained_matrix_location = "trainset7_112011N6Aaux.mat"
input_lambda = 10000
kmer = 6
if not os.path.isfile(args.trained):
parser.error("custom trained matrix not be found")
xstar = quikr(args.fasta, trained_matrix_location, kmer, input_lambda)
return 0
def quikr(input_fasta_location, trained_matrix_location, kmer, default_lambda):
"""
input_fasta is the input fasta file to find the estimated frequencies of
trained_matrix is the trained matrix we are using to estimate the species
kmer is the desired k-mer to use
default_lambda is inp
returns the estimated requencies of bacteria present when given an input
FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
promoting algorthim is utilized.
In practice reconstruction is accurate only down to the genus level (not
species or strain).
"""
uname = platform.uname()[0]
# We use the count program to count ____
if uname == "Linux" and os.path.isfile("./count-linux"):
print "Detected Linux"
count_input = Popen(["count-linux", "-r " + kmer, "-1", "-u", input_fasta_location], stdout=PIPE)
elif uname == "Darwin" and os.path.isfile("./count-osx"):
print "Detected Mac OS X"
count_input = Popen(["count-osx", "-r 6", "-1", "-u", input_fasta_location], stdout=PIPE)
# load the output of our count program and form a probability vector from the counts
counts = np.loadtxt(count_input.stdout)
counts = counts / np.sum(counts)
counts = default_lambda * counts
trained_matrix = np.loadtxt(trained_matrix_location)
# perform the non-negative least squares
xstar = scipy.optimize.nnls(trained_matrix, counts)
xstar = xstar / sum(xstar)
return xstar
if __name__ == "__main__":
sys.exit(main())
|