1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
#!/usr/bin/python
import os
import sys
from StringIO import StringIO
import scipy.optimize.nnls
import scipy.sparse
import numpy as np
from subprocess import *
import argparse
import platform
import gzip
import itertools
def generate_kmers(kmer):
""" generate all possible kmers permutations seperated by newlines
>>> kmers = generate_kmers(1)
>>> generate_kmers(2)
param kmer: the desired Mer size
type kmer: int
return: Returns a string of kmers seperated by newlines
rtype: string
"""
return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
def isCompressed(filename):
""" This function checks to see if the file is gzipped
>>> boolean_value = isCompressed("/path/to/compressed/gzip/file")
>>> print boolean_value
True
param filename: the filename to check
type filename: string
return: Returns whether the file is gzipped
rtype: boolean
"""
f = open(filename, "rb")
# The first two bytes of a gzipped file are always '1f 8b'
if f.read(2) == '\x1f\x8b':
f.close()
return True
else:
f.close()
return False
def train_matrix(input_file_location, kmer):
"""
Takes a input fasta file, and kmer, returns a custom trained matrix
"""
kmer_file_name = str(kmer) + "mers.txt"
if not os.path.isfile(kmer_file_name):
print "could not find kmer file"
exit()
input_file = Popen(["probabilities-by-read", str(kmer), input_file_location, kmer_file_name] , stdout=PIPE)
# load and normalize the matrix by dividing each element by the sum of it's column.
# also do some fancy rotations so that it works properly with quikr
matrix = np.loadtxt(input_file.stdout)
matrix = np.rot90(matrix)
matrix = matrix / matrix.sum(0)
matrix = np.flipud(matrix);
return matrix
def load_trained_matrix_from_file(trained_matrix_location):
""" This is a helper function to load our trained matrix and run quikr """
if isCompressed(trained_matrix_location):
trained_matrix_file = gzip.open(trained_matrix_location, "rb")
else:
trained_matrix_file = open(trained_matrix_location, "rb")
trained_matrix = np.load(trained_matrix_file)
return trained_matrix
def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
"""
input_fasta is the input fasta file to find the estimated frequencies of
trained_matrix is the trained matrix we are using to estimate the species
kmer is the desired k-mer to use
default_lambda is inp
returns the estimated requencies of bacteria present when given an input
FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
promoting algorthim is utilized.
In practice reconstruction is accurate only down to the genus level (not
species or strain).
"""
uname = platform.uname()[0]
# We use the count program to count ____
if uname == "Linux" and os.path.isfile("./count-linux"):
count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
elif uname == "Darwin" and os.path.isfile("./count-osx"):
count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
# load the output of our count program and form a probability vector from the counts
counts = np.loadtxt(count_input.stdout)
counts = counts / counts.sum(0)
counts = default_lambda * counts
counts = np.concatenate([np.zeros(1), counts])
#form the k-mer sensing matrix
trained_matrix = trained_matrix * default_lambda;
trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)
xstar = xstar / xstar.sum(0)
return xstar
if __name__ == "__main__":
sys.exit(main())
|