1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
#!/usr/bin/python
import os
import sys
from StringIO import StringIO
import scipy.optimize.nnls
import scipy.sparse
import numpy as np
from subprocess import *
import gzip
import itertools
def generate_kmers(kmer):
""" generate all possible kmers permutations seperated by newlines
>>> kmers = generate_kmers(1)
>>> generate_kmers(2)
param kmer: the desired Mer size
type kmer: int
return: Returns a string of kmers seperated by newlines
rtype: string
"""
return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
def isCompressed(filename):
""" This function checks to see if the file is gzipped
>>> boolean_value = isCompressed("/path/to/compressed/gzip/file")
>>> print boolean_value
True
param filename: the filename to check
type filename: string
return: Returns whether the file is gzipped
rtype: boolean
"""
try:
f = open(filename, "rb")
except IOError:
print "Warning: isCompressed could not find " + filename
return False
# The first two bytes of a gzipped file are always '1f 8b'
if f.read(2) == '\x1f\x8b':
f.close()
return True
else:
f.close()
return False
def train_matrix(input_file_location, kmer):
"""
Takes a input fasta file, and kmer, returns a custom trained matrix
"""
input_file = Popen(["bash", "-c", "probabilities-by-read " + str(kmer) + " " + input_file_location + " <(generate_kmers 6)"], stdout=PIPE)
# load and normalize the matrix by dividing each element by the sum of it's column.
# also do some fancy rotations so that it works properly with quikr
matrix = np.loadtxt(input_file.stdout)
matrix = np.rot90(matrix)
matrix = matrix / matrix.sum(0)
matrix = np.flipud(matrix);
return matrix
def load_trained_matrix_from_file(trained_matrix_location):
""" This is a helper function to load our trained matrix and run quikr """
if isCompressed(trained_matrix_location):
trained_matrix_file = gzip.open(trained_matrix_location, "rb")
else:
trained_matrix_file = open(trained_matrix_location, "rb")
trained_matrix = np.load(trained_matrix_file)
return trained_matrix
def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
"""
input_fasta is the input fasta file to find the estimated frequencies of
trained_matrix is the trained matrix we are using to estimate the species
kmer is the desired k-mer to use
default_lambda is inp
returns the estimated requencies of bacteria present when given an input
FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
promoting algorthim is utilized.
In practice reconstruction is accurate only down to the genus level (not
species or strain).
"""
# We use the count program to count
count_input = Popen(["count-kmers", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
# load the output of our count program and form a probability vector from the counts
counts = np.loadtxt(count_input.stdout)
counts = counts / counts.sum(0)
counts = default_lambda * counts
counts = np.concatenate([np.zeros(1), counts])
#form the k-mer sensing matrix
trained_matrix = trained_matrix * default_lambda;
trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)
xstar = xstar / xstar.sum(0)
return xstar
|