summaryrefslogtreecommitdiff
path: root/src/quikr.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/quikr.py')
-rwxr-xr-xsrc/quikr.py115
1 files changed, 115 insertions, 0 deletions
diff --git a/src/quikr.py b/src/quikr.py
new file mode 100755
index 0000000..3f8221e
--- /dev/null
+++ b/src/quikr.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+import os
+import sys
+import scipy.optimize.nnls
+import scipy.sparse
+import numpy as np
+from subprocess import *
+import argparse
+import platform
+import gzip
+import itertools
+
+def generate_kmers(kmer):
+ """ This will return a list of kmers seperated by newlines """
+ return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
+
+def isCompressed(filename):
+ """ This function checks to see if the file is gzipped """
+
+ f = open(filename, "rb")
+
+ # The first two bytes of a gzipped file are always '1f 8b'
+ if f.read(2) == '\x1f\x8b':
+ f.close()
+ return True
+ else:
+ f.close()
+ return False
+
+
+def train_matrix(input_file_location, kmer):
+ """
+ Takes a input fasta file, and kmer, returns a custom trained matrix
+ """
+
+ kmer_file_name = str(kmer) + "mers.txt"
+
+ if not os.path.isfile(kmer_file_name):
+ print "could not find kmer file"
+ exit()
+
+ uname = platform.uname()[0]
+
+ if uname == "Linux":
+ input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)
+ elif uname == "Darwin":
+ input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])
+
+ # load and normalize the matrix by dividing each element by the sum of it's column.
+ # also do some fancy rotations so that it works properly with quikr
+ matrix = np.loadtxt(input_file.stdout)
+
+ matrix = np.rot90(matrix)
+ matrix = matrix / matrix.sum(0)
+ matrix = np.flipud(matrix);
+ return matrix
+
+
+def load_trained_matrix_from_file(trained_matrix_location):
+ """ This is a helper function to load our trained matrix and run quikr """
+
+ if isCompressed(trained_matrix_location):
+ trained_matrix_file = gzip.open(trained_matrix_location, "rb")
+ else:
+ trained_matrix_file = open(trained_matrix_location, "rb")
+
+ trained_matrix = np.load(trained_matrix_file)
+
+ return trained_matrix
+
+
+def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
+ """
+ input_fasta is the input fasta file to find the estimated frequencies of
+ trained_matrix is the trained matrix we are using to estimate the species
+ kmer is the desired k-mer to use
+ default_lambda is inp
+
+ returns the estimated requencies of bacteria present when given an input
+ FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
+ promoting algorthim is utilized.
+
+ In practice reconstruction is accurate only down to the genus level (not
+ species or strain).
+
+ """
+
+ uname = platform.uname()[0]
+
+ # We use the count program to count ____
+ if uname == "Linux" and os.path.isfile("./count-linux"):
+ count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
+ elif uname == "Darwin" and os.path.isfile("./count-osx"):
+ count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
+
+
+ # load the output of our count program and form a probability vector from the counts
+ counts = np.loadtxt(count_input.stdout)
+ counts = counts / counts.sum(0)
+ counts = default_lambda * counts
+ counts = np.concatenate([np.zeros(1), counts])
+
+ #form the k-mer sensing matrix
+ trained_matrix = trained_matrix * default_lambda;
+ trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
+
+
+ xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)
+ xstar = xstar / xstar.sum(0)
+
+ return xstar
+
+
+if __name__ == "__main__":
+ sys.exit(main())