diff options
author | Calvin <calvin@EESI> | 2013-02-14 16:21:47 -0500 |
---|---|---|
committer | Calvin <calvin@EESI> | 2013-02-14 16:21:47 -0500 |
commit | dd6ba075ea5ec86936900adaf71d69ae374d1794 (patch) | |
tree | 0ad18c8d488be046336f5d1585e73132dd38c94d |
quikr and quikr_train
-rw-r--r-- | quikr.py | 67 | ||||
-rw-r--r-- | quikr_train.py | 50 |
2 files changed, 117 insertions, 0 deletions
diff --git a/quikr.py b/quikr.py new file mode 100644 index 0000000..1e2d671 --- /dev/null +++ b/quikr.py @@ -0,0 +1,67 @@ +import os +import sys +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +import scipy as sp +from scipy.sparse import lil_matrix +from scipy.sparse.linalg import spsolve +from subprocess import * +import argparse +import platform + +def main(args): + + parser = argparse.ArgumentParser(description="An argparse example") + parser.add_argument("--trained", help="The trained matrix") + parser.add_argument("--fasta", help="input fasta file") + + args = parser.parse_args() + + if args.trained== "install": + print "You asked for installation" + else: + print "You asked for something other than installation" + + + return 0 + +def quikr(input_fasta_location, trained_matrix_location, kmer, default_lambda): + """ + input_fasta is the input fasta file to find the estimated frequencies of + trained_matrix is the trained matrix we are using to estimate the species + kmer is the desired k-mer to use + default_lambda is inp + + returns the estimated requencies of bacteria present when given an input + FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity + promoting algorthim is utilized. + + In practice reconstruction is accurate only down to the genus level (not + species or strain). + """ + + uname = platform.uname()[0] + + # We use the count program to count ____ + if uname == "Linux" and os.path.isfile("./count-linux"): + print "Detected Linux" + count_input = Popen(["count-linux", "-r " + kmer, "-1", "-u", input_fasta], stdout=PIPE) + elif uname == "Darwin" and os.path.isfile("./count-osx"): + print "Detected Mac OS X" + count_input = Popen(["count-osx", "-r 6", "-1", "-u", input_fasta], stdout=PIPE) + + + counts = np.loadtxt(count_input.stdout) # create a ndarray + counts = counts / np.sum(counts) # form a probablility vector from our counts + + counts = default_lambda * counts + + trained_matrix = np.loadtxt(trained_matrix_location) + + xstar = nnls(trained_matrix, counts) + return 1 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/quikr_train.py b/quikr_train.py new file mode 100644 index 0000000..2076f5a --- /dev/null +++ b/quikr_train.py @@ -0,0 +1,50 @@ +#from scipy.sparse import * +import numpy as np +import sys +from subprocess import * +import platform + +# You can call this script independently, and will save the +# trained matrix as a numpy file. +# example: python quikr-train.py input.fasta 6 trained_matrix.npy + +def main(argv): + input_file_location = argv[1] + kmer = argv[2] + output_file_location = argv[3] + + # call the quikr train function, save the output with np.save + matrix = quikr_train(argv[1], argv[2]) + np.save(output_file_location, matrix) + + return 0 + +def quikr_train(input_file_location, kmer): + + + print "input fasta training file: " + input_file_location + print "kmer: " + kmer + + kmer_file_name = kmer + "mers.txt" + print kmer_file_name + + + uname = platform.uname()[0] + + if uname == "Linux": + print "Detected Linux" + input_file = Popen(["./probabilities-by-read-linux", kmer, input_file_location, kmer_file_name], stdout=PIPE) + elif uname == "Darwin": + print "Detected Mac OS X" + input_file = Popen(["./probabilities-by-read-osx", kmer, input_file_location, kmer_file_name]) + + # load and normalize the matrix by dividing each element by the sum of it's column. + matrix = np.loadtxt(input_file.stdout) + normalized = matrix / matrix.sum(0) + + return normalized + + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) |