From c18793c210ea44cef63db3568791cbad537bdb79 Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 8 Mar 2013 15:32:50 -0500 Subject: moving some stuff --- src/quikr.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100755 src/quikr.py (limited to 'src/quikr.py') diff --git a/src/quikr.py b/src/quikr.py new file mode 100755 index 0000000..3f8221e --- /dev/null +++ b/src/quikr.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +import os +import sys +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +from subprocess import * +import argparse +import platform +import gzip +import itertools + +def generate_kmers(kmer): + """ This will return a list of kmers seperated by newlines """ + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) + +def isCompressed(filename): + """ This function checks to see if the file is gzipped """ + + f = open(filename, "rb") + + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def train_matrix(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ + + kmer_file_name = str(kmer) + "mers.txt" + + if not os.path.isfile(kmer_file_name): + print "could not find kmer file" + exit() + + uname = platform.uname()[0] + + if uname == "Linux": + input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) + elif uname == "Darwin": + input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) + + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix + + +def load_trained_matrix_from_file(trained_matrix_location): + """ This is a helper function to load our trained matrix and run quikr """ + + if isCompressed(trained_matrix_location): + trained_matrix_file = gzip.open(trained_matrix_location, "rb") + else: + trained_matrix_file = open(trained_matrix_location, "rb") + + trained_matrix = np.load(trained_matrix_file) + + return trained_matrix + + +def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): + """ + input_fasta is the input fasta file to find the estimated frequencies of + trained_matrix is the trained matrix we are using to estimate the species + kmer is the desired k-mer to use + default_lambda is inp + + returns the estimated requencies of bacteria present when given an input + FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity + promoting algorthim is utilized. + + In practice reconstruction is accurate only down to the genus level (not + species or strain). + + """ + + uname = platform.uname()[0] + + # We use the count program to count ____ + if uname == "Linux" and os.path.isfile("./count-linux"): + count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + elif uname == "Darwin" and os.path.isfile("./count-osx"): + count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + + + # load the output of our count program and form a probability vector from the counts + counts = np.loadtxt(count_input.stdout) + counts = counts / counts.sum(0) + counts = default_lambda * counts + counts = np.concatenate([np.zeros(1), counts]) + + #form the k-mer sensing matrix + trained_matrix = trained_matrix * default_lambda; + trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + + + xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) + xstar = xstar / xstar.sum(0) + + return xstar + + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3