From 3f0c33ff93dea10b2f79c8c2101431e251b8b928 Mon Sep 17 00:00:00 2001 From: Calvin Date: Mon, 11 Mar 2013 12:40:47 -0400 Subject: move python stuff to python directory --- src/python/quikr.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100755 src/python/quikr.py (limited to 'src/python/quikr.py') diff --git a/src/python/quikr.py b/src/python/quikr.py new file mode 100755 index 0000000..368ab31 --- /dev/null +++ b/src/python/quikr.py @@ -0,0 +1,127 @@ +#!/usr/bin/python +import os +import sys +from StringIO import StringIO +import scipy.optimize.nnls +import scipy.sparse +import numpy as np +from subprocess import * +import argparse +import platform +import gzip +import itertools + +def generate_kmers(kmer): + """ generate all possible kmers permutations seperated by newlines + + >>> kmers = generate_kmers(1) + >>> generate_kmers(2) + + param kmer: the desired Mer size + type kmer: int + return: Returns a string of kmers seperated by newlines + rtype: string + """ + + return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) + +def isCompressed(filename): + """ This function checks to see if the file is gzipped + + >>> boolean_value = isCompressed("/path/to/compressed/gzip/file") + >>> print boolean_value + True + + param filename: the filename to check + type filename: string + return: Returns whether the file is gzipped + rtype: boolean + + """ + f = open(filename, "rb") + + # The first two bytes of a gzipped file are always '1f 8b' + if f.read(2) == '\x1f\x8b': + f.close() + return True + else: + f.close() + return False + + +def train_matrix(input_file_location, kmer): + """ + Takes a input fasta file, and kmer, returns a custom trained matrix + """ + + kmer_file_name = str(kmer) + "mers.txt" + + kmer_output = Popen(["generate_kmers", str(kmer)], stdout=PIPE) + input_file = Popen(["probabilities-by-read", str(kmer), input_file_location] , stdout=PIPE) + + # load and normalize the matrix by dividing each element by the sum of it's column. + # also do some fancy rotations so that it works properly with quikr + matrix = np.loadtxt(input_file.stdout) + + matrix = np.rot90(matrix) + matrix = matrix / matrix.sum(0) + matrix = np.flipud(matrix); + return matrix + + +def load_trained_matrix_from_file(trained_matrix_location): + """ This is a helper function to load our trained matrix and run quikr """ + + if isCompressed(trained_matrix_location): + trained_matrix_file = gzip.open(trained_matrix_location, "rb") + else: + trained_matrix_file = open(trained_matrix_location, "rb") + + trained_matrix = np.load(trained_matrix_file) + + return trained_matrix + + +def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda): + """ + input_fasta is the input fasta file to find the estimated frequencies of + trained_matrix is the trained matrix we are using to estimate the species + kmer is the desired k-mer to use + default_lambda is inp + + returns the estimated requencies of bacteria present when given an input + FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity + promoting algorthim is utilized. + + In practice reconstruction is accurate only down to the genus level (not + species or strain). + + """ + + uname = platform.uname()[0] + + # We use the count program to count ____ + if uname == "Linux" and os.path.isfile("./count-linux"): + count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + elif uname == "Darwin" and os.path.isfile("./count-osx"): + count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) + + + # load the output of our count program and form a probability vector from the counts + counts = np.loadtxt(count_input.stdout) + counts = counts / counts.sum(0) + counts = default_lambda * counts + counts = np.concatenate([np.zeros(1), counts]) + + #form the k-mer sensing matrix + trained_matrix = trained_matrix * default_lambda; + trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) + + xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) + xstar = xstar / xstar.sum(0) + + return xstar + + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3