#!/usr/bin/python import os import sys import scipy.optimize.nnls import scipy.sparse import numpy as np import quikr_util as qu from subprocess import * import argparse import platform import gzip import itertools def generate_kmers(kmer): """ This will return a list of kmers seperated by newlines """ return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer)) def isCompressed(filename): """ This function checks to see if the file is gzipped """ f = open(filename, "rb") # The first two bytes of a gzipped file are always '1f 8b' if f.read(2) == '\x1f\x8b': f.close() return True else: f.close() return False def quikr_train(input_file_location, kmer): """ Takes a input fasta file, and kmer, returns a custom trained matrix """ kmer_file_name = str(kmer) + "mers.txt" if not os.path.isfile(kmer_file_name): print "could not find kmer file" exit() uname = platform.uname()[0] if uname == "Linux": input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) elif uname == "Darwin": input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) # load and normalize the matrix by dividing each element by the sum of it's column. # also do some fancy rotations so that it works properly with quikr matrix = np.loadtxt(input_file.stdout) matrix = np.rot90(matrix) matrix = matrix / matrix.sum(0) matrix = np.flipud(matrix); return matrix def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda): """ This is a helper function to load our trained matrix and run quikr """ if qu.isCompressed(trained_matrix_location): trained_matrix_file = gzip.open(trained_matrix_location, "rb") else: trained_matrix_file = open(trained_matrix_location, "rb") trained_matrix = np.load(trained_matrix_file) xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda) return xstar def quikr(input_fasta_location, trained_matrix, kmer, default_lambda): """ input_fasta is the input fasta file to find the estimated frequencies of trained_matrix is the trained matrix we are using to estimate the species kmer is the desired k-mer to use default_lambda is inp returns the estimated requencies of bacteria present when given an input FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity promoting algorthim is utilized. In practice reconstruction is accurate only down to the genus level (not species or strain). """ uname = platform.uname()[0] # We use the count program to count ____ if uname == "Linux" and os.path.isfile("./count-linux"): count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) elif uname == "Darwin" and os.path.isfile("./count-osx"): count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) # load the output of our count program and form a probability vector from the counts counts = np.loadtxt(count_input.stdout) counts = counts / counts.sum(0) counts = default_lambda * counts counts = np.concatenate([np.zeros(1), counts]) #form the k-mer sensing matrix trained_matrix = trained_matrix * default_lambda; trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix)) xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) xstar = xstar / xstar.sum(0) return xstar if __name__ == "__main__": sys.exit(main())