aboutsummaryrefslogtreecommitdiff
path: root/quikr.py
diff options
context:
space:
mode:
Diffstat (limited to 'quikr.py')
-rwxr-xr-xquikr.py115
1 files changed, 0 insertions, 115 deletions
diff --git a/quikr.py b/quikr.py
deleted file mode 100755
index 3f8221e..0000000
--- a/quikr.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/python
-import os
-import sys
-import scipy.optimize.nnls
-import scipy.sparse
-import numpy as np
-from subprocess import *
-import argparse
-import platform
-import gzip
-import itertools
-
-def generate_kmers(kmer):
- """ This will return a list of kmers seperated by newlines """
- return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
-
-def isCompressed(filename):
- """ This function checks to see if the file is gzipped """
-
- f = open(filename, "rb")
-
- # The first two bytes of a gzipped file are always '1f 8b'
- if f.read(2) == '\x1f\x8b':
- f.close()
- return True
- else:
- f.close()
- return False
-
-
-def train_matrix(input_file_location, kmer):
- """
- Takes a input fasta file, and kmer, returns a custom trained matrix
- """
-
- kmer_file_name = str(kmer) + "mers.txt"
-
- if not os.path.isfile(kmer_file_name):
- print "could not find kmer file"
- exit()
-
- uname = platform.uname()[0]
-
- if uname == "Linux":
- input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)
- elif uname == "Darwin":
- input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])
-
- # load and normalize the matrix by dividing each element by the sum of it's column.
- # also do some fancy rotations so that it works properly with quikr
- matrix = np.loadtxt(input_file.stdout)
-
- matrix = np.rot90(matrix)
- matrix = matrix / matrix.sum(0)
- matrix = np.flipud(matrix);
- return matrix
-
-
-def load_trained_matrix_from_file(trained_matrix_location):
- """ This is a helper function to load our trained matrix and run quikr """
-
- if isCompressed(trained_matrix_location):
- trained_matrix_file = gzip.open(trained_matrix_location, "rb")
- else:
- trained_matrix_file = open(trained_matrix_location, "rb")
-
- trained_matrix = np.load(trained_matrix_file)
-
- return trained_matrix
-
-
-def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
- """
- input_fasta is the input fasta file to find the estimated frequencies of
- trained_matrix is the trained matrix we are using to estimate the species
- kmer is the desired k-mer to use
- default_lambda is inp
-
- returns the estimated requencies of bacteria present when given an input
- FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
- promoting algorthim is utilized.
-
- In practice reconstruction is accurate only down to the genus level (not
- species or strain).
-
- """
-
- uname = platform.uname()[0]
-
- # We use the count program to count ____
- if uname == "Linux" and os.path.isfile("./count-linux"):
- count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
- elif uname == "Darwin" and os.path.isfile("./count-osx"):
- count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE)
-
-
- # load the output of our count program and form a probability vector from the counts
- counts = np.loadtxt(count_input.stdout)
- counts = counts / counts.sum(0)
- counts = default_lambda * counts
- counts = np.concatenate([np.zeros(1), counts])
-
- #form the k-mer sensing matrix
- trained_matrix = trained_matrix * default_lambda;
- trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
-
-
- xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts)
- xstar = xstar / xstar.sum(0)
-
- return xstar
-
-
-if __name__ == "__main__":
- sys.exit(main())