From 3f0c33ff93dea10b2f79c8c2101431e251b8b928 Mon Sep 17 00:00:00 2001
From: Calvin <calvin@EESI>
Date: Mon, 11 Mar 2013 12:40:47 -0400
Subject: move python stuff to python directory

---
 src/python/quikr.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100755 src/python/quikr.py

(limited to 'src/python/quikr.py')

diff --git a/src/python/quikr.py b/src/python/quikr.py
new file mode 100755
index 0000000..368ab31
--- /dev/null
+++ b/src/python/quikr.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python
+import os
+import sys
+from StringIO import StringIO
+import scipy.optimize.nnls
+import scipy.sparse
+import numpy as np
+from subprocess import *
+import argparse
+import platform
+import gzip
+import itertools
+
+def generate_kmers(kmer):
+  """ generate all possible kmers permutations seperated by newlines 
+
+ >>> kmers =  generate_kmers(1)
+ >>> generate_kmers(2)
+
+ param kmer: the desired Mer size
+ type  kmer: int
+ return: Returns a string of kmers seperated by newlines
+ rtype: string
+ """
+
+  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
+
+def isCompressed(filename):
+  """ This function checks to see if the file is gzipped
+  
+  >>> boolean_value = isCompressed("/path/to/compressed/gzip/file")
+  >>> print boolean_value
+  True
+
+  param filename: the filename to check
+  type  filename: string
+  return: Returns whether the file is gzipped
+  rtype: boolean
+
+  """
+  f = open(filename, "rb")
+
+  # The first two bytes of a gzipped file are always '1f 8b'
+  if f.read(2) == '\x1f\x8b':
+    f.close()
+    return True
+  else:
+    f.close()
+    return False
+
+  
+def train_matrix(input_file_location, kmer):
+  """
+  Takes a input fasta file, and kmer, returns a custom trained matrix
+  """
+
+  kmer_file_name = str(kmer) + "mers.txt"
+
+  kmer_output = Popen(["generate_kmers", str(kmer)], stdout=PIPE)
+  input_file = Popen(["probabilities-by-read", str(kmer), input_file_location] , stdout=PIPE) 
+
+  # load and  normalize the matrix by dividing each element by the sum of it's column.
+  # also do some fancy rotations so that it works properly with quikr
+  matrix  = np.loadtxt(input_file.stdout)
+  
+  matrix = np.rot90(matrix)
+  matrix = matrix / matrix.sum(0)
+  matrix = np.flipud(matrix);
+  return matrix
+
+
+def load_trained_matrix_from_file(trained_matrix_location):
+  """ This is a helper function to load our trained matrix and run quikr """
+  
+  if isCompressed(trained_matrix_location):
+    trained_matrix_file = gzip.open(trained_matrix_location, "rb")
+  else:
+    trained_matrix_file = open(trained_matrix_location, "rb")
+  
+  trained_matrix = np.load(trained_matrix_file)
+
+  return trained_matrix
+
+
+def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
+  """
+  input_fasta is the input fasta file to find the estimated frequencies of
+  trained_matrix is the trained matrix we are using to estimate the species
+  kmer is the desired k-mer to use
+  default_lambda is inp 
+  
+  returns the estimated requencies of bacteria present when given an input
+  FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
+  promoting algorthim is utilized. 
+
+  In practice reconstruction is accurate only down to the genus level (not 
+  species or strain).
+
+  """
+
+  uname = platform.uname()[0]
+
+  # We use the count program to count ____
+  if uname == "Linux" and os.path.isfile("./count-linux"):
+    count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
+  elif uname == "Darwin" and os.path.isfile("./count-osx"):
+    count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
+
+
+  # load the output of our count program and form a probability vector from the counts  
+  counts = np.loadtxt(count_input.stdout) 
+  counts = counts / counts.sum(0) 
+  counts = default_lambda * counts
+  counts = np.concatenate([np.zeros(1), counts])
+
+  #form the k-mer sensing matrix
+  trained_matrix = trained_matrix * default_lambda;
+  trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
+
+  xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) 
+  xstar = xstar / xstar.sum(0) 
+
+  return xstar
+
+
+if __name__ == "__main__":
+    sys.exit(main())
-- 
cgit v1.2.3