From c18793c210ea44cef63db3568791cbad537bdb79 Mon Sep 17 00:00:00 2001
From: Calvin <calvin@EESI>
Date: Fri, 8 Mar 2013 15:32:50 -0500
Subject: moving some stuff

---
 generate_kmers           |   5 --
 multifasta_to_otu.py     | 154 -----------------------------------------------
 quikr                    |  48 ---------------
 quikr.py                 | 115 -----------------------------------
 quikr_train              |  48 ---------------
 src/generate_kmers       |   5 ++
 src/multifasta_to_otu.py | 154 +++++++++++++++++++++++++++++++++++++++++++++++
 src/quikr                |  48 +++++++++++++++
 src/quikr.py             | 115 +++++++++++++++++++++++++++++++++++
 src/quikr_train          |  48 +++++++++++++++
 10 files changed, 370 insertions(+), 370 deletions(-)
 delete mode 100755 generate_kmers
 delete mode 100755 multifasta_to_otu.py
 delete mode 100755 quikr
 delete mode 100755 quikr.py
 delete mode 100755 quikr_train
 create mode 100755 src/generate_kmers
 create mode 100755 src/multifasta_to_otu.py
 create mode 100755 src/quikr
 create mode 100755 src/quikr.py
 create mode 100755 src/quikr_train

diff --git a/generate_kmers b/generate_kmers
deleted file mode 100755
index 38fdf0a..0000000
--- a/generate_kmers
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/python
-import quikr
-import sys
-
-print quikr.generate_kmers(int(sys.argv[1]))
diff --git a/multifasta_to_otu.py b/multifasta_to_otu.py
deleted file mode 100755
index c6fb562..0000000
--- a/multifasta_to_otu.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/python
-
-from multiprocessing import Pool
-from Bio import SeqIO
-import multiprocessing
-from subprocess import *
-import os
-import gzip
-import quikr as q
-import sys
-import numpy as np
-import argparse
-import platform
-import csv
-
-# our defaults
-kmer = 6
-lamb = 10000
-output_directory = ""
-input_directory = ""
-
-
-def main():
-
-  global kmer
-  global input_directory 
-  global output_directory 
-  global lamb
-  global trained_matrix 
-
-  parser = argparse.ArgumentParser(description="MultifastaOTU")
-
-  parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True)
-  parser.add_argument("-o", "--otu-table", help="otu_table", required=True)
-  parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True)
-  parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True)
-  parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True)
-  parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
-  parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6")
-  parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs")
-  args = parser.parse_args()
-    
-  # our defaults
-  jobs=multiprocessing.cpu_count()
-  trained_matrix = args.trained_matrix
-  input_directory = args.input_directory
-  output_directory = args.output_directory
-
-  # Make sure our input exist
-  if not os.path.isdir(args.input_directory):
-    parser.error("Input directory not found")
-
-  if not os.path.isdir(args.output_directory):
-    print "Output directory not found, creating directory"
-    os.path.mkdir(args, output_directory)
-
-  if not os.path.isfile(args.trained_matrix):
-    parser.error("Custom trained matrix not found")
-    
-  if not os.path.isfile(args.trained_fasta):
-    parser.error("Fasta file of trained matrix not found")
-
-  # use alternative lambda
-  if args.lamb is not None:
-    lamb = args.lamb
-    
-  if args.jobs is not None:
-    jobs = args.jobs
-
-  if args.kmer is not None:
-    kmer = args.kmer
-
-  # Load trained matrix
-  if qu.isCompressed(args.trained_matrix):
-    trained_matrix_file = gzip.open(args.trained_matrix, "rb")
-  else:
-    trained_matrix_file = open(args.trained_matrix, "rb")
-  
-  trained_matrix = np.load(trained_matrix_file)
-
-  # Return a list of the input directory
-  fasta_list = os.listdir(args.input_directory)
-
-  # Queue up and run our quikr functions.
-  pool = Pool(processes=jobs)
-  results = pool.map(quikr_call, fasta_list)
-
-  # Create an array of headers
-  headers = []
-  trained_matrix_headers = open(args.trained_fasta, "rU")
-  for header in SeqIO.parse(trained_matrix_headers, "fasta"):
-    headers.append(header.id)
-  trained_matrix_headers.close()
-
-  # create our number of reads matrix
-  number_of_reads = np.zeros((len(headers), len(fasta_list)))
-
-  # load the keys with values from each fasta result
-  for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):
-
-    count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) 
-    number_of_sequences = int(count_sequences.stdout.readline())
-
-    proportions = np.loadtxt(output_directory + fasta);
-    
-    for proportion, proportion_it in map(None, proportions, range(len(proportions))):
-      number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences)
-
-  # remove empty rows from our matrix
-  final_headers = list()
-  final_data = list()
-  for row, header in map(None, number_of_reads, headers):
-    if row.sum() != 0:
-      final_headers.append(header)
-      final_data.append(row)
-
-  # convert from a list back into a numpy array
-  final_data = np.array(final_data, dtype=int)
-
-  # stack our final header and our number_of_reads matrix
-  number_of_reads = np.column_stack((final_headers, final_data))
-
-  # write our OTU table
-  output_file = open(args.otu_table, "wb") 
-  writer = csv.writer(output_file, delimiter="\t")
-
-  #write out our fasta file row
-  writer.writerow(['# QIIME vGail OTU table'])
-
-  fasta_row = ['#OTU_ID']
-  fasta_row.append(' '.join(fasta_list))
-  fasta_row = [' '.join(fasta_row)]
-  writer.writerow(fasta_row)
-
-  # write out our results
-  for i in range(0, np.shape(number_of_reads)[0]):
-      writer.writerow(list(number_of_reads[i]))
-
-  output_file.close()
-
-  return 0
-
-
-def quikr_call(fasta_file):
-  input_location = input_directory + fasta_file
-  output_location = output_directory + os.path.basename(fasta_file)
-
-  xstar = q.quikr(input_location, trained_matrix, kmer, lamb)
-  np.savetxt(output_location, xstar, delimiter=",", fmt="%f")
-  return xstar
-
-if __name__ == "__main__":
-  sys.exit(main())
- 
diff --git a/quikr b/quikr
deleted file mode 100755
index bac01ca..0000000
--- a/quikr
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/python
-import sys
-import os
-import argparse
-import quikr
-import numpy as np
-
-def main():
-
-    parser = argparse.ArgumentParser(description=
-    "Quikr returns the estimated frequencies of batcteria present when given a \
-    input FASTA file. \n \
-    A default trained matrix will be used if none is supplied \n \
-    You must supply a kmer and default lambda if using a custom trained \
-    matrix.")
-
-    parser.add_argument("-f", "--fasta", help="fasta file", required=True)
-    parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
-    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
-    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
-    parser.add_argument("-k", "--kmer", type=int, required=True,
-        help="specifies which kmer to use, must be used with a custom trained database")
-
-
-    args = parser.parse_args()
-    
-    # our default lambda is 10,000
-    lamb = 10000
-
-    # Make sure our input exist
-    if not os.path.isfile(args.fasta):
-        parser.error( "Input fasta file not found")
-
-    if not os.path.isfile(args.trained_matrix):
-        parser.error("Custom trained matrix not found")
-    
-    # use alternative lambda
-    if args.lamb is not None:
-        lamb = args.lamb
-    
-    trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix)
-    xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb)
-
-    np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
-    return 0
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/quikr.py b/quikr.py
deleted file mode 100755
index 3f8221e..0000000
--- a/quikr.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/python
-import os
-import sys
-import scipy.optimize.nnls
-import scipy.sparse
-import numpy as np
-from subprocess import *
-import argparse
-import platform
-import gzip
-import itertools
-
-def generate_kmers(kmer):
-  """ This will return a list of kmers seperated by newlines """
-  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
-
-def isCompressed(filename):
-  """ This function checks to see if the file is gzipped """ 
-
-  f = open(filename, "rb")
-
-  # The first two bytes of a gzipped file are always '1f 8b'
-  if f.read(2) == '\x1f\x8b':
-    f.close()
-    return True
-  else:
-    f.close()
-    return False
-
-  
-def train_matrix(input_file_location, kmer):
-  """
-  Takes a input fasta file, and kmer, returns a custom trained matrix
-  """
-
-  kmer_file_name = str(kmer) + "mers.txt"
-
-  if not os.path.isfile(kmer_file_name):
-    print "could not find kmer file"
-    exit()
-  
-  uname = platform.uname()[0]
-
-  if uname == "Linux": 
-    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) 
-  elif uname == "Darwin":
-    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) 
-
-  # load and  normalize the matrix by dividing each element by the sum of it's column.
-  # also do some fancy rotations so that it works properly with quikr
-  matrix  = np.loadtxt(input_file.stdout)
-  
-  matrix = np.rot90(matrix)
-  matrix = matrix / matrix.sum(0)
-  matrix = np.flipud(matrix);
-  return matrix
-
-
-def load_trained_matrix_from_file(trained_matrix_location):
-  """ This is a helper function to load our trained matrix and run quikr """
-  
-  if isCompressed(trained_matrix_location):
-    trained_matrix_file = gzip.open(trained_matrix_location, "rb")
-  else:
-    trained_matrix_file = open(trained_matrix_location, "rb")
-  
-  trained_matrix = np.load(trained_matrix_file)
-
-  return trained_matrix
-
-
-def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
-  """
-  input_fasta is the input fasta file to find the estimated frequencies of
-  trained_matrix is the trained matrix we are using to estimate the species
-  kmer is the desired k-mer to use
-  default_lambda is inp 
-  
-  returns the estimated requencies of bacteria present when given an input
-  FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
-  promoting algorthim is utilized. 
-
-  In practice reconstruction is accurate only down to the genus level (not 
-  species or strain).
-
-  """
-
-  uname = platform.uname()[0]
-
-  # We use the count program to count ____
-  if uname == "Linux" and os.path.isfile("./count-linux"):
-    count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
-  elif uname == "Darwin" and os.path.isfile("./count-osx"):
-    count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
-
-
-  # load the output of our count program and form a probability vector from the counts  
-  counts = np.loadtxt(count_input.stdout) 
-  counts = counts / counts.sum(0) 
-  counts = default_lambda * counts
-  counts = np.concatenate([np.zeros(1), counts])
-
-  #form the k-mer sensing matrix
-  trained_matrix = trained_matrix * default_lambda;
-  trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
-
-
-  xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) 
-  xstar = xstar / xstar.sum(0) 
-
-  return xstar
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/quikr_train b/quikr_train
deleted file mode 100755
index 6e599c9..0000000
--- a/quikr_train
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/python
-import numpy as np
-import quikr
-import os
-import sys
-import gzip
-from subprocess import *
-import platform
-import argparse
-
-def main():
-  """
-  You can call this script independently, and will save the 
-  trained matrix as a numpy file.
-
-  example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy
- 
-  """
-  parser = argparse.ArgumentParser(description=
-  " quikr_train returns a custom trained matrix that can be used with \
-    the quikr function. \n You must supply a kmer. \n ")
-
-  parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True)
-  parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True)
-  parser.add_argument("-k", "--kmer", help="kmer size (integer)", 
-                      type=int, required=False )
-  parser.add_argument("-z", "--compress", help="compress output (integer)", 
-                      action='store_true', required=False)
-
-  args = parser.parse_args()
-
-  if not os.path.isfile(args.input):
-    parser.error( "Input database not found")
-
-  # call the quikr train function, save the output with np.save
-  matrix = quikr.train_matrix(args.input, args.kmer)
-
-  if args.compress: 
-    output_file = gzip.open(args.output, "wb")
-  else:
-    output_file = open(args.output, "wb")
-
-  np.save(output_file, matrix)
-
-  return 0
-  
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/src/generate_kmers b/src/generate_kmers
new file mode 100755
index 0000000..38fdf0a
--- /dev/null
+++ b/src/generate_kmers
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+import quikr
+import sys
+
+print quikr.generate_kmers(int(sys.argv[1]))
diff --git a/src/multifasta_to_otu.py b/src/multifasta_to_otu.py
new file mode 100755
index 0000000..c6fb562
--- /dev/null
+++ b/src/multifasta_to_otu.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python
+
+from multiprocessing import Pool
+from Bio import SeqIO
+import multiprocessing
+from subprocess import *
+import os
+import gzip
+import quikr as q
+import sys
+import numpy as np
+import argparse
+import platform
+import csv
+
+# our defaults
+kmer = 6
+lamb = 10000
+output_directory = ""
+input_directory = ""
+
+
+def main():
+
+  global kmer
+  global input_directory 
+  global output_directory 
+  global lamb
+  global trained_matrix 
+
+  parser = argparse.ArgumentParser(description="MultifastaOTU")
+
+  parser.add_argument("-i", "--input-directory", help="directory containing fasta files", required=True)
+  parser.add_argument("-o", "--otu-table", help="otu_table", required=True)
+  parser.add_argument("-t", "--trained-matrix", help="your trained matrix ", required=True)
+  parser.add_argument("-f", "--trained-fasta", help="the fasta file used to train your matrix", required=True)
+  parser.add_argument("-d", "--output-directory", help="quikr output directory", required=True)
+  parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+  parser.add_argument("-k", "--kmer", type=int, help="specifies which kmer to use, default=6")
+  parser.add_argument("-j", "--jobs", type=int, help="specifies how many jobs to run at once, default=number of CPUs")
+  args = parser.parse_args()
+    
+  # our defaults
+  jobs=multiprocessing.cpu_count()
+  trained_matrix = args.trained_matrix
+  input_directory = args.input_directory
+  output_directory = args.output_directory
+
+  # Make sure our input exist
+  if not os.path.isdir(args.input_directory):
+    parser.error("Input directory not found")
+
+  if not os.path.isdir(args.output_directory):
+    print "Output directory not found, creating directory"
+    os.path.mkdir(args, output_directory)
+
+  if not os.path.isfile(args.trained_matrix):
+    parser.error("Custom trained matrix not found")
+    
+  if not os.path.isfile(args.trained_fasta):
+    parser.error("Fasta file of trained matrix not found")
+
+  # use alternative lambda
+  if args.lamb is not None:
+    lamb = args.lamb
+    
+  if args.jobs is not None:
+    jobs = args.jobs
+
+  if args.kmer is not None:
+    kmer = args.kmer
+
+  # Load trained matrix
+  if qu.isCompressed(args.trained_matrix):
+    trained_matrix_file = gzip.open(args.trained_matrix, "rb")
+  else:
+    trained_matrix_file = open(args.trained_matrix, "rb")
+  
+  trained_matrix = np.load(trained_matrix_file)
+
+  # Return a list of the input directory
+  fasta_list = os.listdir(args.input_directory)
+
+  # Queue up and run our quikr functions.
+  pool = Pool(processes=jobs)
+  results = pool.map(quikr_call, fasta_list)
+
+  # Create an array of headers
+  headers = []
+  trained_matrix_headers = open(args.trained_fasta, "rU")
+  for header in SeqIO.parse(trained_matrix_headers, "fasta"):
+    headers.append(header.id)
+  trained_matrix_headers.close()
+
+  # create our number of reads matrix
+  number_of_reads = np.zeros((len(headers), len(fasta_list)))
+
+  # load the keys with values from each fasta result
+  for fasta, fasta_it in map(None, fasta_list, range(len(fasta_list))):
+
+    count_sequences = Popen(["grep", "-c" , "^>", args.input_directory + fasta], stdout=PIPE) 
+    number_of_sequences = int(count_sequences.stdout.readline())
+
+    proportions = np.loadtxt(output_directory + fasta);
+    
+    for proportion, proportion_it in map(None, proportions, range(len(proportions))):
+      number_of_reads[proportion_it, fasta_it] = round(proportion * number_of_sequences)
+
+  # remove empty rows from our matrix
+  final_headers = list()
+  final_data = list()
+  for row, header in map(None, number_of_reads, headers):
+    if row.sum() != 0:
+      final_headers.append(header)
+      final_data.append(row)
+
+  # convert from a list back into a numpy array
+  final_data = np.array(final_data, dtype=int)
+
+  # stack our final header and our number_of_reads matrix
+  number_of_reads = np.column_stack((final_headers, final_data))
+
+  # write our OTU table
+  output_file = open(args.otu_table, "wb") 
+  writer = csv.writer(output_file, delimiter="\t")
+
+  #write out our fasta file row
+  writer.writerow(['# QIIME vGail OTU table'])
+
+  fasta_row = ['#OTU_ID']
+  fasta_row.append(' '.join(fasta_list))
+  fasta_row = [' '.join(fasta_row)]
+  writer.writerow(fasta_row)
+
+  # write out our results
+  for i in range(0, np.shape(number_of_reads)[0]):
+      writer.writerow(list(number_of_reads[i]))
+
+  output_file.close()
+
+  return 0
+
+
+def quikr_call(fasta_file):
+  input_location = input_directory + fasta_file
+  output_location = output_directory + os.path.basename(fasta_file)
+
+  xstar = q.quikr(input_location, trained_matrix, kmer, lamb)
+  np.savetxt(output_location, xstar, delimiter=",", fmt="%f")
+  return xstar
+
+if __name__ == "__main__":
+  sys.exit(main())
+ 
diff --git a/src/quikr b/src/quikr
new file mode 100755
index 0000000..bac01ca
--- /dev/null
+++ b/src/quikr
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+import sys
+import os
+import argparse
+import quikr
+import numpy as np
+
+def main():
+
+    parser = argparse.ArgumentParser(description=
+    "Quikr returns the estimated frequencies of batcteria present when given a \
+    input FASTA file. \n \
+    A default trained matrix will be used if none is supplied \n \
+    You must supply a kmer and default lambda if using a custom trained \
+    matrix.")
+
+    parser.add_argument("-f", "--fasta", help="fasta file", required=True)
+    parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
+    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
+    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+    parser.add_argument("-k", "--kmer", type=int, required=True,
+        help="specifies which kmer to use, must be used with a custom trained database")
+
+
+    args = parser.parse_args()
+    
+    # our default lambda is 10,000
+    lamb = 10000
+
+    # Make sure our input exist
+    if not os.path.isfile(args.fasta):
+        parser.error( "Input fasta file not found")
+
+    if not os.path.isfile(args.trained_matrix):
+        parser.error("Custom trained matrix not found")
+    
+    # use alternative lambda
+    if args.lamb is not None:
+        lamb = args.lamb
+    
+    trained_matrix = quikr.load_trained_matrix_from_file(args.trained_matrix)
+    xstar = quikr.calculate_estimated_frequencies(args.fasta, trained_matrix, args.kmer, lamb)
+
+    np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/quikr.py b/src/quikr.py
new file mode 100755
index 0000000..3f8221e
--- /dev/null
+++ b/src/quikr.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+import os
+import sys
+import scipy.optimize.nnls
+import scipy.sparse
+import numpy as np
+from subprocess import *
+import argparse
+import platform
+import gzip
+import itertools
+
+def generate_kmers(kmer):
+  """ This will return a list of kmers seperated by newlines """
+  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
+
+def isCompressed(filename):
+  """ This function checks to see if the file is gzipped """ 
+
+  f = open(filename, "rb")
+
+  # The first two bytes of a gzipped file are always '1f 8b'
+  if f.read(2) == '\x1f\x8b':
+    f.close()
+    return True
+  else:
+    f.close()
+    return False
+
+  
+def train_matrix(input_file_location, kmer):
+  """
+  Takes a input fasta file, and kmer, returns a custom trained matrix
+  """
+
+  kmer_file_name = str(kmer) + "mers.txt"
+
+  if not os.path.isfile(kmer_file_name):
+    print "could not find kmer file"
+    exit()
+  
+  uname = platform.uname()[0]
+
+  if uname == "Linux": 
+    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) 
+  elif uname == "Darwin":
+    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) 
+
+  # load and  normalize the matrix by dividing each element by the sum of it's column.
+  # also do some fancy rotations so that it works properly with quikr
+  matrix  = np.loadtxt(input_file.stdout)
+  
+  matrix = np.rot90(matrix)
+  matrix = matrix / matrix.sum(0)
+  matrix = np.flipud(matrix);
+  return matrix
+
+
+def load_trained_matrix_from_file(trained_matrix_location):
+  """ This is a helper function to load our trained matrix and run quikr """
+  
+  if isCompressed(trained_matrix_location):
+    trained_matrix_file = gzip.open(trained_matrix_location, "rb")
+  else:
+    trained_matrix_file = open(trained_matrix_location, "rb")
+  
+  trained_matrix = np.load(trained_matrix_file)
+
+  return trained_matrix
+
+
+def calculate_estimated_frequencies(input_fasta_location, trained_matrix, kmer, default_lambda):
+  """
+  input_fasta is the input fasta file to find the estimated frequencies of
+  trained_matrix is the trained matrix we are using to estimate the species
+  kmer is the desired k-mer to use
+  default_lambda is inp 
+  
+  returns the estimated requencies of bacteria present when given an input
+  FASTA file of amplicon (454) reads. A k-mer based, L1 regularized, sparsity
+  promoting algorthim is utilized. 
+
+  In practice reconstruction is accurate only down to the genus level (not 
+  species or strain).
+
+  """
+
+  uname = platform.uname()[0]
+
+  # We use the count program to count ____
+  if uname == "Linux" and os.path.isfile("./count-linux"):
+    count_input = Popen(["./count-linux", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
+  elif uname == "Darwin" and os.path.isfile("./count-osx"):
+    count_input = Popen(["count-osx", "-r", str(kmer), "-1", "-u", input_fasta_location], stdout=PIPE) 
+
+
+  # load the output of our count program and form a probability vector from the counts  
+  counts = np.loadtxt(count_input.stdout) 
+  counts = counts / counts.sum(0) 
+  counts = default_lambda * counts
+  counts = np.concatenate([np.zeros(1), counts])
+
+  #form the k-mer sensing matrix
+  trained_matrix = trained_matrix * default_lambda;
+  trained_matrix = np.vstack((np.ones(trained_matrix.shape[1]), trained_matrix))
+
+
+  xstar, rnorm = scipy.optimize.nnls(trained_matrix, counts) 
+  xstar = xstar / xstar.sum(0) 
+
+  return xstar
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/quikr_train b/src/quikr_train
new file mode 100755
index 0000000..6e599c9
--- /dev/null
+++ b/src/quikr_train
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+import numpy as np
+import quikr
+import os
+import sys
+import gzip
+from subprocess import *
+import platform
+import argparse
+
+def main():
+  """
+  You can call this script independently, and will save the 
+  trained matrix as a numpy file.
+
+  example: python quikr-train.py -i input.fasta -k 6 -o trained_matrix.npy
+ 
+  """
+  parser = argparse.ArgumentParser(description=
+  " quikr_train returns a custom trained matrix that can be used with \
+    the quikr function. \n You must supply a kmer. \n ")
+
+  parser.add_argument("-i", "--input", help="training database of sequences (fasta format)", required=True)
+  parser.add_argument("-o", "--output", help="sensing matrix (text file)", required=True)
+  parser.add_argument("-k", "--kmer", help="kmer size (integer)", 
+                      type=int, required=False )
+  parser.add_argument("-z", "--compress", help="compress output (integer)", 
+                      action='store_true', required=False)
+
+  args = parser.parse_args()
+
+  if not os.path.isfile(args.input):
+    parser.error( "Input database not found")
+
+  # call the quikr train function, save the output with np.save
+  matrix = quikr.train_matrix(args.input, args.kmer)
+
+  if args.compress: 
+    output_file = gzip.open(args.output, "wb")
+  else:
+    output_file = open(args.output, "wb")
+
+  np.save(output_file, matrix)
+
+  return 0
+  
+if __name__ == "__main__":
+    sys.exit(main())
-- 
cgit v1.2.3