starting to modularize

author: Calvin <calvin@EESI> 2013-03-07 17:17:20 -0500
committer: Calvin <calvin@EESI> 2013-03-07 17:17:20 -0500
commit: e8dfa85cd7e0428e53aac532c31f1ac1cc3cf1c1 (patch)
tree: e7663ea51680b07dd4a42f4132b746e9437aeb10
parent: 7df3bd86f20197aad9e82f7ee89b7c0c8938dc15 (diff)
5 files changed, 91 insertions, 75 deletions
diff --git a/generate_kmers.py b/generate_kmers.py
index 9a338b7..38fdf0a 100755
--- a/generate_kmers.py
+++ b/generate_kmers.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python
-import itertools
+import quikr
 import sys
 
-print '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=int(sys.argv[1])))
+print quikr.generate_kmers(int(sys.argv[1]))
diff --git a/quikr b/quikr
new file mode 100755
index 0000000..26e1708
--- /dev/null
+++ b/quikr
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+import sys
+import argparse
+import quikr
+
+def main():
+
+    parser = argparse.ArgumentParser(description=
+    "Quikr returns the estimated frequencies of batcteria present when given a \
+    input FASTA file. \n \
+    A default trained matrix will be used if none is supplied \n \
+    You must supply a kmer and default lambda if using a custom trained \
+    matrix.")
+
+    parser.add_argument("-f", "--fasta", help="fasta file", required=True)
+    parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
+    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
+    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+    parser.add_argument("-k", "--kmer", type=int, required=True,
+        help="specifies which kmer to use, must be used with a custom trained database")
+
+
+    args = parser.parse_args()
+    
+    # our default lambda is 10,000
+    lamb = 10000
+
+    # Make sure our input exist
+    if not os.path.isfile(args.fasta):
+        parser.error( "Input fasta file not found")
+
+    if not os.path.isfile(args.trained_matrix):
+        parser.error("Custom trained matrix not found")
+    
+    # use alternative lambda
+    if args.lamb is not None:
+        lamb = args.lamb
+    
+    xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb)
+
+    np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/quikr.py b/quikr.py
index f83cdf2..19ff96c 100755
--- a/quikr.py
+++ b/quikr.py
@@ -9,47 +9,56 @@ from subprocess import *
 import argparse
 import platform
 import gzip
+import itertools
 
-def main():
+def generate_kmers(kmer):
+  """ This will return a list of kmers seperated by newlines """
+  return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
 
-    parser = argparse.ArgumentParser(description=
-    "Quikr returns the estimated frequencies of batcteria present when given a \
-    input FASTA file. \n \
-    A default trained matrix will be used if none is supplied \n \
-    You must supply a kmer and default lambda if using a custom trained \
-    matrix.")
+def isCompressed(filename):
+  """ This function checks to see if the file is gzipped """ 
 
-    parser.add_argument("-f", "--fasta", help="fasta file", required=True)
-    parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
-    parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
-    parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
-    parser.add_argument("-k", "--kmer", type=int, required=True,
-        help="specifies which kmer to use, must be used with a custom trained database")
+  f = open(filename, "rb")
 
+  # The first two bytes of a gzipped file are always '1f 8b'
+  if f.read(2) == '\x1f\x8b':
+    f.close()
+    return True
+  else:
+    f.close()
+    return False
+
+  
+def quikr_train(input_file_location, kmer):
+  """
+  Takes a input fasta file, and kmer, returns a custom trained matrix
+  """
 
-    args = parser.parse_args()
-    
-    # our default lambda is 10,000
-    lamb = 10000
+  kmer_file_name = str(kmer) + "mers.txt"
 
+  if not os.path.isfile(kmer_file_name):
+    print "could not find kmer file"
+    exit()
+  
+  uname = platform.uname()[0]
 
-    # Make sure our input exist
-    if not os.path.isfile(args.fasta):
-        parser.error( "Input fasta file not found")
+  if uname == "Linux": 
+    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) 
+  elif uname == "Darwin":
+    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) 
 
-    if not os.path.isfile(args.trained_matrix):
-        parser.error("Custom trained matrix not found")
-    
-    # use alternative lambda
-    if args.lamb is not None:
-        lamb = args.lamb
-    
-    xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb)
+  # load and  normalize the matrix by dividing each element by the sum of it's column.
+  # also do some fancy rotations so that it works properly with quikr
+  matrix  = np.loadtxt(input_file.stdout)
+  
+  matrix = np.rot90(matrix)
+  matrix = matrix / matrix.sum(0)
+  matrix = np.flipud(matrix);
+  return matrix
 
-    np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
-    return 0
 
 def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda):
+  """ This is a helper function to load our trained matrix and run quikr """
   
   if qu.isCompressed(trained_matrix_location):
     trained_matrix_file = gzip.open(trained_matrix_location, "rb")
@@ -60,7 +69,8 @@ def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_loc
 
   xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda)
   return xstar
-  
+
+
 def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):
   """
   input_fasta is the input fasta file to find the estimated frequencies of
@@ -74,6 +84,7 @@ def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):
 
   In practice reconstruction is accurate only down to the genus level (not 
   species or strain).
+
   """
 
   uname = platform.uname()[0]
diff --git a/quikr_train.py b/quikr_train
index a427436..9e9caa7 100755
--- a/quikr_train.py
+++ b/quikr_train
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 import numpy as np
+import quikr
 import os
 import sys
 import gzip
@@ -31,7 +32,7 @@ def main():
     parser.error( "Input database not found")
 
   # call the quikr train function, save the output with np.save
-  matrix = quikr_train(args.input, args.kmer)
+  matrix = quikr.quikr_train(args.input, args.kmer)
 
   if args.compress: 
     output_file = gzip.open(args.output, "wb")
@@ -41,34 +42,6 @@ def main():
   np.save(output_file, matrix)
 
   return 0
-
-def quikr_train(input_file_location, kmer):
-  """
-  Takes a input fasta file, and kmer, returns a custom trained matrix
-  """
-
-  kmer_file_name = str(kmer) + "mers.txt"
-
-  if not os.path.isfile(kmer_file_name):
-    print "could not find kmer file"
-    exit()
-
-  
-  uname = platform.uname()[0]
-
-  if uname == "Linux": 
-    input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE) 
-  elif uname == "Darwin":
-    input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name]) 
-
-  # load and  normalize the matrix by dividing each element by the sum of it's column.
-  # also do some fancy rotations so that it works properly with quikr
-  matrix  = np.loadtxt(input_file.stdout)
-  
-  matrix = np.rot90(matrix)
-  matrix = matrix / matrix.sum(0)
-  matrix = np.flipud(matrix);
-  return matrix
   
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/quikr_util.py b/quikr_util.py
deleted file mode 100644
index b2fb968..0000000
--- a/quikr_util.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Helper Functions
-
-# Check if file is gzip compressed
-def isCompressed(filename):
-  f = open(filename, "rb")
-
-  # The first two bytes of a tar file are always '1f 8b'
-  if f.read(2) == '\x1f\x8b':
-    f.close()
-    return True
-  else:
-    f.close()
-    return False
author	Calvin <calvin@EESI>	2013-03-07 17:17:20 -0500
committer	Calvin <calvin@EESI>	2013-03-07 17:17:20 -0500
commit	e8dfa85cd7e0428e53aac532c31f1ac1cc3cf1c1 (patch)
tree	e7663ea51680b07dd4a42f4132b746e9437aeb10
parent	7df3bd86f20197aad9e82f7ee89b7c0c8938dc15 (diff)