summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin <calvin@EESI>2013-03-07 17:17:20 -0500
committerCalvin <calvin@EESI>2013-03-07 17:17:20 -0500
commite8dfa85cd7e0428e53aac532c31f1ac1cc3cf1c1 (patch)
treee7663ea51680b07dd4a42f4132b746e9437aeb10
parent7df3bd86f20197aad9e82f7ee89b7c0c8938dc15 (diff)
starting to modularize
-rwxr-xr-xgenerate_kmers.py4
-rwxr-xr-xquikr45
-rwxr-xr-xquikr.py73
-rwxr-xr-xquikr_train (renamed from quikr_train.py)31
-rw-r--r--quikr_util.py13
5 files changed, 91 insertions, 75 deletions
diff --git a/generate_kmers.py b/generate_kmers.py
index 9a338b7..38fdf0a 100755
--- a/generate_kmers.py
+++ b/generate_kmers.py
@@ -1,5 +1,5 @@
#!/usr/bin/python
-import itertools
+import quikr
import sys
-print '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=int(sys.argv[1])))
+print quikr.generate_kmers(int(sys.argv[1]))
diff --git a/quikr b/quikr
new file mode 100755
index 0000000..26e1708
--- /dev/null
+++ b/quikr
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+import sys
+import argparse
+import quikr
+
+def main():
+
+ parser = argparse.ArgumentParser(description=
+ "Quikr returns the estimated frequencies of batcteria present when given a \
+ input FASTA file. \n \
+ A default trained matrix will be used if none is supplied \n \
+ You must supply a kmer and default lambda if using a custom trained \
+ matrix.")
+
+ parser.add_argument("-f", "--fasta", help="fasta file", required=True)
+ parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
+ parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
+ parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
+ parser.add_argument("-k", "--kmer", type=int, required=True,
+ help="specifies which kmer to use, must be used with a custom trained database")
+
+
+ args = parser.parse_args()
+
+ # our default lambda is 10,000
+ lamb = 10000
+
+ # Make sure our input exist
+ if not os.path.isfile(args.fasta):
+ parser.error( "Input fasta file not found")
+
+ if not os.path.isfile(args.trained_matrix):
+ parser.error("Custom trained matrix not found")
+
+ # use alternative lambda
+ if args.lamb is not None:
+ lamb = args.lamb
+
+ xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb)
+
+ np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/quikr.py b/quikr.py
index f83cdf2..19ff96c 100755
--- a/quikr.py
+++ b/quikr.py
@@ -9,47 +9,56 @@ from subprocess import *
import argparse
import platform
import gzip
+import itertools
-def main():
+def generate_kmers(kmer):
+ """ This will return a list of kmers seperated by newlines """
+ return '\n'.join(''.join(x) for x in itertools.product('acgt', repeat=kmer))
- parser = argparse.ArgumentParser(description=
- "Quikr returns the estimated frequencies of batcteria present when given a \
- input FASTA file. \n \
- A default trained matrix will be used if none is supplied \n \
- You must supply a kmer and default lambda if using a custom trained \
- matrix.")
+def isCompressed(filename):
+ """ This function checks to see if the file is gzipped """
- parser.add_argument("-f", "--fasta", help="fasta file", required=True)
- parser.add_argument("-o", "--output", help="output path (csv output)", required=True)
- parser.add_argument("-t", "--trained-matrix", help="trained matrix", required=True)
- parser.add_argument("-l", "--lamb", type=int, help="the default lambda value is 10,000")
- parser.add_argument("-k", "--kmer", type=int, required=True,
- help="specifies which kmer to use, must be used with a custom trained database")
+ f = open(filename, "rb")
+ # The first two bytes of a gzipped file are always '1f 8b'
+ if f.read(2) == '\x1f\x8b':
+ f.close()
+ return True
+ else:
+ f.close()
+ return False
+
+
+def quikr_train(input_file_location, kmer):
+ """
+ Takes a input fasta file, and kmer, returns a custom trained matrix
+ """
- args = parser.parse_args()
-
- # our default lambda is 10,000
- lamb = 10000
+ kmer_file_name = str(kmer) + "mers.txt"
+ if not os.path.isfile(kmer_file_name):
+ print "could not find kmer file"
+ exit()
+
+ uname = platform.uname()[0]
- # Make sure our input exist
- if not os.path.isfile(args.fasta):
- parser.error( "Input fasta file not found")
+ if uname == "Linux":
+ input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)
+ elif uname == "Darwin":
+ input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])
- if not os.path.isfile(args.trained_matrix):
- parser.error("Custom trained matrix not found")
-
- # use alternative lambda
- if args.lamb is not None:
- lamb = args.lamb
-
- xstar = quikr_load_trained_matrix_from_file(args.fasta, args.trained_matrix, args.kmer, lamb)
+ # load and normalize the matrix by dividing each element by the sum of it's column.
+ # also do some fancy rotations so that it works properly with quikr
+ matrix = np.loadtxt(input_file.stdout)
+
+ matrix = np.rot90(matrix)
+ matrix = matrix / matrix.sum(0)
+ matrix = np.flipud(matrix);
+ return matrix
- np.savetxt(args.output, xstar, delimiter=",", fmt="%f")
- return 0
def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_location, kmer, default_lambda):
+ """ This is a helper function to load our trained matrix and run quikr """
if qu.isCompressed(trained_matrix_location):
trained_matrix_file = gzip.open(trained_matrix_location, "rb")
@@ -60,7 +69,8 @@ def quikr_load_trained_matrix_from_file(input_fasta_location, trained_matrix_loc
xstar = quikr(input_fasta_location, trained_matrix, kmer, default_lambda)
return xstar
-
+
+
def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):
"""
input_fasta is the input fasta file to find the estimated frequencies of
@@ -74,6 +84,7 @@ def quikr(input_fasta_location, trained_matrix, kmer, default_lambda):
In practice reconstruction is accurate only down to the genus level (not
species or strain).
+
"""
uname = platform.uname()[0]
diff --git a/quikr_train.py b/quikr_train
index a427436..9e9caa7 100755
--- a/quikr_train.py
+++ b/quikr_train
@@ -1,5 +1,6 @@
#!/usr/bin/python
import numpy as np
+import quikr
import os
import sys
import gzip
@@ -31,7 +32,7 @@ def main():
parser.error( "Input database not found")
# call the quikr train function, save the output with np.save
- matrix = quikr_train(args.input, args.kmer)
+ matrix = quikr.quikr_train(args.input, args.kmer)
if args.compress:
output_file = gzip.open(args.output, "wb")
@@ -41,34 +42,6 @@ def main():
np.save(output_file, matrix)
return 0
-
-def quikr_train(input_file_location, kmer):
- """
- Takes a input fasta file, and kmer, returns a custom trained matrix
- """
-
- kmer_file_name = str(kmer) + "mers.txt"
-
- if not os.path.isfile(kmer_file_name):
- print "could not find kmer file"
- exit()
-
-
- uname = platform.uname()[0]
-
- if uname == "Linux":
- input_file = Popen(["./probabilities-by-read-linux", str(kmer), input_file_location, kmer_file_name], stdout=PIPE)
- elif uname == "Darwin":
- input_file = Popen(["./probabilities-by-read-osx", str(kmer), input_file_location, kmer_file_name])
-
- # load and normalize the matrix by dividing each element by the sum of it's column.
- # also do some fancy rotations so that it works properly with quikr
- matrix = np.loadtxt(input_file.stdout)
-
- matrix = np.rot90(matrix)
- matrix = matrix / matrix.sum(0)
- matrix = np.flipud(matrix);
- return matrix
if __name__ == "__main__":
sys.exit(main())
diff --git a/quikr_util.py b/quikr_util.py
deleted file mode 100644
index b2fb968..0000000
--- a/quikr_util.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Helper Functions
-
-# Check if file is gzip compressed
-def isCompressed(filename):
- f = open(filename, "rb")
-
- # The first two bytes of a tar file are always '1f 8b'
- if f.read(2) == '\x1f\x8b':
- f.close()
- return True
- else:
- f.close()
- return False