aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-01 16:19:21 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-01 16:19:21 -0400
commit3d5ab03f760814e5e6bb6ac54e29650bc1fe6153 (patch)
tree8181071f59ed7e616611a12b8863057d6ac0a64d
parent4d00f90fd8b1b9e38eea297336ee83a5e5c9e764 (diff)
update headers, use const for better performance (.500ms on ~2gb file), update comments for functions
-rw-r--r--kmer_total_count.c8
-rw-r--r--kmer_utils.c29
-rw-r--r--kmer_utils.h1
3 files changed, 18 insertions, 20 deletions
diff --git a/kmer_total_count.c b/kmer_total_count.c
index 2416bfa..8cc91a7 100644
--- a/kmer_total_count.c
+++ b/kmer_total_count.c
@@ -13,10 +13,8 @@ int main(int argc, char **argv) {
char *line = NULL;
size_t len = 0;
ssize_t read;
- long i = 0;
+ unsigned long i = 0;
- unsigned int kmer = 0;
- unsigned long width = 0;
unsigned long long *counts;
if(argc != 3) {
@@ -31,10 +29,10 @@ int main(int argc, char **argv) {
}
// second argument is the kmer
- kmer = atoi(argv[2]);
+ const unsigned int kmer = atoi(argv[2]);
// width is 4^kmer
- width = (int)pow(4, kmer);
+ const unsigned long width = (int)pow(4, kmer);
// malloc our counts matrix
counts = malloc((width+ 1) * sizeof(unsigned long long));
diff --git a/kmer_utils.c b/kmer_utils.c
index 7b483ea..89df3b2 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -4,19 +4,9 @@
#include <string.h>
#include <unistd.h>
-// This function takes a char array containing sequences and converts it
-// into a kmer index (essentially a base 4 radix converstion to base 10, with
-// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning
-// them into a radix-4 number we can use.
-//
-// kind of convoluted but it works.
-//
-// Arguemnts:
-// char *str - a NULL terminated character array
-// long kmer - how long of a index value you want to return
-// long error_pos - what index to return for a non ACGT character
-//
-inline long num_to_index(const char *str, int kmer, long error_pos) {
+// convert a string of k-mer size base-4 values into a
+// base-10 index
+long num_to_index(const char *str, const int kmer, const long error_pos) {
int i = 0;
unsigned long out = 0;
@@ -24,6 +14,9 @@ inline long num_to_index(const char *str, int kmer, long error_pos) {
for(i = kmer - 1; i >= 0; i--){
+ if(str[i] >> 2)
+ return error_pos;
+
out += str[i] * multiply;
multiply = multiply << 2;
}
@@ -31,11 +24,15 @@ inline long num_to_index(const char *str, int kmer, long error_pos) {
return out;
}
-void convert_kmer_to_num(char *str, long length) {
+// replaces values in a char array of ACGT's and others with
+// values that correspond to their base 4 value to be used in
+// num_to_index.
+void convert_kmer_to_num(char *str, const long length) {
long i = 0;
for(i = 0; i < length; i++) {
+ // this is _not_ portable, only works with ASCII values.
switch(str[i] | 0x20 ) {
case 'a':
str[i] = 0;
@@ -50,7 +47,9 @@ void convert_kmer_to_num(char *str, long length) {
str[i] = 3;
break;
default:
- str[i] = 5;
+ // Error Checking: use 4 so we can shift right twice
+ // to check quickly is there is an non ACGT character
+ str[i] = 4;
}
}
diff --git a/kmer_utils.h b/kmer_utils.h
index a93216e..caad759 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -1 +1,2 @@
long convert_kmer_to_index(char *str, long kmer, long error_pos);
+long num_to_index(const char *str, int kmer, long error_pos);