From 3d5ab03f760814e5e6bb6ac54e29650bc1fe6153 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 1 Oct 2013 16:19:21 -0400 Subject: update headers, use const for better performance (.500ms on ~2gb file), update comments for functions --- kmer_total_count.c | 8 +++----- kmer_utils.c | 29 ++++++++++++++--------------- kmer_utils.h | 1 + 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/kmer_total_count.c b/kmer_total_count.c index 2416bfa..8cc91a7 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -13,10 +13,8 @@ int main(int argc, char **argv) { char *line = NULL; size_t len = 0; ssize_t read; - long i = 0; + unsigned long i = 0; - unsigned int kmer = 0; - unsigned long width = 0; unsigned long long *counts; if(argc != 3) { @@ -31,10 +29,10 @@ int main(int argc, char **argv) { } // second argument is the kmer - kmer = atoi(argv[2]); + const unsigned int kmer = atoi(argv[2]); // width is 4^kmer - width = (int)pow(4, kmer); + const unsigned long width = (int)pow(4, kmer); // malloc our counts matrix counts = malloc((width+ 1) * sizeof(unsigned long long)); diff --git a/kmer_utils.c b/kmer_utils.c index 7b483ea..89df3b2 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -4,19 +4,9 @@ #include #include -// This function takes a char array containing sequences and converts it -// into a kmer index (essentially a base 4 radix converstion to base 10, with -// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning -// them into a radix-4 number we can use. -// -// kind of convoluted but it works. -// -// Arguemnts: -// char *str - a NULL terminated character array -// long kmer - how long of a index value you want to return -// long error_pos - what index to return for a non ACGT character -// -inline long num_to_index(const char *str, int kmer, long error_pos) { +// convert a string of k-mer size base-4 values into a +// base-10 index +long num_to_index(const char *str, const int kmer, const long error_pos) { int i = 0; unsigned long out = 0; @@ -24,6 +14,9 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { for(i = kmer - 1; i >= 0; i--){ + if(str[i] >> 2) + return error_pos; + out += str[i] * multiply; multiply = multiply << 2; } @@ -31,11 +24,15 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { return out; } -void convert_kmer_to_num(char *str, long length) { +// replaces values in a char array of ACGT's and others with +// values that correspond to their base 4 value to be used in +// num_to_index. +void convert_kmer_to_num(char *str, const long length) { long i = 0; for(i = 0; i < length; i++) { + // this is _not_ portable, only works with ASCII values. switch(str[i] | 0x20 ) { case 'a': str[i] = 0; @@ -50,7 +47,9 @@ void convert_kmer_to_num(char *str, long length) { str[i] = 3; break; default: - str[i] = 5; + // Error Checking: use 4 so we can shift right twice + // to check quickly is there is an non ACGT character + str[i] = 4; } } diff --git a/kmer_utils.h b/kmer_utils.h index a93216e..caad759 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -1 +1,2 @@ long convert_kmer_to_index(char *str, long kmer, long error_pos); +long num_to_index(const char *str, int kmer, long error_pos); -- cgit v1.2.3