From 3d5ab03f760814e5e6bb6ac54e29650bc1fe6153 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 1 Oct 2013 16:19:21 -0400 Subject: update headers, use const for better performance (.500ms on ~2gb file), update comments for functions --- kmer_utils.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kmer_utils.c') diff --git a/kmer_utils.c b/kmer_utils.c index 7b483ea..89df3b2 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -4,19 +4,9 @@ #include #include -// This function takes a char array containing sequences and converts it -// into a kmer index (essentially a base 4 radix converstion to base 10, with -// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning -// them into a radix-4 number we can use. -// -// kind of convoluted but it works. -// -// Arguemnts: -// char *str - a NULL terminated character array -// long kmer - how long of a index value you want to return -// long error_pos - what index to return for a non ACGT character -// -inline long num_to_index(const char *str, int kmer, long error_pos) { +// convert a string of k-mer size base-4 values into a +// base-10 index +long num_to_index(const char *str, const int kmer, const long error_pos) { int i = 0; unsigned long out = 0; @@ -24,6 +14,9 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { for(i = kmer - 1; i >= 0; i--){ + if(str[i] >> 2) + return error_pos; + out += str[i] * multiply; multiply = multiply << 2; } @@ -31,11 +24,15 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { return out; } -void convert_kmer_to_num(char *str, long length) { +// replaces values in a char array of ACGT's and others with +// values that correspond to their base 4 value to be used in +// num_to_index. +void convert_kmer_to_num(char *str, const long length) { long i = 0; for(i = 0; i < length; i++) { + // this is _not_ portable, only works with ASCII values. switch(str[i] | 0x20 ) { case 'a': str[i] = 0; @@ -50,7 +47,9 @@ void convert_kmer_to_num(char *str, long length) { str[i] = 3; break; default: - str[i] = 5; + // Error Checking: use 4 so we can shift right twice + // to check quickly is there is an non ACGT character + str[i] = 4; } } -- cgit v1.2.1