diff options
Diffstat (limited to 'kmer_utils.c')
-rw-r--r-- | kmer_utils.c | 29 |
1 files changed, 14 insertions, 15 deletions
diff --git a/kmer_utils.c b/kmer_utils.c index 7b483ea..89df3b2 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -4,19 +4,9 @@ #include <string.h> #include <unistd.h> -// This function takes a char array containing sequences and converts it -// into a kmer index (essentially a base 4 radix converstion to base 10, with -// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning -// them into a radix-4 number we can use. -// -// kind of convoluted but it works. -// -// Arguemnts: -// char *str - a NULL terminated character array -// long kmer - how long of a index value you want to return -// long error_pos - what index to return for a non ACGT character -// -inline long num_to_index(const char *str, int kmer, long error_pos) { +// convert a string of k-mer size base-4 values into a +// base-10 index +long num_to_index(const char *str, const int kmer, const long error_pos) { int i = 0; unsigned long out = 0; @@ -24,6 +14,9 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { for(i = kmer - 1; i >= 0; i--){ + if(str[i] >> 2) + return error_pos; + out += str[i] * multiply; multiply = multiply << 2; } @@ -31,11 +24,15 @@ inline long num_to_index(const char *str, int kmer, long error_pos) { return out; } -void convert_kmer_to_num(char *str, long length) { +// replaces values in a char array of ACGT's and others with +// values that correspond to their base 4 value to be used in +// num_to_index. +void convert_kmer_to_num(char *str, const long length) { long i = 0; for(i = 0; i < length; i++) { + // this is _not_ portable, only works with ASCII values. switch(str[i] | 0x20 ) { case 'a': str[i] = 0; @@ -50,7 +47,9 @@ void convert_kmer_to_num(char *str, long length) { str[i] = 3; break; default: - str[i] = 5; + // Error Checking: use 4 so we can shift right twice + // to check quickly is there is an non ACGT character + str[i] = 4; } } |