From 35135e92af473c3641d1be0a53c761fc084861c3 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 12 Mar 2014 16:06:46 -0400 Subject: reverse compliment --- kmer_utils.c | 151 +++++++++++++++++++++++++++++++---------------------------- kmer_utils.h | 35 +++++++++++--- 2 files changed, 108 insertions(+), 78 deletions(-) diff --git a/kmer_utils.c b/kmer_utils.c index b8d5691..7703f0a 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -3,9 +3,10 @@ #include #include #include +#include -const unsigned char alpha[256] = +const unsigned char alpha[256] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, @@ -19,6 +20,12 @@ const unsigned char alpha[256] = const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' }; + // compliments + // A C G T E E + // T G C A E E +const char compliment[6] = { 3, 2, 1, 0, 5, 5}; + + unsigned long long pow_four(unsigned long long x) { return (unsigned long long)1 << (x * 2); } @@ -27,14 +34,53 @@ void check_null_ptr(void *ptr, const char *error) { if (ptr == NULL) { if(error != NULL) { fprintf(stderr, "Error: %s - %s\n", error, strerror(errno)); - } + } else { fprintf(stderr, "Error: %s\n", strerror(errno)); - } + } exit(EXIT_FAILURE); } } + +void reverse_string(char *s, size_t len) { + char t, *d = &(s[len - 1]); + while (d > s) { + t = *s; + *s++ = *d; + *d-- = t; + } +} + +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) { + long long position; + long long i; + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // bump up the mer value in the counts array + counts[mer]++; + + // skip count if error + next: ; + } +} + // convert a string of k-mer size base-4 values into a // base-10 index unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) { @@ -44,7 +90,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos unsigned long multiply = 1; for(i = kmer - 1; i >= 0; i--){ - if(str[i] == 5) { + if(str[i] == 5) { current_position += i; return error_pos; } @@ -57,7 +103,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos } // return the number of loaded elements -unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { +unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { FILE *fh; size_t arr_pos = 0; char line[64]; @@ -130,11 +176,11 @@ char *index_to_kmer(unsigned long long index, long kmer) { size_t start = i ; // decrement i by 1 to reverse the last i++ - i--; + i--; j = 0; // reverse the array, as j increases, decrease i - for(j = 0; j < start; j++, i--) + for(j = 0; j < start; j++, i--) ret[j + offset] = reverse_alpha[(int)num_array[i]]; // set our last character to the null termination byte @@ -165,7 +211,7 @@ size_t strnstrip(char *s, int c, size_t len) { } -unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { +unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { char *line = NULL; size_t len = 0; @@ -184,9 +230,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer char *seq; size_t seq_length; - long long i = 0; - long long position = 0; - // find our first \n, this should be the end of the header seq = strchr(line, '\n'); if(seq == NULL) @@ -204,31 +247,18 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer for(k = 0; k < seq_length; k++) seq[k] = alpha[(int)seq[k]]; + count_sequence(seq, seq_length, kmer, counts); - // loop through our seq to process each k-mer - for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { - unsigned long long mer = 0; - unsigned long long multiply = 1; - - // for each char in the k-mer check if it is an error char - for(i = position + kmer - 1; i >= position; i--){ - if(seq[i] == 5) { - mer = width; - position = i; - goto next; - } - - // multiply this char in the mer by the multiply - // and bitshift the multiply for the next round - mer += seq[i] * multiply; - multiply = multiply << 2; + if(count_compliment) { + for(k = 0; k < seq_length; k++) { + seq[k] = compliment[(int)seq[k]]; } - // use this point to get mer of our loop - next: - // bump up the mer value in the counts array - counts[mer]++; + + reverse_string(seq, seq_length); + count_sequence(seq, seq_length, kmer, counts); + } - } + } free(line); @@ -237,15 +267,14 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer return counts; } -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { +unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) { FILE *fh = fopen(fn, "r"); check_null_ptr(fh, fn); - return get_kmer_counts_from_file(fh, kmer); + return get_kmer_counts_from_file(fh, kmer, count_compliment); } - -unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { char *line = NULL; size_t len = 0; @@ -262,13 +291,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign size_t cpy_size = kmer - 1; char *end_of_previous = malloc(sizeof(char) * cpy_size); + check_null_ptr(end_of_previous, NULL); memset(end_of_previous, 5, cpy_size); - if(end_of_previous == NULL) { - fprintf(stderr, "%s\n", strerror(errno)); - exit(EXIT_FAILURE); - } - while ((read = getline(&line, &len, fh)) != -1) { if(line[0] == '>') continue; @@ -276,7 +301,6 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign char *seq; size_t j; size_t k; - long long position; size_t seq_length; seq = malloc(read + cpy_size); @@ -290,40 +314,23 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign // everything else is 5 for(k = cpy_size; k < seq_length; k++) seq[k] = alpha[(int)seq[k]]; - - // loop through our seq to process each k-mer - for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { - unsigned long long mer = 0; - unsigned long long multiply = 1; - long long i; - - // for each char in the k-mer check if it is an error char - for(i = position + kmer - 1; i >= position; i--){ - if(seq[i] == 5) { - mer = width; - position = i; - goto next; - } - - // multiply this char in the mer by the multiply - // and bitshift the multiply for the next round - mer += seq[i] * multiply; - multiply = multiply << 2; - } - // use this point to get mer of our loop - next: - // bump up the mer value in the counts array - counts[mer]++; - } + count_sequence(seq, seq_length, kmer, counts); for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) { end_of_previous[j] = seq[k]; } - free(seq); + if(count_compliment) { + for(k = 0; k < seq_length; k++) + seq[k] = compliment[(int)seq[k]]; - } + reverse_string(seq, seq_length); + count_sequence(seq, seq_length, kmer, counts); + } + + free(seq); + } free(end_of_previous); free(line); @@ -332,9 +339,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign return counts; } -unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) { FILE *fh = fopen(fn, "r"); check_null_ptr(fh, fn); - return get_continuous_kmer_counts_from_file(fh, kmer); + return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment); } diff --git a/kmer_utils.h b/kmer_utils.h index 4bc732a..a727da8 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -1,22 +1,45 @@ -// Kmer functions -void convert_kmer_to_num(char *str, const unsigned long length); +// dna-util's function library. + unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position); char *index_to_kmer(unsigned long long index, long kmer); // Utility functions + + +// strip char 'c' out of char array *s of length len size_t strnstrip(char *s, int c, size_t len); + +// reverse char arry *s of length len +void reverse_string(char *s, size_t len); + +// quicky calculate 4^x unsigned long long pow_four(unsigned long long x); + +// check if pointer is null. a helper for dealing with NULL +// return values as errors. Calls strerror and quits if +// ptr is null, optionally takes *error char array as +// a error to output void check_null_ptr(void *ptr, const char *error); +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts); + // Variables +// const unsigned char alpha[256]; +const unsigned char reverse_alpha[4]; +const unsigned char compliment[5]; + // file loading functions + +// open file from filename in char array *fn, and try and parse in one mer per +// line, of size kmer, and store the indicies of those mers in the *arr +// pointer; unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr); -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer); +unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment); +unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer, const bool count_compliment); -unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer); +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment); +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment); -- cgit v1.2.3