From 84e6803509f91fc01678f18ec634480463099b10 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Thu, 30 Jan 2014 16:29:47 -0500 Subject: kmer_count_per_sequence: add option to load specific mers from file, add multiline ecounting --- kmer_counts_per_sequence.c | 100 +++++++++++++++++++++++++++++++----------- kmer_frequency_per_sequence.h | 1 - kmer_utils.c | 64 +++++++++++++++++++++++---- kmer_utils.h | 6 ++- 4 files changed, 134 insertions(+), 37 deletions(-) delete mode 100644 kmer_frequency_per_sequence.h diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index fceb17e..f725d5b 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -3,69 +3,119 @@ #include #include #include +#include #include "kmer_utils.h" unsigned long position = 0; int main(int argc, char **argv) { + // getdelim variables char *line = NULL; size_t len = 0; ssize_t read; - if(argc != 3) { + // specific mer variables + bool specific_mers = false; + size_t num_desired_indicies = 0; + size_t *desired_indicies = NULL; + + unsigned long kmer = atoi(argv[2]); + if(kmer == 0) { + fprintf(stderr, "Error: invalid kmer.\n"); + exit(EXIT_FAILURE); + } + + const unsigned long long width = (unsigned long long)1 << (kmer * 2); + + if(argc < 3) { printf("Please supply a filename and a kmer\n"); exit(EXIT_FAILURE); } + if(argc == 4) { + specific_mers = true; + desired_indicies = malloc((width) * sizeof(size_t)); + if(desired_indicies == NULL) + exit(EXIT_FAILURE); + num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies); + + } + FILE *fh = fopen(argv[1], "r" ); if(fh == NULL) { fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno)); exit(EXIT_FAILURE); } + unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); + if(counts == NULL) + exit(EXIT_FAILURE); - unsigned long kmer = atoi(argv[2]); - if(kmer == 0) { - fprintf(stderr, "Error: invalid kmer.\n"); + char *str = malloc(BUFSIZ); + if(str == NULL) { + fprintf(stderr, strerror(errno)); exit(EXIT_FAILURE); } - const unsigned long width = (unsigned long)1 << (kmer * 2); + unsigned long long str_size = BUFSIZ; + unsigned long long sequence = 0; + while ((read = getdelim(&line, &len, '>', fh)) != -1) { + size_t i = 0; - unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); - if(counts == NULL) - exit(EXIT_FAILURE); + memset(counts, 0, width * sizeof(unsigned long long)); - while ((read = getline(&line, &len, fh)) != -1) { - if(line[0] != '>' && (read > kmer)) { + // find our first \n, this should be the end of the header + char *start = strchr(line, '\n'); + if(start == NULL) + continue; - unsigned int i = 0; - unsigned long total = 0; + // point to one past that. + start = start + 1; - // reset our count matrix to zero - memset(counts, 0, width * sizeof(unsigned long long)); + size_t start_len = strlen(start); - for(i = 0; i < read; i++) { - line[i] = alpha[(int)line[i]]; - } - for(i = 0; i < read - kmer; i++) { - counts[num_to_index(&line[i],kmer, width)]++; + // if our current str buffer isn't big enough, realloc + if(start_len + 1 > str_size + 1) { + str = realloc(str, start_len + 1); + if(str == NULL) { + exit(EXIT_FAILURE); + fprintf(stderr, strerror(errno)); } + } + - for(i = 0; i < width; i++) - total += counts[i]; + // strip out all other newlines to handle multiline sequences + str = strnstrip(start, str, '\n',start_len); + size_t seq_length = strlen(str); + // reset our count matrix to zero + + for(i = 0; i < seq_length; i++) { + str[i] = alpha[(int)str[i]]; + } + + for(i = 0; i < (seq_length - kmer + 1); i++) { + size_t mer = num_to_index(&str[i],kmer, width, &i); + counts[mer]++; + } + + + if(specific_mers) { + for(i = 0; i < num_desired_indicies; i++) + printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]); + sequence++; + } + else { for(i = 0; i < width - 1; i++) printf("%llu\t", counts[i]); printf("%llu\n", counts[width - 1]); - } } - free(counts); - free(line); +free(counts); +free(line); - return EXIT_SUCCESS; +return EXIT_SUCCESS; } diff --git a/kmer_frequency_per_sequence.h b/kmer_frequency_per_sequence.h deleted file mode 100644 index e782906..0000000 --- a/kmer_frequency_per_sequence.h +++ /dev/null @@ -1 +0,0 @@ -extern long position; diff --git a/kmer_utils.c b/kmer_utils.c index 2b3ce67..b0cab47 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -26,7 +26,7 @@ inline unsigned long long pow_four(unsigned long long x) { // convert a string of k-mer size base-4 values into a // base-10 index -inline unsigned long num_to_index(const char *str, const int kmer, const long error_pos) { +inline unsigned long num_to_index(const char *str, const int kmer, const long error_pos, size_t *current_position) { int i = 0; unsigned long out = 0; @@ -35,9 +35,7 @@ inline unsigned long num_to_index(const char *str, const int kmer, const long er for(i = kmer - 1; i >= 0; i--){ if(str[i] == 5) { - #ifndef SHARED - position += i; - #endif + current_position += i; return error_pos; } @@ -48,11 +46,59 @@ inline unsigned long num_to_index(const char *str, const int kmer, const long er return out; } +// return the number of loaded elements +unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { + FILE *fh; + size_t arr_pos = 0; + char line[64]; + + fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno)); + exit(EXIT_FAILURE); + } + + while (fgets(line, sizeof(line), fh) != NULL) { + size_t i; + size_t len; + + len = strlen(line); + if(len == 0) + continue; + + len--; + line[len] = '\0'; + + + if(len != kmer) { + fprintf(stderr, "SKIPPING: '%s' is not length %u\n", line, kmer); + continue; + } + + for(i = 0; i < len; i++) { + line[i] = alpha[(int)line[i]]; + } + + size_t mer = num_to_index(line, kmer, width, NULL); + if(mer == width) { + fprintf(stderr, "SKIPPING: '%s' is a unrecognized mer\n", line); + continue; + } + else { + arr[arr_pos] = mer; + arr_pos++; + } + } + + fclose(fh); + return arr_pos; +} + // convert an index back into a kmer string char *index_to_kmer(unsigned long long index, long kmer) { - int i = 0; - int j = 0; + size_t i = 0; + size_t j = 0; char *num_array = calloc(kmer, sizeof(char)); char *ret = calloc(kmer + 1, sizeof(char)); if(ret == NULL) @@ -74,7 +120,7 @@ char *index_to_kmer(unsigned long long index, long kmer) { // our offset for how many chars we prepended int offset = j; // save i so we can print it - int start = i ; + size_t start = i ; // decrement i by 1 to reverse the last i++ i--; @@ -116,8 +162,8 @@ unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned in size_t len = 0; ssize_t read; - long long i = 0; - long long position = 0; + size_t i = 0; + size_t position = 0; FILE * const fh = fopen(fn, "r"); if(fh == NULL) { diff --git a/kmer_utils.h b/kmer_utils.h index 762c20c..cfe664b 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -1,12 +1,14 @@ // Kmer functions void convert_kmer_to_num(char *str, const unsigned long length); -unsigned long num_to_index(const char *str, const int kmer, const long error_pos); +unsigned long num_to_index(const char *str, const int kmer, const long error_pos, size_t *current_position); char *index_to_kmer(unsigned long long index, long kmer); unsigned long long * get_kmer_counts_from_file(const char *fn, const int kmer); // Utility functions char *strnstrip(const char *s, char *dest, int c, int len); -inline unsigned long long pow_four(unsigned long long x); +unsigned long long pow_four(unsigned long long x); // Variables const unsigned char alpha[256]; + +unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr); -- cgit v1.2.3