From 84e6803509f91fc01678f18ec634480463099b10 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Thu, 30 Jan 2014 16:29:47 -0500 Subject: kmer_count_per_sequence: add option to load specific mers from file, add multiline ecounting --- kmer_counts_per_sequence.c | 100 +++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 25 deletions(-) (limited to 'kmer_counts_per_sequence.c') diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index fceb17e..f725d5b 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -3,69 +3,119 @@ #include #include #include +#include #include "kmer_utils.h" unsigned long position = 0; int main(int argc, char **argv) { + // getdelim variables char *line = NULL; size_t len = 0; ssize_t read; - if(argc != 3) { + // specific mer variables + bool specific_mers = false; + size_t num_desired_indicies = 0; + size_t *desired_indicies = NULL; + + unsigned long kmer = atoi(argv[2]); + if(kmer == 0) { + fprintf(stderr, "Error: invalid kmer.\n"); + exit(EXIT_FAILURE); + } + + const unsigned long long width = (unsigned long long)1 << (kmer * 2); + + if(argc < 3) { printf("Please supply a filename and a kmer\n"); exit(EXIT_FAILURE); } + if(argc == 4) { + specific_mers = true; + desired_indicies = malloc((width) * sizeof(size_t)); + if(desired_indicies == NULL) + exit(EXIT_FAILURE); + num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies); + + } + FILE *fh = fopen(argv[1], "r" ); if(fh == NULL) { fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno)); exit(EXIT_FAILURE); } + unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); + if(counts == NULL) + exit(EXIT_FAILURE); - unsigned long kmer = atoi(argv[2]); - if(kmer == 0) { - fprintf(stderr, "Error: invalid kmer.\n"); + char *str = malloc(BUFSIZ); + if(str == NULL) { + fprintf(stderr, strerror(errno)); exit(EXIT_FAILURE); } - const unsigned long width = (unsigned long)1 << (kmer * 2); + unsigned long long str_size = BUFSIZ; + unsigned long long sequence = 0; + while ((read = getdelim(&line, &len, '>', fh)) != -1) { + size_t i = 0; - unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); - if(counts == NULL) - exit(EXIT_FAILURE); + memset(counts, 0, width * sizeof(unsigned long long)); - while ((read = getline(&line, &len, fh)) != -1) { - if(line[0] != '>' && (read > kmer)) { + // find our first \n, this should be the end of the header + char *start = strchr(line, '\n'); + if(start == NULL) + continue; - unsigned int i = 0; - unsigned long total = 0; + // point to one past that. + start = start + 1; - // reset our count matrix to zero - memset(counts, 0, width * sizeof(unsigned long long)); + size_t start_len = strlen(start); - for(i = 0; i < read; i++) { - line[i] = alpha[(int)line[i]]; - } - for(i = 0; i < read - kmer; i++) { - counts[num_to_index(&line[i],kmer, width)]++; + // if our current str buffer isn't big enough, realloc + if(start_len + 1 > str_size + 1) { + str = realloc(str, start_len + 1); + if(str == NULL) { + exit(EXIT_FAILURE); + fprintf(stderr, strerror(errno)); } + } + - for(i = 0; i < width; i++) - total += counts[i]; + // strip out all other newlines to handle multiline sequences + str = strnstrip(start, str, '\n',start_len); + size_t seq_length = strlen(str); + // reset our count matrix to zero + + for(i = 0; i < seq_length; i++) { + str[i] = alpha[(int)str[i]]; + } + + for(i = 0; i < (seq_length - kmer + 1); i++) { + size_t mer = num_to_index(&str[i],kmer, width, &i); + counts[mer]++; + } + + + if(specific_mers) { + for(i = 0; i < num_desired_indicies; i++) + printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]); + sequence++; + } + else { for(i = 0; i < width - 1; i++) printf("%llu\t", counts[i]); printf("%llu\n", counts[width - 1]); - } } - free(counts); - free(line); +free(counts); +free(line); - return EXIT_SUCCESS; +return EXIT_SUCCESS; } -- cgit v1.2.3