diff options
author | Calvin Morrison <mutantturkey@gmail.com> | 2013-10-18 10:26:12 -0400 |
---|---|---|
committer | Calvin Morrison <mutantturkey@gmail.com> | 2013-10-18 10:26:12 -0400 |
commit | c1a34163771229aa269ada443c6baa38f3073c11 (patch) | |
tree | daa87dd67e45ad910f750587e4769acda68e9eec | |
parent | 2f62ad7c4ec7ed5bae74c4444442dd871222e6c6 (diff) |
add kmercounting code from dna-utils. update our make file to support debugging, and update quikr to count-kmer code internally
-rw-r--r-- | src/c/Makefile | 32 | ||||
-rw-r--r-- | src/c/kmer_utils.c | 154 | ||||
-rw-r--r-- | src/c/kmer_utils.h | 11 | ||||
-rw-r--r-- | src/c/quikr.c | 35 |
4 files changed, 214 insertions, 18 deletions
diff --git a/src/c/Makefile b/src/c/Makefile index e927b13..1f59312 100644 --- a/src/c/Makefile +++ b/src/c/Makefile @@ -1,17 +1,27 @@ VERSION=\"v1.0.4\" UNAME := $(shell uname) CC = gcc -QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME) -DVERSION=$(VERSION) -MULTIFASTA_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -lz -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) -DVERSION=$(VERSION) -QUIKR_CFLAGS= -O3 -s -mtune=native -Wextra -Wall -lm -lz -std=gnu99 -D$(UNAME) -DVERSION=$(VERSION) +QUIKR_TRAIN_CFLAGS = -D$(UNAME) -DVERSION=$(VERSION) +MULTIFASTA_CFLAGS = -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) -DVERSION=$(VERSION) +QUIKR_CFLAGS = -std=gnu99 -D$(UNAME) -DVERSION=$(VERSION) -all: quikr_train_ quikr_ multifasta_to_otu_ +ifndef DEBUG +CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -lz +else +CFLAGS = -ggdb3 -O0 -Wall -Wextra -lm -lz +endif -multifasta_to_otu_: - $(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(MULTIFASTA_CFLAGS) -quikr_train_: - $(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS) -quikr_: - $(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../ +all: nnls.o kmer_utils.o quikr_train quikr multifasta_to_otu + +nnls.o: nnls.c + $(CC) -c nnls.c -o nnls.o $(CFLAGS) +kmer_utils.o: kmer_utils.c + $(CC) -c kmer_utils.c -o kmer_utils.o $(CFLAGS) +multifasta_to_otu: kmer_utils.o nnls.o multifasta_to_otu.c + $(CC) multifasta_to_otu.c quikr_functions.c nnls.o kmer_utils.o -o multifasta_to_otu $(CFLAGS) $(MULTIFASTA_CFLAGS) +quikr_train: kmer_utils.o quikr_train.c + $(CC) quikr_train.c quikr_functions.c -o quikr_train $(CFLAGS) $(QUIKR_TRAIN_CFLAGS) +quikr: kmer_utils.o nnls.o quikr.c + $(CC) quikr.c quikr_functions.c nnls.o kmer_utils.o -o quikr $(CFLAGS) $(QUIKR_CFLAGS) -I../ clean: - rm -vf quikr_train quikr multifasta_to_otu + rm -vf quikr_train quikr multifasta_to_otu *.o diff --git a/src/c/kmer_utils.c b/src/c/kmer_utils.c new file mode 100644 index 0000000..cf8d666 --- /dev/null +++ b/src/c/kmer_utils.c @@ -0,0 +1,154 @@ +// Copyright 2013 Calvin Morrison +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +const unsigned char alpha[256] = +{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; + +inline unsigned long long pow_four(unsigned long long x) { + return (unsigned long long)1 << (x * 2); +} + +// convert a string of k-mer size base-4 values into a +// base-10 index +inline unsigned long num_to_index(const char *str, const int kmer, int *position, const long error_pos) { + + int i = 0; + unsigned long out = 0; + unsigned long multiply = 1; + + for(i = kmer - 1; i >= 0; i--){ + + if(str[i] >> 2) { + position += i; + return error_pos; + } + + out += str[i] * multiply; + multiply = multiply << 2; + } + + return out; +} + +// Strip out any character 'c' from char array 's' into a destination dest (you +// need to allocate that) and copy only len characters. +char *strnstrip(const char *s, char *dest, int c, unsigned long long len) { + + unsigned long long i = 0; + unsigned long long j = 0; + + for(i = 0; i < len; i++) { + if(s[i] != c) { + dest[j] = s[i]; + j++; + } + } + + dest[j] = '\0'; + + return dest; +} + +unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned int kmer) { + + char *line = NULL; + size_t len = 0; + ssize_t read; + + long long i = 0; + long long position = 0; + + FILE * const fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno)); + exit(EXIT_FAILURE); + } + + // width is 4^kmer + // there's a sneaky bitshift to avoid pow dependency + const unsigned long width = pow_four(kmer); + + // malloc our return array + unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long)); + memset(counts, 0, width); + if(counts == NULL) { + fprintf(stderr, strerror(errno)); + exit(EXIT_FAILURE); + } + + char *str = malloc(4096); + if(str == NULL) { + fprintf(stderr, strerror(errno)); + exit(EXIT_FAILURE); + } + + unsigned long long str_size = 4096; + + while ((read = getdelim(&line, &len, '>', fh)) != -1) { + + // find our first \n, this should be the end of the header + char *start = strchr(line, '\n'); + if(start == NULL) + continue; + + size_t start_len = strlen(start); + + + // if our current str buffer isn't big enough, realloc + if(start_len + 1 > str_size + 1) { + str = realloc(str, start_len + 1); + if(str == NULL) { + exit(EXIT_FAILURE); + fprintf(stderr, strerror(errno)); + } + } + + // strip out all other newlines to handle multiline sequences + str = strnstrip(start, str, '\n',start_len); + size_t seq_length = strlen(str); + + // relace A, C, G and T with 0, 1, 2, 3 respectively + // everything else is 5 + for(i = 0; i < seq_length; i++) { + str[i] = alpha[(int)str[i]]; + } + + // loop through our string to process each k-mer + for(position = 0; position < (seq_length - kmer + 1); position++) { + unsigned long mer = 0; + unsigned long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(str[i] >> 2) { + mer = width; + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += str[i] * multiply; + multiply = multiply << 2; + } + // use this point to get mer of our loop + next: + // bump up the mer value in the counts array + counts[mer]++; + } + } + + return counts; +} diff --git a/src/c/kmer_utils.h b/src/c/kmer_utils.h new file mode 100644 index 0000000..34cedca --- /dev/null +++ b/src/c/kmer_utils.h @@ -0,0 +1,11 @@ +// Kmer functions +void convert_kmer_to_num(char *str, const unsigned long length); +unsigned long long * get_kmer_counts_from_file(const char *fn, const int kmer); +unsigned long num_to_index(const char *str, const int kmer, int *position, const long error_pos); + +// Utility functions +char *strnstrip(const char *s, char *dest, int c, int len); +unsigned long long pow_four(unsigned long long x); + +// Variables +const unsigned char alpha[256]; diff --git a/src/c/quikr.c b/src/c/quikr.c index a954cce..8a238f6 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -3,12 +3,12 @@ #include <getopt.h> #include <math.h> #include <stdio.h> -#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include "nnls.h" +#include "kmer_utils.h" #include "quikr_functions.h" #define sensing_matrix(i,j) (sensing_matrix[width*i + j]) @@ -136,6 +136,11 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } + if(kmer == 0) { + fprintf(stderr, "Error: zero is not a valid kmer\n"); + exit(EXIT_FAILURE); + } + // 4 "ACGT" ^ Kmer gives us the size of output rows width = pow(4, kmer); width = width + 1; @@ -149,24 +154,40 @@ int main(int argc, char **argv) { printf("width: %ld\nsequences %ld\n", width, sequences); } + + // load counts matrix + unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer); + + double *count_matrix = malloc(sizeof(double) * width); + count_matrix[0] = 0; + + for(x = 1; x < width ; x++) + count_matrix[x] = (double)integer_counts[x-1]; + + free(integer_counts); + + // normalize our count_matrix + normalize_matrix(count_matrix, 1, width); + + for(x = 0; x < width; x++) + count_matrix[x] = count_matrix[x] * lambda; + + // load sensing matrix double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); - double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer); - // multiply our matrix by lambda + // multiply our sensing matrix by lambda for(x = 1; x < sequences; x++) { for(y= 0; y < width - 1; y++) { sensing_matrix(x, y) = sensing_matrix(x, y) * lambda; } } + // set the first column of our sensing matrix to 0 for(x = 0; x < sequences; x++) { sensing_matrix(x, 0) = 1.0; } - // normalize our count_matrix - normalize_matrix(count_matrix, 1, width); - for(x = 0; x < width; x++) - count_matrix[x] = count_matrix[x] * lambda; + // run NNLS double *solution = nnls(sensing_matrix, count_matrix, sequences, width); // normalize our solution vector |