aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-18 10:26:12 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-18 10:26:12 -0400
commitc1a34163771229aa269ada443c6baa38f3073c11 (patch)
treedaa87dd67e45ad910f750587e4769acda68e9eec
parent2f62ad7c4ec7ed5bae74c4444442dd871222e6c6 (diff)
add kmercounting code from dna-utils. update our make file to support debugging, and update quikr to count-kmer code internally
-rw-r--r--src/c/Makefile32
-rw-r--r--src/c/kmer_utils.c154
-rw-r--r--src/c/kmer_utils.h11
-rw-r--r--src/c/quikr.c35
4 files changed, 214 insertions, 18 deletions
diff --git a/src/c/Makefile b/src/c/Makefile
index e927b13..1f59312 100644
--- a/src/c/Makefile
+++ b/src/c/Makefile
@@ -1,17 +1,27 @@
VERSION=\"v1.0.4\"
UNAME := $(shell uname)
CC = gcc
-QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME) -DVERSION=$(VERSION)
-MULTIFASTA_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -lz -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) -DVERSION=$(VERSION)
-QUIKR_CFLAGS= -O3 -s -mtune=native -Wextra -Wall -lm -lz -std=gnu99 -D$(UNAME) -DVERSION=$(VERSION)
+QUIKR_TRAIN_CFLAGS = -D$(UNAME) -DVERSION=$(VERSION)
+MULTIFASTA_CFLAGS = -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) -DVERSION=$(VERSION)
+QUIKR_CFLAGS = -std=gnu99 -D$(UNAME) -DVERSION=$(VERSION)
-all: quikr_train_ quikr_ multifasta_to_otu_
+ifndef DEBUG
+CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -lz
+else
+CFLAGS = -ggdb3 -O0 -Wall -Wextra -lm -lz
+endif
-multifasta_to_otu_:
- $(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(MULTIFASTA_CFLAGS)
-quikr_train_:
- $(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS)
-quikr_:
- $(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../
+all: nnls.o kmer_utils.o quikr_train quikr multifasta_to_otu
+
+nnls.o: nnls.c
+ $(CC) -c nnls.c -o nnls.o $(CFLAGS)
+kmer_utils.o: kmer_utils.c
+ $(CC) -c kmer_utils.c -o kmer_utils.o $(CFLAGS)
+multifasta_to_otu: kmer_utils.o nnls.o multifasta_to_otu.c
+ $(CC) multifasta_to_otu.c quikr_functions.c nnls.o kmer_utils.o -o multifasta_to_otu $(CFLAGS) $(MULTIFASTA_CFLAGS)
+quikr_train: kmer_utils.o quikr_train.c
+ $(CC) quikr_train.c quikr_functions.c -o quikr_train $(CFLAGS) $(QUIKR_TRAIN_CFLAGS)
+quikr: kmer_utils.o nnls.o quikr.c
+ $(CC) quikr.c quikr_functions.c nnls.o kmer_utils.o -o quikr $(CFLAGS) $(QUIKR_CFLAGS) -I../
clean:
- rm -vf quikr_train quikr multifasta_to_otu
+ rm -vf quikr_train quikr multifasta_to_otu *.o
diff --git a/src/c/kmer_utils.c b/src/c/kmer_utils.c
new file mode 100644
index 0000000..cf8d666
--- /dev/null
+++ b/src/c/kmer_utils.c
@@ -0,0 +1,154 @@
+// Copyright 2013 Calvin Morrison
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const unsigned char alpha[256] =
+{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+
+inline unsigned long long pow_four(unsigned long long x) {
+ return (unsigned long long)1 << (x * 2);
+}
+
+// convert a string of k-mer size base-4 values into a
+// base-10 index
+inline unsigned long num_to_index(const char *str, const int kmer, int *position, const long error_pos) {
+
+ int i = 0;
+ unsigned long out = 0;
+ unsigned long multiply = 1;
+
+ for(i = kmer - 1; i >= 0; i--){
+
+ if(str[i] >> 2) {
+ position += i;
+ return error_pos;
+ }
+
+ out += str[i] * multiply;
+ multiply = multiply << 2;
+ }
+
+ return out;
+}
+
+// Strip out any character 'c' from char array 's' into a destination dest (you
+// need to allocate that) and copy only len characters.
+char *strnstrip(const char *s, char *dest, int c, unsigned long long len) {
+
+ unsigned long long i = 0;
+ unsigned long long j = 0;
+
+ for(i = 0; i < len; i++) {
+ if(s[i] != c) {
+ dest[j] = s[i];
+ j++;
+ }
+ }
+
+ dest[j] = '\0';
+
+ return dest;
+}
+
+unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned int kmer) {
+
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t read;
+
+ long long i = 0;
+ long long position = 0;
+
+ FILE * const fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ // width is 4^kmer
+ // there's a sneaky bitshift to avoid pow dependency
+ const unsigned long width = pow_four(kmer);
+
+ // malloc our return array
+ unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long));
+ memset(counts, 0, width);
+ if(counts == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ char *str = malloc(4096);
+ if(str == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ unsigned long long str_size = 4096;
+
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+
+ // find our first \n, this should be the end of the header
+ char *start = strchr(line, '\n');
+ if(start == NULL)
+ continue;
+
+ size_t start_len = strlen(start);
+
+
+ // if our current str buffer isn't big enough, realloc
+ if(start_len + 1 > str_size + 1) {
+ str = realloc(str, start_len + 1);
+ if(str == NULL) {
+ exit(EXIT_FAILURE);
+ fprintf(stderr, strerror(errno));
+ }
+ }
+
+ // strip out all other newlines to handle multiline sequences
+ str = strnstrip(start, str, '\n',start_len);
+ size_t seq_length = strlen(str);
+
+ // relace A, C, G and T with 0, 1, 2, 3 respectively
+ // everything else is 5
+ for(i = 0; i < seq_length; i++) {
+ str[i] = alpha[(int)str[i]];
+ }
+
+ // loop through our string to process each k-mer
+ for(position = 0; position < (seq_length - kmer + 1); position++) {
+ unsigned long mer = 0;
+ unsigned long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(str[i] >> 2) {
+ mer = width;
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += str[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // use this point to get mer of our loop
+ next:
+ // bump up the mer value in the counts array
+ counts[mer]++;
+ }
+ }
+
+ return counts;
+}
diff --git a/src/c/kmer_utils.h b/src/c/kmer_utils.h
new file mode 100644
index 0000000..34cedca
--- /dev/null
+++ b/src/c/kmer_utils.h
@@ -0,0 +1,11 @@
+// Kmer functions
+void convert_kmer_to_num(char *str, const unsigned long length);
+unsigned long long * get_kmer_counts_from_file(const char *fn, const int kmer);
+unsigned long num_to_index(const char *str, const int kmer, int *position, const long error_pos);
+
+// Utility functions
+char *strnstrip(const char *s, char *dest, int c, int len);
+unsigned long long pow_four(unsigned long long x);
+
+// Variables
+const unsigned char alpha[256];
diff --git a/src/c/quikr.c b/src/c/quikr.c
index a954cce..8a238f6 100644
--- a/src/c/quikr.c
+++ b/src/c/quikr.c
@@ -3,12 +3,12 @@
#include <getopt.h>
#include <math.h>
#include <stdio.h>
-#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "nnls.h"
+#include "kmer_utils.h"
#include "quikr_functions.h"
#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
@@ -136,6 +136,11 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
+ if(kmer == 0) {
+ fprintf(stderr, "Error: zero is not a valid kmer\n");
+ exit(EXIT_FAILURE);
+ }
+
// 4 "ACGT" ^ Kmer gives us the size of output rows
width = pow(4, kmer);
width = width + 1;
@@ -149,24 +154,40 @@ int main(int argc, char **argv) {
printf("width: %ld\nsequences %ld\n", width, sequences);
}
+
+ // load counts matrix
+ unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer);
+
+ double *count_matrix = malloc(sizeof(double) * width);
+ count_matrix[0] = 0;
+
+ for(x = 1; x < width ; x++)
+ count_matrix[x] = (double)integer_counts[x-1];
+
+ free(integer_counts);
+
+ // normalize our count_matrix
+ normalize_matrix(count_matrix, 1, width);
+
+ for(x = 0; x < width; x++)
+ count_matrix[x] = count_matrix[x] * lambda;
+
+ // load sensing matrix
double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
- double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer);
- // multiply our matrix by lambda
+ // multiply our sensing matrix by lambda
for(x = 1; x < sequences; x++) {
for(y= 0; y < width - 1; y++) {
sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
}
}
+ // set the first column of our sensing matrix to 0
for(x = 0; x < sequences; x++) {
sensing_matrix(x, 0) = 1.0;
}
- // normalize our count_matrix
- normalize_matrix(count_matrix, 1, width);
- for(x = 0; x < width; x++)
- count_matrix[x] = count_matrix[x] * lambda;
+ // run NNLS
double *solution = nnls(sensing_matrix, count_matrix, sequences, width);
// normalize our solution vector