From 719c45d9ea49f33f087db899737ad5af4fc4414f Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 18 Feb 2014 12:42:11 -0500 Subject: working sparse implementation --- Makefile | 8 +++-- kmer_total_count.c | 45 ++++++---------------------- kmer_utils.c | 32 ++++++++++++-------- kmer_utils.h | 4 +-- sparse.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ sparse.h | 17 +++++++++++ 6 files changed, 140 insertions(+), 54 deletions(-) create mode 100644 sparse.c create mode 100644 sparse.h diff --git a/Makefile b/Makefile index c929cfd..e51a508 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,19 @@ VERSION=\"0.0.2\" CC = gcc -CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra +CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra -lcsparse DESTDIR = /usr/local/ all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence +sparse.o: sparse.c + $(CC) -c sparse.c -o sparse.o $(CFLAGS) libkmer.o: kmer_utils.c $(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC libkmer.so: libkmer.o $(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC -kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h - $(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) +kmer_total_count: sparse.o libkmer.o kmer_total_count.c kmer_utils.h + $(CC) sparse.o libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h $(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) diff --git a/kmer_total_count.c b/kmer_total_count.c index 6b627f2..87f1b69 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -6,6 +6,7 @@ #include #include +#include "sparse.h" #include "kmer_utils.h" @@ -115,43 +116,15 @@ int main(int argc, char **argv) { width = pow_four(kmer); - unsigned long long *counts = get_kmer_counts_from_file(fh, kmer); - - // If nonzero is set, only print non zeros - if(nonzero) { - // if labels is set, print out our labels - if(label) { - for(i = 0; i < width; i++) - if(counts[i] != 0) { - char *kmer_str = index_to_kmer(i, kmer); - fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); - free(kmer_str); - } - - } - else { - for(i = 0; i < width; i++) - if(counts[i] != 0) - fprintf(stdout, "%llu\t%llu\n", i, counts[i]); - - } - } - // If we aren't printing nonzeros print everything - else { - if(label) { - for(i = 0; i < width; i++) { - char *kmer_str = index_to_kmer(i, kmer); - fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); - free(kmer_str); - } - } - else { - for(i = 0; i < width; i=i+4) { - fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); - } - } + if(kmer > 12) { + node *root = get_kmer_counts_from_file(fh, kmer); + print_sparse(root, label, nonzero, kmer); } + //else { +// unsigned long long *counts = get_kmer_counts_from_file(fh, kmer); +// print(countst, label, nonzero); + //free(counts); +// } - free(counts); return EXIT_SUCCESS; } diff --git a/kmer_utils.c b/kmer_utils.c index 060e311..f2057ee 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -4,6 +4,7 @@ #include #include +#include "sparse.h" const unsigned char alpha[256] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, @@ -156,8 +157,8 @@ size_t strnstrip(char *s, int c, size_t len) { } -unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { - +node * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { + char *line = NULL; size_t len = 0; ssize_t read; @@ -166,15 +167,11 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer long long position = 0; // width is 4^kmer - // there's a sneaky bitshift to avoid pow dependency - const unsigned long width = pow_four(kmer); + const unsigned long long width = pow_four(kmer); + + node *root = NULL; + - // malloc our return array - unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); - if(counts == NULL) { - fprintf(stderr, strerror(errno)); - exit(EXIT_FAILURE); - } while ((read = getdelim(&line, &len, '>', fh)) != -1) { size_t k; @@ -219,15 +216,24 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer } // use this point to get mer of our loop next: - // bump up the mer value in the counts array - counts[mer]++; + { + // bump up the mer value in the counts array + node *tmp = search(&root, mer); + if(tmp) { + tmp->value++; + } + else { + insert(&root, mer); + } + } } } free(line); fclose(fh); - return counts; + return root; + } unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { diff --git a/kmer_utils.h b/kmer_utils.h index ceb28eb..bf60dc4 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -12,5 +12,5 @@ const unsigned char alpha[256]; // file loading functions unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr); -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer); +node * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); +node * get_kmer_counts_from_file(FILE *fh, const int kmer); diff --git a/sparse.c b/sparse.c new file mode 100644 index 0000000..9ab6902 --- /dev/null +++ b/sparse.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +struct bin_tree { + size_t index; + unsigned long long value; + struct bin_tree *right; + struct bin_tree *left; +}; + +typedef struct bin_tree node; + +void insert(node ** tree, size_t index) { + node *temp = NULL; + if(!(*tree)) { + temp = (node *)malloc(sizeof(node)); + temp->left = temp->right = NULL; + temp->index = index; + temp->value = 1; + *tree = temp; + return; + } + + if(index < (*tree)->index) + insert(&(*tree)->left, index); + else if(index > (*tree)->index) + insert(&(*tree)->right, index); + +} + +void deltree(node * tree) { + if (tree) { + deltree(tree->left); + deltree(tree->right); + free(tree); + } +} + +node* search(node ** tree, size_t index) { + if(!(*tree)) + return NULL; + + if(index < (*tree)->index) + search(&((*tree)->left), index); + + else if(index > (*tree)->index) + search(&((*tree)->right), index); + + else if(index == (*tree)->index) + return *tree; +} + +unsigned long long *lookup(node **tree, size_t index) { + node *ret = search(tree, index); + if(ret) + return ret->value; + else + return 0; +} + +void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer) { + + if (tree) { + print_sparse(tree->left, label, nonzero, kmer); + if(label && nonzero) { + char *kmer_str = index_to_kmer(tree->index, kmer); + if (tree->value != 0) + fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); + free(kmer_str); + } + else if(label) { + char *kmer_str = index_to_kmer(tree->index, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); + free(kmer_str); + } + else if(nonzero) { + if (tree->value != 0) + fprintf(stdout, "%zu\t%llu\n", tree->index, tree->value); + } + else + fprintf(stdout, "%llu\n", tree->value); + + print_sparse(tree->right, label, nonzero, kmer); + } +} diff --git a/sparse.h b/sparse.h new file mode 100644 index 0000000..2ffbabd --- /dev/null +++ b/sparse.h @@ -0,0 +1,17 @@ +#include + +struct b_tree { + size_t index; + unsigned long long value; + struct b_tree *right; + struct b_tree *left; +}; + +typedef struct b_tree node; + +node* search(node **tree, size_t index); +void insert(node **tree, size_t index); +void deltree(node *tree); +unsigned long long lookup(node **tree, size_t index); + +void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer); -- cgit v1.2.3