From c038740be3e0dec1798c0c660081f1b6d2445907 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 26 Feb 2014 13:38:43 -0500 Subject: update sparse --- Makefile | 2 +- kmer_total_count.c | 27 +++++------ kmer_utils.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++- kmer_utils.h | 10 ++-- sparse.c | 26 ---------- 5 files changed, 158 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index e51a508..5a1f34d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION=\"0.0.2\" CC = gcc -CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra -lcsparse +CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra DESTDIR = /usr/local/ diff --git a/kmer_total_count.c b/kmer_total_count.c index 87f1b69..12f9ab4 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -18,6 +18,7 @@ void help() { " --kmer -k size of mers to count\n" " --nonzero -n only print non-zero values\n" " --label -l print mer along with value\n" + " --sparse -s force sparse table for any mer\n" "\n" "Report all bugs to mutantturkey@gmail.com\n" "\n" @@ -40,16 +41,14 @@ int main(int argc, char **argv) { bool nonzero = false; bool label = false; bool kmer_set = false; - - unsigned long long width = 0; - - unsigned long long i = 0; + bool force_sparse = false; static struct option long_options[] = { {"input", required_argument, 0, 'i'}, {"kmer", required_argument, 0, 'k'}, {"nonzero", no_argument, 0, 'n'}, {"label", no_argument, 0, 'l'}, + {"sparse", no_argument, 0, 's'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; @@ -59,7 +58,7 @@ int main(int argc, char **argv) { int option_index = 0; int c = 0; - c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index); + c = getopt_long (argc, argv, "i:k:nslvh", long_options, &option_index); if (c == -1) break; @@ -78,6 +77,9 @@ int main(int argc, char **argv) { case 'l': label = true; break; + case 's': + force_sparse = true; + break; case 'h': help(); exit(EXIT_SUCCESS); @@ -114,17 +116,14 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - width = pow_four(kmer); - - if(kmer > 12) { - node *root = get_kmer_counts_from_file(fh, kmer); + if(kmer > 12 || force_sparse) { + node *root = get_sparse_kmer_counts_from_file(fh, kmer); print_sparse(root, label, nonzero, kmer); } - //else { -// unsigned long long *counts = get_kmer_counts_from_file(fh, kmer); -// print(countst, label, nonzero); - //free(counts); -// } + else { + unsigned long long *counts = get_dense_kmer_counts_from_file(fh, kmer); + print_dense(counts, label, nonzero, kmer); + } return EXIT_SUCCESS; } diff --git a/kmer_utils.c b/kmer_utils.c index f2057ee..b456f2f 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -157,7 +157,7 @@ size_t strnstrip(char *s, int c, size_t len) { } -node * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { +node * get_sparse_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { char *line = NULL; size_t len = 0; @@ -236,6 +236,75 @@ node * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { } +unsigned long long * get_dense_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { + + char *line = NULL; + size_t len = 0; + ssize_t read; + + long long i = 0; + long long position = 0; + + // width is 4^kmer + const unsigned long long width = pow_four(kmer); + + unsigned long long *counts = malloc(width+1 * sizeof(unsigned long long)); + while ((read = getdelim(&line, &len, '>', fh)) != -1) { + size_t k; + char *seq; + size_t seq_length; + + // find our first \n, this should be the end of the header + seq = strchr(line, '\n'); + if(seq == NULL) + continue; + + // point to one past that. + seq = seq + 1; + + // strip out all other newlines to handle multiline sequences + strnstrip(seq, '\n', strlen(seq)); + seq_length = strlen(seq); + + // relace A, C, G and T with 0, 1, 2, 3 respectively + // everything else is 5 + for(k = 0; k < seq_length; k++) + seq[k] = alpha[(int)seq[k]]; + + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + mer = width; + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // use this point to get mer of our loop + next: + { + // bump up the mer value in the counts array + counts[mer]++; + } + } + } + + //free(line); + fclose(fh); + return counts; + +} + unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { FILE *fh = fopen(fn, "r"); if(fh == NULL) { @@ -243,6 +312,72 @@ unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigne return 0; } - return get_kmer_counts_from_file(fh, kmer); + return get_dense_kmer_counts_from_file(fh, kmer); } +void print_dense(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer) { + size_t width = pow_four(kmer); + size_t i = 0; + + // If nonzero is set, only print non zeros + if(nonzero) { + // if labels is set, print out our labels + if(label) { + for(i = 0; i < width; i++) + if(counts[i] != 0) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + + } + else { + for(i = 0; i < width; i++) + if(counts[i] != 0) + fprintf(stdout, "%llu\t%llu\n", i, counts[i]); + + } + } + // If we aren't printing nonzeros print everything + else { + if(label) { + for(i = 0; i < width; i++) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + } + else { + for(i = 0; i < width; i=i+4) { + fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); + } + } + + } +} + +void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer) { + + if (tree) { + print_sparse(tree->left, label, nonzero, kmer); + if(label && nonzero) { + char *kmer_str = index_to_kmer(tree->index, kmer); + if (tree->value != 0) + fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); + free(kmer_str); + } + else if(label) { + char *kmer_str = index_to_kmer(tree->index, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); + free(kmer_str); + } + else if(nonzero) { + if (tree->value != 0) + fprintf(stdout, "%zu\t%llu\n", tree->index, tree->value); + } + else + fprintf(stdout, "%llu\n", tree->value); + + print_sparse(tree->right, label, nonzero, kmer); + } +} diff --git a/kmer_utils.h b/kmer_utils.h index bf60dc4..af13227 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -11,6 +11,10 @@ unsigned long long pow_four(unsigned long long x); const unsigned char alpha[256]; // file loading functions -unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr); -node * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -node * get_kmer_counts_from_file(FILE *fh, const int kmer); +node * get_sparse_kmer_counts_from_filename(const char *fn, const unsigned int kmer); +node * get_sparse_kmer_counts_from_file(FILE *fh, const int kmer); + +unsigned long long * get_dense_kmer_counts_from_file(FILE *fh, const unsigned int kmer); + + +void print_dense(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer); diff --git a/sparse.c b/sparse.c index 9ab6902..30ce2b1 100644 --- a/sparse.c +++ b/sparse.c @@ -60,29 +60,3 @@ unsigned long long *lookup(node **tree, size_t index) { else return 0; } - -void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer) { - - if (tree) { - print_sparse(tree->left, label, nonzero, kmer); - if(label && nonzero) { - char *kmer_str = index_to_kmer(tree->index, kmer); - if (tree->value != 0) - fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); - free(kmer_str); - } - else if(label) { - char *kmer_str = index_to_kmer(tree->index, kmer); - fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value); - free(kmer_str); - } - else if(nonzero) { - if (tree->value != 0) - fprintf(stdout, "%zu\t%llu\n", tree->index, tree->value); - } - else - fprintf(stdout, "%llu\n", tree->value); - - print_sparse(tree->right, label, nonzero, kmer); - } -} -- cgit v1.2.3