From d8578f338104287b4af59cbadb01f0e45843962d Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 9 Apr 2014 17:12:09 -0400 Subject: MERGE sparse trunk into master --- Makefile | 33 ++++--- kmer_counts_per_sequence.c | 44 ++++++++- kmer_total_count.c | 91 ++++++++--------- kmer_utils.c | 242 +++++++++++++++++++++++++++++++++++---------- 4 files changed, 297 insertions(+), 113 deletions(-) diff --git a/Makefile b/Makefile index 4c457fe..1710203 100644 --- a/Makefile +++ b/Makefile @@ -1,26 +1,27 @@ -VERSION=\"0.0.4\" -CC = gcc -CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra +VERSION=\"0.0.5\" +CC = g++ +CFLAGS = -O3 -s -mtune=native -Wall -Wextra -DVERSION=$(VERSION) -std=c++11 DESTDIR = /usr/local/ +all: libkmer.so kmer_total_count kmer_counts_per_sequence -all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence kmer_continuous_count - -libkmer.o: kmer_utils.c - $(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC -libkmer.so: libkmer.o +libkmer.so: kmer_utils.o $(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC -kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h - $(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) -kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h - $(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) -kmer_continuous_count: libkmer.o kmer_continuous_count.c kmer_utils.h - $(CC) libkmer.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) + +kmer_total_count: kmer_utils.c kmer_total_count.c kmer_utils.h + $(CC) kmer_utils.c kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) + +kmer_counts_per_sequence: kmer_utils.c kmer_counts_per_sequence.c kmer_utils.h + $(CC) kmer_utils.c kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) + +kmer_continuous_count: kmer_utils.c kmer_continuous_count.c kmer_utils.h + $(CC) kmer_utils.c kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) clean: - rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so libkmer.o + rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count kmer_utils.o libkmer.so + rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so kmer_utils.o -debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" +debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" -std=c++11 debug: all install: all diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index 2b2dbef..e9ca812 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -30,7 +30,43 @@ void help() { "dna-utils. Drexel University, Philadelphia USA, 2014;\n" "software available at www.github.com/EESI/dna-utils/\n"); } +static void inc(kmer_map *map, size_t index) { + (*map)[index]++; +} + +static void inc(unsigned long long *counts, size_t index) { + counts[index]++; +} + +template +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { + long long position; + long long i; + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + position = i; + goto next; + } + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // bump up the mer value in the counts array + inc(counts, mer); + + // skip count if error + next: ; + } +} int main(int argc, char **argv) { @@ -135,8 +171,10 @@ int main(int argc, char **argv) { width = pow_four(kmer); if(specific_mers) { - desired_indicies = malloc((width) * sizeof(size_t)); - check_null_ptr(desired_indicies, NULL); + sparse = false; + desired_indicies = (size_t *) malloc((width) * sizeof(size_t)); + if(desired_indicies == NULL) + exit(EXIT_FAILURE); num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies); if(num_desired_indicies == 0) { fprintf(stderr, "Error: no mers loaded from file\n"); @@ -145,7 +183,7 @@ int main(int argc, char **argv) { } - unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); + unsigned long long *counts = (unsigned long long *) malloc((width+ 1) * sizeof(unsigned long long)); if(counts == NULL) { fprintf(stderr, "%s\n", strerror(errno)); exit(EXIT_FAILURE); diff --git a/kmer_total_count.c b/kmer_total_count.c index dd29a53..c3ae070 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -8,16 +8,46 @@ #include "kmer_utils.h" +template +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { + long long position; + long long i; + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // bump up the mer value in the counts array + inc(counts, mer); + + // skip count if error + next: ; + } +} void help() { printf("usage: kmer_total_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n" "count mers in size k from a fasta file\n" "\n" - " --input -i input fasta file to count\n" - " --kmer -k size of mers to count\n" + " --input -i input fasta file to count\n" + " --kmer -k size of mers to count\n" " --compliment -c count compliment of sequences\n" - " --nonzero -n only print non-zero values\n" - " --label -l print mer along with value\n" + " --nonzero -n only print non-zero values\n" + " --label -l print mer along with value\n" + " --sparse -s force sparse table for any mer\n" "\n" "Report all bugs to mutantturkey@gmail.com\n" "\n" @@ -41,10 +71,7 @@ int main(int argc, char **argv) { bool label = false; bool kmer_set = false; bool count_compliment = false; - - unsigned long long width = 0; - - unsigned long long i = 0; + bool force_sparse = false; static struct option long_options[] = { {"input", required_argument, 0, 'i'}, @@ -52,6 +79,7 @@ int main(int argc, char **argv) { {"compliment", required_argument, 0, 'c'}, {"nonzero", no_argument, 0, 'n'}, {"label", no_argument, 0, 'l'}, + {"sparse", no_argument, 0, 's'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; @@ -61,7 +89,7 @@ int main(int argc, char **argv) { int option_index = 0; int c = 0; - c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index); + c = getopt_long (argc, argv, "i:k:cnslvh", long_options, &option_index); if (c == -1) break; @@ -83,6 +111,9 @@ int main(int argc, char **argv) { case 'l': label = true; break; + case 's': + force_sparse = true; + break; case 'h': help(); exit(EXIT_SUCCESS); @@ -119,45 +150,17 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - width = pow_four(kmer); + if(kmer > 12 || force_sparse) { + kmer_map *counts = NULL; + kmer_map *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); - unsigned long long *counts = get_kmer_counts_from_file(fh, kmer, count_compliment); - - // If nonzero is set, only print non zeros - if(nonzero) { - // if labels is set, print out our labels - if(label) { - for(i = 0; i < width; i++) - if(counts[i] != 0) { - char *kmer_str = index_to_kmer(i, kmer); - fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); - free(kmer_str); - } - - } - else { - for(i = 0; i < width; i++) - if(counts[i] != 0) - fprintf(stdout, "%llu\t%llu\n", i, counts[i]); - - } + print_kmer(res, label, nonzero, kmer); } - // If we aren't printing nonzeros print everything else { - if(label) { - for(i = 0; i < width; i++) { - char *kmer_str = index_to_kmer(i, kmer); - fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); - free(kmer_str); - } - } - else { - for(i = 0; i < width; i=i+4) { - fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); - } - } + unsigned long long *counts = NULL; + unsigned long long *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); + print_kmer(res, label, nonzero, kmer); } - free(counts); return EXIT_SUCCESS; } diff --git a/kmer_utils.c b/kmer_utils.c index 7703f0a..1b94b08 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -5,6 +5,23 @@ #include #include +#include + +using namespace std; + +typedef struct { + size_t operator() (const size_t &k) const { + return k; + } +} kmer_noHash_hash; + +typedef struct { + bool operator() (const size_t &x, const size_t &y) const { + return x == y; + } +} kmer_eq; + +typedef unordered_map kmer_map; const unsigned char alpha[256] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, @@ -52,35 +69,6 @@ void reverse_string(char *s, size_t len) { } } -void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) { - long long position; - long long i; - - // loop through our seq to process each k-mer - for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { - unsigned long long mer = 0; - unsigned long long multiply = 1; - - // for each char in the k-mer check if it is an error char - for(i = position + kmer - 1; i >= position; i--){ - if(seq[i] == 5) { - position = i; - goto next; - } - - // multiply this char in the mer by the multiply - // and bitshift the multiply for the next round - mer += seq[i] * multiply; - multiply = multiply << 2; - } - // bump up the mer value in the counts array - counts[mer]++; - - // skip count if error - next: ; - } -} - // convert a string of k-mer size base-4 values into a // base-10 index unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) { @@ -103,7 +91,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos } // return the number of loaded elements -unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { +size_t load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { FILE *fh; size_t arr_pos = 0; char line[64]; @@ -152,8 +140,8 @@ char *index_to_kmer(unsigned long long index, long kmer) { size_t i = 0; size_t j = 0; - char *num_array = calloc(kmer, sizeof(char)); - char *ret = calloc(kmer + 1, sizeof(char)); + char *num_array = (char *) calloc(kmer, sizeof(char)); + char *ret = (char *) calloc(kmer + 1, sizeof(char)); check_null_ptr(num_array, NULL); check_null_ptr(ret, NULL); @@ -211,19 +199,74 @@ size_t strnstrip(char *s, int c, size_t len) { } -unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { +static void inc(kmer_map *map, size_t index) { + (*map)[index]++; +} + +static void inc(unsigned long long *counts, size_t index) { + counts[index]++; +} + +template +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { + long long position; + long long i; + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // bump up the mer value in the counts array + inc(counts, mer); + + // skip count if error + next: ; + } +} + + +void create_array(kmer_map **counts, int kmer) { + *counts = new kmer_map(); + (*counts)->reserve(pow_four(kmer) / 2 ); +} + +void create_array(unsigned long long **counts, const unsigned int kmer) { + // width is 4^kmer + const unsigned long long width = pow_four(kmer); + *counts = (unsigned long long *) calloc(width+1, sizeof(unsigned long long)); + if(counts == NULL) { + exit(EXIT_FAILURE); + } + else { + } +} + +template +array_type * get_kmer_counts_from_file(array_type *counts, FILE *fh, const unsigned int kmer, const bool count_compliment) { + char *line = NULL; size_t len = 0; ssize_t read; - // width is 4^kmer - // there's a sneaky bitshift to avoid pow dependency - const unsigned long width = pow_four(kmer); - - // malloc our return array - unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); - check_null_ptr(counts, NULL); + create_array(&counts, kmer); + if(counts == NULL) { + puts("Counts is null"); + exit(EXIT_FAILURE); + } while ((read = getdelim(&line, &len, '>', fh)) != -1) { size_t k; @@ -260,18 +303,33 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer } } - - free(line); + free(line); fclose(fh); return counts; } -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) { +kmer_map * get_kmer_counts_from_filename(kmer_map *counts, const char *fn, const unsigned int kmer, const bool count_compliment) { FILE *fh = fopen(fn, "r"); check_null_ptr(fh, fn); - return get_kmer_counts_from_file(fh, kmer, count_compliment); + // Why does this work this way!!? + kmer_map *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); + if(counts == NULL) { + puts("NULL IN FILENAME"); + } + + return counts2; + +} + + +unsigned long long * get_kmer_counts_from_filename(unsigned long long *counts, const char *fn, const unsigned int kmer, const bool count_compliment) { + FILE *fh = fopen(fn, "r"); + check_null_ptr(fh, fn); + + unsigned long long *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); + return counts2; } unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { @@ -285,12 +343,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign const unsigned long width = pow_four(kmer); // malloc our return array - unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); + unsigned long long * counts = (unsigned long long *) calloc((width+ 1), sizeof(unsigned long long)); check_null_ptr(counts, NULL); size_t cpy_size = kmer - 1; - char *end_of_previous = malloc(sizeof(char) * cpy_size); + char *end_of_previous = (char*) malloc(sizeof(char) * cpy_size); check_null_ptr(end_of_previous, NULL); memset(end_of_previous, 5, cpy_size); @@ -298,12 +356,11 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign if(line[0] == '>') continue; - char *seq; size_t j; size_t k; size_t seq_length; - seq = malloc(read + cpy_size); + char *seq = (char *) malloc(read + cpy_size); memcpy(seq, end_of_previous, cpy_size); memcpy(seq + cpy_size, line, read); @@ -329,11 +386,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign count_sequence(seq, seq_length, kmer, counts); } - free(seq); + + free(seq); } free(end_of_previous); - free(line); + free(line); fclose(fh); return counts; @@ -345,3 +403,87 @@ unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, co return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment); } + +void print_kmer(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer) { + size_t width = pow_four(kmer); + size_t i = 0; + + // If nonzero is set, only print non zeros + if(nonzero) { + // if labels is set, print out our labels + if(label) { + for(i = 0; i < width; i++) + if(counts[i] != 0) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + + } + else { + for(i = 0; i < width; i++) + if(counts[i] != 0) + fprintf(stdout, "%zu\t%llu\n", i, counts[i]); + + } + } + // If we aren't printing nonzeros print everything + else { + if(label) { + for(i = 0; i < width; i++) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + } + else { + for(i = 0; i < width; i=i+4) { + fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); + } + } + + } +} + +void print_kmer(kmer_map *counts, bool label, bool nonzero, unsigned int kmer) { + size_t width = pow_four(kmer); + size_t i = 0; + + // If nonzero is set, only print non zeros + if(nonzero) { + // if labels is set, print out our labels + if(label) { + for(auto it = counts->begin(); it != counts->end(); it++ ) { + char *kmer_str = index_to_kmer(it->first, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, it->second); + free(kmer_str); + } + } + else { + for(auto it = counts->begin(); it != counts->end(); it++ ) { + fprintf(stdout, "%zu\t%llu\n", it->first, it->second); + } + } + } + // If we aren't printing nonzeros print everything + else { + if(label) { + for(i = 0; i < width; i++) { + char *kmer_str = index_to_kmer(i, kmer); + if(counts->count(i) != 0) + fprintf(stdout, "%s\t%llu\n", kmer_str, counts->at(i)); + else + fprintf(stdout, "%s\t0\n", kmer_str); + free(kmer_str); + } + } + else { + for(i = 0; i < width; i++) { + if(counts->count(i) != 0) + fprintf(stdout, "%llu\n", counts->at(i)); + else + fprintf(stdout, "0\n"); + } + } + } +} -- cgit v1.2.3