aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-04-09 17:12:09 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-04-09 17:12:09 -0400
commitd8578f338104287b4af59cbadb01f0e45843962d (patch)
treea44a9a94c53def94c36029d3f9f3c7f9d34311ff
parentb7c04f4067e3eb51d8542438c4bda6e1a663fff9 (diff)
MERGE sparse trunk into master
-rw-r--r--Makefile33
-rw-r--r--kmer_counts_per_sequence.c44
-rw-r--r--kmer_total_count.c91
-rw-r--r--kmer_utils.c242
4 files changed, 297 insertions, 113 deletions
diff --git a/Makefile b/Makefile
index 4c457fe..1710203 100644
--- a/Makefile
+++ b/Makefile
@@ -1,26 +1,27 @@
-VERSION=\"0.0.4\"
-CC = gcc
-CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra
+VERSION=\"0.0.5\"
+CC = g++
+CFLAGS = -O3 -s -mtune=native -Wall -Wextra -DVERSION=$(VERSION) -std=c++11
DESTDIR = /usr/local/
+all: libkmer.so kmer_total_count kmer_counts_per_sequence
-all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence kmer_continuous_count
-
-libkmer.o: kmer_utils.c
- $(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC
-libkmer.so: libkmer.o
+libkmer.so: kmer_utils.o
$(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC
-kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h
- $(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS)
-kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h
- $(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS)
-kmer_continuous_count: libkmer.o kmer_continuous_count.c kmer_utils.h
- $(CC) libkmer.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)
+
+kmer_total_count: kmer_utils.c kmer_total_count.c kmer_utils.h
+ $(CC) kmer_utils.c kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS)
+
+kmer_counts_per_sequence: kmer_utils.c kmer_counts_per_sequence.c kmer_utils.h
+ $(CC) kmer_utils.c kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS)
+
+kmer_continuous_count: kmer_utils.c kmer_continuous_count.c kmer_utils.h
+ $(CC) kmer_utils.c kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)
clean:
- rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so libkmer.o
+ rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count kmer_utils.o libkmer.so
+ rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so kmer_utils.o
-debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\"
+debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" -std=c++11
debug: all
install: all
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index 2b2dbef..e9ca812 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -30,7 +30,43 @@ void help() {
"dna-utils. Drexel University, Philadelphia USA, 2014;\n"
"software available at www.github.com/EESI/dna-utils/\n");
}
+static void inc(kmer_map *map, size_t index) {
+ (*map)[index]++;
+}
+
+static void inc(unsigned long long *counts, size_t index) {
+ counts[index]++;
+}
+
+template <typename array_type>
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) {
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // bump up the mer value in the counts array
+ inc(counts, mer);
+
+ // skip count if error
+ next: ;
+ }
+}
int main(int argc, char **argv) {
@@ -135,8 +171,10 @@ int main(int argc, char **argv) {
width = pow_four(kmer);
if(specific_mers) {
- desired_indicies = malloc((width) * sizeof(size_t));
- check_null_ptr(desired_indicies, NULL);
+ sparse = false;
+ desired_indicies = (size_t *) malloc((width) * sizeof(size_t));
+ if(desired_indicies == NULL)
+ exit(EXIT_FAILURE);
num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies);
if(num_desired_indicies == 0) {
fprintf(stderr, "Error: no mers loaded from file\n");
@@ -145,7 +183,7 @@ int main(int argc, char **argv) {
}
- unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
+ unsigned long long *counts = (unsigned long long *) malloc((width+ 1) * sizeof(unsigned long long));
if(counts == NULL) {
fprintf(stderr, "%s\n", strerror(errno));
exit(EXIT_FAILURE);
diff --git a/kmer_total_count.c b/kmer_total_count.c
index dd29a53..c3ae070 100644
--- a/kmer_total_count.c
+++ b/kmer_total_count.c
@@ -8,16 +8,46 @@
#include "kmer_utils.h"
+template <typename array_type>
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) {
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // bump up the mer value in the counts array
+ inc(counts, mer);
+
+ // skip count if error
+ next: ;
+ }
+}
void help() {
printf("usage: kmer_total_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"
"count mers in size k from a fasta file\n"
"\n"
- " --input -i input fasta file to count\n"
- " --kmer -k size of mers to count\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
" --compliment -c count compliment of sequences\n"
- " --nonzero -n only print non-zero values\n"
- " --label -l print mer along with value\n"
+ " --nonzero -n only print non-zero values\n"
+ " --label -l print mer along with value\n"
+ " --sparse -s force sparse table for any mer\n"
"\n"
"Report all bugs to mutantturkey@gmail.com\n"
"\n"
@@ -41,10 +71,7 @@ int main(int argc, char **argv) {
bool label = false;
bool kmer_set = false;
bool count_compliment = false;
-
- unsigned long long width = 0;
-
- unsigned long long i = 0;
+ bool force_sparse = false;
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
@@ -52,6 +79,7 @@ int main(int argc, char **argv) {
{"compliment", required_argument, 0, 'c'},
{"nonzero", no_argument, 0, 'n'},
{"label", no_argument, 0, 'l'},
+ {"sparse", no_argument, 0, 's'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
@@ -61,7 +89,7 @@ int main(int argc, char **argv) {
int option_index = 0;
int c = 0;
- c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index);
+ c = getopt_long (argc, argv, "i:k:cnslvh", long_options, &option_index);
if (c == -1)
break;
@@ -83,6 +111,9 @@ int main(int argc, char **argv) {
case 'l':
label = true;
break;
+ case 's':
+ force_sparse = true;
+ break;
case 'h':
help();
exit(EXIT_SUCCESS);
@@ -119,45 +150,17 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- width = pow_four(kmer);
+ if(kmer > 12 || force_sparse) {
+ kmer_map *counts = NULL;
+ kmer_map *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment);
- unsigned long long *counts = get_kmer_counts_from_file(fh, kmer, count_compliment);
-
- // If nonzero is set, only print non zeros
- if(nonzero) {
- // if labels is set, print out our labels
- if(label) {
- for(i = 0; i < width; i++)
- if(counts[i] != 0) {
- char *kmer_str = index_to_kmer(i, kmer);
- fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
- free(kmer_str);
- }
-
- }
- else {
- for(i = 0; i < width; i++)
- if(counts[i] != 0)
- fprintf(stdout, "%llu\t%llu\n", i, counts[i]);
-
- }
+ print_kmer(res, label, nonzero, kmer);
}
- // If we aren't printing nonzeros print everything
else {
- if(label) {
- for(i = 0; i < width; i++) {
- char *kmer_str = index_to_kmer(i, kmer);
- fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
- free(kmer_str);
- }
- }
- else {
- for(i = 0; i < width; i=i+4) {
- fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]);
- }
- }
+ unsigned long long *counts = NULL;
+ unsigned long long *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment);
+ print_kmer(res, label, nonzero, kmer);
}
- free(counts);
return EXIT_SUCCESS;
}
diff --git a/kmer_utils.c b/kmer_utils.c
index 7703f0a..1b94b08 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -5,6 +5,23 @@
#include <string.h>
#include <stdbool.h>
+#include <unordered_map>
+
+using namespace std;
+
+typedef struct {
+ size_t operator() (const size_t &k) const {
+ return k;
+ }
+} kmer_noHash_hash;
+
+typedef struct {
+ bool operator() (const size_t &x, const size_t &y) const {
+ return x == y;
+ }
+} kmer_eq;
+
+typedef unordered_map<size_t, unsigned long long, kmer_noHash_hash, kmer_eq> kmer_map;
const unsigned char alpha[256] =
{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -52,35 +69,6 @@ void reverse_string(char *s, size_t len) {
}
}
-void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) {
- long long position;
- long long i;
-
- // loop through our seq to process each k-mer
- for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
- unsigned long long mer = 0;
- unsigned long long multiply = 1;
-
- // for each char in the k-mer check if it is an error char
- for(i = position + kmer - 1; i >= position; i--){
- if(seq[i] == 5) {
- position = i;
- goto next;
- }
-
- // multiply this char in the mer by the multiply
- // and bitshift the multiply for the next round
- mer += seq[i] * multiply;
- multiply = multiply << 2;
- }
- // bump up the mer value in the counts array
- counts[mer]++;
-
- // skip count if error
- next: ;
- }
-}
-
// convert a string of k-mer size base-4 values into a
// base-10 index
unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) {
@@ -103,7 +91,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
}
// return the number of loaded elements
-unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
+size_t load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
FILE *fh;
size_t arr_pos = 0;
char line[64];
@@ -152,8 +140,8 @@ char *index_to_kmer(unsigned long long index, long kmer) {
size_t i = 0;
size_t j = 0;
- char *num_array = calloc(kmer, sizeof(char));
- char *ret = calloc(kmer + 1, sizeof(char));
+ char *num_array = (char *) calloc(kmer, sizeof(char));
+ char *ret = (char *) calloc(kmer + 1, sizeof(char));
check_null_ptr(num_array, NULL);
check_null_ptr(ret, NULL);
@@ -211,19 +199,74 @@ size_t strnstrip(char *s, int c, size_t len) {
}
-unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
+static void inc(kmer_map *map, size_t index) {
+ (*map)[index]++;
+}
+
+static void inc(unsigned long long *counts, size_t index) {
+ counts[index]++;
+}
+
+template <typename array_type>
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) {
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // bump up the mer value in the counts array
+ inc(counts, mer);
+
+ // skip count if error
+ next: ;
+ }
+}
+
+
+void create_array(kmer_map **counts, int kmer) {
+ *counts = new kmer_map();
+ (*counts)->reserve(pow_four(kmer) / 2 );
+}
+
+void create_array(unsigned long long **counts, const unsigned int kmer) {
+ // width is 4^kmer
+ const unsigned long long width = pow_four(kmer);
+ *counts = (unsigned long long *) calloc(width+1, sizeof(unsigned long long));
+ if(counts == NULL) {
+ exit(EXIT_FAILURE);
+ }
+ else {
+ }
+}
+
+template <typename array_type>
+array_type * get_kmer_counts_from_file(array_type *counts, FILE *fh, const unsigned int kmer, const bool count_compliment) {
+
char *line = NULL;
size_t len = 0;
ssize_t read;
- // width is 4^kmer
- // there's a sneaky bitshift to avoid pow dependency
- const unsigned long width = pow_four(kmer);
-
- // malloc our return array
- unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long));
- check_null_ptr(counts, NULL);
+ create_array(&counts, kmer);
+ if(counts == NULL) {
+ puts("Counts is null");
+ exit(EXIT_FAILURE);
+ }
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
size_t k;
@@ -260,18 +303,33 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
}
}
-
- free(line);
+ free(line);
fclose(fh);
return counts;
}
-unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {
+kmer_map * get_kmer_counts_from_filename(kmer_map *counts, const char *fn, const unsigned int kmer, const bool count_compliment) {
FILE *fh = fopen(fn, "r");
check_null_ptr(fh, fn);
- return get_kmer_counts_from_file(fh, kmer, count_compliment);
+ // Why does this work this way!!?
+ kmer_map *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment);
+ if(counts == NULL) {
+ puts("NULL IN FILENAME");
+ }
+
+ return counts2;
+
+}
+
+
+unsigned long long * get_kmer_counts_from_filename(unsigned long long *counts, const char *fn, const unsigned int kmer, const bool count_compliment) {
+ FILE *fh = fopen(fn, "r");
+ check_null_ptr(fh, fn);
+
+ unsigned long long *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment);
+ return counts2;
}
unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
@@ -285,12 +343,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
const unsigned long width = pow_four(kmer);
// malloc our return array
- unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long));
+ unsigned long long * counts = (unsigned long long *) calloc((width+ 1), sizeof(unsigned long long));
check_null_ptr(counts, NULL);
size_t cpy_size = kmer - 1;
- char *end_of_previous = malloc(sizeof(char) * cpy_size);
+ char *end_of_previous = (char*) malloc(sizeof(char) * cpy_size);
check_null_ptr(end_of_previous, NULL);
memset(end_of_previous, 5, cpy_size);
@@ -298,12 +356,11 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
if(line[0] == '>')
continue;
- char *seq;
size_t j;
size_t k;
size_t seq_length;
- seq = malloc(read + cpy_size);
+ char *seq = (char *) malloc(read + cpy_size);
memcpy(seq, end_of_previous, cpy_size);
memcpy(seq + cpy_size, line, read);
@@ -329,11 +386,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
count_sequence(seq, seq_length, kmer, counts);
}
- free(seq);
+
+ free(seq);
}
free(end_of_previous);
- free(line);
+ free(line);
fclose(fh);
return counts;
@@ -345,3 +403,87 @@ unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, co
return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);
}
+
+void print_kmer(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer) {
+ size_t width = pow_four(kmer);
+ size_t i = 0;
+
+ // If nonzero is set, only print non zeros
+ if(nonzero) {
+ // if labels is set, print out our labels
+ if(label) {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+
+ }
+ else {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0)
+ fprintf(stdout, "%zu\t%llu\n", i, counts[i]);
+
+ }
+ }
+ // If we aren't printing nonzeros print everything
+ else {
+ if(label) {
+ for(i = 0; i < width; i++) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(i = 0; i < width; i=i+4) {
+ fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]);
+ }
+ }
+
+ }
+}
+
+void print_kmer(kmer_map *counts, bool label, bool nonzero, unsigned int kmer) {
+ size_t width = pow_four(kmer);
+ size_t i = 0;
+
+ // If nonzero is set, only print non zeros
+ if(nonzero) {
+ // if labels is set, print out our labels
+ if(label) {
+ for(auto it = counts->begin(); it != counts->end(); it++ ) {
+ char *kmer_str = index_to_kmer(it->first, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, it->second);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(auto it = counts->begin(); it != counts->end(); it++ ) {
+ fprintf(stdout, "%zu\t%llu\n", it->first, it->second);
+ }
+ }
+ }
+ // If we aren't printing nonzeros print everything
+ else {
+ if(label) {
+ for(i = 0; i < width; i++) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ if(counts->count(i) != 0)
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts->at(i));
+ else
+ fprintf(stdout, "%s\t0\n", kmer_str);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(i = 0; i < width; i++) {
+ if(counts->count(i) != 0)
+ fprintf(stdout, "%llu\n", counts->at(i));
+ else
+ fprintf(stdout, "0\n");
+ }
+ }
+ }
+}