aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--kmer_total_count.c27
-rw-r--r--kmer_utils.c139
-rw-r--r--kmer_utils.h10
-rw-r--r--sparse.c26
5 files changed, 158 insertions, 46 deletions
diff --git a/Makefile b/Makefile
index e51a508..5a1f34d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION=\"0.0.2\"
CC = gcc
-CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra -lcsparse
+CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra
DESTDIR = /usr/local/
diff --git a/kmer_total_count.c b/kmer_total_count.c
index 87f1b69..12f9ab4 100644
--- a/kmer_total_count.c
+++ b/kmer_total_count.c
@@ -18,6 +18,7 @@ void help() {
" --kmer -k size of mers to count\n"
" --nonzero -n only print non-zero values\n"
" --label -l print mer along with value\n"
+ " --sparse -s force sparse table for any mer\n"
"\n"
"Report all bugs to mutantturkey@gmail.com\n"
"\n"
@@ -40,16 +41,14 @@ int main(int argc, char **argv) {
bool nonzero = false;
bool label = false;
bool kmer_set = false;
-
- unsigned long long width = 0;
-
- unsigned long long i = 0;
+ bool force_sparse = false;
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
{"kmer", required_argument, 0, 'k'},
{"nonzero", no_argument, 0, 'n'},
{"label", no_argument, 0, 'l'},
+ {"sparse", no_argument, 0, 's'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
@@ -59,7 +58,7 @@ int main(int argc, char **argv) {
int option_index = 0;
int c = 0;
- c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index);
+ c = getopt_long (argc, argv, "i:k:nslvh", long_options, &option_index);
if (c == -1)
break;
@@ -78,6 +77,9 @@ int main(int argc, char **argv) {
case 'l':
label = true;
break;
+ case 's':
+ force_sparse = true;
+ break;
case 'h':
help();
exit(EXIT_SUCCESS);
@@ -114,17 +116,14 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- width = pow_four(kmer);
-
- if(kmer > 12) {
- node *root = get_kmer_counts_from_file(fh, kmer);
+ if(kmer > 12 || force_sparse) {
+ node *root = get_sparse_kmer_counts_from_file(fh, kmer);
print_sparse(root, label, nonzero, kmer);
}
- //else {
-// unsigned long long *counts = get_kmer_counts_from_file(fh, kmer);
-// print(countst, label, nonzero);
- //free(counts);
-// }
+ else {
+ unsigned long long *counts = get_dense_kmer_counts_from_file(fh, kmer);
+ print_dense(counts, label, nonzero, kmer);
+ }
return EXIT_SUCCESS;
}
diff --git a/kmer_utils.c b/kmer_utils.c
index f2057ee..b456f2f 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -157,7 +157,7 @@ size_t strnstrip(char *s, int c, size_t len) {
}
-node * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+node * get_sparse_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
char *line = NULL;
size_t len = 0;
@@ -236,6 +236,75 @@ node * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
}
+unsigned long long * get_dense_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t read;
+
+ long long i = 0;
+ long long position = 0;
+
+ // width is 4^kmer
+ const unsigned long long width = pow_four(kmer);
+
+ unsigned long long *counts = malloc(width+1 * sizeof(unsigned long long));
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+ size_t k;
+ char *seq;
+ size_t seq_length;
+
+ // find our first \n, this should be the end of the header
+ seq = strchr(line, '\n');
+ if(seq == NULL)
+ continue;
+
+ // point to one past that.
+ seq = seq + 1;
+
+ // strip out all other newlines to handle multiline sequences
+ strnstrip(seq, '\n', strlen(seq));
+ seq_length = strlen(seq);
+
+ // relace A, C, G and T with 0, 1, 2, 3 respectively
+ // everything else is 5
+ for(k = 0; k < seq_length; k++)
+ seq[k] = alpha[(int)seq[k]];
+
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ mer = width;
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // use this point to get mer of our loop
+ next:
+ {
+ // bump up the mer value in the counts array
+ counts[mer]++;
+ }
+ }
+ }
+
+ //free(line);
+ fclose(fh);
+ return counts;
+
+}
+
unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
FILE *fh = fopen(fn, "r");
if(fh == NULL) {
@@ -243,6 +312,72 @@ unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigne
return 0;
}
- return get_kmer_counts_from_file(fh, kmer);
+ return get_dense_kmer_counts_from_file(fh, kmer);
}
+void print_dense(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer) {
+ size_t width = pow_four(kmer);
+ size_t i = 0;
+
+ // If nonzero is set, only print non zeros
+ if(nonzero) {
+ // if labels is set, print out our labels
+ if(label) {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+
+ }
+ else {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0)
+ fprintf(stdout, "%llu\t%llu\n", i, counts[i]);
+
+ }
+ }
+ // If we aren't printing nonzeros print everything
+ else {
+ if(label) {
+ for(i = 0; i < width; i++) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(i = 0; i < width; i=i+4) {
+ fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]);
+ }
+ }
+
+ }
+}
+
+void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer) {
+
+ if (tree) {
+ print_sparse(tree->left, label, nonzero, kmer);
+ if(label && nonzero) {
+ char *kmer_str = index_to_kmer(tree->index, kmer);
+ if (tree->value != 0)
+ fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value);
+ free(kmer_str);
+ }
+ else if(label) {
+ char *kmer_str = index_to_kmer(tree->index, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value);
+ free(kmer_str);
+ }
+ else if(nonzero) {
+ if (tree->value != 0)
+ fprintf(stdout, "%zu\t%llu\n", tree->index, tree->value);
+ }
+ else
+ fprintf(stdout, "%llu\n", tree->value);
+
+ print_sparse(tree->right, label, nonzero, kmer);
+ }
+}
diff --git a/kmer_utils.h b/kmer_utils.h
index bf60dc4..af13227 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -11,6 +11,10 @@ unsigned long long pow_four(unsigned long long x);
const unsigned char alpha[256];
// file loading functions
-unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr);
-node * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
-node * get_kmer_counts_from_file(FILE *fh, const int kmer);
+node * get_sparse_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
+node * get_sparse_kmer_counts_from_file(FILE *fh, const int kmer);
+
+unsigned long long * get_dense_kmer_counts_from_file(FILE *fh, const unsigned int kmer);
+
+
+void print_dense(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer);
diff --git a/sparse.c b/sparse.c
index 9ab6902..30ce2b1 100644
--- a/sparse.c
+++ b/sparse.c
@@ -60,29 +60,3 @@ unsigned long long *lookup(node **tree, size_t index) {
else
return 0;
}
-
-void print_sparse(node *tree, bool label, bool nonzero, unsigned int kmer) {
-
- if (tree) {
- print_sparse(tree->left, label, nonzero, kmer);
- if(label && nonzero) {
- char *kmer_str = index_to_kmer(tree->index, kmer);
- if (tree->value != 0)
- fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value);
- free(kmer_str);
- }
- else if(label) {
- char *kmer_str = index_to_kmer(tree->index, kmer);
- fprintf(stdout, "%s\t%llu\n", kmer_str, tree->value);
- free(kmer_str);
- }
- else if(nonzero) {
- if (tree->value != 0)
- fprintf(stdout, "%zu\t%llu\n", tree->index, tree->value);
- }
- else
- fprintf(stdout, "%llu\n", tree->value);
-
- print_sparse(tree->right, label, nonzero, kmer);
- }
-}