aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-06 17:05:40 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-06 17:05:40 -0500
commit2c038ba630c14c7030186c64e9eb92761ddcba74 (patch)
treef4705db1e2603bdc831254eee60800eb10448fcf
parent5d7e67a846ec104da2d7bdb988672fbd02ddda28 (diff)
add kmer_continuous_count
this tool will count continuously, instead of line by line. The way that this works out is something like this: test.fa > header 1 AAAAATTTTT > header 2 GGGGGAAAAA counting 6 mers, the program will count TTTGGG, TTGGGG, TGGGGG, like there was no header seperating them. This can be useful for certain tyeps of processing, like when the sequences are continuous from a genome. initial commit
-rw-r--r--Makefile6
-rw-r--r--kmer_continuous_count.c158
-rw-r--r--kmer_utils.c102
-rw-r--r--kmer_utils.h4
4 files changed, 268 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index c929cfd..dc47796 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra
DESTDIR = /usr/local/
-all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence
+all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence kmer_continuous_count
libkmer.o: kmer_utils.c
$(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC
@@ -14,9 +14,11 @@ kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h
$(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS)
kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h
$(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS)
+kmer_continuous_count: libkmer.o kmer_continuous_count.c kmer_utils.h
+ $(CC) libkmer.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)
clean:
- rm -vf kmer_total_count kmer_counts_per_sequence libkmer.so libkmer.o
+ rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so libkmer.o
debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\"
debug: all
diff --git a/kmer_continuous_count.c b/kmer_continuous_count.c
new file mode 100644
index 0000000..5ace9a5
--- /dev/null
+++ b/kmer_continuous_count.c
@@ -0,0 +1,158 @@
+
+// Copyright 2013 Calvin Morrison
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "kmer_utils.h"
+
+
+void help() {
+ printf("usage: kmer_total_count -i input_file -k kmer [-n] [-l] ...\n\n"
+ "count mers in size k from a fasta file, but do so continuously\n"
+ "\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --nonzero -n only print non-zero values\n"
+ " --label -l print mer along with value\n"
+ "\n"
+ "Report all bugs to mutantturkey@gmail.com\n"
+ "\n"
+ "Copyright 2014 Calvin Morrison, Drexel University.\n"
+ "\n"
+ "If you are using any dna-utils tool for a publication\n"
+ "please cite your usage:\n\n"
+ "dna-utils. Drexel University, Philadelphia USA, 2014;\n"
+ "software available at www.github.com/EESI/dna-utils/\n");
+}
+
+
+int main(int argc, char **argv) {
+
+ char *fn = NULL;
+ FILE *fh = NULL;
+
+ unsigned int kmer = 0;
+
+ bool nonzero = false;
+ bool label = false;
+ bool kmer_set = false;
+
+ unsigned long long width = 0;
+
+ unsigned long long i = 0;
+
+ static struct option long_options[] = {
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"nonzero", no_argument, 0, 'n'},
+ {"label", no_argument, 0, 'l'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+
+ while (1) {
+
+ int option_index = 0;
+ int c = 0;
+
+ c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fn = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ kmer_set = true;
+ break;
+ case 'n':
+ nonzero = true;
+ break;
+ case 'l':
+ label = true;
+ break;
+ case 'h':
+ help();
+ exit(EXIT_SUCCESS);
+ break;
+ case 'v':
+ printf("dna-utils version " VERSION "\n");
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+ if(argc == 1) {
+ help();
+ exit(EXIT_FAILURE);
+ }
+ if(fn == NULL) {
+ fprintf(stderr, "no input file specified with -i, reading from stdin\n");
+ fh = stdin;
+ }
+ else {
+ fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+ if(!kmer_set) {
+ fprintf(stderr, "Error: kmer (-k) must be supplied\n");
+ exit(EXIT_FAILURE);
+ }
+ if(kmer == 0) {
+ fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer);
+ exit(EXIT_FAILURE);
+ }
+
+ width = pow_four(kmer);
+
+ unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer);
+
+ // If nonzero is set, only print non zeros
+ if(nonzero) {
+ // if labels is set, print out our labels
+ if(label) {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+
+ }
+ else {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0)
+ fprintf(stdout, "%llu\t%llu\n", i, counts[i]);
+
+ }
+ }
+ // If we aren't printing nonzeros print everything
+ else {
+ if(label) {
+ for(i = 0; i < width; i++) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(i = 0; i < width; i=i+4) {
+ fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]);
+ }
+ }
+ }
+
+ free(counts);
+ return EXIT_SUCCESS;
+}
diff --git a/kmer_utils.c b/kmer_utils.c
index 81b1e91..79c9180 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -226,6 +226,7 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
}
}
+
free(line);
fclose(fh);
@@ -242,3 +243,104 @@ unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigne
return get_kmer_counts_from_file(fh, kmer);
}
+
+unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t read;
+
+ // width is 4^kmer
+ // there's a sneaky bitshift to avoid pow dependency
+ const unsigned long width = pow_four(kmer);
+
+ // malloc our return array
+ unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long));
+ if(counts == NULL) {
+ fprintf(stderr, "%s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ size_t cpy_size = kmer - 1;
+
+ char *end_of_previous = malloc(sizeof(char) * cpy_size);
+ memset(end_of_previous, 5, cpy_size);
+
+ if(end_of_previous == NULL) {
+ fprintf(stderr, "%s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ while ((read = getline(&line, &len, fh)) != -1) {
+ if(line[0] == '>')
+ continue;
+
+ char *seq;
+ size_t j;
+ size_t k;
+ long long position;
+ size_t seq_length;
+
+ seq = malloc(read + cpy_size);
+
+ memcpy(seq, end_of_previous, cpy_size);
+ memcpy(seq + cpy_size, line, read);
+
+ seq_length = read + cpy_size;
+
+ // relace A, C, G and T with 0, 1, 2, 3 respectively
+ // everything else is 5
+ for(k = cpy_size; k < seq_length; k++)
+ seq[k] = alpha[(int)seq[k]];
+
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+ long long i;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ mer = width;
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // use this point to get mer of our loop
+ next:
+ // bump up the mer value in the counts array
+ counts[mer]++;
+ }
+
+ for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) {
+ end_of_previous[j] = seq[k];
+ }
+
+ free(seq);
+
+ }
+
+
+ free(end_of_previous);
+ free(line);
+ fclose(fh);
+
+ return counts;
+}
+
+unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
+ FILE *fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno));
+ return NULL;
+ }
+
+ return get_continuous_kmer_counts_from_file(fh, kmer);
+}
diff --git a/kmer_utils.h b/kmer_utils.h
index ceb28eb..ae48136 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -12,5 +12,9 @@ const unsigned char alpha[256];
// file loading functions
unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr);
+
unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer);
+
+unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
+unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer);