From 2c038ba630c14c7030186c64e9eb92761ddcba74 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Thu, 6 Mar 2014 17:05:40 -0500 Subject: add kmer_continuous_count this tool will count continuously, instead of line by line. The way that this works out is something like this: test.fa > header 1 AAAAATTTTT > header 2 GGGGGAAAAA counting 6 mers, the program will count TTTGGG, TTGGGG, TGGGGG, like there was no header seperating them. This can be useful for certain tyeps of processing, like when the sequences are continuous from a genome. initial commit --- Makefile | 6 +- kmer_continuous_count.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++ kmer_utils.c | 102 +++++++++++++++++++++++++++++++ kmer_utils.h | 4 ++ 4 files changed, 268 insertions(+), 2 deletions(-) create mode 100644 kmer_continuous_count.c diff --git a/Makefile b/Makefile index c929cfd..dc47796 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra DESTDIR = /usr/local/ -all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence +all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.o: kmer_utils.c $(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC @@ -14,9 +14,11 @@ kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h $(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h $(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) +kmer_continuous_count: libkmer.o kmer_continuous_count.c kmer_utils.h + $(CC) libkmer.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) clean: - rm -vf kmer_total_count kmer_counts_per_sequence libkmer.so libkmer.o + rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so libkmer.o debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" debug: all diff --git a/kmer_continuous_count.c b/kmer_continuous_count.c new file mode 100644 index 0000000..5ace9a5 --- /dev/null +++ b/kmer_continuous_count.c @@ -0,0 +1,158 @@ + +// Copyright 2013 Calvin Morrison +#include +#include +#include +#include +#include +#include + +#include "kmer_utils.h" + + +void help() { + printf("usage: kmer_total_count -i input_file -k kmer [-n] [-l] ...\n\n" + "count mers in size k from a fasta file, but do so continuously\n" + "\n" + " --input -i input fasta file to count\n" + " --kmer -k size of mers to count\n" + " --nonzero -n only print non-zero values\n" + " --label -l print mer along with value\n" + "\n" + "Report all bugs to mutantturkey@gmail.com\n" + "\n" + "Copyright 2014 Calvin Morrison, Drexel University.\n" + "\n" + "If you are using any dna-utils tool for a publication\n" + "please cite your usage:\n\n" + "dna-utils. Drexel University, Philadelphia USA, 2014;\n" + "software available at www.github.com/EESI/dna-utils/\n"); +} + + +int main(int argc, char **argv) { + + char *fn = NULL; + FILE *fh = NULL; + + unsigned int kmer = 0; + + bool nonzero = false; + bool label = false; + bool kmer_set = false; + + unsigned long long width = 0; + + unsigned long long i = 0; + + static struct option long_options[] = { + {"input", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"nonzero", no_argument, 0, 'n'}, + {"label", no_argument, 0, 'l'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + while (1) { + + int option_index = 0; + int c = 0; + + c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + fn = optarg; + break; + case 'k': + kmer = atoi(optarg); + kmer_set = true; + break; + case 'n': + nonzero = true; + break; + case 'l': + label = true; + break; + case 'h': + help(); + exit(EXIT_SUCCESS); + break; + case 'v': + printf("dna-utils version " VERSION "\n"); + exit(EXIT_SUCCESS); + break; + default: + break; + } + } + if(argc == 1) { + help(); + exit(EXIT_FAILURE); + } + if(fn == NULL) { + fprintf(stderr, "no input file specified with -i, reading from stdin\n"); + fh = stdin; + } + else { + fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno)); + exit(EXIT_FAILURE); + } + } + if(!kmer_set) { + fprintf(stderr, "Error: kmer (-k) must be supplied\n"); + exit(EXIT_FAILURE); + } + if(kmer == 0) { + fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer); + exit(EXIT_FAILURE); + } + + width = pow_four(kmer); + + unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer); + + // If nonzero is set, only print non zeros + if(nonzero) { + // if labels is set, print out our labels + if(label) { + for(i = 0; i < width; i++) + if(counts[i] != 0) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + + } + else { + for(i = 0; i < width; i++) + if(counts[i] != 0) + fprintf(stdout, "%llu\t%llu\n", i, counts[i]); + + } + } + // If we aren't printing nonzeros print everything + else { + if(label) { + for(i = 0; i < width; i++) { + char *kmer_str = index_to_kmer(i, kmer); + fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); + free(kmer_str); + } + } + else { + for(i = 0; i < width; i=i+4) { + fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); + } + } + } + + free(counts); + return EXIT_SUCCESS; +} diff --git a/kmer_utils.c b/kmer_utils.c index 81b1e91..79c9180 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -226,6 +226,7 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer } } + free(line); fclose(fh); @@ -242,3 +243,104 @@ unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigne return get_kmer_counts_from_file(fh, kmer); } + +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { + + char *line = NULL; + size_t len = 0; + ssize_t read; + + // width is 4^kmer + // there's a sneaky bitshift to avoid pow dependency + const unsigned long width = pow_four(kmer); + + // malloc our return array + unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); + if(counts == NULL) { + fprintf(stderr, "%s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + size_t cpy_size = kmer - 1; + + char *end_of_previous = malloc(sizeof(char) * cpy_size); + memset(end_of_previous, 5, cpy_size); + + if(end_of_previous == NULL) { + fprintf(stderr, "%s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + while ((read = getline(&line, &len, fh)) != -1) { + if(line[0] == '>') + continue; + + char *seq; + size_t j; + size_t k; + long long position; + size_t seq_length; + + seq = malloc(read + cpy_size); + + memcpy(seq, end_of_previous, cpy_size); + memcpy(seq + cpy_size, line, read); + + seq_length = read + cpy_size; + + // relace A, C, G and T with 0, 1, 2, 3 respectively + // everything else is 5 + for(k = cpy_size; k < seq_length; k++) + seq[k] = alpha[(int)seq[k]]; + + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + long long i; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + mer = width; + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + // use this point to get mer of our loop + next: + // bump up the mer value in the counts array + counts[mer]++; + } + + for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) { + end_of_previous[j] = seq[k]; + } + + free(seq); + + } + + + free(end_of_previous); + free(line); + fclose(fh); + + return counts; +} + +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { + FILE *fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno)); + return NULL; + } + + return get_continuous_kmer_counts_from_file(fh, kmer); +} diff --git a/kmer_utils.h b/kmer_utils.h index ceb28eb..ae48136 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -12,5 +12,9 @@ const unsigned char alpha[256]; // file loading functions unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr); + unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer); + +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer); +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer); -- cgit v1.2.3