diff options
-rw-r--r-- | Makefile | 21 | ||||
-rw-r--r-- | kmer_locations.c | 253 | ||||
-rw-r--r-- | kmer_utils.h | 20 |
3 files changed, 283 insertions, 11 deletions
@@ -3,20 +3,25 @@ CC = g++ CFLAGS = -O3 -s -march=native -Wall -Wextra -DVERSION=$(VERSION) -std=c++11 DESTDIR = /usr/local/ -all: libkmer.so kmer_total_count kmer_counts_per_sequence +all: libkmer.so kmer_total_count kmer_counts_per_sequence kmer_utils.o kmer_locations + +kmer_utils.o: + $(CC) -c kmer_utils.c -O $(CFLAGS) -fPIC libkmer.so: kmer_utils.o - $(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC + $(CC) kmer_utils.o -o libkmer.so $(CFLAGS) -shared -fPIC -kmer_total_count: kmer_utils.c kmer_total_count.c kmer_utils.h - $(CC) kmer_utils.c kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) +kmer_total_count: kmer_utils.o kmer_total_count.c kmer_utils.h + $(CC) kmer_utils.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) -kmer_counts_per_sequence: kmer_utils.c kmer_counts_per_sequence.c kmer_utils.h - $(CC) kmer_utils.c kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) +kmer_counts_per_sequence: kmer_utils.o kmer_counts_per_sequence.c kmer_utils.h + $(CC) kmer_utils.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) -kmer_continuous_count: kmer_utils.c kmer_continuous_count.c kmer_utils.h - $(CC) kmer_utils.c kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) +kmer_continuous_count: kmer_utils.o kmer_continuous_count.c kmer_utils.h + $(CC) kmer_utils.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) +kmer_locations: kmer_utils.o kmer_locations.c kmer_utils.h + $(CC) kmer_utils.o kmer_locations.c -o kmer_locations $(CLIBS) $(CFLAGS) clean: rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count kmer_utils.o libkmer.so rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so kmer_utils.o diff --git a/kmer_locations.c b/kmer_locations.c new file mode 100644 index 0000000..fbfcab4 --- /dev/null +++ b/kmer_locations.c @@ -0,0 +1,253 @@ +// Copyright 2013 Calvin Morrison +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <getopt.h> + +#include "kmer_utils.h" + +void print_mer(unsigned long long mer, long long pos, bool labels, bool reverse, unsigned int kmer) { + if(labels) { + char *kmer_str = index_to_kmer(mer, kmer); + if(reverse) + printf("%s %llu c\n", kmer_str, pos); + else + printf("%s %llu\n", kmer_str, pos); + free(kmer_str); + } + else { + if(reverse) + printf("%llu %llu c\n", mer, pos); + else + printf("%llu %llu\n", mer, pos); + } +} + +void print_sequence(const char *seq, // sequence + const size_t seq_length, // length of sequence + const size_t global_pos, // overall position in read + const unsigned int kmer, // kmer size + size_t *specific_mers, // specific mers array + size_t num_specific_mers, + bool labels, // print label instead of indicies + bool reverse) // reverse points +{ + long long position; + long long i; + + // loop through our seq to process each k-mer + for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { + unsigned long long mer = 0; + unsigned long long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(seq[i] == 5) { + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += seq[i] * multiply; + multiply = multiply << 2; + } + + // bump up the mer value in the counts array + if(num_specific_mers != 0) { + size_t j; + for(j = 0; j < num_specific_mers; j++) { + if(mer == specific_mers[j]) + print_mer(mer, position + global_pos, labels, reverse, kmer); + } + } + else { + print_mer(mer, position + global_pos, labels, reverse, kmer); + } + // skip count if error + next: ; + } +} + +void help() { + printf("usage: kmer_locations -i input_file -k kmer [-c] [-n] [-l] ...\n\n" + "print locations of mers in size k from a fasta file\n" + "\n" + " --input -i input fasta file to count\n" + " --kmer -k size of mers to count\n" + " --compliment -c count compliment of sequences (position is not in the reverse. \n" + " --label -l print mer along with location\n" + " --mer-file -m a file containing a list of mers you are interested\n" + " in opening. this will enable output your results in\n" + " a sparse format \n" + "\n" + "Report all bugs to mutantturkey@gmail.com\n" + "\n" + "Copyright 2014 Calvin Morrison, Drexel University.\n" + "\n" + "If you are using any dna-utils tool for a publication\n" + "please cite your usage:\n\n" + "dna-utils. Drexel University, Philadelphia USA, 2014;\n" + "software available at www.github.com/EESI/dna-utils/\n"); +} + + +int main(int argc, char **argv) { + + char *fn = NULL; + FILE *fh = NULL; + + // for getdelim + char *line = NULL; + size_t len = 0; + ssize_t read; + + unsigned int kmer = 0; + + // for specific mers + char *mer_fn = NULL; + size_t num_desired_indicies = 0; + size_t *desired_indicies = NULL; + + bool label = false; + bool kmer_set = false; + bool specific_mers = false; + bool count_compliment = false; + + unsigned long long width = 0; + + static struct option long_options[] = { + {"input", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"compliment", required_argument, 0, 'c'}, + {"label", no_argument, 0, 'l'}, + {"mer-file", no_argument, 0, 'm'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + while (1) { + + int option_index = 0; + int c = 0; + + c = getopt_long (argc, argv, "i:k:m:clvh", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + fn = optarg; + break; + case 'k': + kmer = atoi(optarg); + kmer_set = true; + break; + case 'c': + count_compliment = true; + break; + case 'm': + specific_mers = false; + mer_fn = optarg; + break; + case 'l': + label = true; + break; + case 'h': + help(); + exit(EXIT_SUCCESS); + break; + case 'v': + printf("dna-utils version " VERSION "\n"); + exit(EXIT_SUCCESS); + break; + default: + break; + } + } + if(argc == 1) { + help(); + exit(EXIT_FAILURE); + } + if(fn == NULL) { + fprintf(stderr, "no input file specified with -i, reading from stdin\n"); + fh = stdin; + } + else { + fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno)); + exit(EXIT_FAILURE); + } + } + if(!kmer_set) { + fprintf(stderr, "Error: kmer (-k) must be supplied\n"); + exit(EXIT_FAILURE); + } + if(kmer == 0) { + fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer); + exit(EXIT_FAILURE); + } + + if(specific_mers) { + desired_indicies = (size_t *) malloc((width) * sizeof(size_t)); + if(desired_indicies == NULL) + exit(EXIT_FAILURE); + num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies); + if(num_desired_indicies == 0) { + fprintf(stderr, "Error: no mers loaded from file\n"); + exit(EXIT_FAILURE); + } + } + + + unsigned long long sequence = 0; + unsigned long long global_position = 0; + + while ((read = getdelim(&line, &len, '>', fh)) != -1) { + size_t k = 0; + + // find our first \n, this should be the end of the header + char *seq = strchr(line, '\n'); + if(seq == NULL) + continue; + + + // point to one past that. + seq = seq + 1; + + // strip out all other newlines to handle multiline sequences + size_t seq_length = strnstrip(seq, '\n', strlen(seq)); + + for(k = 0; k < seq_length; k++) { + seq[k] = alpha[(int)seq[k]]; + } + + print_sequence(seq, seq_length, global_position, kmer, desired_indicies, num_desired_indicies, label, false); + + if(count_compliment) { + for(k = 0; k < seq_length; k++) { + seq[k] = compliment[(int)seq[k]]; + } + + reverse_string(seq, seq_length); + print_sequence(seq, seq_length, global_position, kmer, desired_indicies, num_desired_indicies, label, true); + } + + if(seq[seq_length - 1] == '>') + seq_length --; + + sequence++; + global_position += seq_length; + printf("%llu %llu\n", seq_length, global_position); + } + + free(line); + free(desired_indicies); + fclose(fh); + + return EXIT_SUCCESS; + } diff --git a/kmer_utils.h b/kmer_utils.h index 9fcc03c..f5087f2 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -43,9 +43,23 @@ typedef struct { typedef unordered_map<size_t,unsigned long long, kmer_noHash_hash, kmer_eq> kmer_map; -unsigned char alpha[256]; -unsigned char reverse_alpha[4]; -unsigned char compliment[5]; +const unsigned char alpha[256] = +{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; +const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' }; + + // compliments + // A C G T E E + // T G C A E E +const char compliment[6] = { 3, 2, 1, 0, 5, 5}; // open file from filename in char array *fn, and try and parse in one mer per // line, of size kmer, and store the indicies of those mers in the *arr |