aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-04-11 10:10:38 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-04-11 10:10:38 -0400
commit564bdd120fa4e2de4c8419d774f1fa6e1d8a0fbc (patch)
treee0d0de42b496161650d9912ee306ecc071c23e41
parent2319585caf71cded277d045c9d4001566d753778 (diff)
parent26233d023b0b1febf0edf8ca5073f36f93b1a33b (diff)
Merge branch 'master' of github.com:mutantturkey/dna-utils
-rw-r--r--Makefile21
-rw-r--r--kmer_locations.c253
-rw-r--r--kmer_utils.h20
3 files changed, 283 insertions, 11 deletions
diff --git a/Makefile b/Makefile
index e729fc9..81bb031 100644
--- a/Makefile
+++ b/Makefile
@@ -3,20 +3,25 @@ CC = g++
CFLAGS = -O3 -s -march=native -Wall -Wextra -DVERSION=$(VERSION) -std=c++11
DESTDIR = /usr/local/
-all: libkmer.so kmer_total_count kmer_counts_per_sequence
+all: libkmer.so kmer_total_count kmer_counts_per_sequence kmer_utils.o kmer_locations
+
+kmer_utils.o:
+ $(CC) -c kmer_utils.c -O $(CFLAGS) -fPIC
libkmer.so: kmer_utils.o
- $(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC
+ $(CC) kmer_utils.o -o libkmer.so $(CFLAGS) -shared -fPIC
-kmer_total_count: kmer_utils.c kmer_total_count.c kmer_utils.h
- $(CC) kmer_utils.c kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS)
+kmer_total_count: kmer_utils.o kmer_total_count.c kmer_utils.h
+ $(CC) kmer_utils.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS)
-kmer_counts_per_sequence: kmer_utils.c kmer_counts_per_sequence.c kmer_utils.h
- $(CC) kmer_utils.c kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS)
+kmer_counts_per_sequence: kmer_utils.o kmer_counts_per_sequence.c kmer_utils.h
+ $(CC) kmer_utils.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS)
-kmer_continuous_count: kmer_utils.c kmer_continuous_count.c kmer_utils.h
- $(CC) kmer_utils.c kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)
+kmer_continuous_count: kmer_utils.o kmer_continuous_count.c kmer_utils.h
+ $(CC) kmer_utils.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)
+kmer_locations: kmer_utils.o kmer_locations.c kmer_utils.h
+ $(CC) kmer_utils.o kmer_locations.c -o kmer_locations $(CLIBS) $(CFLAGS)
clean:
rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count kmer_utils.o libkmer.so
rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so kmer_utils.o
diff --git a/kmer_locations.c b/kmer_locations.c
new file mode 100644
index 0000000..fbfcab4
--- /dev/null
+++ b/kmer_locations.c
@@ -0,0 +1,253 @@
+// Copyright 2013 Calvin Morrison
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "kmer_utils.h"
+
+void print_mer(unsigned long long mer, long long pos, bool labels, bool reverse, unsigned int kmer) {
+ if(labels) {
+ char *kmer_str = index_to_kmer(mer, kmer);
+ if(reverse)
+ printf("%s %llu c\n", kmer_str, pos);
+ else
+ printf("%s %llu\n", kmer_str, pos);
+ free(kmer_str);
+ }
+ else {
+ if(reverse)
+ printf("%llu %llu c\n", mer, pos);
+ else
+ printf("%llu %llu\n", mer, pos);
+ }
+}
+
+void print_sequence(const char *seq, // sequence
+ const size_t seq_length, // length of sequence
+ const size_t global_pos, // overall position in read
+ const unsigned int kmer, // kmer size
+ size_t *specific_mers, // specific mers array
+ size_t num_specific_mers,
+ bool labels, // print label instead of indicies
+ bool reverse) // reverse points
+{
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+
+ // bump up the mer value in the counts array
+ if(num_specific_mers != 0) {
+ size_t j;
+ for(j = 0; j < num_specific_mers; j++) {
+ if(mer == specific_mers[j])
+ print_mer(mer, position + global_pos, labels, reverse, kmer);
+ }
+ }
+ else {
+ print_mer(mer, position + global_pos, labels, reverse, kmer);
+ }
+ // skip count if error
+ next: ;
+ }
+}
+
+void help() {
+ printf("usage: kmer_locations -i input_file -k kmer [-c] [-n] [-l] ...\n\n"
+ "print locations of mers in size k from a fasta file\n"
+ "\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --compliment -c count compliment of sequences (position is not in the reverse. \n"
+ " --label -l print mer along with location\n"
+ " --mer-file -m a file containing a list of mers you are interested\n"
+ " in opening. this will enable output your results in\n"
+ " a sparse format \n"
+ "\n"
+ "Report all bugs to mutantturkey@gmail.com\n"
+ "\n"
+ "Copyright 2014 Calvin Morrison, Drexel University.\n"
+ "\n"
+ "If you are using any dna-utils tool for a publication\n"
+ "please cite your usage:\n\n"
+ "dna-utils. Drexel University, Philadelphia USA, 2014;\n"
+ "software available at www.github.com/EESI/dna-utils/\n");
+}
+
+
+int main(int argc, char **argv) {
+
+ char *fn = NULL;
+ FILE *fh = NULL;
+
+ // for getdelim
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t read;
+
+ unsigned int kmer = 0;
+
+ // for specific mers
+ char *mer_fn = NULL;
+ size_t num_desired_indicies = 0;
+ size_t *desired_indicies = NULL;
+
+ bool label = false;
+ bool kmer_set = false;
+ bool specific_mers = false;
+ bool count_compliment = false;
+
+ unsigned long long width = 0;
+
+ static struct option long_options[] = {
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"compliment", required_argument, 0, 'c'},
+ {"label", no_argument, 0, 'l'},
+ {"mer-file", no_argument, 0, 'm'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+
+ while (1) {
+
+ int option_index = 0;
+ int c = 0;
+
+ c = getopt_long (argc, argv, "i:k:m:clvh", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fn = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ kmer_set = true;
+ break;
+ case 'c':
+ count_compliment = true;
+ break;
+ case 'm':
+ specific_mers = false;
+ mer_fn = optarg;
+ break;
+ case 'l':
+ label = true;
+ break;
+ case 'h':
+ help();
+ exit(EXIT_SUCCESS);
+ break;
+ case 'v':
+ printf("dna-utils version " VERSION "\n");
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+ if(argc == 1) {
+ help();
+ exit(EXIT_FAILURE);
+ }
+ if(fn == NULL) {
+ fprintf(stderr, "no input file specified with -i, reading from stdin\n");
+ fh = stdin;
+ }
+ else {
+ fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+ if(!kmer_set) {
+ fprintf(stderr, "Error: kmer (-k) must be supplied\n");
+ exit(EXIT_FAILURE);
+ }
+ if(kmer == 0) {
+ fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer);
+ exit(EXIT_FAILURE);
+ }
+
+ if(specific_mers) {
+ desired_indicies = (size_t *) malloc((width) * sizeof(size_t));
+ if(desired_indicies == NULL)
+ exit(EXIT_FAILURE);
+ num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies);
+ if(num_desired_indicies == 0) {
+ fprintf(stderr, "Error: no mers loaded from file\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+
+ unsigned long long sequence = 0;
+ unsigned long long global_position = 0;
+
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+ size_t k = 0;
+
+ // find our first \n, this should be the end of the header
+ char *seq = strchr(line, '\n');
+ if(seq == NULL)
+ continue;
+
+
+ // point to one past that.
+ seq = seq + 1;
+
+ // strip out all other newlines to handle multiline sequences
+ size_t seq_length = strnstrip(seq, '\n', strlen(seq));
+
+ for(k = 0; k < seq_length; k++) {
+ seq[k] = alpha[(int)seq[k]];
+ }
+
+ print_sequence(seq, seq_length, global_position, kmer, desired_indicies, num_desired_indicies, label, false);
+
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++) {
+ seq[k] = compliment[(int)seq[k]];
+ }
+
+ reverse_string(seq, seq_length);
+ print_sequence(seq, seq_length, global_position, kmer, desired_indicies, num_desired_indicies, label, true);
+ }
+
+ if(seq[seq_length - 1] == '>')
+ seq_length --;
+
+ sequence++;
+ global_position += seq_length;
+ printf("%llu %llu\n", seq_length, global_position);
+ }
+
+ free(line);
+ free(desired_indicies);
+ fclose(fh);
+
+ return EXIT_SUCCESS;
+ }
diff --git a/kmer_utils.h b/kmer_utils.h
index 9fcc03c..f5087f2 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -43,9 +43,23 @@ typedef struct {
typedef unordered_map<size_t,unsigned long long, kmer_noHash_hash, kmer_eq> kmer_map;
-unsigned char alpha[256];
-unsigned char reverse_alpha[4];
-unsigned char compliment[5];
+const unsigned char alpha[256] =
+{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' };
+
+ // compliments
+ // A C G T E E
+ // T G C A E E
+const char compliment[6] = { 3, 2, 1, 0, 5, 5};
// open file from filename in char array *fn, and try and parse in one mer per
// line, of size kmer, and store the indicies of those mers in the *arr