aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kmer_counts_per_sequence.c143
1 files changed, 113 insertions, 30 deletions
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index d871ae5..0cab5e7 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -1,5 +1,6 @@
// Copyright 2013 Calvin Morrison
#include <errno.h>
+#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -32,43 +33,112 @@ void help() {
int main(int argc, char **argv) {
// getdelim variables
+ char *fn = NULL;
+ FILE *fh = NULL;
+
+ // for specific mers
+ char *mer_fn = NULL;
+ size_t num_desired_indicies = 0;
+ size_t *desired_indicies = NULL;
+
+ // for getdelim
char *line = NULL;
size_t len = 0;
ssize_t read;
- // specific mer variables
- bool specific_mers = false;
- size_t num_desired_indicies = 0;
- size_t *desired_indicies = NULL;
+ // sizes
+ unsigned int kmer = 0;
+ size_t width = 0;
+ bool sparse = false;
+ bool kmer_set = false;
+ bool specific_mers = false;
- if(argc < 3) {
- printf("Please supply a filename and a kmer\n");
+ static struct option long_options[] = {
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"sparse", no_argument, 0, 's'},
+ {"mer-file", required_argument, 0, 'm'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+
+ while (1) {
+
+ int option_index = 0;
+ int c = 0;
+
+ c = getopt_long (argc, argv, "i:k:m:vsh", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fn = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ kmer_set = true;
+ break;
+ case 's':
+ sparse = true;
+ break;
+ case 'm':
+ mer_fn = optarg;
+ specific_mers = true;
+ break;
+ case 'h':
+ help();
+ exit(EXIT_SUCCESS);
+ break;
+ case 'v':
+ printf("dna-utils version " VERSION "\n");
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+ if(argc == 1) {
+ help();
+ exit(EXIT_FAILURE);
+ }
+ if(fn == NULL) {
+ fprintf(stderr, "no input file specified with -i, reading from stdin\n");
+ fh = stdin;
+ }
+ else {
+ fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+ if(!kmer_set) {
+ fprintf(stderr, "Error: kmer (-k) must be supplied\n");
exit(EXIT_FAILURE);
}
-
- unsigned long kmer = atoi(argv[2]);
if(kmer == 0) {
- fprintf(stderr, "Error: invalid kmer.\n");
+ fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer);
exit(EXIT_FAILURE);
}
- const unsigned long long width = (unsigned long long)1 << (kmer * 2);
+ width = (unsigned long long)1 << (kmer * 2);
- if(argc == 4) {
- specific_mers = true;
+ if(specific_mers) {
+ sparse = false;
desired_indicies = malloc((width) * sizeof(size_t));
if(desired_indicies == NULL)
exit(EXIT_FAILURE);
- num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies);
-
+ num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies);
+ if(num_desired_indicies == 0) {
+ fprintf(stderr, "Error: no mers loaded from file");
+ exit(EXIT_FAILURE);
+ }
}
- FILE *fh = fopen(argv[1], "r" );
- if(fh == NULL) {
- fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno));
- exit(EXIT_FAILURE);
- }
+
unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
if(counts == NULL)
exit(EXIT_FAILURE);
@@ -79,10 +149,11 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- unsigned long long str_size = BUFSIZ;
+ size_t str_size = BUFSIZ;
unsigned long long sequence = 0;
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
- size_t i = 0;
+ long long i = 0;
+ size_t k = 0;
memset(counts, 0, width * sizeof(unsigned long long));
@@ -113,30 +184,42 @@ int main(int argc, char **argv) {
// reset our count matrix to zero
- for(i = 0; i < seq_length; i++) {
- str[i] = alpha[(int)str[i]];
+ for(k = 0; k < seq_length; k++) {
+ str[k] = alpha[(int)str[k]];
}
- for(i = 0; i < (seq_length - kmer + 1); i++) {
+ for(i = 0; i < (signed long long)(seq_length - kmer + 1); i++) {
size_t mer = num_to_index(&str[i],kmer, width, &i);
counts[mer]++;
}
if(specific_mers) {
- for(i = 0; i < num_desired_indicies; i++)
- printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]);
- sequence++;
+ for(k = 0; k < num_desired_indicies; k++) {
+ if(desired_indicies[k] != 0)
+ fprintf(stdout, "%llu\t%zu\t%llu\n", sequence, desired_indicies[k], counts[desired_indicies[k]]);
+ }
}
+ else if(sparse) {
+ for(k = 0; k < width; k++) {
+ if(counts[k] != 0)
+ fprintf(stdout, "%llu\t%zu\t%llu\n", sequence, k, counts[k]);
+ }
+ }
else {
- for(i = 0; i < width - 1; i++)
- printf("%llu\t", counts[i]);
- printf("%llu\n", counts[width - 1]);
+ for(k = 0; k < width - 1; k++)
+ fprintf(stdout, "%llu\t", counts[k]);
+ fprintf(stdout, "%llu\n", counts[width - 1]);
}
+
+ sequence++;
}
free(counts);
free(line);
+free(desired_indicies);
+free(str);
+fclose(fh);
return EXIT_SUCCESS;