aboutsummaryrefslogtreecommitdiff
path: root/kmer_continuous_count.c
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-06 17:05:40 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-06 17:05:40 -0500
commit2c038ba630c14c7030186c64e9eb92761ddcba74 (patch)
treef4705db1e2603bdc831254eee60800eb10448fcf /kmer_continuous_count.c
parent5d7e67a846ec104da2d7bdb988672fbd02ddda28 (diff)
add kmer_continuous_count
this tool will count continuously, instead of line by line. The way that this works out is something like this: test.fa > header 1 AAAAATTTTT > header 2 GGGGGAAAAA counting 6 mers, the program will count TTTGGG, TTGGGG, TGGGGG, like there was no header seperating them. This can be useful for certain tyeps of processing, like when the sequences are continuous from a genome. initial commit
Diffstat (limited to 'kmer_continuous_count.c')
-rw-r--r--kmer_continuous_count.c158
1 files changed, 158 insertions, 0 deletions
diff --git a/kmer_continuous_count.c b/kmer_continuous_count.c
new file mode 100644
index 0000000..5ace9a5
--- /dev/null
+++ b/kmer_continuous_count.c
@@ -0,0 +1,158 @@
+
+// Copyright 2013 Calvin Morrison
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "kmer_utils.h"
+
+
+void help() {
+ printf("usage: kmer_total_count -i input_file -k kmer [-n] [-l] ...\n\n"
+ "count mers in size k from a fasta file, but do so continuously\n"
+ "\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --nonzero -n only print non-zero values\n"
+ " --label -l print mer along with value\n"
+ "\n"
+ "Report all bugs to mutantturkey@gmail.com\n"
+ "\n"
+ "Copyright 2014 Calvin Morrison, Drexel University.\n"
+ "\n"
+ "If you are using any dna-utils tool for a publication\n"
+ "please cite your usage:\n\n"
+ "dna-utils. Drexel University, Philadelphia USA, 2014;\n"
+ "software available at www.github.com/EESI/dna-utils/\n");
+}
+
+
+int main(int argc, char **argv) {
+
+ char *fn = NULL;
+ FILE *fh = NULL;
+
+ unsigned int kmer = 0;
+
+ bool nonzero = false;
+ bool label = false;
+ bool kmer_set = false;
+
+ unsigned long long width = 0;
+
+ unsigned long long i = 0;
+
+ static struct option long_options[] = {
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"nonzero", no_argument, 0, 'n'},
+ {"label", no_argument, 0, 'l'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+
+ while (1) {
+
+ int option_index = 0;
+ int c = 0;
+
+ c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fn = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ kmer_set = true;
+ break;
+ case 'n':
+ nonzero = true;
+ break;
+ case 'l':
+ label = true;
+ break;
+ case 'h':
+ help();
+ exit(EXIT_SUCCESS);
+ break;
+ case 'v':
+ printf("dna-utils version " VERSION "\n");
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+ if(argc == 1) {
+ help();
+ exit(EXIT_FAILURE);
+ }
+ if(fn == NULL) {
+ fprintf(stderr, "no input file specified with -i, reading from stdin\n");
+ fh = stdin;
+ }
+ else {
+ fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Could not open %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+ if(!kmer_set) {
+ fprintf(stderr, "Error: kmer (-k) must be supplied\n");
+ exit(EXIT_FAILURE);
+ }
+ if(kmer == 0) {
+ fprintf(stderr, "Error: invalid kmer - '%d'.\n", kmer);
+ exit(EXIT_FAILURE);
+ }
+
+ width = pow_four(kmer);
+
+ unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer);
+
+ // If nonzero is set, only print non zeros
+ if(nonzero) {
+ // if labels is set, print out our labels
+ if(label) {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+
+ }
+ else {
+ for(i = 0; i < width; i++)
+ if(counts[i] != 0)
+ fprintf(stdout, "%llu\t%llu\n", i, counts[i]);
+
+ }
+ }
+ // If we aren't printing nonzeros print everything
+ else {
+ if(label) {
+ for(i = 0; i < width; i++) {
+ char *kmer_str = index_to_kmer(i, kmer);
+ fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]);
+ free(kmer_str);
+ }
+ }
+ else {
+ for(i = 0; i < width; i=i+4) {
+ fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]);
+ }
+ }
+ }
+
+ free(counts);
+ return EXIT_SUCCESS;
+}