aboutsummaryrefslogtreecommitdiff
path: root/kmer_counts_per_sequence.c
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-30 16:29:47 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-30 16:29:47 -0500
commit84e6803509f91fc01678f18ec634480463099b10 (patch)
tree44cf7bf75ff7439852c81668a9eb8589a843fcdb /kmer_counts_per_sequence.c
parent59ff0a86e855cd78628af40ade8b2e096eb8833e (diff)
kmer_count_per_sequence: add option to load specific mers from file, add multiline ecounting
Diffstat (limited to 'kmer_counts_per_sequence.c')
-rw-r--r--kmer_counts_per_sequence.c100
1 files changed, 75 insertions, 25 deletions
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index fceb17e..f725d5b 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -3,69 +3,119 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
#include "kmer_utils.h"
unsigned long position = 0;
int main(int argc, char **argv) {
+ // getdelim variables
char *line = NULL;
size_t len = 0;
ssize_t read;
- if(argc != 3) {
+ // specific mer variables
+ bool specific_mers = false;
+ size_t num_desired_indicies = 0;
+ size_t *desired_indicies = NULL;
+
+ unsigned long kmer = atoi(argv[2]);
+ if(kmer == 0) {
+ fprintf(stderr, "Error: invalid kmer.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ const unsigned long long width = (unsigned long long)1 << (kmer * 2);
+
+ if(argc < 3) {
printf("Please supply a filename and a kmer\n");
exit(EXIT_FAILURE);
}
+ if(argc == 4) {
+ specific_mers = true;
+ desired_indicies = malloc((width) * sizeof(size_t));
+ if(desired_indicies == NULL)
+ exit(EXIT_FAILURE);
+ num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies);
+
+ }
+
FILE *fh = fopen(argv[1], "r" );
if(fh == NULL) {
fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno));
exit(EXIT_FAILURE);
}
+ unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
+ if(counts == NULL)
+ exit(EXIT_FAILURE);
- unsigned long kmer = atoi(argv[2]);
- if(kmer == 0) {
- fprintf(stderr, "Error: invalid kmer.\n");
+ char *str = malloc(BUFSIZ);
+ if(str == NULL) {
+ fprintf(stderr, strerror(errno));
exit(EXIT_FAILURE);
}
- const unsigned long width = (unsigned long)1 << (kmer * 2);
+ unsigned long long str_size = BUFSIZ;
+ unsigned long long sequence = 0;
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+ size_t i = 0;
- unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
- if(counts == NULL)
- exit(EXIT_FAILURE);
+ memset(counts, 0, width * sizeof(unsigned long long));
- while ((read = getline(&line, &len, fh)) != -1) {
- if(line[0] != '>' && (read > kmer)) {
+ // find our first \n, this should be the end of the header
+ char *start = strchr(line, '\n');
+ if(start == NULL)
+ continue;
- unsigned int i = 0;
- unsigned long total = 0;
+ // point to one past that.
+ start = start + 1;
- // reset our count matrix to zero
- memset(counts, 0, width * sizeof(unsigned long long));
+ size_t start_len = strlen(start);
- for(i = 0; i < read; i++) {
- line[i] = alpha[(int)line[i]];
- }
- for(i = 0; i < read - kmer; i++) {
- counts[num_to_index(&line[i],kmer, width)]++;
+ // if our current str buffer isn't big enough, realloc
+ if(start_len + 1 > str_size + 1) {
+ str = realloc(str, start_len + 1);
+ if(str == NULL) {
+ exit(EXIT_FAILURE);
+ fprintf(stderr, strerror(errno));
}
+ }
+
- for(i = 0; i < width; i++)
- total += counts[i];
+ // strip out all other newlines to handle multiline sequences
+ str = strnstrip(start, str, '\n',start_len);
+ size_t seq_length = strlen(str);
+ // reset our count matrix to zero
+
+ for(i = 0; i < seq_length; i++) {
+ str[i] = alpha[(int)str[i]];
+ }
+
+ for(i = 0; i < (seq_length - kmer + 1); i++) {
+ size_t mer = num_to_index(&str[i],kmer, width, &i);
+ counts[mer]++;
+ }
+
+
+ if(specific_mers) {
+ for(i = 0; i < num_desired_indicies; i++)
+ printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]);
+ sequence++;
+ }
+ else {
for(i = 0; i < width - 1; i++)
printf("%llu\t", counts[i]);
printf("%llu\n", counts[width - 1]);
-
}
}
- free(counts);
- free(line);
+free(counts);
+free(line);
- return EXIT_SUCCESS;
+return EXIT_SUCCESS;
}