aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-30 16:29:47 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-30 16:29:47 -0500
commit84e6803509f91fc01678f18ec634480463099b10 (patch)
tree44cf7bf75ff7439852c81668a9eb8589a843fcdb
parent59ff0a86e855cd78628af40ade8b2e096eb8833e (diff)
kmer_count_per_sequence: add option to load specific mers from file, add multiline ecounting
-rw-r--r--kmer_counts_per_sequence.c100
-rw-r--r--kmer_frequency_per_sequence.h1
-rw-r--r--kmer_utils.c64
-rw-r--r--kmer_utils.h6
4 files changed, 134 insertions, 37 deletions
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index fceb17e..f725d5b 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -3,69 +3,119 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
#include "kmer_utils.h"
unsigned long position = 0;
int main(int argc, char **argv) {
+ // getdelim variables
char *line = NULL;
size_t len = 0;
ssize_t read;
- if(argc != 3) {
+ // specific mer variables
+ bool specific_mers = false;
+ size_t num_desired_indicies = 0;
+ size_t *desired_indicies = NULL;
+
+ unsigned long kmer = atoi(argv[2]);
+ if(kmer == 0) {
+ fprintf(stderr, "Error: invalid kmer.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ const unsigned long long width = (unsigned long long)1 << (kmer * 2);
+
+ if(argc < 3) {
printf("Please supply a filename and a kmer\n");
exit(EXIT_FAILURE);
}
+ if(argc == 4) {
+ specific_mers = true;
+ desired_indicies = malloc((width) * sizeof(size_t));
+ if(desired_indicies == NULL)
+ exit(EXIT_FAILURE);
+ num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies);
+
+ }
+
FILE *fh = fopen(argv[1], "r" );
if(fh == NULL) {
fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno));
exit(EXIT_FAILURE);
}
+ unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
+ if(counts == NULL)
+ exit(EXIT_FAILURE);
- unsigned long kmer = atoi(argv[2]);
- if(kmer == 0) {
- fprintf(stderr, "Error: invalid kmer.\n");
+ char *str = malloc(BUFSIZ);
+ if(str == NULL) {
+ fprintf(stderr, strerror(errno));
exit(EXIT_FAILURE);
}
- const unsigned long width = (unsigned long)1 << (kmer * 2);
+ unsigned long long str_size = BUFSIZ;
+ unsigned long long sequence = 0;
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+ size_t i = 0;
- unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
- if(counts == NULL)
- exit(EXIT_FAILURE);
+ memset(counts, 0, width * sizeof(unsigned long long));
- while ((read = getline(&line, &len, fh)) != -1) {
- if(line[0] != '>' && (read > kmer)) {
+ // find our first \n, this should be the end of the header
+ char *start = strchr(line, '\n');
+ if(start == NULL)
+ continue;
- unsigned int i = 0;
- unsigned long total = 0;
+ // point to one past that.
+ start = start + 1;
- // reset our count matrix to zero
- memset(counts, 0, width * sizeof(unsigned long long));
+ size_t start_len = strlen(start);
- for(i = 0; i < read; i++) {
- line[i] = alpha[(int)line[i]];
- }
- for(i = 0; i < read - kmer; i++) {
- counts[num_to_index(&line[i],kmer, width)]++;
+ // if our current str buffer isn't big enough, realloc
+ if(start_len + 1 > str_size + 1) {
+ str = realloc(str, start_len + 1);
+ if(str == NULL) {
+ exit(EXIT_FAILURE);
+ fprintf(stderr, strerror(errno));
}
+ }
+
- for(i = 0; i < width; i++)
- total += counts[i];
+ // strip out all other newlines to handle multiline sequences
+ str = strnstrip(start, str, '\n',start_len);
+ size_t seq_length = strlen(str);
+ // reset our count matrix to zero
+
+ for(i = 0; i < seq_length; i++) {
+ str[i] = alpha[(int)str[i]];
+ }
+
+ for(i = 0; i < (seq_length - kmer + 1); i++) {
+ size_t mer = num_to_index(&str[i],kmer, width, &i);
+ counts[mer]++;
+ }
+
+
+ if(specific_mers) {
+ for(i = 0; i < num_desired_indicies; i++)
+ printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]);
+ sequence++;
+ }
+ else {
for(i = 0; i < width - 1; i++)
printf("%llu\t", counts[i]);
printf("%llu\n", counts[width - 1]);
-
}
}
- free(counts);
- free(line);
+free(counts);
+free(line);
- return EXIT_SUCCESS;
+return EXIT_SUCCESS;
}
diff --git a/kmer_frequency_per_sequence.h b/kmer_frequency_per_sequence.h
deleted file mode 100644
index e782906..0000000
--- a/kmer_frequency_per_sequence.h
+++ /dev/null
@@ -1 +0,0 @@
-extern long position;
diff --git a/kmer_utils.c b/kmer_utils.c
index 2b3ce67..b0cab47 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -26,7 +26,7 @@ inline unsigned long long pow_four(unsigned long long x) {
// convert a string of k-mer size base-4 values into a
// base-10 index
-inline unsigned long num_to_index(const char *str, const int kmer, const long error_pos) {
+inline unsigned long num_to_index(const char *str, const int kmer, const long error_pos, size_t *current_position) {
int i = 0;
unsigned long out = 0;
@@ -35,9 +35,7 @@ inline unsigned long num_to_index(const char *str, const int kmer, const long er
for(i = kmer - 1; i >= 0; i--){
if(str[i] == 5) {
- #ifndef SHARED
- position += i;
- #endif
+ current_position += i;
return error_pos;
}
@@ -48,11 +46,59 @@ inline unsigned long num_to_index(const char *str, const int kmer, const long er
return out;
}
+// return the number of loaded elements
+unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
+ FILE *fh;
+ size_t arr_pos = 0;
+ char line[64];
+
+ fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ while (fgets(line, sizeof(line), fh) != NULL) {
+ size_t i;
+ size_t len;
+
+ len = strlen(line);
+ if(len == 0)
+ continue;
+
+ len--;
+ line[len] = '\0';
+
+
+ if(len != kmer) {
+ fprintf(stderr, "SKIPPING: '%s' is not length %u\n", line, kmer);
+ continue;
+ }
+
+ for(i = 0; i < len; i++) {
+ line[i] = alpha[(int)line[i]];
+ }
+
+ size_t mer = num_to_index(line, kmer, width, NULL);
+ if(mer == width) {
+ fprintf(stderr, "SKIPPING: '%s' is a unrecognized mer\n", line);
+ continue;
+ }
+ else {
+ arr[arr_pos] = mer;
+ arr_pos++;
+ }
+ }
+
+ fclose(fh);
+ return arr_pos;
+}
+
// convert an index back into a kmer string
char *index_to_kmer(unsigned long long index, long kmer) {
- int i = 0;
- int j = 0;
+ size_t i = 0;
+ size_t j = 0;
char *num_array = calloc(kmer, sizeof(char));
char *ret = calloc(kmer + 1, sizeof(char));
if(ret == NULL)
@@ -74,7 +120,7 @@ char *index_to_kmer(unsigned long long index, long kmer) {
// our offset for how many chars we prepended
int offset = j;
// save i so we can print it
- int start = i ;
+ size_t start = i ;
// decrement i by 1 to reverse the last i++
i--;
@@ -116,8 +162,8 @@ unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned in
size_t len = 0;
ssize_t read;
- long long i = 0;
- long long position = 0;
+ size_t i = 0;
+ size_t position = 0;
FILE * const fh = fopen(fn, "r");
if(fh == NULL) {
diff --git a/kmer_utils.h b/kmer_utils.h
index 762c20c..cfe664b 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -1,12 +1,14 @@
// Kmer functions
void convert_kmer_to_num(char *str, const unsigned long length);
-unsigned long num_to_index(const char *str, const int kmer, const long error_pos);
+unsigned long num_to_index(const char *str, const int kmer, const long error_pos, size_t *current_position);
char *index_to_kmer(unsigned long long index, long kmer);
unsigned long long * get_kmer_counts_from_file(const char *fn, const int kmer);
// Utility functions
char *strnstrip(const char *s, char *dest, int c, int len);
-inline unsigned long long pow_four(unsigned long long x);
+unsigned long long pow_four(unsigned long long x);
// Variables
const unsigned char alpha[256];
+
+unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr);