aboutsummaryrefslogtreecommitdiff
path: root/kmer_utils.c
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-16 12:43:11 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-16 12:43:11 -0400
commite0a1e2605920dcfbc500b1cff4b7282210b49744 (patch)
treeb47fdf0ffd72592e24ae60728d72c8cf10e961ca /kmer_utils.c
parent7a5e77d675f78f86ab4486f5240b36ff0f25a066 (diff)
added new functions
Diffstat (limited to 'kmer_utils.c')
-rw-r--r--kmer_utils.c131
1 files changed, 130 insertions, 1 deletions
diff --git a/kmer_utils.c b/kmer_utils.c
index a1ff9b2..111e550 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -1,5 +1,27 @@
+// Copyright 2013 Calvin Morrison
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
#include "kmer_total_count.h"
+const unsigned char alpha[256] =
+{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+
+inline unsigned long long pow_four(unsigned long long x) {
+ return (unsigned long long)1 << (x * 2);
+}
+
// convert a string of k-mer size base-4 values into a
// base-10 index
inline unsigned long num_to_index(const char *str, const int kmer, const long error_pos) {
@@ -46,10 +68,117 @@ void convert_kmer_to_num(char *str, const unsigned long length) {
default:
// Error Checking: use 4 so we can shift right twice
// to check quickly is there is an non ACGT character
- str[i] = 4;
+ str[i] = 5;
}
}
}
+char *strnstrip(const char *s, char *dest, int c, unsigned long long len) {
+
+ unsigned long long i = 0;
+ unsigned long long j = 0;
+
+ for(i = 0; i < len; i++) {
+ if(s[i] != c) {
+ dest[j] = s[i];
+ j++;
+ }
+ }
+
+ dest[j] = '\0';
+
+ return dest;
+}
+
+unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned int kmer) {
+
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t read;
+
+ long long i = 0;
+ long long posistion = 0;
+
+ FILE * const fh = fopen(fn, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ // width is 4^kmer
+ // there's a sneaky bitshift to avoid pow dependency
+ const unsigned long width = pow_four(kmer);
+
+ // malloc our return array
+ unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long));
+ if(counts == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ char *str = malloc(4096);
+ if(str == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ unsigned long long str_size = 4096;
+
+ while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+
+ // find our first \n, this should be the end of the header
+ char *start = strchr(line, '\n');
+ if(start == NULL)
+ continue;
+
+ size_t start_len = strlen(start);
+
+
+ // if our current str buffer isn't big enough, realloc
+ if(start_len + 1 > str_size + 1) {
+ str = realloc(str, start_len + 1);
+ if(str == NULL) {
+ exit(EXIT_FAILURE);
+ fprintf(stderr, strerror(errno));
+ }
+ }
+
+ // strip out all other newlines to handle multiline sequences
+ str = strnstrip(start, str, '\n',start_len);
+ size_t seq_length = strlen(str);
+
+ // relace A, C, G and T with 0, 1, 2, 3 respectively
+ // everything else is 5
+ for(i = 0; i < seq_length; i++) {
+ str[i] = alpha[(int)str[i]];
+ }
+
+ // loop through our string to process each k-mer
+ for(posistion = 0; posistion < (seq_length - kmer + 1); posistion++) {
+ unsigned long mer = 0;
+ unsigned long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = posistion + kmer - 1; i >= posistion; i--){
+ if(str[i] >> 2) {
+ mer = width;
+ posistion = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += str[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // use this point to get mer of our loop
+ next:
+ // bump up the mer value in the counts array
+ counts[mer]++;
+ }
+ }
+
+ return counts;
+}