aboutsummaryrefslogtreecommitdiff
path: root/kmer_utils.c
diff options
context:
space:
mode:
Diffstat (limited to 'kmer_utils.c')
-rw-r--r--kmer_utils.c151
1 files changed, 79 insertions, 72 deletions
diff --git a/kmer_utils.c b/kmer_utils.c
index b8d5691..7703f0a 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -3,9 +3,10 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
-const unsigned char alpha[256] =
+const unsigned char alpha[256] =
{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
@@ -19,6 +20,12 @@ const unsigned char alpha[256] =
const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' };
+ // compliments
+ // A C G T E E
+ // T G C A E E
+const char compliment[6] = { 3, 2, 1, 0, 5, 5};
+
+
unsigned long long pow_four(unsigned long long x) {
return (unsigned long long)1 << (x * 2);
}
@@ -27,14 +34,53 @@ void check_null_ptr(void *ptr, const char *error) {
if (ptr == NULL) {
if(error != NULL) {
fprintf(stderr, "Error: %s - %s\n", error, strerror(errno));
- }
+ }
else {
fprintf(stderr, "Error: %s\n", strerror(errno));
- }
+ }
exit(EXIT_FAILURE);
}
}
+
+void reverse_string(char *s, size_t len) {
+ char t, *d = &(s[len - 1]);
+ while (d > s) {
+ t = *s;
+ *s++ = *d;
+ *d-- = t;
+ }
+}
+
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) {
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // bump up the mer value in the counts array
+ counts[mer]++;
+
+ // skip count if error
+ next: ;
+ }
+}
+
// convert a string of k-mer size base-4 values into a
// base-10 index
unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) {
@@ -44,7 +90,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
unsigned long multiply = 1;
for(i = kmer - 1; i >= 0; i--){
- if(str[i] == 5) {
+ if(str[i] == 5) {
current_position += i;
return error_pos;
}
@@ -57,7 +103,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
}
// return the number of loaded elements
-unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
+unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
FILE *fh;
size_t arr_pos = 0;
char line[64];
@@ -130,11 +176,11 @@ char *index_to_kmer(unsigned long long index, long kmer) {
size_t start = i ;
// decrement i by 1 to reverse the last i++
- i--;
+ i--;
j = 0;
// reverse the array, as j increases, decrease i
- for(j = 0; j < start; j++, i--)
+ for(j = 0; j < start; j++, i--)
ret[j + offset] = reverse_alpha[(int)num_array[i]];
// set our last character to the null termination byte
@@ -165,7 +211,7 @@ size_t strnstrip(char *s, int c, size_t len) {
}
-unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
char *line = NULL;
size_t len = 0;
@@ -184,9 +230,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
char *seq;
size_t seq_length;
- long long i = 0;
- long long position = 0;
-
// find our first \n, this should be the end of the header
seq = strchr(line, '\n');
if(seq == NULL)
@@ -204,31 +247,18 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
for(k = 0; k < seq_length; k++)
seq[k] = alpha[(int)seq[k]];
+ count_sequence(seq, seq_length, kmer, counts);
- // loop through our seq to process each k-mer
- for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
- unsigned long long mer = 0;
- unsigned long long multiply = 1;
-
- // for each char in the k-mer check if it is an error char
- for(i = position + kmer - 1; i >= position; i--){
- if(seq[i] == 5) {
- mer = width;
- position = i;
- goto next;
- }
-
- // multiply this char in the mer by the multiply
- // and bitshift the multiply for the next round
- mer += seq[i] * multiply;
- multiply = multiply << 2;
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++) {
+ seq[k] = compliment[(int)seq[k]];
}
- // use this point to get mer of our loop
- next:
- // bump up the mer value in the counts array
- counts[mer]++;
+
+ reverse_string(seq, seq_length);
+ count_sequence(seq, seq_length, kmer, counts);
+
}
- }
+ }
free(line);
@@ -237,15 +267,14 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
return counts;
}
-unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
+unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {
FILE *fh = fopen(fn, "r");
check_null_ptr(fh, fn);
- return get_kmer_counts_from_file(fh, kmer);
+ return get_kmer_counts_from_file(fh, kmer, count_compliment);
}
-
-unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
char *line = NULL;
size_t len = 0;
@@ -262,13 +291,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
size_t cpy_size = kmer - 1;
char *end_of_previous = malloc(sizeof(char) * cpy_size);
+ check_null_ptr(end_of_previous, NULL);
memset(end_of_previous, 5, cpy_size);
- if(end_of_previous == NULL) {
- fprintf(stderr, "%s\n", strerror(errno));
- exit(EXIT_FAILURE);
- }
-
while ((read = getline(&line, &len, fh)) != -1) {
if(line[0] == '>')
continue;
@@ -276,7 +301,6 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
char *seq;
size_t j;
size_t k;
- long long position;
size_t seq_length;
seq = malloc(read + cpy_size);
@@ -290,40 +314,23 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
// everything else is 5
for(k = cpy_size; k < seq_length; k++)
seq[k] = alpha[(int)seq[k]];
-
- // loop through our seq to process each k-mer
- for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
- unsigned long long mer = 0;
- unsigned long long multiply = 1;
- long long i;
-
- // for each char in the k-mer check if it is an error char
- for(i = position + kmer - 1; i >= position; i--){
- if(seq[i] == 5) {
- mer = width;
- position = i;
- goto next;
- }
-
- // multiply this char in the mer by the multiply
- // and bitshift the multiply for the next round
- mer += seq[i] * multiply;
- multiply = multiply << 2;
- }
- // use this point to get mer of our loop
- next:
- // bump up the mer value in the counts array
- counts[mer]++;
- }
+ count_sequence(seq, seq_length, kmer, counts);
for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) {
end_of_previous[j] = seq[k];
}
- free(seq);
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++)
+ seq[k] = compliment[(int)seq[k]];
- }
+ reverse_string(seq, seq_length);
+ count_sequence(seq, seq_length, kmer, counts);
+ }
+
+ free(seq);
+ }
free(end_of_previous);
free(line);
@@ -332,9 +339,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
return counts;
}
-unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
+unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {
FILE *fh = fopen(fn, "r");
check_null_ptr(fh, fn);
- return get_continuous_kmer_counts_from_file(fh, kmer);
+ return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);
}