1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
|
// Copyright 2013 Calvin Morrison
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "quikr.h"
const unsigned char alpha[256] =
{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
// convert a string of k-mer size base-4 values into a
// base-10 index
unsigned long num_to_index(const char *str, const int kmer, const long error_pos) {
int i = 0;
unsigned long out = 0;
unsigned long multiply = 1;
for(i = kmer - 1; i >= 0; i--){
if(str[i] == 5) {
// position += i;
return error_pos;
}
out += str[i] * multiply;
multiply = multiply << 2;
}
return out;
}
// Strip out any character 'c' from char array 's' into a destination dest (you
// need to allocate that) and copy only len characters.
char *strnstrip(const char *s, char *dest, int c, unsigned long long len) {
unsigned long long i = 0;
unsigned long long j = 0;
for(i = 0; i < len; i++) {
if(s[i] != c) {
dest[j] = s[i];
j++;
}
}
dest[j] = '\0';
return dest;
}
unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned int kmer) {
char *line = NULL;
size_t len = 0;
ssize_t read;
long long i = 0;
long long position = 0;
FILE * const fh = fopen(fn, "r");
if(fh == NULL) {
fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno));
exit(EXIT_FAILURE);
}
// width is 4^kmer
// there's a sneaky bitshift to avoid pow dependency
const unsigned long width = pow_four(kmer);
// malloc our return array
unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long));
memset(counts, 0, width * sizeof(unsigned long long));
if(counts == NULL) {
fprintf(stderr, strerror(errno));
exit(EXIT_FAILURE);
}
char *str = malloc(4096);
if(str == NULL) {
fprintf(stderr, strerror(errno));
exit(EXIT_FAILURE);
}
unsigned long long str_size = 4096;
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
// find our first \n, this should be the end of the header
char *start = strchr(line, '\n');
if(start == NULL)
continue;
start = start + 1;
size_t start_len = strlen(start);
// if our current str buffer isn't big enough, realloc
if(start_len + 1 > str_size + 1) {
str = realloc(str, start_len + 1);
if(str == NULL) {
exit(EXIT_FAILURE);
fprintf(stderr, strerror(errno));
}
}
// strip out all other newlines to handle multiline sequences
str = strnstrip(start, str, '\n',start_len);
ssize_t seq_length = strlen(str);
if(seq_length < kmer)
continue;
// relace A, C, G and T with 0, 1, 2, 3 respectively
// everything else is 5
for(i = 0; i < seq_length; i++) {
str[i] = alpha[(int)str[i]];
}
// loop through our string to process each k-mer
for(position = 0; position < (seq_length - kmer + 1); position++) {
unsigned long mer = 0;
unsigned long multiply = 1;
// for each char in the k-mer check if it is an error char
for(i = position + kmer - 1; i >= position; i--){
if(str[i] == 5) {
mer = width;
position = i;
goto next;
}
// if it's a newline, we should skip it
// multiply this char in the mer by the multiply
// and bitshift the multiply for the next round
mer += str[i] * multiply;
multiply = multiply << 2;
}
// use this point to get mer of our loop
next:
// bump up the mer value in the counts array
counts[mer]++;
}
}
free(line);
free(str);
return counts;
}
|