aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kmer_counts_per_sequence.c37
-rw-r--r--kmer_utils.c57
-rw-r--r--kmer_utils.h2
3 files changed, 28 insertions, 68 deletions
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index d4c0a6b..2b95ed1 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -30,8 +30,10 @@ void help() {
"software available at www.github.com/EESI/dna-utils/\n");
}
+
int main(int argc, char **argv) {
+
// getdelim variables
char *fn = NULL;
FILE *fh = NULL;
@@ -143,13 +145,6 @@ int main(int argc, char **argv) {
if(counts == NULL)
exit(EXIT_FAILURE);
- char *str = malloc(BUFSIZ);
- if(str == NULL) {
- fprintf(stderr, strerror(errno));
- exit(EXIT_FAILURE);
- }
-
- size_t str_size = BUFSIZ;
unsigned long long sequence = 0;
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
long long i = 0;
@@ -158,38 +153,25 @@ int main(int argc, char **argv) {
memset(counts, 0, width * sizeof(unsigned long long));
// find our first \n, this should be the end of the header
- char *start = strchr(line, '\n');
- if(start == NULL)
+ char *seq = strchr(line, '\n');
+ if(seq == NULL)
continue;
- // point to one past that.
- start = start + 1;
-
- size_t start_len = strlen(start);
-
-
- // if our current str buffer isn't big enough, realloc
- if(start_len + 1 > str_size + 1) {
- str = realloc(str, start_len + 1);
- if(str == NULL) {
- exit(EXIT_FAILURE);
- fprintf(stderr, strerror(errno));
- }
- }
+ // point to one past that.
+ seq = seq + 1;
// strip out all other newlines to handle multiline sequences
- str = strnstrip(start, str, '\n',start_len);
- size_t seq_length = strlen(str);
+ size_t seq_length = strnstrip(seq, '\n', strlen(seq));
// reset our count matrix to zero
for(k = 0; k < seq_length; k++) {
- str[k] = alpha[(int)str[k]];
+ seq[k] = alpha[(int)seq[k]];
}
for(i = 0; i < (signed long long)(seq_length - kmer + 1); i++) {
- size_t mer = num_to_index(&str[i],kmer, width, &i);
+ size_t mer = num_to_index(&seq[i],kmer, width, &i);
counts[mer]++;
}
@@ -218,7 +200,6 @@ int main(int argc, char **argv) {
free(counts);
free(line);
free(desired_indicies);
-free(str);
fclose(fh);
diff --git a/kmer_utils.c b/kmer_utils.c
index 80b82a0..295628a 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -135,23 +135,24 @@ char *index_to_kmer(unsigned long long index, long kmer) {
return ret;
}
-// Strip out any character 'c' from char array 's' into a destination dest (you
-// need to allocate that) and copy only len characters.
-char *strnstrip(const char *s, char *dest, int c, unsigned long long len) {
+// Strip out any character 'c' from char array 's'
+size_t strnstrip(char *s, int c, size_t len) {
size_t i = 0;
size_t j = 0;
for(i = 0; i < len; i++) {
if(s[i] != c) {
- dest[j] = s[i];
+ if(j != i)
+ s[j] = s[i];
j++;
}
}
- dest[j] = '\0';
+ s[j] = '\0';
+
+ return j;
- return dest;
}
unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
@@ -174,58 +175,37 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
exit(EXIT_FAILURE);
}
- char *str = malloc(4096);
- if(str == NULL) {
- fprintf(stderr, strerror(errno));
- exit(EXIT_FAILURE);
- }
-
- unsigned long long str_size = 4096;
-
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
size_t k;
- char *start;
- size_t start_len;
+ char *seq;
+ size_t seq_length;
// find our first \n, this should be the end of the header
- start = strchr(line, '\n');
- if(start == NULL)
+ seq = strchr(line, '\n');
+ if(seq == NULL)
continue;
// point to one past that.
- start = start + 1;
-
- start_len = strlen(start);
-
-
- // if our current str buffer isn't big enough, realloc
- if(start_len + 1 > str_size + 1) {
- str = realloc(str, start_len + 1);
- if(str == NULL) {
- exit(EXIT_FAILURE);
- fprintf(stderr, strerror(errno));
- }
- }
-
+ seq = seq + 1;
// strip out all other newlines to handle multiline sequences
- str = strnstrip(start, str, '\n',start_len);
- size_t seq_length = strlen(str);
+ strnstrip(seq, '\n', strlen(seq));
+ seq_length = strlen(seq);
// relace A, C, G and T with 0, 1, 2, 3 respectively
// everything else is 5
for(k = 0; k < seq_length; k++)
- str[k] = alpha[(int)str[k]];
+ seq[k] = alpha[(int)seq[k]];
- // loop through our string to process each k-mer
+ // loop through our seq to process each k-mer
for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
unsigned long long mer = 0;
unsigned long long multiply = 1;
// for each char in the k-mer check if it is an error char
for(i = position + kmer - 1; i >= position; i--){
- if(str[i] == 5) {
+ if(seq[i] == 5) {
mer = width;
position = i;
goto next;
@@ -233,7 +213,7 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
// multiply this char in the mer by the multiply
// and bitshift the multiply for the next round
- mer += str[i] * multiply;
+ mer += seq[i] * multiply;
multiply = multiply << 2;
}
// use this point to get mer of our loop
@@ -244,7 +224,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
}
free(line);
- free(str);
fclose(fh);
return counts;
diff --git a/kmer_utils.h b/kmer_utils.h
index 3caa8de..d71a607 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -4,7 +4,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
char *index_to_kmer(unsigned long long index, long kmer);
// Utility functions
-char *strnstrip(const char *s, char *dest, int c, int len);
+size_t strnstrip(char *s, int c, size_t len);
unsigned long long pow_four(unsigned long long x);
// Variables