diff options
-rw-r--r-- | kmer_counts_per_sequence.c | 37 | ||||
-rw-r--r-- | kmer_utils.c | 57 | ||||
-rw-r--r-- | kmer_utils.h | 2 |
3 files changed, 28 insertions, 68 deletions
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index d4c0a6b..2b95ed1 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -30,8 +30,10 @@ void help() { "software available at www.github.com/EESI/dna-utils/\n"); } + int main(int argc, char **argv) { + // getdelim variables char *fn = NULL; FILE *fh = NULL; @@ -143,13 +145,6 @@ int main(int argc, char **argv) { if(counts == NULL) exit(EXIT_FAILURE); - char *str = malloc(BUFSIZ); - if(str == NULL) { - fprintf(stderr, strerror(errno)); - exit(EXIT_FAILURE); - } - - size_t str_size = BUFSIZ; unsigned long long sequence = 0; while ((read = getdelim(&line, &len, '>', fh)) != -1) { long long i = 0; @@ -158,38 +153,25 @@ int main(int argc, char **argv) { memset(counts, 0, width * sizeof(unsigned long long)); // find our first \n, this should be the end of the header - char *start = strchr(line, '\n'); - if(start == NULL) + char *seq = strchr(line, '\n'); + if(seq == NULL) continue; - // point to one past that. - start = start + 1; - - size_t start_len = strlen(start); - - - // if our current str buffer isn't big enough, realloc - if(start_len + 1 > str_size + 1) { - str = realloc(str, start_len + 1); - if(str == NULL) { - exit(EXIT_FAILURE); - fprintf(stderr, strerror(errno)); - } - } + // point to one past that. + seq = seq + 1; // strip out all other newlines to handle multiline sequences - str = strnstrip(start, str, '\n',start_len); - size_t seq_length = strlen(str); + size_t seq_length = strnstrip(seq, '\n', strlen(seq)); // reset our count matrix to zero for(k = 0; k < seq_length; k++) { - str[k] = alpha[(int)str[k]]; + seq[k] = alpha[(int)seq[k]]; } for(i = 0; i < (signed long long)(seq_length - kmer + 1); i++) { - size_t mer = num_to_index(&str[i],kmer, width, &i); + size_t mer = num_to_index(&seq[i],kmer, width, &i); counts[mer]++; } @@ -218,7 +200,6 @@ int main(int argc, char **argv) { free(counts); free(line); free(desired_indicies); -free(str); fclose(fh); diff --git a/kmer_utils.c b/kmer_utils.c index 80b82a0..295628a 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -135,23 +135,24 @@ char *index_to_kmer(unsigned long long index, long kmer) { return ret; } -// Strip out any character 'c' from char array 's' into a destination dest (you -// need to allocate that) and copy only len characters. -char *strnstrip(const char *s, char *dest, int c, unsigned long long len) { +// Strip out any character 'c' from char array 's' +size_t strnstrip(char *s, int c, size_t len) { size_t i = 0; size_t j = 0; for(i = 0; i < len; i++) { if(s[i] != c) { - dest[j] = s[i]; + if(j != i) + s[j] = s[i]; j++; } } - dest[j] = '\0'; + s[j] = '\0'; + + return j; - return dest; } unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { @@ -174,58 +175,37 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer exit(EXIT_FAILURE); } - char *str = malloc(4096); - if(str == NULL) { - fprintf(stderr, strerror(errno)); - exit(EXIT_FAILURE); - } - - unsigned long long str_size = 4096; - while ((read = getdelim(&line, &len, '>', fh)) != -1) { size_t k; - char *start; - size_t start_len; + char *seq; + size_t seq_length; // find our first \n, this should be the end of the header - start = strchr(line, '\n'); - if(start == NULL) + seq = strchr(line, '\n'); + if(seq == NULL) continue; // point to one past that. - start = start + 1; - - start_len = strlen(start); - - - // if our current str buffer isn't big enough, realloc - if(start_len + 1 > str_size + 1) { - str = realloc(str, start_len + 1); - if(str == NULL) { - exit(EXIT_FAILURE); - fprintf(stderr, strerror(errno)); - } - } - + seq = seq + 1; // strip out all other newlines to handle multiline sequences - str = strnstrip(start, str, '\n',start_len); - size_t seq_length = strlen(str); + strnstrip(seq, '\n', strlen(seq)); + seq_length = strlen(seq); // relace A, C, G and T with 0, 1, 2, 3 respectively // everything else is 5 for(k = 0; k < seq_length; k++) - str[k] = alpha[(int)str[k]]; + seq[k] = alpha[(int)seq[k]]; - // loop through our string to process each k-mer + // loop through our seq to process each k-mer for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { unsigned long long mer = 0; unsigned long long multiply = 1; // for each char in the k-mer check if it is an error char for(i = position + kmer - 1; i >= position; i--){ - if(str[i] == 5) { + if(seq[i] == 5) { mer = width; position = i; goto next; @@ -233,7 +213,7 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer // multiply this char in the mer by the multiply // and bitshift the multiply for the next round - mer += str[i] * multiply; + mer += seq[i] * multiply; multiply = multiply << 2; } // use this point to get mer of our loop @@ -244,7 +224,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer } free(line); - free(str); fclose(fh); return counts; diff --git a/kmer_utils.h b/kmer_utils.h index 3caa8de..d71a607 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -4,7 +4,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos char *index_to_kmer(unsigned long long index, long kmer); // Utility functions -char *strnstrip(const char *s, char *dest, int c, int len); +size_t strnstrip(char *s, int c, size_t len); unsigned long long pow_four(unsigned long long x); // Variables |