aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kmer_continuous_count.c20
-rw-r--r--kmer_counts_per_sequence.c36
-rw-r--r--kmer_total_count.c20
-rw-r--r--kmer_utils.c151
-rw-r--r--kmer_utils.h35
5 files changed, 157 insertions, 105 deletions
diff --git a/kmer_continuous_count.c b/kmer_continuous_count.c
index a7c35f0..7eb617d 100644
--- a/kmer_continuous_count.c
+++ b/kmer_continuous_count.c
@@ -11,13 +11,14 @@
void help() {
- printf("usage: kmer_continuous_count -i input_file -k kmer [-n] [-l] ...\n\n"
+ printf("usage: kmer_continuous_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"
"count mers in size k from a fasta file, but do so continuously\n"
"\n"
- " --input -i input fasta file to count\n"
- " --kmer -k size of mers to count\n"
- " --nonzero -n only print non-zero values\n"
- " --label -l print mer along with value\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --compliment -c count compliment of sequences\n"
+ " --nonzero -n only print non-zero values\n"
+ " --label -l print mer along with value\n"
"\n"
"Report all bugs to mutantturkey@gmail.com\n"
"\n"
@@ -40,6 +41,7 @@ int main(int argc, char **argv) {
bool nonzero = false;
bool label = false;
bool kmer_set = false;
+ bool count_compliment = false;
unsigned long long width = 0;
@@ -48,6 +50,7 @@ int main(int argc, char **argv) {
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
{"kmer", required_argument, 0, 'k'},
+ {"compliment", required_argument, 0, 'c'},
{"nonzero", no_argument, 0, 'n'},
{"label", no_argument, 0, 'l'},
{"help", no_argument, 0, 'h'},
@@ -59,7 +62,7 @@ int main(int argc, char **argv) {
int option_index = 0;
int c = 0;
- c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index);
+ c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index);
if (c == -1)
break;
@@ -72,6 +75,9 @@ int main(int argc, char **argv) {
kmer = atoi(optarg);
kmer_set = true;
break;
+ case 'c':
+ count_compliment = true;
+ break;
case 'n':
nonzero = true;
break;
@@ -116,7 +122,7 @@ int main(int argc, char **argv) {
width = pow_four(kmer);
- unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer);
+ unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);
// If nonzero is set, only print non zeros
if(nonzero) {
diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c
index 7e0e119..21aca5a 100644
--- a/kmer_counts_per_sequence.c
+++ b/kmer_counts_per_sequence.c
@@ -12,13 +12,14 @@ void help() {
printf("usage: kmer_counts_per_sequence input_file kmer [kmer-file] ...\n\n"
"count mers in each sequence of size k from a fasta file\n"
"\n"
- " --input -i input fasta file to count\n"
- " --kmer -k size of mers to count\n"
- " --mer-file -m a file containing a list of mers you are interested\n"
- " in opening. this will enable output your results in\n"
- " a sparse format \n"
- " --sparse -s output values in a sparse format. output is in the\n"
- " order sequence_number, mer_index, value\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --compliment -c count compliment of sequences\n"
+ " --mer-file -m a file containing a list of mers you are interested\n"
+ " in opening. this will enable output your results in\n"
+ " a sparse format \n"
+ " --sparse -s output values in a sparse format. output is in the\n"
+ " order sequence_number, mer_index, value\n"
"\n"
"Report all bugs to mutantturkey@gmail.com\n"
"\n"
@@ -55,10 +56,12 @@ int main(int argc, char **argv) {
bool sparse = false;
bool kmer_set = false;
bool specific_mers = false;
+ bool count_compliment = false;
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
{"kmer", required_argument, 0, 'k'},
+ {"compliment", required_argument, 0, 'c'},
{"sparse", no_argument, 0, 's'},
{"mer-file", required_argument, 0, 'm'},
{"help", no_argument, 0, 'h'},
@@ -70,7 +73,7 @@ int main(int argc, char **argv) {
int option_index = 0;
int c = 0;
- c = getopt_long (argc, argv, "i:k:m:vsh", long_options, &option_index);
+ c = getopt_long (argc, argv, "i:k:m:cvsh", long_options, &option_index);
if (c == -1)
break;
@@ -83,6 +86,8 @@ int main(int argc, char **argv) {
kmer = atoi(optarg);
kmer_set = true;
break;
+ case 'c':
+ count_compliment = true;
case 's':
sparse = true;
break;
@@ -147,7 +152,6 @@ int main(int argc, char **argv) {
unsigned long long sequence = 0;
while ((read = getdelim(&line, &len, '>', fh)) != -1) {
- long long i = 0;
size_t k = 0;
memset(counts, 0, width * sizeof(unsigned long long));
@@ -170,11 +174,17 @@ int main(int argc, char **argv) {
seq[k] = alpha[(int)seq[k]];
}
- for(i = 0; i < (signed long long)(seq_length - kmer + 1); i++) {
- size_t mer = num_to_index(&seq[i],kmer, width, &i);
- counts[mer]++;
+ count_sequence(seq, seq_length, kmer, counts);
+
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++) {
+ seq[k] = compliment[(int)seq[k]];
+ }
+
+ reverse_string(seq, seq_length);
+ count_sequence(seq, seq_length, kmer, counts);
+
}
-
if(specific_mers) {
for(k = 0; k < num_desired_indicies; k++) {
diff --git a/kmer_total_count.c b/kmer_total_count.c
index 6b627f2..dd29a53 100644
--- a/kmer_total_count.c
+++ b/kmer_total_count.c
@@ -10,13 +10,14 @@
void help() {
- printf("usage: kmer_total_count -i input_file -k kmer [-n] [-l] ...\n\n"
+ printf("usage: kmer_total_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"
"count mers in size k from a fasta file\n"
"\n"
- " --input -i input fasta file to count\n"
- " --kmer -k size of mers to count\n"
- " --nonzero -n only print non-zero values\n"
- " --label -l print mer along with value\n"
+ " --input -i input fasta file to count\n"
+ " --kmer -k size of mers to count\n"
+ " --compliment -c count compliment of sequences\n"
+ " --nonzero -n only print non-zero values\n"
+ " --label -l print mer along with value\n"
"\n"
"Report all bugs to mutantturkey@gmail.com\n"
"\n"
@@ -39,6 +40,7 @@ int main(int argc, char **argv) {
bool nonzero = false;
bool label = false;
bool kmer_set = false;
+ bool count_compliment = false;
unsigned long long width = 0;
@@ -47,6 +49,7 @@ int main(int argc, char **argv) {
static struct option long_options[] = {
{"input", required_argument, 0, 'i'},
{"kmer", required_argument, 0, 'k'},
+ {"compliment", required_argument, 0, 'c'},
{"nonzero", no_argument, 0, 'n'},
{"label", no_argument, 0, 'l'},
{"help", no_argument, 0, 'h'},
@@ -58,7 +61,7 @@ int main(int argc, char **argv) {
int option_index = 0;
int c = 0;
- c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index);
+ c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index);
if (c == -1)
break;
@@ -71,6 +74,9 @@ int main(int argc, char **argv) {
kmer = atoi(optarg);
kmer_set = true;
break;
+ case 'c':
+ count_compliment = true;
+ break;
case 'n':
nonzero = true;
break;
@@ -115,7 +121,7 @@ int main(int argc, char **argv) {
width = pow_four(kmer);
- unsigned long long *counts = get_kmer_counts_from_file(fh, kmer);
+ unsigned long long *counts = get_kmer_counts_from_file(fh, kmer, count_compliment);
// If nonzero is set, only print non zeros
if(nonzero) {
diff --git a/kmer_utils.c b/kmer_utils.c
index b8d5691..7703f0a 100644
--- a/kmer_utils.c
+++ b/kmer_utils.c
@@ -3,9 +3,10 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
-const unsigned char alpha[256] =
+const unsigned char alpha[256] =
{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5,
@@ -19,6 +20,12 @@ const unsigned char alpha[256] =
const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' };
+ // compliments
+ // A C G T E E
+ // T G C A E E
+const char compliment[6] = { 3, 2, 1, 0, 5, 5};
+
+
unsigned long long pow_four(unsigned long long x) {
return (unsigned long long)1 << (x * 2);
}
@@ -27,14 +34,53 @@ void check_null_ptr(void *ptr, const char *error) {
if (ptr == NULL) {
if(error != NULL) {
fprintf(stderr, "Error: %s - %s\n", error, strerror(errno));
- }
+ }
else {
fprintf(stderr, "Error: %s\n", strerror(errno));
- }
+ }
exit(EXIT_FAILURE);
}
}
+
+void reverse_string(char *s, size_t len) {
+ char t, *d = &(s[len - 1]);
+ while (d > s) {
+ t = *s;
+ *s++ = *d;
+ *d-- = t;
+ }
+}
+
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) {
+ long long position;
+ long long i;
+
+ // loop through our seq to process each k-mer
+ for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
+ unsigned long long mer = 0;
+ unsigned long long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(seq[i] == 5) {
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += seq[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // bump up the mer value in the counts array
+ counts[mer]++;
+
+ // skip count if error
+ next: ;
+ }
+}
+
// convert a string of k-mer size base-4 values into a
// base-10 index
unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) {
@@ -44,7 +90,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
unsigned long multiply = 1;
for(i = kmer - 1; i >= 0; i--){
- if(str[i] == 5) {
+ if(str[i] == 5) {
current_position += i;
return error_pos;
}
@@ -57,7 +103,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos
}
// return the number of loaded elements
-unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
+unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {
FILE *fh;
size_t arr_pos = 0;
char line[64];
@@ -130,11 +176,11 @@ char *index_to_kmer(unsigned long long index, long kmer) {
size_t start = i ;
// decrement i by 1 to reverse the last i++
- i--;
+ i--;
j = 0;
// reverse the array, as j increases, decrease i
- for(j = 0; j < start; j++, i--)
+ for(j = 0; j < start; j++, i--)
ret[j + offset] = reverse_alpha[(int)num_array[i]];
// set our last character to the null termination byte
@@ -165,7 +211,7 @@ size_t strnstrip(char *s, int c, size_t len) {
}
-unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
char *line = NULL;
size_t len = 0;
@@ -184,9 +230,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
char *seq;
size_t seq_length;
- long long i = 0;
- long long position = 0;
-
// find our first \n, this should be the end of the header
seq = strchr(line, '\n');
if(seq == NULL)
@@ -204,31 +247,18 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
for(k = 0; k < seq_length; k++)
seq[k] = alpha[(int)seq[k]];
+ count_sequence(seq, seq_length, kmer, counts);
- // loop through our seq to process each k-mer
- for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
- unsigned long long mer = 0;
- unsigned long long multiply = 1;
-
- // for each char in the k-mer check if it is an error char
- for(i = position + kmer - 1; i >= position; i--){
- if(seq[i] == 5) {
- mer = width;
- position = i;
- goto next;
- }
-
- // multiply this char in the mer by the multiply
- // and bitshift the multiply for the next round
- mer += seq[i] * multiply;
- multiply = multiply << 2;
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++) {
+ seq[k] = compliment[(int)seq[k]];
}
- // use this point to get mer of our loop
- next:
- // bump up the mer value in the counts array
- counts[mer]++;
+
+ reverse_string(seq, seq_length);
+ count_sequence(seq, seq_length, kmer, counts);
+
}
- }
+ }
free(line);
@@ -237,15 +267,14 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer
return counts;
}
-unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
+unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {
FILE *fh = fopen(fn, "r");
check_null_ptr(fh, fn);
- return get_kmer_counts_from_file(fh, kmer);
+ return get_kmer_counts_from_file(fh, kmer, count_compliment);
}
-
-unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) {
+unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {
char *line = NULL;
size_t len = 0;
@@ -262,13 +291,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
size_t cpy_size = kmer - 1;
char *end_of_previous = malloc(sizeof(char) * cpy_size);
+ check_null_ptr(end_of_previous, NULL);
memset(end_of_previous, 5, cpy_size);
- if(end_of_previous == NULL) {
- fprintf(stderr, "%s\n", strerror(errno));
- exit(EXIT_FAILURE);
- }
-
while ((read = getline(&line, &len, fh)) != -1) {
if(line[0] == '>')
continue;
@@ -276,7 +301,6 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
char *seq;
size_t j;
size_t k;
- long long position;
size_t seq_length;
seq = malloc(read + cpy_size);
@@ -290,40 +314,23 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
// everything else is 5
for(k = cpy_size; k < seq_length; k++)
seq[k] = alpha[(int)seq[k]];
-
- // loop through our seq to process each k-mer
- for(position = 0; position < (signed)(seq_length - kmer + 1); position++) {
- unsigned long long mer = 0;
- unsigned long long multiply = 1;
- long long i;
-
- // for each char in the k-mer check if it is an error char
- for(i = position + kmer - 1; i >= position; i--){
- if(seq[i] == 5) {
- mer = width;
- position = i;
- goto next;
- }
-
- // multiply this char in the mer by the multiply
- // and bitshift the multiply for the next round
- mer += seq[i] * multiply;
- multiply = multiply << 2;
- }
- // use this point to get mer of our loop
- next:
- // bump up the mer value in the counts array
- counts[mer]++;
- }
+ count_sequence(seq, seq_length, kmer, counts);
for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) {
end_of_previous[j] = seq[k];
}
- free(seq);
+ if(count_compliment) {
+ for(k = 0; k < seq_length; k++)
+ seq[k] = compliment[(int)seq[k]];
- }
+ reverse_string(seq, seq_length);
+ count_sequence(seq, seq_length, kmer, counts);
+ }
+
+ free(seq);
+ }
free(end_of_previous);
free(line);
@@ -332,9 +339,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign
return counts;
}
-unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) {
+unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {
FILE *fh = fopen(fn, "r");
check_null_ptr(fh, fn);
- return get_continuous_kmer_counts_from_file(fh, kmer);
+ return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);
}
diff --git a/kmer_utils.h b/kmer_utils.h
index 4bc732a..a727da8 100644
--- a/kmer_utils.h
+++ b/kmer_utils.h
@@ -1,22 +1,45 @@
-// Kmer functions
-void convert_kmer_to_num(char *str, const unsigned long length);
+// dna-util's function library.
+
unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position);
char *index_to_kmer(unsigned long long index, long kmer);
// Utility functions
+
+
+// strip char 'c' out of char array *s of length len
size_t strnstrip(char *s, int c, size_t len);
+
+// reverse char arry *s of length len
+void reverse_string(char *s, size_t len);
+
+// quicky calculate 4^x
unsigned long long pow_four(unsigned long long x);
+
+// check if pointer is null. a helper for dealing with NULL
+// return values as errors. Calls strerror and quits if
+// ptr is null, optionally takes *error char array as
+// a error to output
void check_null_ptr(void *ptr, const char *error);
+void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts);
+
// Variables
+//
const unsigned char alpha[256];
+const unsigned char reverse_alpha[4];
+const unsigned char compliment[5];
+
// file loading functions
+
+// open file from filename in char array *fn, and try and parse in one mer per
+// line, of size kmer, and store the indicies of those mers in the *arr
+// pointer;
unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr);
-unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
-unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer);
+unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment);
+unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer, const bool count_compliment);
-unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer);
-unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer);
+unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment);
+unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment);