diff options
| -rw-r--r-- | kmer_continuous_count.c | 20 | ||||
| -rw-r--r-- | kmer_counts_per_sequence.c | 36 | ||||
| -rw-r--r-- | kmer_total_count.c | 20 | ||||
| -rw-r--r-- | kmer_utils.c | 151 | ||||
| -rw-r--r-- | kmer_utils.h | 35 | 
5 files changed, 157 insertions, 105 deletions
| diff --git a/kmer_continuous_count.c b/kmer_continuous_count.c index a7c35f0..7eb617d 100644 --- a/kmer_continuous_count.c +++ b/kmer_continuous_count.c @@ -11,13 +11,14 @@  void help() { -	printf("usage: kmer_continuous_count -i input_file -k kmer [-n] [-l] ...\n\n" +	printf("usage: kmer_continuous_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"  				 "count mers in size k from a fasta file, but do so continuously\n"  				 "\n" -				 "  --input    -i  input fasta file to count\n" -				 "  --kmer     -k  size of mers to count\n" -				 "  --nonzero  -n  only print non-zero values\n" -				 "  --label    -l  print mer along with value\n" +				 "  --input      -i  input fasta file to count\n" +				 "  --kmer       -k  size of mers to count\n" +				 "  --compliment -c  count compliment of sequences\n" +				 "  --nonzero    -n  only print non-zero values\n" +				 "  --label      -l  print mer along with value\n"  				 "\n"  				 "Report all bugs to mutantturkey@gmail.com\n"  				 "\n" @@ -40,6 +41,7 @@ int main(int argc, char **argv) {  	bool nonzero = false;  	bool label = false;  	bool kmer_set = false; +	bool count_compliment = false;  	unsigned long long width = 0; @@ -48,6 +50,7 @@ int main(int argc, char **argv) {  	static struct option long_options[] = {  		{"input", required_argument, 0, 'i'},  		{"kmer",  required_argument, 0, 'k'}, +		{"compliment", required_argument, 0, 'c'},  		{"nonzero", no_argument, 0, 'n'},  		{"label", no_argument, 0, 'l'},  		{"help", no_argument, 0, 'h'}, @@ -59,7 +62,7 @@ int main(int argc, char **argv) {  		int option_index = 0;  		int c = 0; -		c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index); +		c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index);  		if (c == -1)  			break; @@ -72,6 +75,9 @@ int main(int argc, char **argv) {  				kmer = atoi(optarg);  				kmer_set = true;  				break; +			case 'c': +				count_compliment = true; +				break;  			case 'n':  				nonzero = true;   				break; @@ -116,7 +122,7 @@ int main(int argc, char **argv) {  	width = pow_four(kmer); -	unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer); +	unsigned long long *counts = get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);  	// If nonzero is set, only print non zeros  	if(nonzero) { diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index 7e0e119..21aca5a 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -12,13 +12,14 @@ void help() {  	printf("usage: kmer_counts_per_sequence input_file kmer [kmer-file] ...\n\n"  				 "count mers in each sequence of size k from a fasta file\n"  				 "\n" -				 "  --input    -i  input fasta file to count\n" -				 "  --kmer     -k  size of mers to count\n" -				 "  --mer-file -m  a file containing a list of mers you are interested\n" -				 "                 in opening. this will enable output your results in\n" -				 "                 a sparse format \n" -				 "  --sparse   -s  output values in a sparse format. output is in the\n" -				 "                 order sequence_number, mer_index, value\n" +				 "  --input      -i  input fasta file to count\n" +				 "  --kmer       -k  size of mers to count\n" +				 "  --compliment -c  count compliment of sequences\n" +				 "  --mer-file   -m  a file containing a list of mers you are interested\n" +				 "                   in opening. this will enable output your results in\n" +				 "                   a sparse format \n" +				 "  --sparse     -s  output values in a sparse format. output is in the\n" +				 "                   order sequence_number, mer_index, value\n"  				 "\n"  				 "Report all bugs to mutantturkey@gmail.com\n"  				 "\n" @@ -55,10 +56,12 @@ int main(int argc, char **argv) {  	bool sparse = false;  	bool kmer_set = false;  	bool specific_mers = false; +	bool count_compliment = false;  	static struct option long_options[] = {  		{"input", required_argument, 0, 'i'},  		{"kmer",  required_argument, 0, 'k'}, +		{"compliment", required_argument, 0, 'c'},  		{"sparse", no_argument, 0, 's'},  		{"mer-file", required_argument, 0, 'm'},  		{"help", no_argument, 0, 'h'}, @@ -70,7 +73,7 @@ int main(int argc, char **argv) {  		int option_index = 0;  		int c = 0; -		c = getopt_long (argc, argv, "i:k:m:vsh", long_options, &option_index); +		c = getopt_long (argc, argv, "i:k:m:cvsh", long_options, &option_index);  		if (c == -1)  			break; @@ -83,6 +86,8 @@ int main(int argc, char **argv) {  				kmer = atoi(optarg);  				kmer_set = true;  				break; +			case 'c': +				count_compliment = true;  			case 's':  				sparse = true;  				break; @@ -147,7 +152,6 @@ int main(int argc, char **argv) {  	unsigned long long sequence = 0;  	while ((read = getdelim(&line, &len, '>', fh)) != -1) { -		long long i = 0;  		size_t k = 0;  		memset(counts, 0, width * sizeof(unsigned long long)); @@ -170,11 +174,17 @@ int main(int argc, char **argv) {  			seq[k] = alpha[(int)seq[k]];  		} -		for(i = 0; i < (signed long long)(seq_length - kmer + 1); i++) { -			size_t mer = num_to_index(&seq[i],kmer, width, &i); -			counts[mer]++; +		count_sequence(seq, seq_length, kmer, counts); + +		if(count_compliment) { +			for(k = 0; k < seq_length; k++) {  +				seq[k] = compliment[(int)seq[k]]; +			} +			 +			reverse_string(seq, seq_length); +			count_sequence(seq, seq_length, kmer, counts); +			  		} -		  		if(specific_mers) {  				for(k = 0; k < num_desired_indicies; k++) { diff --git a/kmer_total_count.c b/kmer_total_count.c index 6b627f2..dd29a53 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -10,13 +10,14 @@  void help() { -	printf("usage: kmer_total_count -i input_file -k kmer [-n] [-l] ...\n\n" +	printf("usage: kmer_total_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"  				 "count mers in size k from a fasta file\n"  				 "\n" -				 "  --input    -i  input fasta file to count\n" -				 "  --kmer     -k  size of mers to count\n" -				 "  --nonzero  -n  only print non-zero values\n" -				 "  --label    -l  print mer along with value\n" +				 "  --input      -i  input fasta file to count\n" +				 "  --kmer       -k  size of mers to count\n" +				 "  --compliment -c  count compliment of sequences\n" +				 "  --nonzero    -n  only print non-zero values\n" +				 "  --label      -l  print mer along with value\n"  				 "\n"  				 "Report all bugs to mutantturkey@gmail.com\n"  				 "\n" @@ -39,6 +40,7 @@ int main(int argc, char **argv) {  	bool nonzero = false;  	bool label = false;  	bool kmer_set = false; +	bool count_compliment = false;  	unsigned long long width = 0; @@ -47,6 +49,7 @@ int main(int argc, char **argv) {  	static struct option long_options[] = {  		{"input", required_argument, 0, 'i'},  		{"kmer",  required_argument, 0, 'k'}, +		{"compliment", required_argument, 0, 'c'},  		{"nonzero", no_argument, 0, 'n'},  		{"label", no_argument, 0, 'l'},  		{"help", no_argument, 0, 'h'}, @@ -58,7 +61,7 @@ int main(int argc, char **argv) {  		int option_index = 0;  		int c = 0; -		c = getopt_long (argc, argv, "i:k:nlvh", long_options, &option_index); +		c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index);  		if (c == -1)  			break; @@ -71,6 +74,9 @@ int main(int argc, char **argv) {  				kmer = atoi(optarg);  				kmer_set = true;  				break; +			case 'c': +				count_compliment = true; +				break;  			case 'n':  				nonzero = true;   				break; @@ -115,7 +121,7 @@ int main(int argc, char **argv) {  	width = pow_four(kmer); -	unsigned long long *counts = get_kmer_counts_from_file(fh, kmer); +	unsigned long long *counts = get_kmer_counts_from_file(fh, kmer, count_compliment);  	// If nonzero is set, only print non zeros  	if(nonzero) { diff --git a/kmer_utils.c b/kmer_utils.c index b8d5691..7703f0a 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -3,9 +3,10 @@  #include <stdio.h>  #include <stdlib.h>  #include <string.h> +#include <stdbool.h> -const unsigned char alpha[256] =  +const unsigned char alpha[256] =  {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, @@ -19,6 +20,12 @@ const unsigned char alpha[256] =  const char reverse_alpha[4] = { 'A', 'C', 'G', 'T' }; +													// compliments +													// A  C  G  T  E  E +													// T  G  C  A  E  E +const char compliment[6] = { 3, 2, 1, 0, 5, 5}; + +  unsigned long long pow_four(unsigned long long x) {  	return (unsigned long long)1 << (x * 2);  } @@ -27,14 +34,53 @@ void check_null_ptr(void *ptr, const char *error) {  	if (ptr == NULL) {  		if(error != NULL)  {  			fprintf(stderr, "Error: %s - %s\n", error, strerror(errno)); -		}  +		}  		else {  			fprintf(stderr, "Error: %s\n", strerror(errno)); -		}  +		}  		exit(EXIT_FAILURE);  	}  } + +void reverse_string(char *s, size_t len) { +	char t, *d = &(s[len - 1]); +	while (d > s) { +		t = *s; +		*s++ = *d; +		*d-- = t; +	} +} + +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) { +	long long position; +	long long i; + +	// loop through our seq to process each k-mer +	for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { +		unsigned long long mer = 0; +		unsigned long long multiply = 1; + +		// for each char in the k-mer check if it is an error char +		for(i = position + kmer - 1; i >= position; i--){ +			if(seq[i] == 5) { +				position = i; +				goto next; +			} + +			// multiply this char in the mer by the multiply +			// and bitshift the multiply for the next round +			mer += seq[i] * multiply; +			multiply = multiply << 2; +		} +		// bump up the mer value in the counts array +		counts[mer]++; + +		// skip count if error +		next: ; +	} +} +  // convert a string of k-mer size base-4 values  into a  // base-10 index  unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) { @@ -44,7 +90,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos  	unsigned long multiply = 1;  	for(i = kmer - 1; i >= 0; i--){ -		if(str[i] == 5) {  +		if(str[i] == 5) {  			current_position += i;  			return error_pos;  		} @@ -57,7 +103,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos  }  // return the number of loaded elements -unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {  +unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {  		FILE *fh;  		size_t arr_pos = 0;  		char line[64]; @@ -130,11 +176,11 @@ char *index_to_kmer(unsigned long long index, long kmer)  {  	size_t start = i ;  	// decrement i by 1 to reverse the last i++ -	i--;   +	i--;  	j = 0;  	// reverse the array, as j increases, decrease i -	for(j = 0; j < start; j++, i--)  +	for(j = 0; j < start; j++, i--)  		ret[j + offset] = reverse_alpha[(int)num_array[i]];    // set our last character to the null termination byte @@ -165,7 +211,7 @@ size_t strnstrip(char *s, int c, size_t len) {  } -unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { +unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {  	char *line = NULL;  	size_t len = 0; @@ -184,9 +230,6 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer  		char *seq;  		size_t seq_length; -		long long i = 0; -		long long position = 0; -  		// find our first \n, this should be the end of the header  		seq = strchr(line, '\n');	  		if(seq == NULL)  @@ -204,31 +247,18 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer  		for(k = 0; k < seq_length; k++)  			seq[k] = alpha[(int)seq[k]]; +		count_sequence(seq, seq_length, kmer, counts); -		// loop through our seq to process each k-mer -		for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { -			unsigned long long mer = 0; -			unsigned long long multiply = 1; - -			// for each char in the k-mer check if it is an error char -			for(i = position + kmer - 1; i >= position; i--){ -				if(seq[i] == 5) {  -					mer = width; -					position = i; -					goto next; -				} - -				// multiply this char in the mer by the multiply -				// and bitshift the multiply for the next round -				mer += seq[i] * multiply; -				multiply = multiply << 2; +		if(count_compliment) { +			for(k = 0; k < seq_length; k++) {  +				seq[k] = compliment[(int)seq[k]];  			} -			// use this point to get mer of our loop -			next: -			// bump up the mer value in the counts array -			counts[mer]++; +			 +			reverse_string(seq, seq_length); +			count_sequence(seq, seq_length, kmer, counts); +			  		} -	}  +	}    free(line); @@ -237,15 +267,14 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer  	return counts;  } -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { +unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {  		FILE *fh = fopen(fn, "r");  		check_null_ptr(fh, fn); -		return get_kmer_counts_from_file(fh, kmer); +		return get_kmer_counts_from_file(fh, kmer, count_compliment);  } - -unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer) { +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) {  	char *line = NULL;  	size_t len = 0; @@ -262,13 +291,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  	size_t cpy_size = kmer - 1;  	char *end_of_previous = malloc(sizeof(char) * cpy_size); +	check_null_ptr(end_of_previous, NULL);  	memset(end_of_previous, 5, cpy_size); -	if(end_of_previous == NULL) { -		fprintf(stderr, "%s\n", strerror(errno)); -		exit(EXIT_FAILURE); -	} -  	while ((read = getline(&line, &len, fh)) != -1) {  		if(line[0] == '>')  			continue; @@ -276,7 +301,6 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  		char *seq;  		size_t j;  		size_t k; -		long long position;  		size_t seq_length;  		seq = malloc(read + cpy_size); @@ -290,40 +314,23 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  		// everything else is 5   		for(k = cpy_size; k < seq_length; k++)  			seq[k] = alpha[(int)seq[k]]; -		 -		// loop through our seq to process each k-mer -		for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { -			unsigned long long mer = 0; -			unsigned long long multiply = 1; -			long long i; - -			// for each char in the k-mer check if it is an error char -			for(i = position + kmer - 1; i >= position; i--){ -				if(seq[i] == 5) {  -					mer = width; -					position = i; -					goto next; -				} - -				// multiply this char in the mer by the multiply -				// and bitshift the multiply for the next round -				mer += seq[i] * multiply; -				multiply = multiply << 2; -			} -			// use this point to get mer of our loop -			next: -			// bump up the mer value in the counts array -			counts[mer]++; -		} +		count_sequence(seq, seq_length, kmer, counts);  		for(j = 0, k = seq_length - (cpy_size + 1); k < seq_length; k++, j++) {  			end_of_previous[j] = seq[k];  		} -		free(seq); +		if(count_compliment) { +			for(k = 0; k < seq_length; k++)  +				seq[k] = compliment[(int)seq[k]]; -	}  +			reverse_string(seq, seq_length); +			count_sequence(seq, seq_length, kmer, counts); +		} + +				free(seq); +	}  	free(end_of_previous);    free(line); @@ -332,9 +339,9 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  	return counts;  } -unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer) { +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) {  		FILE *fh = fopen(fn, "r");  		check_null_ptr(fh, fn); -		return get_continuous_kmer_counts_from_file(fh, kmer); +		return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);  } diff --git a/kmer_utils.h b/kmer_utils.h index 4bc732a..a727da8 100644 --- a/kmer_utils.h +++ b/kmer_utils.h @@ -1,22 +1,45 @@ -// Kmer functions -void convert_kmer_to_num(char *str, const unsigned long length); +// dna-util's function library. +  unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position);  char *index_to_kmer(unsigned long long index, long kmer);  // Utility functions + + +// strip char 'c' out of char array *s of length len  size_t strnstrip(char *s, int c, size_t len); + +// reverse char arry *s of length len +void reverse_string(char *s, size_t len); + +// quicky calculate 4^x   unsigned long long pow_four(unsigned long long x); + +// check if pointer is null. a helper for dealing with NULL +// return values as errors. Calls strerror and quits if  +// ptr is null, optionally takes *error char array as  +// a error to output  void check_null_ptr(void *ptr, const char *error); +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts); +  // Variables +//  const unsigned char alpha[256];  +const unsigned char reverse_alpha[4]; +const unsigned char compliment[5]; +  // file loading functions + +// open file from filename in char array *fn, and try and parse in one mer per +// line, of size kmer, and store the indicies of those mers in the *arr +// pointer;  unsigned long long load_specific_mers_from_file(const char *fn, unsigned int kmer, size_t width, size_t *arr); -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer); +unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment); +unsigned long long * get_kmer_counts_from_file(FILE *fh, const int kmer, const bool count_compliment); -unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer); -unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer); +unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment); +unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment); | 
