diff options
| author | Calvin Morrison <mutantturkey@gmail.com> | 2014-04-09 17:12:09 -0400 | 
|---|---|---|
| committer | Calvin Morrison <mutantturkey@gmail.com> | 2014-04-09 17:12:09 -0400 | 
| commit | d8578f338104287b4af59cbadb01f0e45843962d (patch) | |
| tree | a44a9a94c53def94c36029d3f9f3c7f9d34311ff | |
| parent | b7c04f4067e3eb51d8542438c4bda6e1a663fff9 (diff) | |
MERGE sparse trunk into master
| -rw-r--r-- | Makefile | 33 | ||||
| -rw-r--r-- | kmer_counts_per_sequence.c | 44 | ||||
| -rw-r--r-- | kmer_total_count.c | 91 | ||||
| -rw-r--r-- | kmer_utils.c | 242 | 
4 files changed, 297 insertions, 113 deletions
| @@ -1,26 +1,27 @@ -VERSION=\"0.0.4\" -CC = gcc -CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra +VERSION=\"0.0.5\" +CC = g++ +CFLAGS = -O3 -s -mtune=native -Wall -Wextra -DVERSION=$(VERSION) -std=c++11  DESTDIR = /usr/local/ +all: libkmer.so kmer_total_count kmer_counts_per_sequence -all: libkmer.o libkmer.so kmer_total_count kmer_counts_per_sequence kmer_continuous_count - -libkmer.o: kmer_utils.c -	$(CC) -c kmer_utils.c -o libkmer.o $(CFLAGS) -fPIC -libkmer.so: libkmer.o +libkmer.so: kmer_utils.o  	$(CC) kmer_utils.c -o libkmer.so $(CFLAGS) -shared -fPIC -kmer_total_count: libkmer.o kmer_total_count.c kmer_utils.h -	$(CC) libkmer.o kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) -kmer_counts_per_sequence: libkmer.o kmer_counts_per_sequence.c kmer_utils.h -	$(CC) libkmer.o kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) -kmer_continuous_count: libkmer.o kmer_continuous_count.c kmer_utils.h -	$(CC) libkmer.o kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS) + +kmer_total_count: kmer_utils.c kmer_total_count.c kmer_utils.h +	$(CC) kmer_utils.c kmer_total_count.c -o kmer_total_count $(CLIBS) $(CFLAGS) + +kmer_counts_per_sequence: kmer_utils.c kmer_counts_per_sequence.c kmer_utils.h +	$(CC) kmer_utils.c kmer_counts_per_sequence.c -o kmer_counts_per_sequence $(CLIBS) $(CFLAGS) + +kmer_continuous_count: kmer_utils.c kmer_continuous_count.c kmer_utils.h +	$(CC) kmer_utils.c kmer_continuous_count.c -o kmer_continuous_count $(CLIBS) $(CFLAGS)  clean: -	rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so libkmer.o +	rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count kmer_utils.o libkmer.so +	rm -vf kmer_total_count kmer_counts_per_sequence kmer_continuous_count libkmer.so kmer_utils.o -debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" +debug: CFLAGS = -ggdb -Wall -Wextra -DVERSION=$(VERSION)\"-debug\" -std=c++11  debug: all  install: all diff --git a/kmer_counts_per_sequence.c b/kmer_counts_per_sequence.c index 2b2dbef..e9ca812 100644 --- a/kmer_counts_per_sequence.c +++ b/kmer_counts_per_sequence.c @@ -30,7 +30,43 @@ void help() {  				 "dna-utils. Drexel University, Philadelphia USA, 2014;\n"  				 "software available at www.github.com/EESI/dna-utils/\n");  } +static void inc(kmer_map *map, size_t index) { +	(*map)[index]++; +} + +static void inc(unsigned long long *counts, size_t index) { +	counts[index]++; +} + +template <typename array_type> +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { +	long long position; +	long long i; + +	// loop through our seq to process each k-mer +	for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { +		unsigned long long mer = 0; +		unsigned long long multiply = 1; + +		// for each char in the k-mer check if it is an error char +		for(i = position + kmer - 1; i >= position; i--){ +			if(seq[i] == 5) { +				position = i; +				goto next; +			} +			// multiply this char in the mer by the multiply +			// and bitshift the multiply for the next round +			mer += seq[i] * multiply; +			multiply = multiply << 2; +		} +		// bump up the mer value in the counts array +		inc(counts, mer); + +		// skip count if error +		next: ; +	} +}  int main(int argc, char **argv) { @@ -135,8 +171,10 @@ int main(int argc, char **argv) {   	width = pow_four(kmer);  	if(specific_mers) { -		desired_indicies = malloc((width) * sizeof(size_t)); -		check_null_ptr(desired_indicies, NULL); +		sparse = false; +		desired_indicies = (size_t *) malloc((width) * sizeof(size_t)); +		if(desired_indicies == NULL)  +			exit(EXIT_FAILURE);  		num_desired_indicies = load_specific_mers_from_file(mer_fn, kmer, width, desired_indicies);  		if(num_desired_indicies == 0) {  			fprintf(stderr, "Error: no mers loaded from file\n");  @@ -145,7 +183,7 @@ int main(int argc, char **argv) {  	} -	unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); +	unsigned long long *counts = (unsigned long long *) malloc((width+ 1) * sizeof(unsigned long long));  	if(counts == NULL)  {  		fprintf(stderr, "%s\n", strerror(errno));  		exit(EXIT_FAILURE); diff --git a/kmer_total_count.c b/kmer_total_count.c index dd29a53..c3ae070 100644 --- a/kmer_total_count.c +++ b/kmer_total_count.c @@ -8,16 +8,46 @@  #include "kmer_utils.h" +template <typename array_type> +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { +	long long position; +	long long i; + +	// loop through our seq to process each k-mer +	for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { +		unsigned long long mer = 0; +		unsigned long long multiply = 1; + +		// for each char in the k-mer check if it is an error char +		for(i = position + kmer - 1; i >= position; i--){ +			if(seq[i] == 5) { +				position = i; +				goto next; +			} + +			// multiply this char in the mer by the multiply +			// and bitshift the multiply for the next round +			mer += seq[i] * multiply; +			multiply = multiply << 2; +		} +		// bump up the mer value in the counts array +		inc(counts, mer); + +		// skip count if error +		next: ; +	} +}  void help() {  	printf("usage: kmer_total_count -i input_file -k kmer [-c] [-n] [-l] ...\n\n"  				 "count mers in size k from a fasta file\n"  				 "\n" -				 "  --input      -i  input fasta file to count\n" -				 "  --kmer       -k  size of mers to count\n" +				 "  --input    -i  input fasta file to count\n" +				 "  --kmer     -k  size of mers to count\n"  				 "  --compliment -c  count compliment of sequences\n" -				 "  --nonzero    -n  only print non-zero values\n" -				 "  --label      -l  print mer along with value\n" +				 "  --nonzero  -n  only print non-zero values\n" +				 "  --label    -l  print mer along with value\n" +				 "  --sparse   -s  force sparse table for any mer\n"  				 "\n"  				 "Report all bugs to mutantturkey@gmail.com\n"  				 "\n" @@ -41,10 +71,7 @@ int main(int argc, char **argv) {  	bool label = false;  	bool kmer_set = false;  	bool count_compliment = false; - -	unsigned long long width = 0; - -	unsigned long long i = 0; +	bool force_sparse = false;  	static struct option long_options[] = {  		{"input", required_argument, 0, 'i'}, @@ -52,6 +79,7 @@ int main(int argc, char **argv) {  		{"compliment", required_argument, 0, 'c'},  		{"nonzero", no_argument, 0, 'n'},  		{"label", no_argument, 0, 'l'}, +		{"sparse", no_argument, 0, 's'},  		{"help", no_argument, 0, 'h'},  		{0, 0, 0, 0}  	}; @@ -61,7 +89,7 @@ int main(int argc, char **argv) {  		int option_index = 0;  		int c = 0; -		c = getopt_long (argc, argv, "i:k:cnlvh", long_options, &option_index); +		c = getopt_long (argc, argv, "i:k:cnslvh", long_options, &option_index);  		if (c == -1)  			break; @@ -83,6 +111,9 @@ int main(int argc, char **argv) {  			case 'l':  				label = true;  				break; +			case 's': +				force_sparse = true; +				break;  			case 'h':  				help();  				exit(EXIT_SUCCESS); @@ -119,45 +150,17 @@ int main(int argc, char **argv) {  		exit(EXIT_FAILURE);  	} -	width = pow_four(kmer); +	if(kmer > 12 || force_sparse) { +		kmer_map *counts = NULL; +		kmer_map *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); -	unsigned long long *counts = get_kmer_counts_from_file(fh, kmer, count_compliment); - -	// If nonzero is set, only print non zeros -	if(nonzero) { -		// if labels is set, print out our labels -		if(label) { -			for(i = 0; i < width; i++) -				if(counts[i] != 0) { -					char *kmer_str = index_to_kmer(i, kmer); -					fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); -					free(kmer_str); -				} - -		} -		else { -			for(i = 0; i < width; i++) -				if(counts[i] != 0)  -					fprintf(stdout, "%llu\t%llu\n", i, counts[i]); - -		} +		print_kmer(res, label, nonzero, kmer);  	} -	// If we aren't printing nonzeros print everything  	else { -		if(label) { -			for(i = 0; i < width; i++) { -				char *kmer_str = index_to_kmer(i, kmer); -				fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); -				free(kmer_str); -			}  -		} -		else { -			for(i = 0; i < width; i=i+4) { -				fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); -			} -		} +		unsigned long long *counts = NULL; +		unsigned long long *res = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); +		print_kmer(res, label, nonzero, kmer);  	} -	free(counts);  	return EXIT_SUCCESS;  } diff --git a/kmer_utils.c b/kmer_utils.c index 7703f0a..1b94b08 100644 --- a/kmer_utils.c +++ b/kmer_utils.c @@ -5,6 +5,23 @@  #include <string.h>  #include <stdbool.h> +#include <unordered_map> + +using namespace std; + +typedef struct { +	size_t operator() (const size_t &k) const { +	return k; +	} +} kmer_noHash_hash; + +typedef struct { +	bool operator() (const size_t &x, const size_t &y) const { +		return x == y; +	} +} kmer_eq;  + +typedef unordered_map<size_t, unsigned long long, kmer_noHash_hash, kmer_eq> kmer_map;  const unsigned char alpha[256] =  {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, @@ -52,35 +69,6 @@ void reverse_string(char *s, size_t len) {  	}  } -void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, unsigned long long *counts) { -	long long position; -	long long i; - -	// loop through our seq to process each k-mer -	for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { -		unsigned long long mer = 0; -		unsigned long long multiply = 1; - -		// for each char in the k-mer check if it is an error char -		for(i = position + kmer - 1; i >= position; i--){ -			if(seq[i] == 5) { -				position = i; -				goto next; -			} - -			// multiply this char in the mer by the multiply -			// and bitshift the multiply for the next round -			mer += seq[i] * multiply; -			multiply = multiply << 2; -		} -		// bump up the mer value in the counts array -		counts[mer]++; - -		// skip count if error -		next: ; -	} -} -  // convert a string of k-mer size base-4 values  into a  // base-10 index  unsigned long num_to_index(const char *str, const int kmer, const long error_pos, long long *current_position) { @@ -103,7 +91,7 @@ unsigned long num_to_index(const char *str, const int kmer, const long error_pos  }  // return the number of loaded elements -unsigned long long load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) { +size_t load_specific_mers_from_file(char *fn, unsigned int kmer, size_t width, size_t *arr) {   		FILE *fh;  		size_t arr_pos = 0;  		char line[64]; @@ -152,8 +140,8 @@ char *index_to_kmer(unsigned long long index, long kmer)  {  	size_t i = 0;  	size_t j = 0; -	char *num_array = calloc(kmer,  sizeof(char)); -	char *ret = calloc(kmer + 1, sizeof(char)); +	char *num_array = (char *) calloc(kmer,  sizeof(char)); +	char *ret = (char *)  calloc(kmer + 1, sizeof(char));  	check_null_ptr(num_array, NULL);  	check_null_ptr(ret, NULL); @@ -211,19 +199,74 @@ size_t strnstrip(char *s, int c, size_t len) {  } -unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { +static void inc(kmer_map *map, size_t index) { +	(*map)[index]++; +} + +static void inc(unsigned long long *counts, size_t index) { +	counts[index]++; +} + +template <typename array_type> +void count_sequence(const char *seq, const size_t seq_length, const unsigned int kmer, array_type *counts) { +	long long position; +	long long i; + +	// loop through our seq to process each k-mer +	for(position = 0; position < (signed)(seq_length - kmer + 1); position++) { +		unsigned long long mer = 0; +		unsigned long long multiply = 1; + +		// for each char in the k-mer check if it is an error char +		for(i = position + kmer - 1; i >= position; i--){ +			if(seq[i] == 5) { +				position = i; +				goto next; +			} + +			// multiply this char in the mer by the multiply +			// and bitshift the multiply for the next round +			mer += seq[i] * multiply; +			multiply = multiply << 2; +		} +		// bump up the mer value in the counts array +		inc(counts, mer); + +		// skip count if error +		next: ; +	} +} + + +void create_array(kmer_map **counts, int kmer) { +	*counts = new kmer_map(); +	(*counts)->reserve(pow_four(kmer) / 2 ); +} + +void create_array(unsigned long long **counts, const unsigned int kmer) { +	// width is 4^kmer   +	const unsigned long long width = pow_four(kmer);  +	*counts = (unsigned long long *) calloc(width+1, sizeof(unsigned long long)); +	if(counts == NULL) { +		exit(EXIT_FAILURE); +	} +	else { +	} +} + +template <typename array_type> +array_type * get_kmer_counts_from_file(array_type *counts, FILE *fh, const unsigned int kmer, const bool count_compliment) { +  	char *line = NULL;  	size_t len = 0;  	ssize_t read; -	// width is 4^kmer   -	// there's a sneaky bitshift to avoid pow dependency -	const unsigned long width = pow_four(kmer);  - -	// malloc our return array -	unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); -	check_null_ptr(counts, NULL); +	create_array(&counts, kmer); +	if(counts == NULL) { +		puts("Counts is null"); +		exit(EXIT_FAILURE); +	}  	while ((read = getdelim(&line, &len, '>', fh)) != -1) {  		size_t k; @@ -260,18 +303,33 @@ unsigned long long * get_kmer_counts_from_file(FILE *fh, const unsigned int kmer  		}  	} - -  free(line); +  	free(line);  	fclose(fh);  	return counts;  } -unsigned long long * get_kmer_counts_from_filename(const char *fn, const unsigned int kmer, const bool count_compliment) { +kmer_map * get_kmer_counts_from_filename(kmer_map *counts, const char *fn, const unsigned int kmer, const bool count_compliment) {  		FILE *fh = fopen(fn, "r");  		check_null_ptr(fh, fn); -		return get_kmer_counts_from_file(fh, kmer, count_compliment); +		// Why does this work this way!!? +		kmer_map *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); +		if(counts == NULL) { +			puts("NULL IN FILENAME"); +		} + +		return counts2; + +} + + +unsigned long long * get_kmer_counts_from_filename(unsigned long long *counts, const char *fn, const unsigned int kmer, const bool count_compliment) { +		FILE *fh = fopen(fn, "r"); +		check_null_ptr(fh, fn); + +		unsigned long long *counts2 = get_kmer_counts_from_file(counts, fh, kmer, count_compliment); +		return counts2;  }  unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsigned int kmer, const bool count_compliment) { @@ -285,12 +343,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  	const unsigned long width = pow_four(kmer);   	// malloc our return array -	unsigned long long * counts = calloc((width+ 1), sizeof(unsigned long long)); +	unsigned long long * counts = (unsigned long long *) calloc((width+ 1), sizeof(unsigned long long));  	check_null_ptr(counts, NULL);  	size_t cpy_size = kmer - 1; -	char *end_of_previous = malloc(sizeof(char) * cpy_size); +	char *end_of_previous = (char*) malloc(sizeof(char) * cpy_size);  	check_null_ptr(end_of_previous, NULL);  	memset(end_of_previous, 5, cpy_size); @@ -298,12 +356,11 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  		if(line[0] == '>')  			continue; -		char *seq;  		size_t j;  		size_t k;  		size_t seq_length; -		seq = malloc(read + cpy_size); +		char *seq = (char *) malloc(read + cpy_size);  		memcpy(seq, end_of_previous, cpy_size);  		memcpy(seq + cpy_size, line, read); @@ -329,11 +386,12 @@ unsigned long long * get_continuous_kmer_counts_from_file(FILE *fh, const unsign  			count_sequence(seq, seq_length, kmer, counts);  		} -				free(seq); + +		free(seq);  	}  	free(end_of_previous); -  free(line); +	free(line);  	fclose(fh);  	return counts; @@ -345,3 +403,87 @@ unsigned long long * get_continuous_kmer_counts_from_filename(const char *fn, co  		return get_continuous_kmer_counts_from_file(fh, kmer, count_compliment);  } + +void print_kmer(unsigned long long *counts, bool label, bool nonzero, unsigned int kmer) { +	size_t width = pow_four(kmer); +	size_t i = 0; + +	// If nonzero is set, only print non zeros +	if(nonzero) { +		// if labels is set, print out our labels +		if(label) { +			for(i = 0; i < width; i++) +				if(counts[i] != 0) { +					char *kmer_str = index_to_kmer(i, kmer); +					fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); +					free(kmer_str); +				} + +		} +		else { +			for(i = 0; i < width; i++) +				if(counts[i] != 0)  +					fprintf(stdout, "%zu\t%llu\n", i, counts[i]); + +		} +	} +	// If we aren't printing nonzeros print everything +	else { +		if(label) { +			for(i = 0; i < width; i++) { +				char *kmer_str = index_to_kmer(i, kmer); +				fprintf(stdout, "%s\t%llu\n", kmer_str, counts[i]); +				free(kmer_str); +			}  +		} +		else { +			for(i = 0; i < width; i=i+4) { +				fprintf(stdout, "%llu\n%llu\n%llu\n%llu\n", counts[i], counts[i+1], counts[i+2], counts[i+3]); +			} +		} + +	} +} + +void print_kmer(kmer_map *counts, bool label, bool nonzero, unsigned int kmer) { +	size_t width = pow_four(kmer); +	size_t i = 0; + +	// If nonzero is set, only print non zeros +	if(nonzero) { +		// if labels is set, print out our labels +		if(label) { +			for(auto it = counts->begin(); it != counts->end(); it++ ) { +				char *kmer_str = index_to_kmer(it->first, kmer); +				fprintf(stdout, "%s\t%llu\n", kmer_str, it->second); +				free(kmer_str); +			} +		} +		else { +			for(auto it = counts->begin(); it != counts->end(); it++ ) { +				fprintf(stdout, "%zu\t%llu\n", it->first, it->second); +			} +		} +	} +	// If we aren't printing nonzeros print everything +	else { +		if(label) { +			for(i = 0; i < width; i++) { +				char *kmer_str = index_to_kmer(i, kmer); +				if(counts->count(i) != 0) +					fprintf(stdout, "%s\t%llu\n", kmer_str, counts->at(i)); +				else +					fprintf(stdout, "%s\t0\n", kmer_str); +				free(kmer_str);  +			}  +		} +		else { +			for(i = 0; i < width; i++) { +				if(counts->count(i) != 0) +					fprintf(stdout, "%llu\n", counts->at(i)); +				else +					fprintf(stdout, "0\n"); +			} +		} +	} +} | 
