diff options
author | Calvin Morrison <mutantturkey@gmail.com> | 2013-11-07 15:38:00 -0500 |
---|---|---|
committer | Calvin Morrison <mutantturkey@gmail.com> | 2013-11-07 15:38:00 -0500 |
commit | d7ed90dc545e54a09ef206634bb6042e5893c04b (patch) | |
tree | f83ee7da7816daa642c1f5e73ae4f74e4ce77fed | |
parent | 6a31527ac8a94d12732dda60dbbf0069e5cc1490 (diff) |
update to work with kmer rarity
-rw-r--r-- | src/c/quikr.c | 131 | ||||
-rw-r--r-- | src/c/quikr_functions.c | 82 | ||||
-rw-r--r-- | src/c/quikr_functions.h | 7 |
3 files changed, 150 insertions, 70 deletions
diff --git a/src/c/quikr.c b/src/c/quikr.c index 9fc0759..1b8daa3 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -1,4 +1,3 @@ -#include <ctype.h> #include <errno.h> #include <getopt.h> #include <math.h> @@ -24,6 +23,14 @@ int main(int argc, char **argv) { char *output_filename = NULL; unsigned long long x = 0; + unsigned long long y = 0; + unsigned long long z = 0; + + unsigned long long rare_value = 0; + unsigned long long rare_width = 0; + + double percentage = 1.0; + double rare_percent = 1.0; unsigned long long width = 0; @@ -39,6 +46,7 @@ int main(int argc, char **argv) { {"lambda", required_argument, 0, 'l'}, {"output", required_argument, 0, 'o'}, {"sensing-matrix", required_argument, 0, 's'}, + {"rare-percent", required_argument, 0, 'r'}, {"verbose", no_argument, 0, 'v'}, {"version", no_argument, 0, 'V'}, {"help", no_argument, 0, 'h'}, @@ -48,7 +56,7 @@ int main(int argc, char **argv) { int option_index = 0; - c = getopt_long (argc, argv, "k:l:s:i:o:hdvV", long_options, &option_index); + c = getopt_long (argc, argv, "k:l:s:r:i:o:r:hdvV", long_options, &option_index); if (c == -1) break; @@ -60,6 +68,9 @@ int main(int argc, char **argv) { case 'l': lambda = atoi(optarg); break; + case 'r': + rare_percent = atof(optarg); + break; case 's': sensing_matrix_filename = optarg; break; @@ -75,17 +86,14 @@ int main(int argc, char **argv) { case 'V': printf("%s\n", VERSION); exit(EXIT_SUCCESS); - break; case 'h': printf("%s\n", USAGE); exit(EXIT_SUCCESS); - break; default: break; } } - if(sensing_matrix_filename == NULL) { fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); fprintf(stderr, "%s\n", USAGE); @@ -102,7 +110,8 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - if(verbose) { + if(verbose) { + printf("rare width:%ld\n", rare_width); printf("kmer: %u\n", kmer); printf("lambda: %llu\n", lambda); printf("fasta: %s\n", input_fasta_filename); @@ -124,18 +133,109 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } + // 4 "ACGT" ^ Kmer gives us the size of output rows - width = pow(4, kmer); - width = width + 1; + width = pow_four(kmer); // load counts matrix - double *count_matrix = setup_count_matrix(input_fasta_filename, kmer, lambda, width); + double *count_matrix = malloc(width * sizeof(double)); + check_malloc(count_matrix, NULL); + + { + unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer); + + for(x = 0; x < width; x++) + count_matrix[x] = (double)integer_counts[x]; + + free(integer_counts); + } // load sensing matrix - struct matrix *sensing_matrix = setup_sensing_matrix(sensing_matrix_filename, kmer, lambda, width); + struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer); + + + // get our "rare" counts + while(1) { + rare_width = 0; + for(x = 0; x < width; x++) { + if(count_matrix[x] < rare_value) { + rare_width++; + } + } + percentage = (float)rare_width / (float)width; + + if(percentage >= rare_percent) + break; + + rare_value++; + } + + if(verbose) + printf("there are %llu values less than %llu\n", rare_width, rare_value); + + // add a extra space for our zero's array + rare_width++; + + // store our count matrix + double *count_matrix_rare = calloc(rare_width, sizeof(double)); + check_malloc(count_matrix_rare, NULL); + + double *sensing_matrix_rare = malloc((rare_width) * sensing_matrix->sequences * sizeof(double)); + check_malloc(sensing_matrix_rare, NULL); + + // copy only kmers from our original counts that match our rareness percentage + // in both our count matrix and our sensing matrix + for(x = 0, y = 1; x < width; x++) { + if(count_matrix[x] < rare_value) { + count_matrix_rare[y] = count_matrix[x]; + + for(z = 0; z < sensing_matrix->sequences; z++) + sensing_matrix_rare[z*rare_width + y] = sensing_matrix->matrix[z*width + x]; + + y++; + } + } + + // normalize our kmer counts and our sensing_matrix + normalize_matrix(count_matrix_rare, 1, rare_width); + normalize_matrix(sensing_matrix_rare, sensing_matrix->sequences, rare_width); + + // multiply our kmer counts and sensing matrix by lambda + for(x = 1; x < rare_width; x++) + count_matrix_rare[x] *= lambda; + + for(x = 0; x < sensing_matrix->sequences; x++) { + for(y = 1; y < rare_width; y++) { + sensing_matrix_rare[rare_width*x + y] *= lambda; + } + } + + // count_matrix's first element should be zero + count_matrix_rare[0] = 0; + // stack one's on our first row of our sensing matrix + for(x = 0; x < sensing_matrix->sequences; x++) { + sensing_matrix_rare[x*rare_width] = 1.0; + } + + // DEBUG OUR MATRIX + if(verbose) { + printf("COUNT_MATRIX\n"); + for(x = 0; x < rare_width; x++) + printf("%lf\t", count_matrix_rare[x]); + printf("\n"); + + printf("SENSING_MATRIX\n"); + for(y = 0; y < sensing_matrix->sequences - 1; y++) { + for(x = 0; x < rare_width; x++) { + printf("%lf\t", sensing_matrix_rare[x*rare_width + y]); + } + printf("%lf\n", sensing_matrix_rare[x*rare_width + (rare_width - 1)]); + } + + } // run NNLS - double *solution = nnls(sensing_matrix->matrix, count_matrix, sensing_matrix->sequences, width); + double *solution = nnls(sensing_matrix_rare, count_matrix_rare, sensing_matrix->sequences, rare_width); // normalize our solution vector normalize_matrix(solution, 1, sensing_matrix->sequences); @@ -149,7 +249,16 @@ int main(int argc, char **argv) { for(x = 0; x < sensing_matrix->sequences; x++) { fprintf(output_fh, "%.10lf\n", solution[x]); } + fclose(output_fh); + free(solution); + + free(count_matrix); + free(sensing_matrix); + + free(count_matrix_rare); + free(sensing_matrix_rare); + return EXIT_SUCCESS; } diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index 2bd3065..a6366be 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -10,6 +10,17 @@ #include "kmer_utils.h" #include "quikr.h" +void check_malloc(void *ptr, char *error) { + if (ptr == NULL) { + if(error != NULL) { + fprintf(stderr,"Error: %s\n", error); + } + else { fprintf(stderr, "Error: Could not allocate enough memory - %s\n", strerror(errno)); + } + exit(EXIT_FAILURE); + } +} + void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) { FILE *count_fh = fopen("count.mat", "w"); @@ -55,10 +66,7 @@ double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned lon unsigned long long x = 0; unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer); double *count_matrix = malloc(width * sizeof(double)); - if(count_matrix == NULL) { - fprintf(stderr, "Could not allocate memory:\n"); - exit(EXIT_FAILURE); - } + check_malloc(count_matrix, NULL); count_matrix[0] = 0; @@ -68,13 +76,6 @@ double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned lon free(integer_counts); - // normalize our kmer counts - normalize_matrix(count_matrix, 1, width); - - // multiply our kmers frequency by lambda - for(x = 0; x < width; x++) - count_matrix[x] = count_matrix[x] * lambda; - return count_matrix; } @@ -103,7 +104,7 @@ unsigned long long count_sequences(const char *filename) { } -struct matrix *load_sensing_matrix(const char *filename) { +struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) { char *line = NULL; char **headers = NULL; @@ -128,8 +129,7 @@ struct matrix *load_sensing_matrix(const char *filename) { } line = malloc(1024 * sizeof(char)); - if(line == NULL) - exit(EXIT_FAILURE); + check_malloc(line, NULL); // Check for quikr line = gzgets(fh, line, 1024); @@ -160,31 +160,28 @@ struct matrix *load_sensing_matrix(const char *filename) { fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n"); exit(EXIT_FAILURE); } + if(kmer != target_kmer) { + fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); + exit(EXIT_FAILURE); + } width = pow_four(kmer); // allocate a +1 size for the extra row - matrix = malloc(sequences * (width + 1) * sizeof(double)); - if(matrix == NULL) { - fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); - } + matrix = malloc(sequences * (width) * sizeof(double)); + check_malloc(matrix, NULL); - row = malloc(width * sizeof(unsigned long long)); - if(row == NULL) { - fprintf(stderr, "Could not allocate memory for parsing row\n"); - } + row = malloc((width) * sizeof(unsigned long long)); + check_malloc(row, NULL); headers = malloc(sequences * sizeof(char *)); - if(headers == NULL) { - fprintf(stderr, "could not allocate enough memory for header pointers\n"); - exit(EXIT_FAILURE); - } + check_malloc(headers, NULL); for(i = 0; i < sequences; i++) { - unsigned long long sum = 0; unsigned long long j = 0; // get header and add it to headers array char *header = malloc(256 * sizeof(char)); + check_malloc(header, NULL); gzgets(fh, header, 256); if(header[0] != '>') { fprintf(stderr, "Error parsing sensing matrix, could not read header\n"); @@ -194,7 +191,7 @@ struct matrix *load_sensing_matrix(const char *filename) { header[strlen(header) - 1] = '\0'; headers[i] = header+1; - row = memset(row, 0, (width + 1) * sizeof(unsigned long long)); + row = memset(row, 0, (width) * sizeof(unsigned long long)); for(j = 0; j < width; j++) { line = gzgets(fh, line, 32); @@ -209,10 +206,9 @@ struct matrix *load_sensing_matrix(const char *filename) { exit(EXIT_FAILURE); } - sum += row[j]; } - for(j = 1; j < width+1; j++) { - matrix[i*(width+1) + j] = ((double)row[j-1]) / sum; + for(j = 0; j < width; j++) { + matrix[i*(width) + j] = ((double)row[j]); } } @@ -230,27 +226,3 @@ struct matrix *load_sensing_matrix(const char *filename) { return ret; } - -struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) { - unsigned long long x = 0; - unsigned long long y = 0; - - struct matrix *sensing_matrix = load_sensing_matrix(filename); - if(sensing_matrix->kmer != kmer) { - fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); - exit(EXIT_FAILURE); - } - - // multiply our sensing matrix by lambda - for(x = 1; x < sensing_matrix->sequences; x++) { - for(y = 0; y < width - 1; y++) { - sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda; - } - } - - // set the first column of our sensing matrix to 0 - for(x = 0; x < sensing_matrix->sequences; x++) { - sensing_matrix->matrix[width * x] = 1.0; - } - return sensing_matrix; -} diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h index 35e88ca..a5d0601 100644 --- a/src/c/quikr_functions.h +++ b/src/c/quikr_functions.h @@ -1,3 +1,5 @@ +// our malloc checker +void check_malloc(void *ptr, char *error); // count the sequences in a fasta file unsigned long long count_sequences(const char *filename); @@ -5,10 +7,7 @@ unsigned long long count_sequences(const char *filename); void normalize_matrix(double *matrix, int height, int width); // load a sensing matrix -struct matrix *load_sensing_matrix(const char *filename); +struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer); // add header and normalize count matrix double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width); - -// add header and normalize sensing matrix -struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width); |