From f0917e5c7d35275f810088b9f51536f36fed6969 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 19 Mar 2014 14:01:03 -0400 Subject: seperate function for getting a rare value --- src/c/multifasta_to_otu.c | 32 +++----------------------------- src/c/quikr.c | 46 +++++++++++++++++++++------------------------- src/c/quikr_functions.c | 39 +++++++++++++++++++++++++++++++++++++++ src/c/quikr_functions.h | 3 +++ 4 files changed, 66 insertions(+), 54 deletions(-) diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c index 9a9ee88..1ae1802 100644 --- a/src/c/multifasta_to_otu.c +++ b/src/c/multifasta_to_otu.c @@ -21,10 +21,6 @@ #define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --input-filelist\n\ta file containing list of fasta files to process seperated by newline (same rules apply as input-directory)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." -static int cmp (const void * a, const void * b) { - return ( *(double*)a - *(double*)b ); -} - char **get_fasta_files_from_file(char *fn) { char **files; int files_count = 0; @@ -336,7 +332,7 @@ int main(int argc, char **argv) { printf("Beginning to process samples\n"); - #pragma omp parallel for shared(solutions, sensing_matrix_ptr, width, done, sequences) + #pragma omp parallel for shared(solutions, sensing_matrix_ptr, done) for(size_t i = 0; i < dir_count; i++ ) { size_t x = 0; @@ -356,8 +352,6 @@ int main(int argc, char **argv) { // load counts matrix double *count_matrix = malloc(width * sizeof(double)); check_malloc(count_matrix, NULL); - double *sorted_count_matrix = malloc(width * sizeof(double)); - check_malloc(sorted_count_matrix, NULL); // convert our matrix into doubles { @@ -365,33 +359,13 @@ int main(int argc, char **argv) { for(x = 0; x < width; x++) { count_matrix[x] = (double)integer_counts[x]; - sorted_count_matrix[x] = count_matrix[x]; } free(integer_counts); } - // sort our array - qsort(sorted_count_matrix, width, sizeof(double), cmp); - - // get our "rare" counts - for(y = 0; y < width; y++) { - double percentage = 0; - - rare_value = sorted_count_matrix[y]; - rare_width = 0; - for(x = 0; x < width; x++) { - if(count_matrix[x] <= rare_value) { - rare_width++; - } - } - percentage = (double)rare_width / (double)width; - - if(percentage >= rare_percent) - break; - } - - free(sorted_count_matrix); + // get_rare_value + get_rare_value(count_matrix, width, rare_percent, &rare_value, &rare_width); if(verbose) printf("there are %llu values less than %llu\n", rare_width, rare_value); diff --git a/src/c/quikr.c b/src/c/quikr.c index 0e15c5b..fd1fbe5 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -11,7 +11,6 @@ #include "quikr_functions.h" #include "quikr.h" -#define sensing_matrix(i,j) (sensing_matrix[width*i + j]) #define USAGE "Usage:\n\tquikr [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." int main(int argc, char **argv) { @@ -29,7 +28,6 @@ int main(int argc, char **argv) { unsigned long long rare_value = 0; unsigned long long rare_width = 0; - double percentage = 1.0; double rare_percent = 1.0; unsigned long long width = 0; @@ -116,7 +114,6 @@ int main(int argc, char **argv) { } if(verbose) { - printf("rare width:%llu\n", rare_width); printf("kmer: %u\n", kmer); printf("lambda: %llu\n", lambda); printf("fasta: %s\n", input_fasta_filename); @@ -142,53 +139,51 @@ int main(int argc, char **argv) { // 4 "ACGT" ^ Kmer gives us the size of output rows width = pow_four(kmer); + // load sensing matrix + struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer); + + if(verbose) { + printf("width: %llu\n", width); + printf("sequences: %llu\n", sensing_matrix->sequences); + } + + + // load counts matrix double *count_matrix = malloc(width * sizeof(double)); check_malloc(count_matrix, NULL); + // convert our matrix into doubles { unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer); - for(x = 0; x < width; x++) + for(x = 0; x < width; x++) { count_matrix[x] = (double)integer_counts[x]; + } free(integer_counts); } - // load sensing matrix - struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer); - - // get our "rare" counts - while(1) { - rare_width = 0; - for(x = 0; x < width; x++) { - if(count_matrix[x] < rare_value) { - rare_width++; - } - } - percentage = (float)rare_width / (float)width; - - if(percentage >= rare_percent) - break; - - rare_value++; - } + // get_rare_value + get_rare_value(count_matrix, width, rare_percent, &rare_value, &rare_width); if(verbose) printf("there are %llu values less than %llu\n", rare_width, rare_value); - // add a extra space for our zero's array + // add a extra space for our zero's array, so we can set the first column to 1's rare_width++; // store our count matrix double *count_matrix_rare = calloc(rare_width, sizeof(double)); check_malloc(count_matrix_rare, NULL); - double *sensing_matrix_rare = malloc((rare_width) * sensing_matrix->sequences * sizeof(double)); + double *sensing_matrix_rare = calloc(rare_width * sensing_matrix->sequences, sizeof(double)); check_malloc(sensing_matrix_rare, NULL); // copy only kmers from our original counts that match our rareness percentage // in both our count matrix and our sensing matrix + // + // y = 1 because we are offsetting the arrah by 1, so we can set the first row to all 1's for(x = 0, y = 1; x < width; x++) { if(count_matrix[x] <= rare_value) { count_matrix_rare[y] = count_matrix[x]; @@ -200,6 +195,7 @@ int main(int argc, char **argv) { } } + // normalize our kmer counts and our sensing_matrix normalize_matrix(count_matrix_rare, 1, rare_width); normalize_matrix(sensing_matrix_rare, sensing_matrix->sequences, rare_width); @@ -208,6 +204,7 @@ int main(int argc, char **argv) { for(x = 1; x < rare_width; x++) count_matrix_rare[x] *= lambda; + //TODO use one loop for(x = 0; x < sensing_matrix->sequences; x++) { for(y = 1; y < rare_width; y++) { sensing_matrix_rare[rare_width*x + y] *= lambda; @@ -221,7 +218,6 @@ int main(int argc, char **argv) { sensing_matrix_rare[x*rare_width] = 1.0; } - // run NNLS double *solution = nnls(sensing_matrix_rare, count_matrix_rare, sensing_matrix->sequences, rare_width); // normalize our solution vector diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index 061d3b8..5bbe913 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -10,6 +10,7 @@ #include "kmer_utils.h" #include "quikr.h" + void check_malloc(void *ptr, char *error) { if (ptr == NULL) { if(error != NULL) { @@ -20,6 +21,44 @@ void check_malloc(void *ptr, char *error) { exit(EXIT_FAILURE); } } +static int double_cmp (const void * a, const void * b) { + return ( *(double*)a - *(double*)b ); +} + +void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width) { + size_t y, x; + unsigned long long rare_width = 0; + double rare_value = 0; + + // allocate * sort a temporary matrix + double *sorted_count_matrix = malloc(width * sizeof(double)); + check_malloc(sorted_count_matrix, NULL); + + memcpy(sorted_count_matrix, count_matrix, width * sizeof(double)); + + qsort(sorted_count_matrix, width, sizeof(double), double_cmp); + + // get our "rare" counts + for(y = 0; y < width; y++) { + double percentage = 0; + + rare_value = sorted_count_matrix[y]; + rare_width = 0; + for(x = 0; x < width; x++) { + if(count_matrix[x] <= rare_value) { + rare_width++; + } + } + percentage = (double)rare_width / (double)width; + + if(percentage >= rare_percent) + break; + } + + free(sorted_count_matrix); + *ret_rare_width = rare_width; + *ret_rare_value = rare_value; +} void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) { FILE *count_fh = fopen("count.mat", "w"); diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h index 369d68c..978f614 100644 --- a/src/c/quikr_functions.h +++ b/src/c/quikr_functions.h @@ -9,3 +9,6 @@ void normalize_matrix(double *matrix, int height, int width); // load a sensing matrix struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer); + +// get_rare_value +void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width); -- cgit v1.2.3