From 8b543ec38690a7a41297fda56335b7f15cb0be8d Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Thu, 31 Oct 2013 15:42:05 -0400 Subject: finish up removing OCaml Code and general code refactoring --- src/c/multifasta_to_otu.c | 35 +++-------- src/c/quikr.c | 39 +----------- src/c/quikr_functions.c | 155 ++++++++++++++++++++++------------------------ src/c/quikr_functions.h | 10 +-- 4 files changed, 90 insertions(+), 149 deletions(-) (limited to 'src') diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c index 72d13b5..fb64981 100644 --- a/src/c/multifasta_to_otu.c +++ b/src/c/multifasta_to_otu.c @@ -20,7 +20,7 @@ #define sensing_matrix(i,j) (sensing_matrix[width*i + j]) #define solutions(i,j) (solutions[sequences*i+ j]) -#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." +#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." int main(int argc, char **argv) { @@ -173,15 +173,15 @@ int main(int argc, char **argv) { } // multiply our matrix by lambda - for(x = 0; x < sensing_matrix->sequences; x++) { - for(y= 0; y < width; y++) { - sensing_matrix->matrix[x*width + y] *= lambda; + for(x = 1; x < sensing_matrix->sequences; x++) { + for(y = 0; y < width - 1; y++) { + sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda; } } // set the first row to be all 1's for(x = 0; x < sensing_matrix->sequences; x++) { - sensing_matrix->matrix[x*width] = 1.0; + sensing_matrix->matrix[width * x] = 1.0; } double *solutions = malloc(dir_count * sensing_matrix->sequences * sizeof(double)); @@ -196,7 +196,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - long *file_sequence_count = malloc(dir_count * sizeof(long)); + long *file_sequence_count = calloc(dir_count, sizeof(long)); if(file_sequence_count == NULL) { fprintf(stderr, "Could not allocate enough memory\n"); exit(EXIT_FAILURE); @@ -237,27 +237,8 @@ int main(int argc, char **argv) { // get individual sequence count file_sequence_count[i] = count_sequences(filename); - // count the kmer amounts, and convert it to a double array - unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer); - double *count_matrix = malloc(sizeof(double) * width); - if(count_matrix == NULL) { - fprintf(stderr, "Could not allocate memory:\n"); - exit(EXIT_FAILURE); - } - - count_matrix[0] = 0; - - for(x = 0; x < width - 1 ; x++) - count_matrix[x+1] = (double)integer_counts[x]; - - free(integer_counts); - - // normalize our kmer counts - normalize_matrix(count_matrix, 1, width); - - // multiply our kmers frequency by lambda - for(z = 0; z < width; z++) - count_matrix[z] = count_matrix[z] * lambda; + // get our count matrix + double *count_matrix = setup_count_matrix(filename, kmer, lambda, width); double *sensing_matrix_copy = malloc(sizeof(double) * sensing_matrix->sequences * width); if(sensing_matrix_copy == NULL) { diff --git a/src/c/quikr.c b/src/c/quikr.c index a9b90e0..af6275e 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -24,7 +24,6 @@ int main(int argc, char **argv) { char *output_filename = NULL; unsigned long long x = 0; - unsigned long long y = 0; unsigned long long width = 0; @@ -131,45 +130,11 @@ int main(int argc, char **argv) { width = width + 1; // load counts matrix - unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer); - double *count_matrix = malloc(sizeof(double) * width); - if(count_matrix == NULL) { - fprintf(stderr, "Could not allocate memory:\n"); - exit(EXIT_FAILURE); - } - - count_matrix[0] = 0; - - for(x = 1; x < width; x++) - count_matrix[x] = (double)integer_counts[x-1]; - - free(integer_counts); - - // normalize our count_matrix - normalize_matrix(count_matrix, 1, width); - - for(x = 0; x < width; x++) - count_matrix[x] = count_matrix[x] * lambda; + double *count_matrix = setup_count_matrix(input_fasta_filename, kmer, lambda, width); // load sensing matrix - struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename); - if(sensing_matrix->kmer != kmer) { - fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); - exit(EXIT_FAILURE); - } - - // multiply our sensing matrix by lambda - for(x = 1; x < sensing_matrix->sequences; x++) { - for(y = 0; y < width - 1; y++) { - sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda; - } - } + struct matrix *sensing_matrix = setup_sensing_matrix(input_fasta_filename, kmer, lambda, width); - // set the first column of our sensing matrix to 0 - for(x = 0; x < sensing_matrix->sequences; x++) { - sensing_matrix->matrix[width * x] = 1.0; - } - // run NNLS double *solution = nnls(sensing_matrix->matrix, count_matrix, sensing_matrix->sequences, width); diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index fab05ab..3697aa2 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -7,30 +7,30 @@ #include #include +#include "kmer_utils.h" #include "quikr.h" -unsigned long long count_sequences(const char *filename) { - char *line = NULL; - size_t len = 0; - ssize_t read; +void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) { + FILE *count_fh = fopen("count.mat", "w"); + FILE *sensing_fh = fopen("sensing.mat", "w"); - unsigned long long sequences = 0; + unsigned long long width = pow_four(sensing_matrix->kmer); + unsigned long long i = 0; + unsigned long long j = 0; - FILE *fh = fopen(filename, "r"); - if(fh == NULL) { - fprintf(stderr, "could not open \"%s\"\n", filename ); - return 0; - } + for(i = 0; i < sensing_matrix->sequences; i++) { + for(j = 0; j < width - 1; j++) + fprintf(sensing_fh, "%lf\t", sensing_matrix->matrix[width*i + j]); + fprintf(sensing_fh, "%lf\n", sensing_matrix->matrix[width*i + width-1]); + } - while ((read = getline(&line, &len, fh)) != -1) { - if(line[0] == '>') - sequences++; - } + fclose(sensing_fh); - free(line); - fclose(fh); + for(j = 0; j < width - 1; j++) + fprintf(count_fh, "%lf\t", count_matrix[j]); + fprintf(count_fh, "%lf\n", count_matrix[width - 1]); - return sequences; + fclose(count_fh); } @@ -49,45 +49,56 @@ void normalize_matrix(double *matrix, unsigned long long height, unsigned long l } } +double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) { -double *load_count_matrix(const char *filename, const unsigned long long width, const unsigned int kmer) { + unsigned long long x = 0; + unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer); + double *count_matrix = malloc(width * sizeof(double)); + if(count_matrix == NULL) { + fprintf(stderr, "Could not allocate memory:\n"); + exit(EXIT_FAILURE); + } - double *count_matrix = malloc(width*sizeof(double)); - char count_command[1024]; - unsigned long long x = 0; + count_matrix[0] = 0; + + for(x = 1; x < width; x++) { + count_matrix[x] = (double)integer_counts[x-1]; + } + + free(integer_counts); + + // normalize our kmer counts + normalize_matrix(count_matrix, 1, width); + + // multiply our kmers frequency by lambda + for(x = 0; x < width; x++) + count_matrix[x] = count_matrix[x] * lambda; + + return count_matrix; +} + +unsigned long long count_sequences(const char *filename) { char *line = NULL; size_t len = 0; + ssize_t read; - if(count_matrix == NULL) { - fprintf(stderr, "could not allocate enough memory for the count matrix\n"); - exit(EXIT_FAILURE); - } + unsigned long long sequences = 0; - // create out count matrix - sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename); - FILE *count_output = popen(count_command, "r"); - if(count_output == NULL) { - fprintf(stderr, "could not execute \"%s\"", count_command); - exit(EXIT_FAILURE); + FILE *fh = fopen(filename, "r"); + if(fh == NULL) { + fprintf(stderr, "could not open \"%s\"\n", filename ); + return 0; } - // set first element to zero. - count_matrix[0] = 0; - - // get our first line - getline(&line, &len, count_output); - count_matrix[1] = atoi(line); - - // iterate over the rest of the lines - for(x = 2; x < width; x++) { - getline(&line, &len, count_output); - count_matrix[x] = atoi(line); - } + while ((read = getline(&line, &len, fh)) != -1) { + if(line[0] == '>') + sequences++; + } free(line); - pclose(count_output); + fclose(fh); - return count_matrix; + return sequences; } @@ -219,42 +230,26 @@ struct matrix *load_sensing_matrix(const char *filename) { return ret; } +struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) { + unsigned long long x = 0; + unsigned long long y = 0; -char **load_headers(char *filename, long sequences) { - char command[512]; - char *line= NULL; - long x = 0; - FILE *grep_output; - size_t len = 0; - - sprintf(command, "grep ^\\> %s", filename); - grep_output = popen(command, "r"); - if(grep_output == NULL) { - fprintf(stderr, "Could not execute %s\n", command); - exit(EXIT_FAILURE); - } - - char **headers = malloc(sequences * sizeof(char *)); - if(headers == NULL) { - fprintf(stderr, "could not allocated enough memory\n"); - exit(EXIT_FAILURE); - } - - for(x = 0; x < sequences; x++) { - - char *header = malloc(256 * sizeof(char)); - if(header == NULL) { - fprintf(stderr, "could not allocated enough memory\n"); - exit(EXIT_FAILURE); - } - getline(&line, &len, grep_output); - sscanf(line + 1, "%s", header); - headers[x] = header; - } + struct matrix *sensing_matrix = load_sensing_matrix(filename); + if(sensing_matrix->kmer != kmer) { + fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); + exit(EXIT_FAILURE); + } - free(line); - pclose(grep_output); + // multiply our sensing matrix by lambda + for(x = 1; x < sensing_matrix->sequences; x++) { + for(y = 0; y < width - 1; y++) { + sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda; + } + } - return headers; + // set the first column of our sensing matrix to 0 + for(x = 0; x < sensing_matrix->sequences; x++) { + sensing_matrix->matrix[width * x] = 1.0; + } + return sensing_matrix; } - diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h index 8e0a525..35e88ca 100644 --- a/src/c/quikr_functions.h +++ b/src/c/quikr_functions.h @@ -4,11 +4,11 @@ unsigned long long count_sequences(const char *filename); // normalize a matrix by dividing each element by the sum of it's column void normalize_matrix(double *matrix, int height, int width); -// load a sensing matrix +// load a sensing matrix struct matrix *load_sensing_matrix(const char *filename); -// load a count matrix -double *load_count_matrix(char *filename, int width, int kmer); +// add header and normalize count matrix +double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width); -// load headers -char **load_headers(char *filename, int sequences); +// add header and normalize sensing matrix +struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width); -- cgit v1.2.3