aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-31 15:42:05 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-31 15:42:05 -0400
commit8b543ec38690a7a41297fda56335b7f15cb0be8d (patch)
treec8e325566ba89568ab41d242a4d1c96c5bb4f441
parent87cf0208484b33d3194add9570b936998b44e9a3 (diff)
finish up removing OCaml Code and general code refactoring
-rw-r--r--src/c/multifasta_to_otu.c35
-rw-r--r--src/c/quikr.c39
-rw-r--r--src/c/quikr_functions.c155
-rw-r--r--src/c/quikr_functions.h10
4 files changed, 90 insertions, 149 deletions
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
index 72d13b5..fb64981 100644
--- a/src/c/multifasta_to_otu.c
+++ b/src/c/multifasta_to_otu.c
@@ -20,7 +20,7 @@
#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
#define solutions(i,j) (solutions[sequences*i+ j])
-#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version."
+#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version."
int main(int argc, char **argv) {
@@ -173,15 +173,15 @@ int main(int argc, char **argv) {
}
// multiply our matrix by lambda
- for(x = 0; x < sensing_matrix->sequences; x++) {
- for(y= 0; y < width; y++) {
- sensing_matrix->matrix[x*width + y] *= lambda;
+ for(x = 1; x < sensing_matrix->sequences; x++) {
+ for(y = 0; y < width - 1; y++) {
+ sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda;
}
}
// set the first row to be all 1's
for(x = 0; x < sensing_matrix->sequences; x++) {
- sensing_matrix->matrix[x*width] = 1.0;
+ sensing_matrix->matrix[width * x] = 1.0;
}
double *solutions = malloc(dir_count * sensing_matrix->sequences * sizeof(double));
@@ -196,7 +196,7 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- long *file_sequence_count = malloc(dir_count * sizeof(long));
+ long *file_sequence_count = calloc(dir_count, sizeof(long));
if(file_sequence_count == NULL) {
fprintf(stderr, "Could not allocate enough memory\n");
exit(EXIT_FAILURE);
@@ -237,27 +237,8 @@ int main(int argc, char **argv) {
// get individual sequence count
file_sequence_count[i] = count_sequences(filename);
- // count the kmer amounts, and convert it to a double array
- unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer);
- double *count_matrix = malloc(sizeof(double) * width);
- if(count_matrix == NULL) {
- fprintf(stderr, "Could not allocate memory:\n");
- exit(EXIT_FAILURE);
- }
-
- count_matrix[0] = 0;
-
- for(x = 0; x < width - 1 ; x++)
- count_matrix[x+1] = (double)integer_counts[x];
-
- free(integer_counts);
-
- // normalize our kmer counts
- normalize_matrix(count_matrix, 1, width);
-
- // multiply our kmers frequency by lambda
- for(z = 0; z < width; z++)
- count_matrix[z] = count_matrix[z] * lambda;
+ // get our count matrix
+ double *count_matrix = setup_count_matrix(filename, kmer, lambda, width);
double *sensing_matrix_copy = malloc(sizeof(double) * sensing_matrix->sequences * width);
if(sensing_matrix_copy == NULL) {
diff --git a/src/c/quikr.c b/src/c/quikr.c
index a9b90e0..af6275e 100644
--- a/src/c/quikr.c
+++ b/src/c/quikr.c
@@ -24,7 +24,6 @@ int main(int argc, char **argv) {
char *output_filename = NULL;
unsigned long long x = 0;
- unsigned long long y = 0;
unsigned long long width = 0;
@@ -131,45 +130,11 @@ int main(int argc, char **argv) {
width = width + 1;
// load counts matrix
- unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer);
- double *count_matrix = malloc(sizeof(double) * width);
- if(count_matrix == NULL) {
- fprintf(stderr, "Could not allocate memory:\n");
- exit(EXIT_FAILURE);
- }
-
- count_matrix[0] = 0;
-
- for(x = 1; x < width; x++)
- count_matrix[x] = (double)integer_counts[x-1];
-
- free(integer_counts);
-
- // normalize our count_matrix
- normalize_matrix(count_matrix, 1, width);
-
- for(x = 0; x < width; x++)
- count_matrix[x] = count_matrix[x] * lambda;
+ double *count_matrix = setup_count_matrix(input_fasta_filename, kmer, lambda, width);
// load sensing matrix
- struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename);
- if(sensing_matrix->kmer != kmer) {
- fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
- exit(EXIT_FAILURE);
- }
-
- // multiply our sensing matrix by lambda
- for(x = 1; x < sensing_matrix->sequences; x++) {
- for(y = 0; y < width - 1; y++) {
- sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda;
- }
- }
+ struct matrix *sensing_matrix = setup_sensing_matrix(input_fasta_filename, kmer, lambda, width);
- // set the first column of our sensing matrix to 0
- for(x = 0; x < sensing_matrix->sequences; x++) {
- sensing_matrix->matrix[width * x] = 1.0;
- }
-
// run NNLS
double *solution = nnls(sensing_matrix->matrix, count_matrix, sensing_matrix->sequences, width);
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
index fab05ab..3697aa2 100644
--- a/src/c/quikr_functions.c
+++ b/src/c/quikr_functions.c
@@ -7,30 +7,30 @@
#include <unistd.h>
#include <zlib.h>
+#include "kmer_utils.h"
#include "quikr.h"
-unsigned long long count_sequences(const char *filename) {
- char *line = NULL;
- size_t len = 0;
- ssize_t read;
+void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) {
+ FILE *count_fh = fopen("count.mat", "w");
+ FILE *sensing_fh = fopen("sensing.mat", "w");
- unsigned long long sequences = 0;
+ unsigned long long width = pow_four(sensing_matrix->kmer);
+ unsigned long long i = 0;
+ unsigned long long j = 0;
- FILE *fh = fopen(filename, "r");
- if(fh == NULL) {
- fprintf(stderr, "could not open \"%s\"\n", filename );
- return 0;
- }
+ for(i = 0; i < sensing_matrix->sequences; i++) {
+ for(j = 0; j < width - 1; j++)
+ fprintf(sensing_fh, "%lf\t", sensing_matrix->matrix[width*i + j]);
+ fprintf(sensing_fh, "%lf\n", sensing_matrix->matrix[width*i + width-1]);
+ }
- while ((read = getline(&line, &len, fh)) != -1) {
- if(line[0] == '>')
- sequences++;
- }
+ fclose(sensing_fh);
- free(line);
- fclose(fh);
+ for(j = 0; j < width - 1; j++)
+ fprintf(count_fh, "%lf\t", count_matrix[j]);
+ fprintf(count_fh, "%lf\n", count_matrix[width - 1]);
- return sequences;
+ fclose(count_fh);
}
@@ -49,45 +49,56 @@ void normalize_matrix(double *matrix, unsigned long long height, unsigned long l
}
}
+double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) {
-double *load_count_matrix(const char *filename, const unsigned long long width, const unsigned int kmer) {
+ unsigned long long x = 0;
+ unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer);
+ double *count_matrix = malloc(width * sizeof(double));
+ if(count_matrix == NULL) {
+ fprintf(stderr, "Could not allocate memory:\n");
+ exit(EXIT_FAILURE);
+ }
- double *count_matrix = malloc(width*sizeof(double));
- char count_command[1024];
- unsigned long long x = 0;
+ count_matrix[0] = 0;
+
+ for(x = 1; x < width; x++) {
+ count_matrix[x] = (double)integer_counts[x-1];
+ }
+
+ free(integer_counts);
+
+ // normalize our kmer counts
+ normalize_matrix(count_matrix, 1, width);
+
+ // multiply our kmers frequency by lambda
+ for(x = 0; x < width; x++)
+ count_matrix[x] = count_matrix[x] * lambda;
+
+ return count_matrix;
+}
+
+unsigned long long count_sequences(const char *filename) {
char *line = NULL;
size_t len = 0;
+ ssize_t read;
- if(count_matrix == NULL) {
- fprintf(stderr, "could not allocate enough memory for the count matrix\n");
- exit(EXIT_FAILURE);
- }
+ unsigned long long sequences = 0;
- // create out count matrix
- sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename);
- FILE *count_output = popen(count_command, "r");
- if(count_output == NULL) {
- fprintf(stderr, "could not execute \"%s\"", count_command);
- exit(EXIT_FAILURE);
+ FILE *fh = fopen(filename, "r");
+ if(fh == NULL) {
+ fprintf(stderr, "could not open \"%s\"\n", filename );
+ return 0;
}
- // set first element to zero.
- count_matrix[0] = 0;
-
- // get our first line
- getline(&line, &len, count_output);
- count_matrix[1] = atoi(line);
-
- // iterate over the rest of the lines
- for(x = 2; x < width; x++) {
- getline(&line, &len, count_output);
- count_matrix[x] = atoi(line);
- }
+ while ((read = getline(&line, &len, fh)) != -1) {
+ if(line[0] == '>')
+ sequences++;
+ }
free(line);
- pclose(count_output);
+ fclose(fh);
- return count_matrix;
+ return sequences;
}
@@ -219,42 +230,26 @@ struct matrix *load_sensing_matrix(const char *filename) {
return ret;
}
+struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) {
+ unsigned long long x = 0;
+ unsigned long long y = 0;
-char **load_headers(char *filename, long sequences) {
- char command[512];
- char *line= NULL;
- long x = 0;
- FILE *grep_output;
- size_t len = 0;
-
- sprintf(command, "grep ^\\> %s", filename);
- grep_output = popen(command, "r");
- if(grep_output == NULL) {
- fprintf(stderr, "Could not execute %s\n", command);
- exit(EXIT_FAILURE);
- }
-
- char **headers = malloc(sequences * sizeof(char *));
- if(headers == NULL) {
- fprintf(stderr, "could not allocated enough memory\n");
- exit(EXIT_FAILURE);
- }
-
- for(x = 0; x < sequences; x++) {
-
- char *header = malloc(256 * sizeof(char));
- if(header == NULL) {
- fprintf(stderr, "could not allocated enough memory\n");
- exit(EXIT_FAILURE);
- }
- getline(&line, &len, grep_output);
- sscanf(line + 1, "%s", header);
- headers[x] = header;
- }
+ struct matrix *sensing_matrix = load_sensing_matrix(filename);
+ if(sensing_matrix->kmer != kmer) {
+ fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
+ exit(EXIT_FAILURE);
+ }
- free(line);
- pclose(grep_output);
+ // multiply our sensing matrix by lambda
+ for(x = 1; x < sensing_matrix->sequences; x++) {
+ for(y = 0; y < width - 1; y++) {
+ sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda;
+ }
+ }
- return headers;
+ // set the first column of our sensing matrix to 0
+ for(x = 0; x < sensing_matrix->sequences; x++) {
+ sensing_matrix->matrix[width * x] = 1.0;
+ }
+ return sensing_matrix;
}
-
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
index 8e0a525..35e88ca 100644
--- a/src/c/quikr_functions.h
+++ b/src/c/quikr_functions.h
@@ -4,11 +4,11 @@ unsigned long long count_sequences(const char *filename);
// normalize a matrix by dividing each element by the sum of it's column
void normalize_matrix(double *matrix, int height, int width);
-// load a sensing matrix
+// load a sensing matrix
struct matrix *load_sensing_matrix(const char *filename);
-// load a count matrix
-double *load_count_matrix(char *filename, int width, int kmer);
+// add header and normalize count matrix
+double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width);
-// load headers
-char **load_headers(char *filename, int sequences);
+// add header and normalize sensing matrix
+struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width);