aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-11-07 15:38:00 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2013-11-07 15:38:00 -0500
commitd7ed90dc545e54a09ef206634bb6042e5893c04b (patch)
treef83ee7da7816daa642c1f5e73ae4f74e4ce77fed
parent6a31527ac8a94d12732dda60dbbf0069e5cc1490 (diff)
update to work with kmer rarity
-rw-r--r--src/c/quikr.c131
-rw-r--r--src/c/quikr_functions.c82
-rw-r--r--src/c/quikr_functions.h7
3 files changed, 150 insertions, 70 deletions
diff --git a/src/c/quikr.c b/src/c/quikr.c
index 9fc0759..1b8daa3 100644
--- a/src/c/quikr.c
+++ b/src/c/quikr.c
@@ -1,4 +1,3 @@
-#include <ctype.h>
#include <errno.h>
#include <getopt.h>
#include <math.h>
@@ -24,6 +23,14 @@ int main(int argc, char **argv) {
char *output_filename = NULL;
unsigned long long x = 0;
+ unsigned long long y = 0;
+ unsigned long long z = 0;
+
+ unsigned long long rare_value = 0;
+ unsigned long long rare_width = 0;
+
+ double percentage = 1.0;
+ double rare_percent = 1.0;
unsigned long long width = 0;
@@ -39,6 +46,7 @@ int main(int argc, char **argv) {
{"lambda", required_argument, 0, 'l'},
{"output", required_argument, 0, 'o'},
{"sensing-matrix", required_argument, 0, 's'},
+ {"rare-percent", required_argument, 0, 'r'},
{"verbose", no_argument, 0, 'v'},
{"version", no_argument, 0, 'V'},
{"help", no_argument, 0, 'h'},
@@ -48,7 +56,7 @@ int main(int argc, char **argv) {
int option_index = 0;
- c = getopt_long (argc, argv, "k:l:s:i:o:hdvV", long_options, &option_index);
+ c = getopt_long (argc, argv, "k:l:s:r:i:o:r:hdvV", long_options, &option_index);
if (c == -1)
break;
@@ -60,6 +68,9 @@ int main(int argc, char **argv) {
case 'l':
lambda = atoi(optarg);
break;
+ case 'r':
+ rare_percent = atof(optarg);
+ break;
case 's':
sensing_matrix_filename = optarg;
break;
@@ -75,17 +86,14 @@ int main(int argc, char **argv) {
case 'V':
printf("%s\n", VERSION);
exit(EXIT_SUCCESS);
- break;
case 'h':
printf("%s\n", USAGE);
exit(EXIT_SUCCESS);
- break;
default:
break;
}
}
-
if(sensing_matrix_filename == NULL) {
fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n");
fprintf(stderr, "%s\n", USAGE);
@@ -102,7 +110,8 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- if(verbose) {
+ if(verbose) {
+ printf("rare width:%ld\n", rare_width);
printf("kmer: %u\n", kmer);
printf("lambda: %llu\n", lambda);
printf("fasta: %s\n", input_fasta_filename);
@@ -124,18 +133,109 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
+
// 4 "ACGT" ^ Kmer gives us the size of output rows
- width = pow(4, kmer);
- width = width + 1;
+ width = pow_four(kmer);
// load counts matrix
- double *count_matrix = setup_count_matrix(input_fasta_filename, kmer, lambda, width);
+ double *count_matrix = malloc(width * sizeof(double));
+ check_malloc(count_matrix, NULL);
+
+ {
+ unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer);
+
+ for(x = 0; x < width; x++)
+ count_matrix[x] = (double)integer_counts[x];
+
+ free(integer_counts);
+ }
// load sensing matrix
- struct matrix *sensing_matrix = setup_sensing_matrix(sensing_matrix_filename, kmer, lambda, width);
+ struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer);
+
+
+ // get our "rare" counts
+ while(1) {
+ rare_width = 0;
+ for(x = 0; x < width; x++) {
+ if(count_matrix[x] < rare_value) {
+ rare_width++;
+ }
+ }
+ percentage = (float)rare_width / (float)width;
+
+ if(percentage >= rare_percent)
+ break;
+
+ rare_value++;
+ }
+
+ if(verbose)
+ printf("there are %llu values less than %llu\n", rare_width, rare_value);
+
+ // add a extra space for our zero's array
+ rare_width++;
+
+ // store our count matrix
+ double *count_matrix_rare = calloc(rare_width, sizeof(double));
+ check_malloc(count_matrix_rare, NULL);
+
+ double *sensing_matrix_rare = malloc((rare_width) * sensing_matrix->sequences * sizeof(double));
+ check_malloc(sensing_matrix_rare, NULL);
+
+ // copy only kmers from our original counts that match our rareness percentage
+ // in both our count matrix and our sensing matrix
+ for(x = 0, y = 1; x < width; x++) {
+ if(count_matrix[x] < rare_value) {
+ count_matrix_rare[y] = count_matrix[x];
+
+ for(z = 0; z < sensing_matrix->sequences; z++)
+ sensing_matrix_rare[z*rare_width + y] = sensing_matrix->matrix[z*width + x];
+
+ y++;
+ }
+ }
+
+ // normalize our kmer counts and our sensing_matrix
+ normalize_matrix(count_matrix_rare, 1, rare_width);
+ normalize_matrix(sensing_matrix_rare, sensing_matrix->sequences, rare_width);
+
+ // multiply our kmer counts and sensing matrix by lambda
+ for(x = 1; x < rare_width; x++)
+ count_matrix_rare[x] *= lambda;
+
+ for(x = 0; x < sensing_matrix->sequences; x++) {
+ for(y = 1; y < rare_width; y++) {
+ sensing_matrix_rare[rare_width*x + y] *= lambda;
+ }
+ }
+
+ // count_matrix's first element should be zero
+ count_matrix_rare[0] = 0;
+ // stack one's on our first row of our sensing matrix
+ for(x = 0; x < sensing_matrix->sequences; x++) {
+ sensing_matrix_rare[x*rare_width] = 1.0;
+ }
+
+ // DEBUG OUR MATRIX
+ if(verbose) {
+ printf("COUNT_MATRIX\n");
+ for(x = 0; x < rare_width; x++)
+ printf("%lf\t", count_matrix_rare[x]);
+ printf("\n");
+
+ printf("SENSING_MATRIX\n");
+ for(y = 0; y < sensing_matrix->sequences - 1; y++) {
+ for(x = 0; x < rare_width; x++) {
+ printf("%lf\t", sensing_matrix_rare[x*rare_width + y]);
+ }
+ printf("%lf\n", sensing_matrix_rare[x*rare_width + (rare_width - 1)]);
+ }
+
+ }
// run NNLS
- double *solution = nnls(sensing_matrix->matrix, count_matrix, sensing_matrix->sequences, width);
+ double *solution = nnls(sensing_matrix_rare, count_matrix_rare, sensing_matrix->sequences, rare_width);
// normalize our solution vector
normalize_matrix(solution, 1, sensing_matrix->sequences);
@@ -149,7 +249,16 @@ int main(int argc, char **argv) {
for(x = 0; x < sensing_matrix->sequences; x++) {
fprintf(output_fh, "%.10lf\n", solution[x]);
}
+
fclose(output_fh);
+ free(solution);
+
+ free(count_matrix);
+ free(sensing_matrix);
+
+ free(count_matrix_rare);
+ free(sensing_matrix_rare);
+
return EXIT_SUCCESS;
}
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
index 2bd3065..a6366be 100644
--- a/src/c/quikr_functions.c
+++ b/src/c/quikr_functions.c
@@ -10,6 +10,17 @@
#include "kmer_utils.h"
#include "quikr.h"
+void check_malloc(void *ptr, char *error) {
+ if (ptr == NULL) {
+ if(error != NULL) {
+ fprintf(stderr,"Error: %s\n", error);
+ }
+ else { fprintf(stderr, "Error: Could not allocate enough memory - %s\n", strerror(errno));
+ }
+ exit(EXIT_FAILURE);
+ }
+}
+
void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) {
FILE *count_fh = fopen("count.mat", "w");
@@ -55,10 +66,7 @@ double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned lon
unsigned long long x = 0;
unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer);
double *count_matrix = malloc(width * sizeof(double));
- if(count_matrix == NULL) {
- fprintf(stderr, "Could not allocate memory:\n");
- exit(EXIT_FAILURE);
- }
+ check_malloc(count_matrix, NULL);
count_matrix[0] = 0;
@@ -68,13 +76,6 @@ double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned lon
free(integer_counts);
- // normalize our kmer counts
- normalize_matrix(count_matrix, 1, width);
-
- // multiply our kmers frequency by lambda
- for(x = 0; x < width; x++)
- count_matrix[x] = count_matrix[x] * lambda;
-
return count_matrix;
}
@@ -103,7 +104,7 @@ unsigned long long count_sequences(const char *filename) {
}
-struct matrix *load_sensing_matrix(const char *filename) {
+struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) {
char *line = NULL;
char **headers = NULL;
@@ -128,8 +129,7 @@ struct matrix *load_sensing_matrix(const char *filename) {
}
line = malloc(1024 * sizeof(char));
- if(line == NULL)
- exit(EXIT_FAILURE);
+ check_malloc(line, NULL);
// Check for quikr
line = gzgets(fh, line, 1024);
@@ -160,31 +160,28 @@ struct matrix *load_sensing_matrix(const char *filename) {
fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n");
exit(EXIT_FAILURE);
}
+ if(kmer != target_kmer) {
+ fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
+ exit(EXIT_FAILURE);
+ }
width = pow_four(kmer);
// allocate a +1 size for the extra row
- matrix = malloc(sequences * (width + 1) * sizeof(double));
- if(matrix == NULL) {
- fprintf(stderr, "Could not allocate memory for the sensing matrix\n");
- }
+ matrix = malloc(sequences * (width) * sizeof(double));
+ check_malloc(matrix, NULL);
- row = malloc(width * sizeof(unsigned long long));
- if(row == NULL) {
- fprintf(stderr, "Could not allocate memory for parsing row\n");
- }
+ row = malloc((width) * sizeof(unsigned long long));
+ check_malloc(row, NULL);
headers = malloc(sequences * sizeof(char *));
- if(headers == NULL) {
- fprintf(stderr, "could not allocate enough memory for header pointers\n");
- exit(EXIT_FAILURE);
- }
+ check_malloc(headers, NULL);
for(i = 0; i < sequences; i++) {
- unsigned long long sum = 0;
unsigned long long j = 0;
// get header and add it to headers array
char *header = malloc(256 * sizeof(char));
+ check_malloc(header, NULL);
gzgets(fh, header, 256);
if(header[0] != '>') {
fprintf(stderr, "Error parsing sensing matrix, could not read header\n");
@@ -194,7 +191,7 @@ struct matrix *load_sensing_matrix(const char *filename) {
header[strlen(header) - 1] = '\0';
headers[i] = header+1;
- row = memset(row, 0, (width + 1) * sizeof(unsigned long long));
+ row = memset(row, 0, (width) * sizeof(unsigned long long));
for(j = 0; j < width; j++) {
line = gzgets(fh, line, 32);
@@ -209,10 +206,9 @@ struct matrix *load_sensing_matrix(const char *filename) {
exit(EXIT_FAILURE);
}
- sum += row[j];
}
- for(j = 1; j < width+1; j++) {
- matrix[i*(width+1) + j] = ((double)row[j-1]) / sum;
+ for(j = 0; j < width; j++) {
+ matrix[i*(width) + j] = ((double)row[j]);
}
}
@@ -230,27 +226,3 @@ struct matrix *load_sensing_matrix(const char *filename) {
return ret;
}
-
-struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width) {
- unsigned long long x = 0;
- unsigned long long y = 0;
-
- struct matrix *sensing_matrix = load_sensing_matrix(filename);
- if(sensing_matrix->kmer != kmer) {
- fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
- exit(EXIT_FAILURE);
- }
-
- // multiply our sensing matrix by lambda
- for(x = 1; x < sensing_matrix->sequences; x++) {
- for(y = 0; y < width - 1; y++) {
- sensing_matrix->matrix[width*x + y] = sensing_matrix->matrix[width*x + y] * lambda;
- }
- }
-
- // set the first column of our sensing matrix to 0
- for(x = 0; x < sensing_matrix->sequences; x++) {
- sensing_matrix->matrix[width * x] = 1.0;
- }
- return sensing_matrix;
-}
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
index 35e88ca..a5d0601 100644
--- a/src/c/quikr_functions.h
+++ b/src/c/quikr_functions.h
@@ -1,3 +1,5 @@
+// our malloc checker
+void check_malloc(void *ptr, char *error);
// count the sequences in a fasta file
unsigned long long count_sequences(const char *filename);
@@ -5,10 +7,7 @@ unsigned long long count_sequences(const char *filename);
void normalize_matrix(double *matrix, int height, int width);
// load a sensing matrix
-struct matrix *load_sensing_matrix(const char *filename);
+struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer);
// add header and normalize count matrix
double *setup_count_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width);
-
-// add header and normalize sensing matrix
-struct matrix *setup_sensing_matrix(char *filename, unsigned long long kmer, unsigned long long lambda, unsigned long long width);