aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-19 14:01:03 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-19 14:01:03 -0400
commitf0917e5c7d35275f810088b9f51536f36fed6969 (patch)
tree25befe379e596ab758bb606180c2dbc47967392a
parent603a90c1d0d22acaeeac4c8fb6bdad730f4591f3 (diff)
seperate function for getting a rare value
-rw-r--r--src/c/multifasta_to_otu.c32
-rw-r--r--src/c/quikr.c46
-rw-r--r--src/c/quikr_functions.c39
-rw-r--r--src/c/quikr_functions.h3
4 files changed, 66 insertions, 54 deletions
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
index 9a9ee88..1ae1802 100644
--- a/src/c/multifasta_to_otu.c
+++ b/src/c/multifasta_to_otu.c
@@ -21,10 +21,6 @@
#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --input-filelist\n\ta file containing list of fasta files to process seperated by newline (same rules apply as input-directory)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version."
-static int cmp (const void * a, const void * b) {
- return ( *(double*)a - *(double*)b );
-}
-
char **get_fasta_files_from_file(char *fn) {
char **files;
int files_count = 0;
@@ -336,7 +332,7 @@ int main(int argc, char **argv) {
printf("Beginning to process samples\n");
- #pragma omp parallel for shared(solutions, sensing_matrix_ptr, width, done, sequences)
+ #pragma omp parallel for shared(solutions, sensing_matrix_ptr, done)
for(size_t i = 0; i < dir_count; i++ ) {
size_t x = 0;
@@ -356,8 +352,6 @@ int main(int argc, char **argv) {
// load counts matrix
double *count_matrix = malloc(width * sizeof(double));
check_malloc(count_matrix, NULL);
- double *sorted_count_matrix = malloc(width * sizeof(double));
- check_malloc(sorted_count_matrix, NULL);
// convert our matrix into doubles
{
@@ -365,33 +359,13 @@ int main(int argc, char **argv) {
for(x = 0; x < width; x++) {
count_matrix[x] = (double)integer_counts[x];
- sorted_count_matrix[x] = count_matrix[x];
}
free(integer_counts);
}
- // sort our array
- qsort(sorted_count_matrix, width, sizeof(double), cmp);
-
- // get our "rare" counts
- for(y = 0; y < width; y++) {
- double percentage = 0;
-
- rare_value = sorted_count_matrix[y];
- rare_width = 0;
- for(x = 0; x < width; x++) {
- if(count_matrix[x] <= rare_value) {
- rare_width++;
- }
- }
- percentage = (double)rare_width / (double)width;
-
- if(percentage >= rare_percent)
- break;
- }
-
- free(sorted_count_matrix);
+ // get_rare_value
+ get_rare_value(count_matrix, width, rare_percent, &rare_value, &rare_width);
if(verbose)
printf("there are %llu values less than %llu\n", rare_width, rare_value);
diff --git a/src/c/quikr.c b/src/c/quikr.c
index 0e15c5b..fd1fbe5 100644
--- a/src/c/quikr.c
+++ b/src/c/quikr.c
@@ -11,7 +11,6 @@
#include "quikr_functions.h"
#include "quikr.h"
-#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
#define USAGE "Usage:\n\tquikr [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version."
int main(int argc, char **argv) {
@@ -29,7 +28,6 @@ int main(int argc, char **argv) {
unsigned long long rare_value = 0;
unsigned long long rare_width = 0;
- double percentage = 1.0;
double rare_percent = 1.0;
unsigned long long width = 0;
@@ -116,7 +114,6 @@ int main(int argc, char **argv) {
}
if(verbose) {
- printf("rare width:%llu\n", rare_width);
printf("kmer: %u\n", kmer);
printf("lambda: %llu\n", lambda);
printf("fasta: %s\n", input_fasta_filename);
@@ -142,53 +139,51 @@ int main(int argc, char **argv) {
// 4 "ACGT" ^ Kmer gives us the size of output rows
width = pow_four(kmer);
+ // load sensing matrix
+ struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer);
+
+ if(verbose) {
+ printf("width: %llu\n", width);
+ printf("sequences: %llu\n", sensing_matrix->sequences);
+ }
+
+
+
// load counts matrix
double *count_matrix = malloc(width * sizeof(double));
check_malloc(count_matrix, NULL);
+ // convert our matrix into doubles
{
unsigned long long *integer_counts = get_kmer_counts_from_file(input_fasta_filename, kmer);
- for(x = 0; x < width; x++)
+ for(x = 0; x < width; x++) {
count_matrix[x] = (double)integer_counts[x];
+ }
free(integer_counts);
}
- // load sensing matrix
- struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer);
-
- // get our "rare" counts
- while(1) {
- rare_width = 0;
- for(x = 0; x < width; x++) {
- if(count_matrix[x] < rare_value) {
- rare_width++;
- }
- }
- percentage = (float)rare_width / (float)width;
-
- if(percentage >= rare_percent)
- break;
-
- rare_value++;
- }
+ // get_rare_value
+ get_rare_value(count_matrix, width, rare_percent, &rare_value, &rare_width);
if(verbose)
printf("there are %llu values less than %llu\n", rare_width, rare_value);
- // add a extra space for our zero's array
+ // add a extra space for our zero's array, so we can set the first column to 1's
rare_width++;
// store our count matrix
double *count_matrix_rare = calloc(rare_width, sizeof(double));
check_malloc(count_matrix_rare, NULL);
- double *sensing_matrix_rare = malloc((rare_width) * sensing_matrix->sequences * sizeof(double));
+ double *sensing_matrix_rare = calloc(rare_width * sensing_matrix->sequences, sizeof(double));
check_malloc(sensing_matrix_rare, NULL);
// copy only kmers from our original counts that match our rareness percentage
// in both our count matrix and our sensing matrix
+ //
+ // y = 1 because we are offsetting the arrah by 1, so we can set the first row to all 1's
for(x = 0, y = 1; x < width; x++) {
if(count_matrix[x] <= rare_value) {
count_matrix_rare[y] = count_matrix[x];
@@ -200,6 +195,7 @@ int main(int argc, char **argv) {
}
}
+
// normalize our kmer counts and our sensing_matrix
normalize_matrix(count_matrix_rare, 1, rare_width);
normalize_matrix(sensing_matrix_rare, sensing_matrix->sequences, rare_width);
@@ -208,6 +204,7 @@ int main(int argc, char **argv) {
for(x = 1; x < rare_width; x++)
count_matrix_rare[x] *= lambda;
+ //TODO use one loop
for(x = 0; x < sensing_matrix->sequences; x++) {
for(y = 1; y < rare_width; y++) {
sensing_matrix_rare[rare_width*x + y] *= lambda;
@@ -221,7 +218,6 @@ int main(int argc, char **argv) {
sensing_matrix_rare[x*rare_width] = 1.0;
}
- // run NNLS
double *solution = nnls(sensing_matrix_rare, count_matrix_rare, sensing_matrix->sequences, rare_width);
// normalize our solution vector
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
index 061d3b8..5bbe913 100644
--- a/src/c/quikr_functions.c
+++ b/src/c/quikr_functions.c
@@ -10,6 +10,7 @@
#include "kmer_utils.h"
#include "quikr.h"
+
void check_malloc(void *ptr, char *error) {
if (ptr == NULL) {
if(error != NULL) {
@@ -20,6 +21,44 @@ void check_malloc(void *ptr, char *error) {
exit(EXIT_FAILURE);
}
}
+static int double_cmp (const void * a, const void * b) {
+ return ( *(double*)a - *(double*)b );
+}
+
+void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width) {
+ size_t y, x;
+ unsigned long long rare_width = 0;
+ double rare_value = 0;
+
+ // allocate * sort a temporary matrix
+ double *sorted_count_matrix = malloc(width * sizeof(double));
+ check_malloc(sorted_count_matrix, NULL);
+
+ memcpy(sorted_count_matrix, count_matrix, width * sizeof(double));
+
+ qsort(sorted_count_matrix, width, sizeof(double), double_cmp);
+
+ // get our "rare" counts
+ for(y = 0; y < width; y++) {
+ double percentage = 0;
+
+ rare_value = sorted_count_matrix[y];
+ rare_width = 0;
+ for(x = 0; x < width; x++) {
+ if(count_matrix[x] <= rare_value) {
+ rare_width++;
+ }
+ }
+ percentage = (double)rare_width / (double)width;
+
+ if(percentage >= rare_percent)
+ break;
+ }
+
+ free(sorted_count_matrix);
+ *ret_rare_width = rare_width;
+ *ret_rare_value = rare_value;
+}
void debug_arrays(double *count_matrix, struct matrix *sensing_matrix) {
FILE *count_fh = fopen("count.mat", "w");
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
index 369d68c..978f614 100644
--- a/src/c/quikr_functions.h
+++ b/src/c/quikr_functions.h
@@ -9,3 +9,6 @@ void normalize_matrix(double *matrix, int height, int width);
// load a sensing matrix
struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer);
+
+// get_rare_value
+void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width);