aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-29 16:15:25 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-29 16:15:25 -0400
commitf026759c02e2f537e597be550f9ce0bb9d16455d (patch)
treed1e03399daeaae810e97a8dc36e1d95f4cb8ced4
parentd7f79dfe426c8357cce324980ab90e788671354d (diff)
initial commit of multifasta_to_otu.c working with new format
-rw-r--r--src/c/multifasta_to_otu.c105
1 files changed, 61 insertions, 44 deletions
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
index 8b26ee5..d4d1588 100644
--- a/src/c/multifasta_to_otu.c
+++ b/src/c/multifasta_to_otu.c
@@ -8,7 +8,10 @@
#include <stdio.h>
#include <string.h>
#include <unistd.h>
+
+#include "kmer_utils.h"
#include "nnls.h"
+#include "quikr.h"
#include "quikr_functions.h"
#ifdef Linux
@@ -28,18 +31,19 @@ int main(int argc, char **argv) {
char *sensing_fasta_filename = NULL;
char *output_filename = NULL;
- double *sensing_matrix;
+ unsigned long long x = 0;
+ unsigned long long y = 0;
+
+ long long i = 0;
+
+ unsigned long long width = 0;
- long width = 0;
- long sequences = 0;
+ unsigned int kmer = 6;
+ unsigned long long lambda = 10000;
- int kmer = 6;
- int lambda = 10000;
- long x = 0;
- long y = 0;
+ unsigned int jobs = 1;
- int jobs = 1;
#ifdef Linux
jobs = get_nprocs();
#endif
@@ -133,8 +137,8 @@ int main(int argc, char **argv) {
}
if(verbose) {
- printf("kmer: %d\n", kmer);
- printf("lambda: %d\n", lambda);
+ printf("kmer: %u\n", kmer);
+ printf("lambda: %llu\n", lambda);
printf("input directory: %s\n", input_fasta_directory);
printf("sensing database: %s\n", sensing_matrix_filename);
printf("sensing database fasta: %s\n", sensing_fasta_filename);
@@ -159,6 +163,11 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
+ if(kmer == 0) {
+ fprintf(stderr, "Error: zero is not a valid kmer\n");
+ exit(EXIT_FAILURE);
+ }
+
// do a directory count
long dir_count = -2; // -2 for ../ and ./
while(entry = readdir(input_directory_dh))
@@ -171,31 +180,27 @@ int main(int argc, char **argv) {
// 4 "ACGT" ^ Kmer gives us the size of output rows
width = pow(4, kmer) + 1;
- sequences = count_sequences(sensing_fasta_filename);
- if(sequences == 0) {
- fprintf(stderr, "Error: %s contains 0 fasta sequences\n", sensing_fasta_filename);
- }
+
+ struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename);
if(verbose) {
printf("directory count: %ld\n", dir_count);
- printf("width: %ld\nsequences %ld\n", width, sequences);
+ printf("width: %llu\nsequences %llu\n", width, sensing_matrix->sequences);
}
- sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
-
// multiply our matrix by lambda
- for(x = 0; x < sequences; x++) {
+ for(x = 0; x < sensing_matrix->sequences; x++) {
for(y= 0; y < width; y++) {
- sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
+ sensing_matrix->matrix[x*width + y] *= lambda;
}
}
// set the first row to be all 1's
- for(x = 0; x < sequences; x++) {
- sensing_matrix(x, 0) = 1.0;
+ for(x = 0; x < sensing_matrix->sequences; x++) {
+ sensing_matrix->matrix[x*width] = 1.0;
}
- double *solutions = malloc(dir_count * sequences * sizeof(double));
+ double *solutions = malloc(dir_count * sensing_matrix->sequences * sizeof(double));
if(solutions == NULL) {
fprintf(stderr, "Could not allocate enough memory for solutions vector\n");
exit(EXIT_FAILURE);
@@ -216,10 +221,10 @@ int main(int argc, char **argv) {
omp_set_num_threads(jobs);
long done = 0;
printf("Beginning to process samples\n");
- #pragma omp parallel for shared(solutions, sequences, width, done)
+ #pragma omp parallel for shared(solutions, sensing_matrix, width, done)
for(long i = 0; i < dir_count; i++ ) {
- long z = 0;
+ unsigned long long z = 0;
struct dirent *directory_entry;
char *filename = malloc(256 * sizeof(char));
char *base_filename = malloc(256 * sizeof(char));
@@ -248,8 +253,20 @@ int main(int argc, char **argv) {
// get individual sequence count
file_sequence_count[i] = count_sequences(filename);
- // count the kmer amounts
- double *count_matrix = load_count_matrix(filename, width, kmer);
+ // count the kmer amounts, and convert it to a double array
+ unsigned long long *integer_counts = get_kmer_counts_from_file(filename, kmer);
+ double *count_matrix = malloc(sizeof(double) * width);
+ if(count_matrix == NULL) {
+ fprintf(stderr, "Could not allocate memory:\n");
+ exit(EXIT_FAILURE);
+ }
+
+ count_matrix[0] = 0;
+
+ for(x = 0; x < width - 1 ; x++)
+ count_matrix[x+1] = (double)integer_counts[x];
+
+ free(integer_counts);
// normalize our kmer counts
normalize_matrix(count_matrix, 1, width);
@@ -258,24 +275,24 @@ int main(int argc, char **argv) {
for(z = 0; z < width; z++)
count_matrix[z] = count_matrix[z] * lambda;
- double *sensing_matrix_copy = malloc(sizeof(double) * sequences * width);
+ double *sensing_matrix_copy = malloc(sizeof(double) * sensing_matrix->sequences * width);
if(sensing_matrix_copy == NULL) {
fprintf(stderr, "Could not allocate enough memory\n");
exit(EXIT_FAILURE);
}
- memcpy(sensing_matrix_copy, sensing_matrix, sequences * width * sizeof(double));
+ memcpy(sensing_matrix_copy, sensing_matrix->matrix, sensing_matrix->sequences * width * sizeof(double));
// run nnls
- double *solution = nnls(sensing_matrix_copy, count_matrix, sequences, width);
+ double *solution = nnls(sensing_matrix_copy, count_matrix, sensing_matrix->sequences, width);
// normalize our solution
- normalize_matrix(solution, 1, sequences);
+ normalize_matrix(solution, 1, sensing_matrix->sequences);
// add the current solution to the solutions array
- for(z = 0; z < sequences; z++ ) {
- solutions(i, z) = solution[z];
+ for(z = 0; z < sensing_matrix->sequences; z++ ) {
+ solutions[i*sensing_matrix->sequences + z] = solution[z];
}
done++;
@@ -286,7 +303,7 @@ int main(int argc, char **argv) {
free(sensing_matrix_copy);
}
- char **headers = load_headers(sensing_fasta_filename, sequences);
+ char **headers = load_headers(sensing_fasta_filename, sensing_matrix->sequences);
// output our matrix
FILE *output_fh = fopen(output_filename, "w");
@@ -299,33 +316,33 @@ int main(int argc, char **argv) {
fprintf(output_fh, "#OTU_ID\t");
// print our filename headers
- for(x = 0; x < dir_count - 1; x++) {
- fprintf(output_fh, "%s\t", filenames[x]);
+ for(i = 0; i < dir_count - 1; i++) {
+ fprintf(output_fh, "%s\t", filenames[i]);
}
fprintf(output_fh, "%s\n", filenames[dir_count - 1]);
// get our actual values
- for(y = 0; y < sequences; y++) {
- for(x = 0; x < dir_count; x++) {
- solutions(x, y) = round(solutions(x, y) * file_sequence_count[x]);
+ for(y = 0; y < sensing_matrix->sequences; y++) {
+ for(i = 0; i < dir_count; i++) {
+ solutions[sensing_matrix->sequences*i + y] = round(solutions[sensing_matrix->sequences*i + y] * file_sequence_count[i]);
}
}
- for(y = 0; y < sequences; y++) {
+ for(y = 0; y < sensing_matrix->sequences; y++) {
double column_sum = 0.;
- for(x = 0; x < dir_count; x++) {
- column_sum += solutions(x, y);
+ for(i = 0; i < dir_count; i++) {
+ column_sum += solutions[sensing_matrix->sequences*i + y];
}
// if our column is zero, don't bother printing the row
if(column_sum != 0) {
fprintf(output_fh, "%s\t", headers[y]);
- for(x = 0; x < dir_count - 1; x++) {
- fprintf(output_fh, "%d\t", (int)solutions(x, y));
+ for(i = 0; i < dir_count - 1; i++) {
+ fprintf(output_fh, "%d\t", (int)solutions[sensing_matrix->sequences*i + y]);
}
- fprintf(output_fh, "%d\n", (int)solutions[sequences*(dir_count - 1) + y]);
+ fprintf(output_fh, "%d\n", (int)solutions[sensing_matrix->sequences*(dir_count - 1) + y]);
}
}
fclose(output_fh);