diff options
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | src/c/multifasta_to_otu.1 | 2 | ||||
-rw-r--r-- | src/c/multifasta_to_otu.c | 22 | ||||
-rw-r--r-- | src/c/nnls.c | 33 | ||||
-rw-r--r-- | src/c/quikr.1 | 2 | ||||
-rw-r--r-- | src/c/quikr.c | 14 | ||||
-rw-r--r-- | src/c/quikr_functions.c | 27 | ||||
-rw-r--r-- | src/c/quikr_functions.c.1 | 153 | ||||
-rw-r--r-- | src/c/quikr_train.1 | 2 | ||||
-rw-r--r-- | src/c/quikr_train.c | 5 |
10 files changed, 203 insertions, 60 deletions
@@ -8,9 +8,10 @@ May 16th 2013, 1.0.1 Released May 20th 2013, 1.0.2 Released - correted usage strings - install manpages globally (to {PREFIX}/share/man/man1) -- added native gzip decoding for quikr and multifasta_to_otu Current Development Branch - Licensing files and documentation added - Removed unused code from the NBC project - Fixed a overflow bug in quikr_train +- added native gzip decoding for quikr and multifasta_to_otu +- documentation fixes diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1 index 29fe5d7..2a00d5c 100644 --- a/src/c/multifasta_to_otu.1 +++ b/src/c/multifasta_to_otu.1 @@ -79,7 +79,7 @@ make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirect .SH AUTHORS .B multifasta2otu was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot. .SH REPORTING BUGS .TP Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c index 2f4bf87..e0fb128 100644 --- a/src/c/multifasta_to_otu.c +++ b/src/c/multifasta_to_otu.c @@ -26,14 +26,14 @@ int main(int argc, char **argv) { double *sensing_matrix; - long int width = 0; - long int sequences = 0; + long width = 0; + long sequences = 0; int kmer = 6; int lambda = 10000; - int x = 0; - int y = 0; + long x = 0; + long y = 0; int jobs = 1; #ifdef Linux @@ -150,7 +150,7 @@ int main(int argc, char **argv) { } // do a directory count - int dir_count = -2; // -2 for ../ and ./ + long dir_count = -2; // -2 for ../ and ./ while(entry = readdir(input_directory_dh)) dir_count++; rewinddir(input_directory_dh); @@ -167,7 +167,7 @@ int main(int argc, char **argv) { } if(verbose) { - printf("directory count: %d\n", dir_count); + printf("directory count: %ld\n", dir_count); printf("width: %ld\nsequences %ld\n", width, sequences); } @@ -197,7 +197,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - int *file_sequence_count = malloc(dir_count * sizeof(int)); + long *file_sequence_count = malloc(dir_count * sizeof(long)); if(file_sequence_count == NULL) { fprintf(stderr, "Could not allocate enough memory\n"); exit(EXIT_FAILURE); @@ -206,12 +206,12 @@ int main(int argc, char **argv) { struct dirent result; omp_set_num_threads(jobs); - int done = 0; + long done = 0; printf("Beginning to process samples\n"); #pragma omp parallel for shared(solutions, sequences, width, result, done) - for(int i = 0; i < dir_count; i++ ) { + for(long i = 0; i < dir_count; i++ ) { - int z = 0; + long z = 0; struct dirent *directory_entry; char *filename = malloc(256 * sizeof(char)); char *base_filename = malloc(256 * sizeof(char)); @@ -269,7 +269,7 @@ int main(int argc, char **argv) { } done++; - printf("%d/%d samples processed\n", done, dir_count); + printf("%ld/%ld samples processed\n", done, dir_count); free(solution); free(count_matrix); free(filename); diff --git a/src/c/nnls.c b/src/c/nnls.c index e512388..f45bfec 100644 --- a/src/c/nnls.c +++ b/src/c/nnls.c @@ -1,5 +1,5 @@ /* - * nnls.c (c) 2002-2009 Turku PET Centre + * nnls.c (c) 2003-2009 Turku PET Centre * This file contains the routine NNLS (nonnegative least squares) * and the subroutines required by it, except h12, which is in * file 'lss_h12.c'. @@ -30,15 +30,16 @@ * Checking for exceeding iteration count is corrected in nnls(). */ #include <stdio.h> +#include <stdint.h> #include <assert.h> #include <stdlib.h> #include <math.h> #define MAX(a,b) ((a) >= (b) ? (a) : (b)) #define ABS(x) ((x) >= 0 ? (x) : -(x)) -int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) { +int64_t h12( int64_t mode, int64_t lpivot, int64_t l1, int64_t m, double *u, int64_t u_dim1, double *up, double *cm, int64_t ice, int64_t icv, int64_t ncv) { double d1, b, clinv, cl, sm; - int k, j; + int64_t k, j; /* Check parameters */ if (mode!=1 && mode!=2) @@ -166,12 +167,12 @@ void g1(double a, double b, double *cterm, double *sterm, double *sig) } -int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) { - int pfeas; +int64_t nnls_algorithm(double *a, int64_t m,int64_t n, double *b, double *x, double *rnorm) { + int64_t pfeas; int ret=0; - int iz; - int jz; - int k, j=0, l, itmax, izmax=0, ii, jj=0, ip; + int64_t iz; + int64_t jz; + int64_t k, j=0, l, itmax, izmax=0, ii, jj=0, ip; double d1, d2, sm, up, ss; double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc; @@ -183,7 +184,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) /* Allocate memory for working space, if required */ double *w = (double*)calloc(n, sizeof(double)); double *zz = (double*)calloc(m, sizeof(double)); - int *index = (int*)calloc(n, sizeof(int)); + int64_t *index = (int64_t*)calloc(n, sizeof(int64_t)); if(w == NULL || zz == NULL || index == NULL) return(2); @@ -193,18 +194,18 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) index[k]=k; } - int iz2 = n - 1; - int iz1 = 0; - int iter=0; - int nsetp=0; - int npp1=0; + int64_t iz2 = n - 1; + int64_t iz1 = 0; + int64_t iter=0; + int64_t nsetp=0; + int64_t npp1=0; /* Main loop; quit if all coeffs are already in the solution or */ /* if M cols of A have been triangularized */ if(n < 3) itmax=n*3; else - itmax=n*n*n; + itmax=n*n; while(iz1 <= iz2 && nsetp < m) { @@ -395,7 +396,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) /* nnls_ */ -double *nnls(double *a_matrix, double *b_matrix, int height, int width) { +double *nnls(double *a_matrix, double *b_matrix, int64_t height, int64_t width) { double *solution = (double*)calloc(height, sizeof(double)); diff --git a/src/c/quikr.1 b/src/c/quikr.1 index 937109b..9b32667 100644 --- a/src/c/quikr.1 +++ b/src/c/quikr.1 @@ -58,7 +58,7 @@ quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt .SH AUTHORS .B quikr was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot. .SH REPORTING BUGS .TP Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/quikr.c b/src/c/quikr.c index 17d6480..f588c5f 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -12,7 +12,7 @@ #include "quikr_functions.h" #define sensing_matrix(i,j) (sensing_matrix[width*i + j]) -#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details" +#define USAGE "Usage:\n\tquikr [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details" int main(int argc, char **argv) { @@ -24,8 +24,8 @@ int main(int argc, char **argv) { char *sensing_fasta_filename = NULL; char *output_filename = NULL; - int x = 0; - int y = 0; + long x = 0; + long y = 0; int verbose = 0; int debug = 0; @@ -33,8 +33,8 @@ int main(int argc, char **argv) { int lambda = 10000; int kmer = 6; - int width = 0; - int sequences = 0; + long width = 0; + long sequences = 0; while (1) { static struct option long_options[] = { @@ -144,7 +144,7 @@ int main(int argc, char **argv) { } if(verbose) { - printf("width: %d\nsequences %d\n", width, sequences); + printf("width: %ld\nsequences %ld\n", width, sequences); } double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); @@ -157,7 +157,7 @@ int main(int argc, char **argv) { } } - for(x= 0; x < sequences; x++) { + for(x = 0; x < sequences; x++) { sensing_matrix(x, 0) = 1.0; } // normalize our count_matrix diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index e55b28c..7bdbb9e 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -6,6 +6,7 @@ #include <unistd.h> #include <string.h> #include <math.h> +#include <zlib.h> int count_sequences(char *filename) { char command[512]; @@ -87,19 +88,14 @@ double *load_sensing_matrix(char *filename, int height, int width) { int x = 0; int y = 0; - char *line = NULL; - char *val; - char command[512]; - size_t len = 0; - FILE *sensing_matrix_fh = NULL; + gzFile sensing_matrix_fh = NULL; double *sensing_matrix = malloc(height * width * sizeof(double)); if(sensing_matrix == NULL) { fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); } - sprintf(command, "gzip -cd %s", filename); - sensing_matrix_fh = popen(command, "r"); + sensing_matrix_fh = gzopen(filename, "r"); if(sensing_matrix_fh == NULL) { fprintf(stderr, "could not open %s", filename); exit(EXIT_FAILURE); @@ -107,21 +103,14 @@ double *load_sensing_matrix(char *filename, int height, int width) { // read our sensing matrix in for(x = 0; x < height; x++) { - getline(&line, &len, sensing_matrix_fh); - char *ptr; - - // Read our first element in outside of the loop - val = strtok_r(line,"\t", &ptr); - sensing_matrix[width*x + 1] = atof(val); - // iterate through and load the array - for (y = 2; y < width; y++) { - val = strtok_r(NULL, "\t", &ptr); - sensing_matrix[width*x + y] = atof(val); + for (y = 1; y < width; y++) { + char buffer[14]; + gzgets(sensing_matrix_fh, buffer, 14); + sensing_matrix[width*x + y] = atof(buffer); } } - free(line); - pclose(sensing_matrix_fh); + gzclose(sensing_matrix_fh); return sensing_matrix; } diff --git a/src/c/quikr_functions.c.1 b/src/c/quikr_functions.c.1 new file mode 100644 index 0000000..31afb3b --- /dev/null +++ b/src/c/quikr_functions.c.1 @@ -0,0 +1,153 @@ +#include <stdio.h> +#include <stdio.h> +#include <errno.h> +#include <ctype.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <math.h> +#include <zlib.h> + +int count_sequences(char *filename) { + char command[512]; + int sequences = 0; + FILE *grep_output; + + sprintf(command, "grep -c ^\\> %s", filename); + grep_output = popen(command, "r"); + if(grep_output == NULL) { + fprintf(stderr, "Could not execute %s\n", command); + exit(EXIT_FAILURE); + } + + fscanf(grep_output, "%d", &sequences); + + pclose(grep_output); + return sequences; +} + + +void normalize_matrix(double *matrix, int height, int width) { + int x = 0; + int y = 0; + for(x = 0; x < height; x++) { + + double row_sum = 0; + + for(y = 0; y < (width); y++) + row_sum = row_sum + matrix[width * x + y]; + for(y = 0; y < (width); y++) + matrix[width * x + y] = matrix[width * x + y] / row_sum; + } +} + + +double *load_count_matrix(char *filename, int width, int kmer) { + + double *count_matrix = malloc((width)*sizeof(double)); + char count_command[512]; + int x = 0; + char *line = NULL; + size_t len = 0; + + if(count_matrix == NULL) { + fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width); + exit(EXIT_FAILURE); + } + + // create out count matrix + sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename); + FILE *count_output = popen(count_command, "r"); + if(count_output == NULL) { + fprintf(stderr, "could not execute \"%s\"", count_command); + exit(EXIT_FAILURE); + } + + // set first element to zero. + count_matrix[0] = 0; + + // get our first line + getline(&line, &len, count_output); + count_matrix[1] = atoi(line); + + // iterate over the rest of the lines + for(x = 2; x < width; x++) { + getline(&line, &len, count_output); + count_matrix[x] = atoi(line); + } + + pclose(count_output); + + return count_matrix; +} + + +double *load_sensing_matrix(char *filename, int height, int width) { + + int x = 0; + int y = 0; + + gzFile sensing_matrix_fh = NULL; + + double *sensing_matrix = malloc(height * width * sizeof(double)); + if(sensing_matrix == NULL) { + fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); + } + + sensing_matrix_fh = gzopen(filename, "r"); + if(sensing_matrix_fh == NULL) { + fprintf(stderr, "could not open %s", filename); + exit(EXIT_FAILURE); + } + + // read our sensing matrix in + for(x = 0; x < height; x++) { + for (y = 1; y < width; y++) { + char buffer[14]; + gzgets(sensing_matrix_fh, buffer, 14); + sensing_matrix[width*x + y] = atof(buffer); + } + } + + gzclose(sensing_matrix_fh); + + return sensing_matrix; +} + +char **load_headers(char *filename, int sequences) { + char command[512]; + char *line= NULL; + int x = 0; + FILE *grep_output; + size_t len = 0; + + sprintf(command, "grep ^\\> %s", filename); + grep_output = popen(command, "r"); + if(grep_output == NULL) { + fprintf(stderr, "Could not execute %s\n", command); + exit(EXIT_FAILURE); + } + + char **headers = malloc(sequences * sizeof(char *)); + if(headers == NULL) { + fprintf(stderr, "could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + + for(x = 0; x < sequences; x++) { + + char *header = malloc(256 * sizeof(char)); + if(header == NULL) { + fprintf(stderr, "could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + getline(&line, &len, grep_output); + sscanf(line + 1, "%s", header); + headers[x] = header; + } + + pclose(grep_output); + + return headers; +} + diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1 index dd1493f..6a058cd 100644 --- a/src/c/quikr_train.1 +++ b/src/c/quikr_train.1 @@ -45,7 +45,7 @@ If you do not have a .gz file on your output matrix name, it will be appended. .SH AUTHORS .B quikr was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot. .SH REPORTING BUGS .TP Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c index a2f132e..c728a2c 100644 --- a/src/c/quikr_train.c +++ b/src/c/quikr_train.c @@ -15,7 +15,6 @@ int main(int argc, char **argv) { char probabilities_command[512]; - char kmers_file[256]; char *line = NULL; char *val; size_t len = 0; @@ -169,10 +168,10 @@ int main(int argc, char **argv) { trained_matrix[y] = trained_matrix[y] / row_sum; } - for( y = 0; y < width; y++) { + for( y = 0; y < (width - 1); y++) { gzprintf(output, "%.10f\t", trained_matrix[y]); } - gzprintf(output, "\n"); + gzprintf(output, "%.10f\n", trained_matrix[width]); } free(trained_matrix); |