diff options
| -rw-r--r-- | CHANGELOG | 3 | ||||
| -rw-r--r-- | src/c/multifasta_to_otu.1 | 2 | ||||
| -rw-r--r-- | src/c/multifasta_to_otu.c | 22 | ||||
| -rw-r--r-- | src/c/nnls.c | 33 | ||||
| -rw-r--r-- | src/c/quikr.1 | 2 | ||||
| -rw-r--r-- | src/c/quikr.c | 14 | ||||
| -rw-r--r-- | src/c/quikr_functions.c | 27 | ||||
| -rw-r--r-- | src/c/quikr_functions.c.1 | 153 | ||||
| -rw-r--r-- | src/c/quikr_train.1 | 2 | ||||
| -rw-r--r-- | src/c/quikr_train.c | 5 | 
10 files changed, 203 insertions, 60 deletions
| @@ -8,9 +8,10 @@ May 16th 2013, 1.0.1 Released  May 20th 2013, 1.0.2 Released  - correted usage strings  - install manpages globally (to {PREFIX}/share/man/man1) -- added native gzip decoding for quikr and multifasta_to_otu  Current Development Branch  - Licensing files and documentation added  - Removed unused code from the NBC project  - Fixed a overflow bug in quikr_train +- added native gzip decoding for quikr and multifasta_to_otu +- documentation fixes diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1 index 29fe5d7..2a00d5c 100644 --- a/src/c/multifasta_to_otu.1 +++ b/src/c/multifasta_to_otu.1 @@ -79,7 +79,7 @@ make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirect  .SH AUTHORS  .B multifasta2otu   was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.  .SH REPORTING BUGS  .TP  Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c index 2f4bf87..e0fb128 100644 --- a/src/c/multifasta_to_otu.c +++ b/src/c/multifasta_to_otu.c @@ -26,14 +26,14 @@ int main(int argc, char **argv) {    double *sensing_matrix; -  long int width = 0; -  long int sequences = 0; +  long width = 0; +  long sequences = 0;    int kmer = 6;    int lambda = 10000; -  int x = 0; -  int y = 0; +  long x = 0; +  long y = 0;    int jobs = 1;    #ifdef Linux @@ -150,7 +150,7 @@ int main(int argc, char **argv) {    }     // do a directory count -  int dir_count = -2; // -2 for ../ and ./ +  long dir_count = -2; // -2 for ../ and ./    while(entry = readdir(input_directory_dh))       dir_count++;     rewinddir(input_directory_dh); @@ -167,7 +167,7 @@ int main(int argc, char **argv) {    }    if(verbose) { -    printf("directory count: %d\n", dir_count); +    printf("directory count: %ld\n", dir_count);      printf("width: %ld\nsequences %ld\n", width, sequences);    } @@ -197,7 +197,7 @@ int main(int argc, char **argv) {      exit(EXIT_FAILURE);    } -  int *file_sequence_count = malloc(dir_count * sizeof(int)); +  long *file_sequence_count = malloc(dir_count * sizeof(long));    if(file_sequence_count == NULL) {      fprintf(stderr, "Could not allocate enough memory\n");      exit(EXIT_FAILURE); @@ -206,12 +206,12 @@ int main(int argc, char **argv) {    struct dirent result;    omp_set_num_threads(jobs); -  int done = 0; +  long done = 0;    printf("Beginning to process samples\n");   #pragma omp parallel for shared(solutions, sequences, width, result, done) -  for(int i = 0; i < dir_count; i++ ) { +  for(long i = 0; i < dir_count; i++ ) { -    int z = 0; +    long z = 0;      struct dirent *directory_entry;      char *filename = malloc(256 * sizeof(char));      char *base_filename = malloc(256 * sizeof(char)); @@ -269,7 +269,7 @@ int main(int argc, char **argv) {      }      done++; -    printf("%d/%d samples processed\n", done, dir_count);  +    printf("%ld/%ld samples processed\n", done, dir_count);       free(solution);      free(count_matrix);      free(filename); diff --git a/src/c/nnls.c b/src/c/nnls.c index e512388..f45bfec 100644 --- a/src/c/nnls.c +++ b/src/c/nnls.c @@ -1,5 +1,5 @@  /* - *   nnls.c  (c) 2002-2009 Turku PET Centre + *   nnls.c  (c) 2003-2009 Turku PET Centre   *   This file contains the routine NNLS (nonnegative least squares)   *   and the subroutines required by it, except h12, which is in   *   file 'lss_h12.c'. @@ -30,15 +30,16 @@   *  Checking for exceeding iteration count is corrected in nnls().   */   #include <stdio.h> +#include <stdint.h>  #include <assert.h>  #include <stdlib.h>  #include <math.h>  #define MAX(a,b) ((a) >= (b) ? (a) : (b))  #define ABS(x) ((x) >= 0 ? (x) : -(x)) -int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) { +int64_t h12( int64_t mode, int64_t lpivot, int64_t l1, int64_t m, double *u, int64_t u_dim1, double *up, double *cm, int64_t ice, int64_t icv, int64_t ncv) {    double d1,  b, clinv, cl, sm; -  int k, j; +  int64_t k, j;    /* Check parameters */    if (mode!=1 && mode!=2)  @@ -166,12 +167,12 @@ void g1(double a, double b, double *cterm, double *sterm, double *sig)  }  -int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) { -  int pfeas; +int64_t nnls_algorithm(double *a, int64_t m,int64_t n, double *b, double *x, double *rnorm) { +  int64_t pfeas;    int ret=0; -  int iz; -  int jz; -  int k, j=0, l, itmax, izmax=0, ii, jj=0, ip; +  int64_t iz; +  int64_t jz; +  int64_t k, j=0, l, itmax, izmax=0, ii, jj=0, ip;    double d1, d2, sm, up, ss;     double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc; @@ -183,7 +184,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)    /* Allocate memory for working space, if required */    double *w = (double*)calloc(n, sizeof(double));    double *zz = (double*)calloc(m, sizeof(double)); -  int *index = (int*)calloc(n, sizeof(int)); +  int64_t *index = (int64_t*)calloc(n, sizeof(int64_t));    if(w == NULL || zz == NULL || index == NULL)       return(2); @@ -193,18 +194,18 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)      index[k]=k;    } -  int iz2 = n - 1; -  int iz1 = 0; -  int iter=0;  -  int nsetp=0; -  int npp1=0; +  int64_t iz2 = n - 1; +  int64_t iz1 = 0; +  int64_t iter=0;  +  int64_t nsetp=0; +  int64_t npp1=0;    /* Main loop; quit if all coeffs are already in the solution or */    /* if M cols of A have been triangularized */    if(n < 3)       itmax=n*3;    else  -    itmax=n*n*n; +    itmax=n*n;    while(iz1 <= iz2 && nsetp < m) { @@ -395,7 +396,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)  /* nnls_ */ -double *nnls(double *a_matrix, double *b_matrix, int height, int width) { +double *nnls(double *a_matrix, double *b_matrix, int64_t height, int64_t width) {    double *solution = (double*)calloc(height, sizeof(double)); diff --git a/src/c/quikr.1 b/src/c/quikr.1 index 937109b..9b32667 100644 --- a/src/c/quikr.1 +++ b/src/c/quikr.1 @@ -58,7 +58,7 @@ quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt  .SH AUTHORS  .B quikr  was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.  .SH REPORTING BUGS  .TP  Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/quikr.c b/src/c/quikr.c index 17d6480..f588c5f 100644 --- a/src/c/quikr.c +++ b/src/c/quikr.c @@ -12,7 +12,7 @@  #include "quikr_functions.h"  #define sensing_matrix(i,j) (sensing_matrix[width*i + j]) -#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details" +#define USAGE "Usage:\n\tquikr [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details"  int main(int argc, char **argv) { @@ -24,8 +24,8 @@ int main(int argc, char **argv) {    char *sensing_fasta_filename = NULL;    char *output_filename = NULL; -  int x = 0; -  int y = 0; +  long x = 0; +  long y = 0;    int verbose = 0;    int debug = 0; @@ -33,8 +33,8 @@ int main(int argc, char **argv) {    int lambda = 10000;    int kmer = 6; -  int width = 0; -  int sequences = 0; +  long width = 0; +  long sequences = 0;    while (1) {      static struct option long_options[] = { @@ -144,7 +144,7 @@ int main(int argc, char **argv) {    }    if(verbose) { -    printf("width: %d\nsequences %d\n", width, sequences); +    printf("width: %ld\nsequences %ld\n", width, sequences);    }    double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); @@ -157,7 +157,7 @@ int main(int argc, char **argv) {      }    } -  for(x= 0; x < sequences; x++) { +  for(x = 0; x < sequences; x++) {      sensing_matrix(x, 0) = 1.0;    }    // normalize our count_matrix diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index e55b28c..7bdbb9e 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -6,6 +6,7 @@  #include <unistd.h>  #include <string.h>  #include <math.h> +#include <zlib.h>  int count_sequences(char *filename) {     char command[512]; @@ -87,19 +88,14 @@ double *load_sensing_matrix(char *filename, int height, int width) {    int x = 0;    int y = 0; -  char *line = NULL; -  char *val; -  char command[512]; -  size_t len = 0; -  FILE *sensing_matrix_fh = NULL; +  gzFile sensing_matrix_fh = NULL;    double *sensing_matrix = malloc(height * width * sizeof(double));    if(sensing_matrix == NULL) {      fprintf(stderr, "Could not allocate memory for the sensing matrix\n");    } -  sprintf(command, "gzip -cd %s", filename); -  sensing_matrix_fh = popen(command, "r"); +  sensing_matrix_fh = gzopen(filename, "r");    if(sensing_matrix_fh == NULL) {      fprintf(stderr, "could not open %s", filename);      exit(EXIT_FAILURE); @@ -107,21 +103,14 @@ double *load_sensing_matrix(char *filename, int height, int width) {    // read our sensing matrix in    for(x = 0; x < height; x++) { -    getline(&line, &len, sensing_matrix_fh); -    char *ptr; - -    // Read our first element in outside of the loop -    val = strtok_r(line,"\t", &ptr); -    sensing_matrix[width*x + 1] = atof(val); -    // iterate through and load the array -    for (y = 2; y < width; y++) { -      val = strtok_r(NULL, "\t", &ptr); -      sensing_matrix[width*x + y] = atof(val); +    for (y = 1; y < width; y++) { +      char buffer[14]; +      gzgets(sensing_matrix_fh, buffer, 14);  +      sensing_matrix[width*x + y] = atof(buffer);      }    } -  free(line); -  pclose(sensing_matrix_fh); +  gzclose(sensing_matrix_fh);    return sensing_matrix;  } diff --git a/src/c/quikr_functions.c.1 b/src/c/quikr_functions.c.1 new file mode 100644 index 0000000..31afb3b --- /dev/null +++ b/src/c/quikr_functions.c.1 @@ -0,0 +1,153 @@ +#include <stdio.h> +#include <stdio.h> +#include <errno.h> +#include <ctype.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <math.h> +#include <zlib.h> + +int count_sequences(char *filename) {  +  char command[512]; +  int sequences = 0; +  FILE *grep_output; + +  sprintf(command, "grep -c ^\\> %s", filename); +  grep_output = popen(command, "r"); +  if(grep_output == NULL) { +    fprintf(stderr, "Could not execute %s\n", command); +    exit(EXIT_FAILURE); +  } +   +  fscanf(grep_output, "%d", &sequences); + +  pclose(grep_output); +  return sequences; +} + + +void normalize_matrix(double *matrix, int height, int width) { +  int x = 0; +  int y = 0; +  for(x = 0; x < height; x++) { + +    double row_sum = 0; + +    for(y = 0; y < (width); y++) +      row_sum = row_sum + matrix[width * x + y]; +    for(y = 0; y < (width); y++)  +      matrix[width * x + y] = matrix[width * x + y] / row_sum;  +  } +} + + +double *load_count_matrix(char *filename, int width, int kmer) { + +  double *count_matrix = malloc((width)*sizeof(double)); +  char count_command[512]; +  int x = 0; +  char *line = NULL; +  size_t len = 0; + +  if(count_matrix == NULL) { +    fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width); +    exit(EXIT_FAILURE); +  } + +  // create out count matrix +  sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename); +  FILE *count_output = popen(count_command, "r"); +  if(count_output == NULL) { +    fprintf(stderr, "could not execute \"%s\"", count_command); +    exit(EXIT_FAILURE); +  } + +  // set first element to zero. +  count_matrix[0] = 0; + +  // get our first line +  getline(&line, &len, count_output); +  count_matrix[1] = atoi(line); + +  // iterate over the rest of the lines +  for(x = 2; x < width; x++) {  +    getline(&line, &len, count_output); +    count_matrix[x] = atoi(line); +  }  + +  pclose(count_output); + +  return count_matrix; +} + + +double *load_sensing_matrix(char *filename, int height, int width) { + +  int x = 0; +  int y = 0; + +  gzFile sensing_matrix_fh = NULL; + +  double *sensing_matrix = malloc(height * width * sizeof(double)); +  if(sensing_matrix == NULL) { +    fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); +  } + +  sensing_matrix_fh = gzopen(filename, "r"); +  if(sensing_matrix_fh == NULL) { +    fprintf(stderr, "could not open %s", filename); +    exit(EXIT_FAILURE); +  } + +  // read our sensing matrix in +  for(x = 0; x < height; x++) { +    for (y = 1; y < width; y++) { +      char buffer[14]; +      gzgets(sensing_matrix_fh, buffer, 14);  +      sensing_matrix[width*x + y] = atof(buffer); +    } +  } + +  gzclose(sensing_matrix_fh); + +  return sensing_matrix; +} + +char **load_headers(char *filename, int sequences) { +  char command[512]; +  char *line= NULL; +  int x = 0; +  FILE *grep_output; +  size_t len = 0; + +  sprintf(command, "grep ^\\> %s", filename); +  grep_output = popen(command, "r"); +  if(grep_output == NULL) { +    fprintf(stderr, "Could not execute %s\n", command); +    exit(EXIT_FAILURE); +  } +   +  char **headers = malloc(sequences * sizeof(char *)); +  if(headers == NULL) { +    fprintf(stderr, "could not allocated enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  for(x = 0; x < sequences; x++) { + +    char *header = malloc(256 * sizeof(char)); +    if(header == NULL) { +      fprintf(stderr, "could not allocated enough memory\n"); +      exit(EXIT_FAILURE); +    } +    getline(&line, &len, grep_output); +    sscanf(line + 1, "%s", header); +    headers[x] = header; +  } + +  pclose(grep_output); + +  return headers; +} + diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1 index dd1493f..6a058cd 100644 --- a/src/c/quikr_train.1 +++ b/src/c/quikr_train.1 @@ -45,7 +45,7 @@ If you do not have a .gz file on your output matrix name, it will be appended.  .SH AUTHORS  .B quikr  was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  -<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.  .SH REPORTING BUGS  .TP  Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c index a2f132e..c728a2c 100644 --- a/src/c/quikr_train.c +++ b/src/c/quikr_train.c @@ -15,7 +15,6 @@  int main(int argc, char **argv) {    char probabilities_command[512]; -  char kmers_file[256];    char *line = NULL;    char *val;    size_t len = 0; @@ -169,10 +168,10 @@ int main(int argc, char **argv) {        trained_matrix[y] = trained_matrix[y] / row_sum;      } -    for( y = 0; y < width; y++) { +    for( y = 0; y < (width - 1); y++) {        gzprintf(output, "%.10f\t", trained_matrix[y]);      } -    gzprintf(output, "\n"); +    gzprintf(output, "%.10f\n", trained_matrix[width]);    }    free(trained_matrix); | 
