diff options
Diffstat (limited to 'src/c')
| -rw-r--r-- | src/c/Makefile | 14 | ||||
| -rw-r--r-- | src/c/multifasta_to_otu.1 | 95 | ||||
| -rw-r--r-- | src/c/multifasta_to_otu.c | 314 | ||||
| -rw-r--r-- | src/c/nnls.c | 416 | ||||
| -rw-r--r-- | src/c/nnls.h | 1 | ||||
| -rw-r--r-- | src/c/quikr.1 | 70 | ||||
| -rw-r--r-- | src/c/quikr.c | 192 | ||||
| -rw-r--r-- | src/c/quikr_functions.c | 163 | ||||
| -rw-r--r-- | src/c/quikr_functions.h | 14 | ||||
| -rw-r--r-- | src/c/quikr_train.1 | 61 | ||||
| -rw-r--r-- | src/c/quikr_train.c | 176 | 
11 files changed, 1516 insertions, 0 deletions
| diff --git a/src/c/Makefile b/src/c/Makefile new file mode 100644 index 0000000..ba0c799 --- /dev/null +++ b/src/c/Makefile @@ -0,0 +1,14 @@ +UNAME := $(shell uname) +CC = gcc +QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME) +QUIKR_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) +all: quikr_train_ quikr_ multifasta_to_otu_ + +multifasta_to_otu_: +	$(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(QUIKR_CFLAGS) +quikr_train_: +	$(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS) +quikr_: +	$(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../ +clean: +	rm -vf quikr_train quikr multifasta_to_otu diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1 new file mode 100644 index 0000000..7878ef0 --- /dev/null +++ b/src/c/multifasta_to_otu.1 @@ -0,0 +1,95 @@ +.TH multifasta_to_otu 1 multifasta_to_otu-2013-05 +.SH NAME +multifasta_to_otu \- create a QIIME OTU table based on Quikr results. +.SH SYNOPSIS +.B multifasta_to_otu +.RB [ \-i +.IR input-directory ] +.RB [ \-f +.IR sensing-fasta] +.RB [ \-s +.IR sensing-matrix] +.RB [ \-k +.IR kmer ] +.RB [ \-l +.IR lambda ] +.RB [ \-j +.IR jobs ] +.RB [ \-o +.IR otu-table] +.RB [ \-v ] +.P +.BR multifasta_to_otu " ..." +.SH DESCRIPTION +.B multifasta_to_otu +The multifasta_to_otu tool is a handy wrapper for quikr which lets the user +to input as many fasta files as they like, and then returns an OTU table of the +number of times a specimen was seen in all of the samples. +.P +.SH OPTIONS +.TP +.B \-i, --input-directory +the directory containing the samples' fasta files of reads (note each fasta file should correspond to a separate sample) +.TP +.B \-f, --sensing-fasta +location of the fasta file database used to create the sensing matrix. +.TP +.B \-s, --sensing-matrix +location of the sensing matrix. +.TP +.b \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-l, --lambda +lambda value to use. (default value is 10000) +.TP +.B \-j, --jobs +specifies how many jobs to run at once. (default value is the number of CPUs) +.TP +.B \-o, --otu-table +the OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or sequence table if not OTU's) +.TP +.B \-v, --verbose +verbose mode. +.SH USAGE +This program will use a large amount of memory, and CPU time.  +You can reduce the number of cores used, and thus memory, by specifying the -j flag with aspecified number of jobs. Otherwise multifasta_to_otu will run one job per cpu core. +.SH POSTPROCESSING +.B Note: When making your QIIME Metadata file, the sample id's must match the sample fasta file prefix names +.P +4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: (Note: Our code works much better with WEIGHTED Unifrac as opposed to Unweighted.) +.TP +Pre-requisites: +1. multifasta output file "quikr_output_table.txt" for our example. +.br +2. the tree of the database sequences that were used (e.g.dp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.) +.br +3. your-defined <qiime_metadata_file.txt> +.TP +The QIIME procedue: +convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table" +.br +beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file> +.br +principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt +.br +make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file> +.SH "SEE ALSO" +\fBquikr\fP(1), \fBquikr_train\fP(1). +.SH AUTHORS +.B multifasta2otu  +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use,  +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright  +notice appear in all copies and that both that copyright notice and this  +permission noticeappear in supporting documentation.  No representations are +made about the suitability of this software for any purpose.  It is provided +"as is" without express or implied warranty. + diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c new file mode 100644 index 0000000..268dec6 --- /dev/null +++ b/src/c/multifasta_to_otu.c @@ -0,0 +1,314 @@ +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <omp.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "nnls.h" +#include "quikr_functions.h" + +#define sensing_matrix(i,j) (sensing_matrix[width*i + j]) +#define solutions(i,j) (solutions[sequences*i+ j]) +#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + +  int c; + +  char *input_fasta_directory = NULL; +  char *sensing_matrix_filename = NULL; +  char *sensing_fasta_filename = NULL; +  char *output_filename = NULL; + +  double *sensing_matrix; + +  long int width = 0; +  long int sequences = 0; + +  int kmer = 6; +  int lambda = 10000; + +  int x = 0; +  int y = 0; + +  int jobs = 1; +  #ifdef Linux +  jobs = get_nprocs(); +  #endif +  #ifdef Darwin +  jobs = sysconf (_SC_NPROCESSORS_ONLN); +  #endif + +  int verbose = 0; + +  DIR *input_directory_dh; +  struct dirent *entry; + +  while (1) { +    static struct option long_options[] = { +      {"input-directory", required_argument, 0, 'i'}, +      {"kmer",  required_argument, 0, 'k'}, +      {"lambda",  required_argument, 0, 'l'}, +      {"jobs",  required_argument, 0, 'j'}, +      {"output", required_argument, 0, 'o'}, +      {"sensing-fasta",  required_argument, 0, 'f'}, +      {"sensing-matrix", required_argument, 0, 's'}, +      {"verbose", no_argument, 0, 'v'}, +      {0, 0, 0, 0} +    }; +    int option_index = 0; + +    c = getopt_long (argc, argv, "k:l:f:s:i:o:j:hv", long_options, &option_index); + +    if (c == -1) +      break; + +    switch (c) { +      case 'k': +        kmer = atoi(optarg); +        break; +      case 'l': +        lambda = atoi(optarg); +        break; +      case 'f': +        sensing_fasta_filename = optarg; +        break; +      case 's': +        sensing_matrix_filename = optarg; +        break; +      case 'j': +        jobs = atoi(optarg); +        break; +      case 'i': +        input_fasta_directory = optarg; +        break; +      case 'o': +        output_filename = optarg; +        break; +      case 'v': +        verbose = 1; +        break; +      case 'h': +        puts(USAGE); +        exit(EXIT_SUCCESS); +        break; +      default: +        break; +    } +  } + +  if(sensing_matrix_filename == NULL) { +    fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(sensing_fasta_filename == NULL) { +    fprintf(stderr, "Error: sensing fasta filename (-f) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(output_filename == NULL) { +    fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(input_fasta_directory == NULL) { +    fprintf(stderr, "Error: input fasta directory (-i) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } + +  // set defaults + + +  if(verbose) {  +    printf("kmer: %d\n", kmer); +    printf("lambda: %d\n", lambda); +    printf("input directory: %s\n", input_fasta_directory); +    printf("sensing database: %s\n", sensing_matrix_filename); +    printf("sensing database fasta: %s\n", sensing_fasta_filename); +    printf("output: %s\n", output_filename); +    printf("number of jobs to run at once: %d\n", jobs);  +  } + + +  input_directory_dh = opendir(input_fasta_directory); +  if(input_fasta_directory == NULL) { +    fprintf(stderr, "could not open %s\n", input_fasta_directory); +    exit(EXIT_FAILURE); +  }  + +  // do a directory count +  int dir_count = -2; // -2 for ../ and ./ +  while(entry = readdir(input_directory_dh))  +    dir_count++;  +  rewinddir(input_directory_dh); +  if(dir_count == 0) { +    fprintf(stderr, "%s is empty\n", input_fasta_directory); +    exit(EXIT_FAILURE); +  } + +  // 4 "ACGT" ^ Kmer gives us the size of output rows +  width = pow(4, kmer) + 1; +  sequences = count_sequences(sensing_fasta_filename); + +  if(verbose) { +    printf("directory count: %d\n", dir_count); +    printf("width: %ld\nsequences %ld\n", width, sequences); +  } + +  sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); + +  // multiply our matrix by lambda +  for(x = 0; x < sequences; x++) { +    for(y= 0; y < width; y++) { +      sensing_matrix(x, y) = sensing_matrix(x, y) * lambda; +    } +  } + +  // set the first row to be all 1's +  for(x = 0; x < sequences; x++) { +    sensing_matrix(x, 0) = 1.0; +  } + +  double *solutions = malloc(dir_count * sequences * sizeof(double)); +  if(solutions == NULL) { +    fprintf(stderr, "Could not allocate enough memory for solutions vector\n"); +    exit(EXIT_FAILURE); +  } + +  char **filenames = malloc(dir_count * sizeof(char *)); +  if(filenames == NULL) { +    fprintf(stderr, "Could not allocate enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  int *file_sequence_count = malloc(dir_count * sizeof(int)); +  if(file_sequence_count == NULL) { +    fprintf(stderr, "Could not allocate enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  struct dirent result; + +  omp_set_num_threads(jobs); +  int done = 0; +  printf("Beginning to process samples\n");  +#pragma omp parallel for shared(solutions, sequences, width, result, done) +  for(int i = 0; i < dir_count; i++ ) { + +    int z = 0; +    struct dirent *directory_entry; +    char *filename = malloc(256 * sizeof(char)); +    char *base_filename = malloc(256 * sizeof(char)); +    if(filename == NULL || base_filename == NULL) { +      fprintf(stderr, "Could not allocate enough memory\n"); +      exit(EXIT_FAILURE); +    } + +#pragma omp critical +    readdir_r(input_directory_dh, &result, &directory_entry); + +    if(strcmp(directory_entry->d_name, "..") == 0 || strcmp(directory_entry->d_name, ".") == 0) { +      i--; +      continue; +    } + +    // get our base filenames +    strcpy(base_filename, directory_entry->d_name); +    filenames[i] = base_filename;  + +    // get our real filename +    sprintf(filename, "%s/%s", input_fasta_directory, directory_entry->d_name); + +    // get individual sequence count +    file_sequence_count[i] =  count_sequences(filename); + +    // count the kmer amounts  +    double *count_matrix = load_count_matrix(filename, width, kmer); + +    // normalize our kmer counts +    normalize_matrix(count_matrix, 1, width); + +    // multiply our kmers frequency by lambda +    for(z = 0; z < width; z++)  +      count_matrix[z] = count_matrix[z] * lambda; + +    double *sensing_matrix_copy = malloc(sizeof(double) * sequences * width); +    if(sensing_matrix_copy == NULL) { +      fprintf(stderr, "Could not allocate enough memory\n"); +      exit(EXIT_FAILURE); +    } + +    memcpy(sensing_matrix_copy, sensing_matrix, sequences * width * sizeof(double)); + + +    // run nnls +    double *solution = nnls(sensing_matrix_copy, count_matrix, sequences, width); + +    // normalize our solution +    normalize_matrix(solution, 1, sequences); + +    // add the current solution to the solutions array +    for(z = 0; z < sequences; z++ )  { +      solutions(i, z) = solution[z];  +    } + +    done++; +    printf("%d/%d samples processed\n", done, dir_count);  +    free(solution); +    free(count_matrix); +    free(filename); +    free(sensing_matrix_copy); +  } + +  char **headers = load_headers(sensing_fasta_filename, sequences); + +  // output our matrix +  FILE *output_fh = fopen(output_filename, "w"); +  if(output_fh == NULL) {  +    fprintf(stderr, "Could not open %s for writing\n", output_filename); +    exit(EXIT_FAILURE); +  } + +  fprintf(output_fh, "# QIIME vQuikr OTU table\n"); +  fprintf(output_fh, "#OTU_ID\t"); + +  // print our filename headers +  for(x = 0; x < dir_count - 1; x++) { +    fprintf(output_fh, "%s\t", filenames[x]); +  } +  fprintf(output_fh, "%s\n", filenames[dir_count - 1]); + +  // get our actual values  +  for(y = 0; y < sequences; y++) { +    for(x = 0; x < dir_count; x++) { +      solutions(x, y) = round(solutions(x, y) * file_sequence_count[x]); +    } +  } + +  for(y = 0; y < sequences; y++) { + +    double column_sum = 0.; +    for(x = 0; x < dir_count; x++) { +      column_sum += solutions(x, y); +    } + +    // if our column is zero, don't bother printing the row +    if(column_sum != 0) { +      fprintf(output_fh, "%s\t", headers[y]); + +      for(x = 0; x < dir_count - 1; x++) { +        fprintf(output_fh, "%d\t", (int)solutions(x, y)); +      } +      fprintf(output_fh, "%d\n", (int)solutions[sequences*(dir_count - 1) + y]); +    } +  } +  fclose(output_fh); + +  return EXIT_SUCCESS; +} diff --git a/src/c/nnls.c b/src/c/nnls.c new file mode 100644 index 0000000..ddd9f2f --- /dev/null +++ b/src/c/nnls.c @@ -0,0 +1,416 @@ +/* + *   nnls.c  (c) 2002-2009 Turku PET Centre + *   This file contains the routine NNLS (nonnegative least squares) + *   and the subroutines required by it, except h12, which is in + *   file 'lss_h12.c'. + *    + *  This routine is based on the text and fortran code in + *  C.L. Lawson and R.J. Hanson, Solving Least Squares Problems, + *  Prentice-Hall, Englewood Cliffs, New Jersey, 1974. + *  Version: + *  2002-08-19 Vesa Oikonen + *  2003-05-08 Kaisa Sederholm & VO + *  Included function nnlsWght(). + *  2003-05-12 KS + *  Variable a_dim1 excluded + *   + *  Usage of the coefficient matrix altered so that it is + *  given in a[][] instead of a[]. + *  2003-11-06 VO + *  If n<2, then itmax is set to n*n instead of previous n*3. + *  2004-09-17 VO + *  Doxygen style comments. + *  2006-24-04 Pauli Sundberg + *  Added some debuging output, and made some comments more precise. + *  2007-05-17 VO + *  2009-04-16 VO + *  Corrected a bug in nnls() which may have caused an infinite loop. + *  009-04-27 VO + *  Added function nnlsWghtSquared() for faster pixel-by-pixel calculations. + *  Checking for exceeding iteration count is corrected in nnls(). + */  +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <math.h> +#define MAX(a,b) ((a) >= (b) ? (a) : (b)) +#define ABS(x) ((x) >= 0 ? (x) : -(x)) + +int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) { +  double d1,  b, clinv, cl, sm; +  int k, j; + +  /* Check parameters */ +  if (mode!=1 && mode!=2)  +    return(1); +  if (m<1 || u==NULL || u_dim1<1 || cm==NULL)  +    assert(0); +  //     return(1); +  if (lpivot<0 || lpivot>=l1 || l1>m)  +    //     assert(0); +    return(1); + +  /* Function Body */ +  cl = ABS( u[lpivot*u_dim1] ); +  // cl= (d1 = u[lpivot*u_dim1], fabs(d1)); + +  if (mode==2)  +  { /* Apply transformation I+U*(U**T)/B to cm[] */ +    if(cl<=0.)  +      //       assert(0); +      return(0); +  }  +  else  +  {   /* Construct the transformation */ + + +    /* This is the way provided in the original pseudocode +       sm = 0; +       for (j = l1; j < m; j++) +       { +       d1 =  u[j * u_dim1]; +       sm += d1*d1; +       } +       d1 = u[lpivot * u_dim1]; +       sm += d1*d1; +       sm = sqrt(sm); + +       if (u[lpivot*u_dim1] > 0)  +       sm=-sm; + +       up[0] = u[lpivot*u_dim1] - sm;  +       u[lpivot*u_dim1]=sm; +       printf("Got sum: %f\n",sm); +       */ + +    /* and this trying to compensate overflow */ +    for (j=l1; j<m; j++)  +    {  // Computing MAX  +      cl = MAX( ABS( u[j*u_dim1] ), cl ); +    } +    // zero vector?    + +    if (cl<=0.)  +      return(0); + +    clinv=1.0/cl; + +    // Computing 2nd power  +    d1=u[lpivot*u_dim1]*clinv;  +    sm=d1*d1; + +    for (j=l1; j<m; j++)  +    { +      d1=u[j*u_dim1]*clinv;  +      sm+=d1*d1; +    } +    cl *= sqrt(sm);  + +    if (u[lpivot*u_dim1] > 0.)  +      cl=-cl; +    up[0] = u[lpivot*u_dim1] - cl;  +    u[lpivot*u_dim1]=cl; +  } + +  // no vectors where to apply? only change pivot vector!  +  b=up[0] * u[lpivot*u_dim1]; + +  /* b must be nonpositive here; if b>=0., then return */ +  if (b == 0)  +    return(0); + +  // ok, for all vectors we want to apply +  for (j =0; j < ncv; j++) { +    sm = cm[ lpivot * ice + j * icv ] * (up[0]); + +    for (k=l1; k<m; k++)  +      sm += cm[ k * ice + j*icv ] * u[ k*u_dim1 ];  + +    if (sm != 0.0) { +      sm *= (1/b);  +      // cm[lpivot, j] = .. +      cm[ lpivot * ice + j*icv] += sm * (up[0]); +      for (k= l1; k<m; k++)  +      { +        cm[ k*ice + j*icv] += u[k * u_dim1]*sm; +      } +    } +  } + +  return(0); +}  + + +void g1(double a, double b, double *cterm, double *sterm, double *sig) +{ +  double d1, xr, yr; + +  if( fabs(a) > fabs(b) ) { +    xr = b / a; +    d1 = xr; +    yr = sqrt(d1*d1 + 1.); +    d1 = 1./yr; +    *cterm=(a>=0.0 ? fabs(d1) : -fabs(d1)); +    *sterm=(*cterm)*xr; +    *sig=fabs(a)*yr; +  } else if( b != 0.) { +    xr = a / b; +    d1 = xr; +    yr = sqrt(d1 * d1 + 1.); +    d1 = 1. / yr; +    *sterm=(b>=0.0 ? fabs(d1) : -fabs(d1)); +    *cterm=(*sterm)*xr; *sig=fabs(b)*yr; +  } else { +    *sig=0.; *cterm=0.; *sterm=1.; +  } +}  + + +int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) { +  int pfeas; +  int ret=0; +  int iz; +  int jz; +  int k, j=0, l, itmax, izmax=0, ii, jj=0, ip; +  double d1, d2, sm, up, ss;  +  double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc; + + +  /* Check the parameters and data */ +  if(m <= 0 || n <= 0 || a == NULL || b == NULL || x == NULL)  +    return(2); + +  /* Allocate memory for working space, if required */ +  double *w = (double*)calloc(n, sizeof(double)); +  double *zz = (double*)calloc(m, sizeof(double)); +  int *index = (int*)calloc(n, sizeof(int)); +  if(w == NULL || zz == NULL || index == NULL)  +    return(2); + +  /* Initialize the arrays INDEX[] and X[] */ +  for(k=0; k<n; k++) { +    x[k]=0.; +    index[k]=k; +  } + +  int iz2 = n - 1; +  int iz1 = 0; +  int iter=0;  +  int nsetp=0; +  int npp1=0; + +  /* Main loop; quit if all coeffs are already in the solution or */ +  /* if M cols of A have been triangularized */ +  if(n < 3)  +    itmax=n*3; +  else  +    itmax=n*n; +  + +  while(iz1 <= iz2 && nsetp < m) { +    /* Compute components of the dual (negative gradient) vector W[] */ +    for(iz=iz1; iz<=iz2; iz++) { +      j=index[iz]; +      sm=0.; +      for(l=npp1; l<m; l++)  +        sm+=a[j*m + l]*b[l]; +      w[j]=sm; +    } + +    while(1) { +      /* Find largest positive W[j] */ +      for(wmax=0., iz=iz1; iz<=iz2; iz++) { +        j=index[iz]; if(w[j]>wmax) {wmax=w[j]; izmax=iz;}} + +      /* Terminate if wmax<=0.; */ +      /* it indicates satisfaction of the Kuhn-Tucker conditions */ +      if(wmax<=0.0) +        break; + +      iz=izmax;  +      j=index[iz]; + +      /* The sign of W[j] is ok for j to be moved to set P. */ +      /* Begin the transformation and check new diagonal element to avoid */ +      /* near linear dependence. */ +      asave=a[j*m + npp1]; +      h12(1, npp1, npp1+1, m, &a[j*m +0], 1, &up, &dummy, 1, 1, 0); +      unorm=0.; +      if(nsetp!=0){ +        for(l=0; l<nsetp; l++) { +          d1=a[j*m + l];  +          unorm+=d1*d1; +        } +      } + +      unorm=sqrt(unorm); +      d2=unorm+(d1=a[j*m + npp1], fabs(d1)) * 0.01; +      if((d2-unorm)>0.) { +        /* Col j is sufficiently independent. Copy B into ZZ, update ZZ */ +        /* and solve for ztest ( = proposed new value for X[j] ) */ +        for(l=0; l<m; l++) zz[l]=b[l]; +        h12(2, npp1, npp1+1, m, &a[j*m + 0], 1, &up, zz, 1, 1, 1); +        ztest=zz[npp1]/a[j*m +npp1]; +        /* See if ztest is positive */ +        if(ztest>0.) break; +      } + +      /* Reject j as a candidate to be moved from set Z to set P. Restore */ +      /* A[npp1,j], set W[j]=0., and loop back to test dual coeffs again */ +      a[j*m+ npp1]=asave; w[j]=0.; +    } /* while(1) */ + +    if(wmax<=0.0) +      break; + +    /* Index j=INDEX[iz] has been selected to be moved from set Z to set P. */ +    /* Update B and indices, apply householder transformations to cols in */ +    /* new set Z, zero subdiagonal elts in col j, set W[j]=0. */ +    for(l=0; l<m; ++l)  +      b[l]=zz[l]; + +    index[iz]=index[iz1]; +    index[iz1]=j; +    iz1++; +    npp1++; +    nsetp=npp1; + +    if(iz1<=iz2) { +     for(jz=iz1; jz<=iz2; jz++) { +        jj=index[jz]; +        h12(2, nsetp-1, npp1, m, &a[j*m +0], 1, &up, &a[jj*m +0], 1, m, 1); +      } +    } + +    if(nsetp!=m) { +      for(l=npp1; l<m; l++)  +        a[j*m +l]=0.; +    } + +    w[j]=0.; + +    /* Solve the triangular system; store the solution temporarily in Z[] */ +    for(l=0; l<nsetp; l++) { +      ip=nsetp-(l+1); +      if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1]; +      jj=index[ip]; zz[ip]/=a[jj*m +ip]; +    } + +    /* Secondary loop begins here */ +    while(++iter < itmax) { +      /* See if all new constrained coeffs are feasible; if not, compute alpha */ +      for(alpha = 2.0, ip = 0; ip < nsetp; ip++) { +        l=index[ip]; +        if(zz[ip]<=0.) { +          t = -x[l]/(zz[ip]-x[l]); +          if(alpha > t) { +            alpha = t;  +            jj = ip - 1; +          } +        } +      } + +      /* If all new constrained coeffs are feasible then still alpha==2. */ +      /* If so, then exit from the secondary loop to main loop */ +      if(alpha==2.0)  +        break; + +      /* Use alpha (0.<alpha<1.) to interpolate between old X and new ZZ */ +      for(ip=0; ip<nsetp; ip++) { +        l = index[ip]; +        x[l] += alpha*(zz[ip]-x[l]); +      } + +      /* Modify A and B and the INDEX arrays to move coefficient i */ +      /* from set P to set Z. */ +      k=index[jj+1]; pfeas=1; +      do { +        x[k]=0.; +        if(jj!=(nsetp-1)) { +          jj++; +          for(j=jj+1; j<nsetp; j++) { +            ii=index[j]; index[j-1]=ii; +            g1(a[ii*m + (j-1)], a[ii*m + j], &cc, &ss, &a[ii*m + j-1]); +            for(a[ii*m + j]=0., l=0; l<n; l++) if(l!=ii) { +              /* Apply procedure G2 (CC,SS,A(J-1,L),A(J,L)) */ +              temp=a[l*m + j-1]; +              a[l*m + j-1]=cc*temp+ss*a[l*m + j]; +              a[l*m + j]=-ss*temp+cc*a[l*m + j]; +            } +            /* Apply procedure G2 (CC,SS,B(J-1),B(J)) */ +            temp=b[j-1]; b[j-1]=cc*temp+ss*b[j]; b[j]=-ss*temp+cc*b[j]; +          } +        } +        npp1=nsetp-1; nsetp--; iz1--; index[iz1]=k; + +        /* See if the remaining coeffs in set P are feasible; they should */ +        /* be because of the way alpha was determined. If any are */ +        /* infeasible it is due to round-off error. Any that are */ +        /* nonpositive will be set to zero and moved from set P to set Z */ +        for(jj=0, pfeas=1; jj<nsetp; jj++) { +          k=index[jj]; if(x[k]<=0.) {pfeas=0; break;} +        } +      } while(pfeas==0); + +      /* Copy B[] into zz[], then solve again and loop back */ +      for(k=0; k<m; k++)  +        zz[k]=b[k]; +      for(l=0; l<nsetp; l++) { +        ip=nsetp-(l+1); +        if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1]; +        jj=index[ip]; zz[ip]/=a[jj*m + ip]; +      } +    } /* end of secondary loop */ + +    if(iter>=itmax) { +      ret = 1; +      break; +    } + +    for(ip=0; ip<nsetp; ip++) { +      k=index[ip]; +      x[k]=zz[ip]; +      } +  } /* end of main loop */ + +  /* Compute the norm of the final residual vector */ +  sm=0.; + +  if (rnorm != NULL) { +    if (npp1<m)  +      for (k=npp1; k<m; k++)  +        sm+=(b[k] * b[k]); +    else  +      for (j=0; j<n; j++)  +        w[j]=0.; +    *rnorm=sqrt(sm); +  }  + +  /* Free working space, if it was allocated here */ +  free(w); +  free(zz); +  free(index); +  return(ret); +} +/* nnls_ */ + + +double *nnls(double *a_matrix, double *b_matrix, int height, int width) { + +  double *solution = (double*)calloc(height, sizeof(double)); + +  if(solution == NULL) { +    fprintf(stderr, "could not allocate enough memory for nnls\n"); +    exit(EXIT_FAILURE); +  } + +  int ret = nnls_algorithm(a_matrix, width, height, b_matrix, solution, NULL); +  if(ret == 1) { +    printf("NNLS has reached the maximum iterations\n"); +  } else if(ret == 2) { +    fprintf(stderr, "NNLS could not allocate enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  return solution; +} diff --git a/src/c/nnls.h b/src/c/nnls.h new file mode 100644 index 0000000..ded8d01 --- /dev/null +++ b/src/c/nnls.h @@ -0,0 +1 @@ +double *nnls(double *a_matrix, double *b_matrix, int height, int width); diff --git a/src/c/quikr.1 b/src/c/quikr.1 new file mode 100644 index 0000000..9982d94 --- /dev/null +++ b/src/c/quikr.1 @@ -0,0 +1,70 @@ +.TH quikr 1 quikr-2013-05 +.SH NAME +quikr \- Calculate estimated frequencies of bacteria in a sample. +.SH SYNOPSIS +.B quikr +.RB [ \-i +.IR input] +.RB [ \-f +.IR sensing-fasta] +.RB [ \-s +.IR sensing-matrix] +.RB [ \-l +.IR lambda ] +.RB [ \-k +.IR kmer ] +.RB [ \-o +.IR output ] +.RB [ \-v ] +.P +.BR quikr " ..." +.SH DESCRIPTION +.B quikr +Quikr returns the estimated frequencies of batcteria present when given a +input FASTA file and a trained sensing matrix +.P +.SH OPTIONS +.TP +.B \-i, --input +the sample's fasta file of NGS. READS (fasta format) +.TP +.B \-f, --sensing-fasta +location of the fasta file database used to create the sensing matrix. (fasta format) +.TP +.B \-s, --sensing-matrix +location of the sensing matrix. (trained from quikr_train) +.TP +.B \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-l, --lambda +lambda value to use. (default value is 10000) +.TP +.B \-o, --output +OTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output) +.TP +.B \-v, --verbose +verbose mode. +.SH EXAMPLES +Use quikr to calculate the estimated frequencies for sample.fa, using rdp7.fasta as the sensing matrix we generated with quikr_train. This uses 6-mers by default, and a lambda value of 10000: +.P +quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt +.SH "SEE ALSO" +\fBmultifasta_to_otu\fP(1), \fBquikr_train\fP(1). +.SH AUTHORS +.B quikr +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use,  +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright  +notice appear in all copies and that both that copyright notice and this  +permission noticeappear in supporting documentation.  No representations are +made about the suitability of this software for any purpose.  It is provided +"as is" without express or implied warranty. + diff --git a/src/c/quikr.c b/src/c/quikr.c new file mode 100644 index 0000000..c73e0dd --- /dev/null +++ b/src/c/quikr.c @@ -0,0 +1,192 @@ +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <stdio.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "nnls.h" +#include "quikr_functions.h" + +#define sensing_matrix(i,j) (sensing_matrix[width*i + j]) +#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + + +  int c; +  int kmer = 0; + +  char *input_fasta_filename = NULL; +  char *sensing_matrix_filename = NULL; +  char *sensing_fasta_filename = NULL; +  char *output_filename = NULL; + +  int x = 0; +  int y = 0; +  int verbose = 0; +  int lambda = 0; +   + + +  while (1) { +    static struct option long_options[] = { +      {"input", required_argument, 0, 'i'}, +      {"kmer",  required_argument, 0, 'k'}, +      {"lambda",  required_argument, 0, 'l'}, +      {"output", required_argument, 0, 'o'}, +      {"sensing-fasta",  required_argument, 0, 'f'}, +      {"sensing-matrix", required_argument, 0, 's'}, +      {"verbose", no_argument, 0, 'v'}, +      {0, 0, 0, 0} +    }; + +    int option_index = 0; + +    c = getopt_long (argc, argv, "k:l:f:s:i:o:hdv", long_options, &option_index); + +    if (c == -1) +      break; + +    switch (c) { +      case 0: +      case 'k': +        kmer = atoi(optarg); +        break; +      case 'l': +        lambda = atoi(optarg); +        break; +      case 'f': +        sensing_fasta_filename = optarg; +        break; +      case 's': +        sensing_matrix_filename = optarg; +        break; +      case 'i': +        input_fasta_filename = optarg; +        break; +      case 'o': +        output_filename = optarg; +        break; +      case 'v': +        verbose = 1; +        break; +      case 'h': +        printf("%s\n", USAGE); +        exit(EXIT_SUCCESS); +        break; +      default: +        break; +    } +  } + + +  if(sensing_matrix_filename == NULL) { +    fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(sensing_fasta_filename == NULL) { +    fprintf(stderr, "Error: sensing matrix filename (-f) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(output_filename == NULL) { +    fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } +  if(input_fasta_filename == NULL) { +    fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } + +  if(lambda == 0)  +    lambda = 10000; +  if(kmer == 0)  +    kmer = 6; + +  if(verbose) {  +    printf("kmer: %d\n", kmer); +    printf("lambda: %d\n", lambda); +    printf("fasta: %s\n", input_fasta_filename); +    printf("sensing database: %s\n", sensing_matrix_filename); +    printf("sensing database fasta: %s\n", sensing_fasta_filename); +    printf("output: %s\n", output_filename); +  } +  // 4 "ACGT" ^ Kmer gives us the size of output rows +  int width = pow(4, kmer); +  width = width + 1; + +  int sequences = count_sequences(sensing_fasta_filename); + +  if(verbose) { +    printf("width: %d\nsequences %d\n", width, sequences); +  } + +  double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); +  double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer); + +  // multiply our matrix by lambda +  for(x = 1; x < sequences; x++) { +    for(y= 0; y < width - 1; y++) { +      sensing_matrix(x, y) = sensing_matrix(x, y) * lambda; +    } +  } + +  for(x= 0; x < sequences; x++) { +    sensing_matrix(x, 0) = 1.0; +  } +  // normalize our count_matrix +  normalize_matrix(count_matrix, 1, width); +  for(x = 0; x < width; x++)  +    count_matrix[x] = count_matrix[x] * lambda; +   +  // output our matricies if we are in verbose mode +  if(verbose) {  +    FILE *sensing_matrix_fh = fopen( "sensing.matrix", "w"); +    if(sensing_matrix_fh == NULL) { +      fprintf(stderr, "could not open sensing.matrix for writing.\n"); +      exit(EXIT_FAILURE); +    } +    for(x = 0; x < sequences; x++) { +      for( y = 0; y < width; y++) { +        fprintf(sensing_matrix_fh, "%.10f\t", sensing_matrix(x, y)); +      } +      fprintf(sensing_matrix_fh, "\n"); +    } +    fclose(sensing_matrix_fh); + +    FILE *count_matrix_fh = fopen("count.matrix", "w"); +    if(count_matrix_fh == NULL) { +      fprintf(stderr, "could not open sensing.matrix for writing.\n"); +      exit(EXIT_FAILURE); +    } +    for(x = 0; x < width; x++) { +      fprintf(count_matrix_fh, "%.10f\n", count_matrix[x]); +    } +    fclose(count_matrix_fh); +  } + +  double *solution = nnls(sensing_matrix, count_matrix, sequences, width); + +  // normalize our solution vector +  normalize_matrix(solution, 1, sequences); + +  // output our matrix +  FILE *output_fh = fopen(output_filename, "w"); +  if(output_fh == NULL) {  +    fprintf(stderr, "Could not open %s for writing\n", output_filename); +    exit(EXIT_FAILURE); +  } +  for(x = 0; x < sequences; x++) { +      fprintf(output_fh, "%.10lf\n", solution[x]); +  } +  fclose(output_fh); + +  return EXIT_SUCCESS; +} diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c new file mode 100644 index 0000000..7e18e64 --- /dev/null +++ b/src/c/quikr_functions.c @@ -0,0 +1,163 @@ +#include <stdio.h> +#include <stdio.h> +#include <errno.h> +#include <ctype.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <math.h> + +int count_sequences(char *filename) {  +  char command[512]; +  int sequences = 0; +  FILE *grep_output; + +  sprintf(command, "grep -c ^\\> %s", filename); +  grep_output = popen(command, "r"); +  if(grep_output == NULL) { +    fprintf(stderr, "Could not execute %s\n", command); +    exit(EXIT_FAILURE); +  } +   +  fscanf(grep_output, "%d", &sequences); + +  pclose(grep_output); +  return sequences; +} + + +void normalize_matrix(double *matrix, int height, int width) { +  int x = 0; +  int y = 0; +  for(x = 0; x < height; x++) { + +    double row_sum = 0; + +    for(y = 0; y < (width); y++) +      row_sum = row_sum + matrix[width * x + y]; +    for(y = 0; y < (width); y++)  +      matrix[width * x + y] = matrix[width * x + y] / row_sum;  +  } +} + + +double *load_count_matrix(char *filename, int width, int kmer) { + +  double *count_matrix = malloc((width)*sizeof(double)); +  char count_command[512]; +  int x = 0; +  char *line = NULL; +  size_t len = 0; + +  if(count_matrix == NULL) { +    fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width); +    exit(EXIT_FAILURE); +  } + +  // create out count matrix +  sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename); +  FILE *count_output = popen(count_command, "r"); +  if(count_output == NULL) { +    fprintf(stderr, "could not execute \"%s\"", count_command); +    exit(EXIT_FAILURE); +  } + +  // set first element to zero. +  count_matrix[0] = 0; + +  // get our first line +  getline(&line, &len, count_output); +  count_matrix[1] = atoi(line); + +  // iterate over the rest of the lines +  for(x = 2; x < width; x++) {  +    getline(&line, &len, count_output); +    count_matrix[x] = atoi(line); +  }  + +  pclose(count_output); + +  return count_matrix; +} + + +double *load_sensing_matrix(char *filename, int height, int width) { + +  int x = 0; +  int y = 0; + +  char *line = NULL; +  char *val; +  char command[512]; +  size_t len = 0; +  FILE *sensing_matrix_fh = NULL; + +  double *sensing_matrix = malloc(height * width * sizeof(double)); +  if(sensing_matrix == NULL) { +    fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); +  } + +  sprintf(command, "gzip -cd %s", filename); +  sensing_matrix_fh = popen(command, "r"); +  if(sensing_matrix_fh == NULL) { +    fprintf(stderr, "could not open %s", filename); +    exit(EXIT_FAILURE); +  } + +  // read our sensing matrix in +  for(x = 0; x < height; x++) { +    getline(&line, &len, sensing_matrix_fh); +    char *ptr; + +    // Read our first element in outside of the loop +    val = strtok_r(line,"\t", &ptr); +    sensing_matrix[width*x + 1] = atof(val); +    // iterate through and load the array +    for (y = 2; y < width; y++) { +      val = strtok_r(NULL, "\t", &ptr); +      sensing_matrix[width*x + y] = atof(val); +    } +  } + +  pclose(sensing_matrix_fh); + +  return sensing_matrix; +} + +char **load_headers(char *filename, int sequences) { +  char command[512]; +  char *line= NULL; +  int x = 0; +  FILE *grep_output; +  size_t len = 0; + +  sprintf(command, "grep ^\\> %s", filename); +  grep_output = popen(command, "r"); +  if(grep_output == NULL) { +    fprintf(stderr, "Could not execute %s\n", command); +    exit(EXIT_FAILURE); +  } +   +  char **headers = malloc(sequences * sizeof(char *)); +  if(headers == NULL) { +    fprintf(stderr, "could not allocated enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  for(x = 0; x < sequences; x++) { + +    char *header = malloc(256 * sizeof(char)); +    if(header == NULL) { +      fprintf(stderr, "could not allocated enough memory\n"); +      exit(EXIT_FAILURE); +    } +    getline(&line, &len, grep_output); +    sscanf(line + 1, "%s", header); +    headers[x] = header; +  } + +  pclose(grep_output); + +  return headers; +} + diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h new file mode 100644 index 0000000..1826427 --- /dev/null +++ b/src/c/quikr_functions.h @@ -0,0 +1,14 @@ +// count the sequences in a fasta file +int count_sequences(char *filename); + +// normalize a matrix by dividing each element by the sum of it's column +void normalize_matrix(double *matrix, int height, int width); + +// load a sensing matrix +double *load_sensing_matrix(char *filename, int height, int width); + +// load a count matrix +double *load_count_matrix(char *filename, int width, int kmer); + +// load headers +char **load_headers(char *filename, int sequences);  diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1 new file mode 100644 index 0000000..dd1493f --- /dev/null +++ b/src/c/quikr_train.1 @@ -0,0 +1,61 @@ +.TH quikr_train 1 quikr_train-2013-05 +.SH NAME +quikr_train \- train databases for use with Quikr +.SH SYNOPSIS +.B quikr_train +.RB [ \-i +.IR input] +.RB [ \-o +.IR output] +.RB [ \-k +.IR kmer ] +.RB [ \-v ] +.P +.BR quikr " ..." +.SH DESCRIPTION +.B quikr +The quikr_train is a utility to train a database for use with quikr. +Before running the quikr utility, you need to generate the sensing matrix or +download a pretrained one from our project's homepage. + +quikr_train returns a sensing matrix that can be used with the quikr +function.  +.P +.SH OPTIONS +.TP +.B \-i, --input +the database of sequences to create the sensing matrix. (fasta format) +.TP +.B \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-o, --output +the sensing matrix. (a gzip'd text file) +.TP +.B \-v, --verbose +verbose mode. +.SH EXAMPLES +Use quikr_train to generate a sensing matrix from rdp7.fasta. This uses 6mers by default. +.P +quikr_train -i rdp7.fa -o rd7_sensing_matrix.gz +.SH USAGE +If you do not have a .gz file on your output matrix name, it will be appended. +.SH "SEE ALSO" +\fBmultifasta_to_otu\fP(1), \fBquikr\fP(1). +.SH AUTHORS +.B quikr +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison  +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use,  +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright  +notice appear in all copies and that both that copyright notice and this  +permission noticeappear in supporting documentation.  No representations are +made about the suitability of this software for any purpose.  It is provided +"as is" without express or implied warranty. + diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c new file mode 100644 index 0000000..d2a83ef --- /dev/null +++ b/src/c/quikr_train.c @@ -0,0 +1,176 @@ +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include "quikr_functions.h" + +#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'" +#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + +  char probabilities_command[512]; +  char kmers_file[256]; +  char *line = NULL; +  char *val; +  size_t len = 0; + + +  int c; +  int kmer = 0; + +  char *fasta_file = NULL; +  char *output_file = NULL; + +  int x = 0; +  int y = 0; + +  int verbose = 0; + +  gzFile output = NULL; + +  while (1) { +    static struct option long_options[] = { +      {"verbose", no_argument, 0, 'v'}, +      {"input", required_argument, 0, 'i'}, +      {"kmer",  required_argument, 0, 'k'}, +      {"output", required_argument, 0, 'o'}, +      {0, 0, 0, 0} +    }; + +    int option_index = 0; + +    c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index); + +    if (c == -1) +      break; + +    switch (c) { +      case 'i': +        fasta_file = optarg; +        break; +      case 'k': +        kmer = atoi(optarg); +        break; +      case 'o': +        output_file = optarg; +        break; +      case 'v': +        verbose = 1; +        break; +      case 'h': +        printf("%s\n", USAGE); +        exit(EXIT_SUCCESS); +        break; +      case '?': +        /* getopt_long already printed an error message. */ +        break; +      default: +        exit(EXIT_FAILURE); +    } +  } + + +  if(fasta_file == NULL) { +    fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } + +  if(output_file == NULL) { +    fprintf(stderr, "Error: output directory (-o) must be specified\n\n"); +    fprintf(stderr, "%s\n", USAGE); +    exit(EXIT_FAILURE); +  } + +  if(kmer == 0) +    kmer = 6; + +  if(verbose) { +    printf("kmer size: %d\n", kmer); +    printf("fasta file: %s\n", fasta_file); +    printf("output file: %s\n", output_file); +  } + +  if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) { +    char *temp = malloc(sizeof(strlen(output_file) + 4)); +    sprintf(temp, "%s.gz", output_file); +    output_file = temp; +    printf("appending a .gz to our output file: %s\n", output_file); +  } + +  // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length +  int width = pow(4, kmer); +  int sequences = count_sequences(fasta_file); + +  if(verbose) +    printf("sequences: %d\nwidth: %d\n", sequences, width); + +  // Allocate our matrix with the appropriate size, just one row +  double *trained_matrix = malloc(width*sizeof(double)); +  if(trained_matrix == NULL) { +    fprintf(stderr, "Could not allocated enough memory\n"); +    exit(EXIT_FAILURE); +  } + +  // call the probabilities-by-read command +  sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer); +  sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file); +  FILE *probabilities_output = popen(probabilities_command, "r"); +  if(probabilities_output == NULL) { +    fprintf(stderr, "Error could not execute: %s\n", probabilities_command); +    exit(EXIT_FAILURE); +  } + +  // open our output file +  output = gzopen(output_file, "w6"); +  if(output == NULL) { +    fprintf(stderr, "Error: could not open output file, error code: %d", errno); +    exit(EXIT_FAILURE); +  } + +  if(verbose) +    printf("Writing our sensing matrix to %s\n", output_file); + +  // read normalize and write our matrix  in one go +  for(x = 0; x < sequences; x++) { + +    getline(&line, &len, probabilities_output); + +    // Read our first element in outside of the loop +    val = strtok(line,"\t\n\r"); +    trained_matrix[0] = atof(val); +    // iterate through and load the array +    for (y = 1; y < width; y++) { +      val = strtok (NULL, "\t\n\r"); +      trained_matrix[y] = atof(val); +    } + +    double row_sum = 0; + +    for( y = 0; y < width; y++) { +      row_sum = row_sum + trained_matrix[y]; +    } + +    for( y = 0; y < width; y++) { +      trained_matrix[y] = trained_matrix[y] / row_sum; +    } + +    for( y = 0; y < width; y++) { +      gzprintf(output, "%.10f\t", trained_matrix[y]); +    } +    gzprintf(output, "\n"); +  } + +  free(trained_matrix); +  gzclose(output); +  pclose(probabilities_output); + +  return 0; +} | 
