release1.0

author: Calvin <calvin@EESI> 2013-05-14 21:51:40 -0400
committer: Calvin <calvin@EESI> 2013-05-14 21:51:40 -0400
commit: 0773aaf89678b967588a902df1f5e6f9ccea393d (patch)
tree: 40762e5df1da876d460d8695357ab0835645e8c6 /src/c
parent: 1d2becc9af591d37badfe0e77751bbb80932472f (diff)
11 files changed, 1516 insertions, 0 deletions
diff --git a/src/c/Makefile b/src/c/Makefile
new file mode 100644
index 0000000..ba0c799
--- /dev/null
+++ b/src/c/Makefile
@@ -0,0 +1,14 @@
+UNAME := $(shell uname)
+CC = gcc
+QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME)
+QUIKR_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME)
+all: quikr_train_ quikr_ multifasta_to_otu_
+
+multifasta_to_otu_:
+	$(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(QUIKR_CFLAGS)
+quikr_train_:
+	$(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS)
+quikr_:
+	$(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../
+clean:
+	rm -vf quikr_train quikr multifasta_to_otu
diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1
new file mode 100644
index 0000000..7878ef0
--- /dev/null
+++ b/src/c/multifasta_to_otu.1
@@ -0,0 +1,95 @@
+.TH multifasta_to_otu 1 multifasta_to_otu-2013-05
+.SH NAME
+multifasta_to_otu \- create a QIIME OTU table based on Quikr results.
+.SH SYNOPSIS
+.B multifasta_to_otu
+.RB [ \-i
+.IR input-directory ]
+.RB [ \-f
+.IR sensing-fasta]
+.RB [ \-s
+.IR sensing-matrix]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-l
+.IR lambda ]
+.RB [ \-j
+.IR jobs ]
+.RB [ \-o
+.IR otu-table]
+.RB [ \-v ]
+.P
+.BR multifasta_to_otu " ..."
+.SH DESCRIPTION
+.B multifasta_to_otu
+The multifasta_to_otu tool is a handy wrapper for quikr which lets the user
+to input as many fasta files as they like, and then returns an OTU table of the
+number of times a specimen was seen in all of the samples.
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input-directory
+the directory containing the samples' fasta files of reads (note each fasta file should correspond to a separate sample)
+.TP
+.B \-f, --sensing-fasta
+location of the fasta file database used to create the sensing matrix.
+.TP
+.B \-s, --sensing-matrix
+location of the sensing matrix.
+.TP
+.b \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-l, --lambda
+lambda value to use. (default value is 10000)
+.TP
+.B \-j, --jobs
+specifies how many jobs to run at once. (default value is the number of CPUs)
+.TP
+.B \-o, --otu-table
+the OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or sequence table if not OTU's)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH USAGE
+This program will use a large amount of memory, and CPU time. 
+You can reduce the number of cores used, and thus memory, by specifying the -j flag with aspecified number of jobs. Otherwise multifasta_to_otu will run one job per cpu core.
+.SH POSTPROCESSING
+.B Note: When making your QIIME Metadata file, the sample id's must match the sample fasta file prefix names
+.P
+4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: (Note: Our code works much better with WEIGHTED Unifrac as opposed to Unweighted.)
+.TP
+Pre-requisites:
+1. multifasta output file "quikr_output_table.txt" for our example.
+.br
+2. the tree of the database sequences that were used (e.g.dp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.)
+.br
+3. your-defined <qiime_metadata_file.txt>
+.TP
+The QIIME procedue:
+convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+.br
+beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file>
+.br
+principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
+.br
+make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
+.SH "SEE ALSO"
+\fBquikr\fP(1), \fBquikr_train\fP(1).
+.SH AUTHORS
+.B multifasta2otu 
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison 
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use, 
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright 
+notice appear in all copies and that both that copyright notice and this 
+permission noticeappear in supporting documentation.  No representations are
+made about the suitability of this software for any purpose.  It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
new file mode 100644
index 0000000..268dec6
--- /dev/null
+++ b/src/c/multifasta_to_otu.c
@@ -0,0 +1,314 @@
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "nnls.h"
+#include "quikr_functions.h"
+
+#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
+#define solutions(i,j) (solutions[sequences*i+ j])
+#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+  int c;
+
+  char *input_fasta_directory = NULL;
+  char *sensing_matrix_filename = NULL;
+  char *sensing_fasta_filename = NULL;
+  char *output_filename = NULL;
+
+  double *sensing_matrix;
+
+  long int width = 0;
+  long int sequences = 0;
+
+  int kmer = 6;
+  int lambda = 10000;
+
+  int x = 0;
+  int y = 0;
+
+  int jobs = 1;
+  #ifdef Linux
+  jobs = get_nprocs();
+  #endif
+  #ifdef Darwin
+  jobs = sysconf (_SC_NPROCESSORS_ONLN);
+  #endif
+
+  int verbose = 0;
+
+  DIR *input_directory_dh;
+  struct dirent *entry;
+
+  while (1) {
+    static struct option long_options[] = {
+      {"input-directory", required_argument, 0, 'i'},
+      {"kmer",  required_argument, 0, 'k'},
+      {"lambda",  required_argument, 0, 'l'},
+      {"jobs",  required_argument, 0, 'j'},
+      {"output", required_argument, 0, 'o'},
+      {"sensing-fasta",  required_argument, 0, 'f'},
+      {"sensing-matrix", required_argument, 0, 's'},
+      {"verbose", no_argument, 0, 'v'},
+      {0, 0, 0, 0}
+    };
+    int option_index = 0;
+
+    c = getopt_long (argc, argv, "k:l:f:s:i:o:j:hv", long_options, &option_index);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+      case 'k':
+        kmer = atoi(optarg);
+        break;
+      case 'l':
+        lambda = atoi(optarg);
+        break;
+      case 'f':
+        sensing_fasta_filename = optarg;
+        break;
+      case 's':
+        sensing_matrix_filename = optarg;
+        break;
+      case 'j':
+        jobs = atoi(optarg);
+        break;
+      case 'i':
+        input_fasta_directory = optarg;
+        break;
+      case 'o':
+        output_filename = optarg;
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'h':
+        puts(USAGE);
+        exit(EXIT_SUCCESS);
+        break;
+      default:
+        break;
+    }
+  }
+
+  if(sensing_matrix_filename == NULL) {
+    fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(sensing_fasta_filename == NULL) {
+    fprintf(stderr, "Error: sensing fasta filename (-f) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(output_filename == NULL) {
+    fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(input_fasta_directory == NULL) {
+    fprintf(stderr, "Error: input fasta directory (-i) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+
+  // set defaults
+
+
+  if(verbose) { 
+    printf("kmer: %d\n", kmer);
+    printf("lambda: %d\n", lambda);
+    printf("input directory: %s\n", input_fasta_directory);
+    printf("sensing database: %s\n", sensing_matrix_filename);
+    printf("sensing database fasta: %s\n", sensing_fasta_filename);
+    printf("output: %s\n", output_filename);
+    printf("number of jobs to run at once: %d\n", jobs); 
+  }
+
+
+  input_directory_dh = opendir(input_fasta_directory);
+  if(input_fasta_directory == NULL) {
+    fprintf(stderr, "could not open %s\n", input_fasta_directory);
+    exit(EXIT_FAILURE);
+  } 
+
+  // do a directory count
+  int dir_count = -2; // -2 for ../ and ./
+  while(entry = readdir(input_directory_dh)) 
+    dir_count++; 
+  rewinddir(input_directory_dh);
+  if(dir_count == 0) {
+    fprintf(stderr, "%s is empty\n", input_fasta_directory);
+    exit(EXIT_FAILURE);
+  }
+
+  // 4 "ACGT" ^ Kmer gives us the size of output rows
+  width = pow(4, kmer) + 1;
+  sequences = count_sequences(sensing_fasta_filename);
+
+  if(verbose) {
+    printf("directory count: %d\n", dir_count);
+    printf("width: %ld\nsequences %ld\n", width, sequences);
+  }
+
+  sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
+
+  // multiply our matrix by lambda
+  for(x = 0; x < sequences; x++) {
+    for(y= 0; y < width; y++) {
+      sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
+    }
+  }
+
+  // set the first row to be all 1's
+  for(x = 0; x < sequences; x++) {
+    sensing_matrix(x, 0) = 1.0;
+  }
+
+  double *solutions = malloc(dir_count * sequences * sizeof(double));
+  if(solutions == NULL) {
+    fprintf(stderr, "Could not allocate enough memory for solutions vector\n");
+    exit(EXIT_FAILURE);
+  }
+
+  char **filenames = malloc(dir_count * sizeof(char *));
+  if(filenames == NULL) {
+    fprintf(stderr, "Could not allocate enough memory\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int *file_sequence_count = malloc(dir_count * sizeof(int));
+  if(file_sequence_count == NULL) {
+    fprintf(stderr, "Could not allocate enough memory\n");
+    exit(EXIT_FAILURE);
+  }
+
+  struct dirent result;
+
+  omp_set_num_threads(jobs);
+  int done = 0;
+  printf("Beginning to process samples\n"); 
+#pragma omp parallel for shared(solutions, sequences, width, result, done)
+  for(int i = 0; i < dir_count; i++ ) {
+
+    int z = 0;
+    struct dirent *directory_entry;
+    char *filename = malloc(256 * sizeof(char));
+    char *base_filename = malloc(256 * sizeof(char));
+    if(filename == NULL || base_filename == NULL) {
+      fprintf(stderr, "Could not allocate enough memory\n");
+      exit(EXIT_FAILURE);
+    }
+
+#pragma omp critical
+    readdir_r(input_directory_dh, &result, &directory_entry);
+
+    if(strcmp(directory_entry->d_name, "..") == 0 || strcmp(directory_entry->d_name, ".") == 0) {
+      i--;
+      continue;
+    }
+
+    // get our base filenames
+    strcpy(base_filename, directory_entry->d_name);
+    filenames[i] = base_filename; 
+
+    // get our real filename
+    sprintf(filename, "%s/%s", input_fasta_directory, directory_entry->d_name);
+
+    // get individual sequence count
+    file_sequence_count[i] =  count_sequences(filename);
+
+    // count the kmer amounts 
+    double *count_matrix = load_count_matrix(filename, width, kmer);
+
+    // normalize our kmer counts
+    normalize_matrix(count_matrix, 1, width);
+
+    // multiply our kmers frequency by lambda
+    for(z = 0; z < width; z++) 
+      count_matrix[z] = count_matrix[z] * lambda;
+
+    double *sensing_matrix_copy = malloc(sizeof(double) * sequences * width);
+    if(sensing_matrix_copy == NULL) {
+      fprintf(stderr, "Could not allocate enough memory\n");
+      exit(EXIT_FAILURE);
+    }
+
+    memcpy(sensing_matrix_copy, sensing_matrix, sequences * width * sizeof(double));
+
+
+    // run nnls
+    double *solution = nnls(sensing_matrix_copy, count_matrix, sequences, width);
+
+    // normalize our solution
+    normalize_matrix(solution, 1, sequences);
+
+    // add the current solution to the solutions array
+    for(z = 0; z < sequences; z++ )  {
+      solutions(i, z) = solution[z]; 
+    }
+
+    done++;
+    printf("%d/%d samples processed\n", done, dir_count); 
+    free(solution);
+    free(count_matrix);
+    free(filename);
+    free(sensing_matrix_copy);
+  }
+
+  char **headers = load_headers(sensing_fasta_filename, sequences);
+
+  // output our matrix
+  FILE *output_fh = fopen(output_filename, "w");
+  if(output_fh == NULL) { 
+    fprintf(stderr, "Could not open %s for writing\n", output_filename);
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(output_fh, "# QIIME vQuikr OTU table\n");
+  fprintf(output_fh, "#OTU_ID\t");
+
+  // print our filename headers
+  for(x = 0; x < dir_count - 1; x++) {
+    fprintf(output_fh, "%s\t", filenames[x]);
+  }
+  fprintf(output_fh, "%s\n", filenames[dir_count - 1]);
+
+  // get our actual values 
+  for(y = 0; y < sequences; y++) {
+    for(x = 0; x < dir_count; x++) {
+      solutions(x, y) = round(solutions(x, y) * file_sequence_count[x]);
+    }
+  }
+
+  for(y = 0; y < sequences; y++) {
+
+    double column_sum = 0.;
+    for(x = 0; x < dir_count; x++) {
+      column_sum += solutions(x, y);
+    }
+
+    // if our column is zero, don't bother printing the row
+    if(column_sum != 0) {
+      fprintf(output_fh, "%s\t", headers[y]);
+
+      for(x = 0; x < dir_count - 1; x++) {
+        fprintf(output_fh, "%d\t", (int)solutions(x, y));
+      }
+      fprintf(output_fh, "%d\n", (int)solutions[sequences*(dir_count - 1) + y]);
+    }
+  }
+  fclose(output_fh);
+
+  return EXIT_SUCCESS;
+}
diff --git a/src/c/nnls.c b/src/c/nnls.c
new file mode 100644
index 0000000..ddd9f2f
--- /dev/null
+++ b/src/c/nnls.c
@@ -0,0 +1,416 @@
+/*
+ *   nnls.c  (c) 2002-2009 Turku PET Centre
+ *   This file contains the routine NNLS (nonnegative least squares)
+ *   and the subroutines required by it, except h12, which is in
+ *   file 'lss_h12.c'.
+ *   
+ *  This routine is based on the text and fortran code in
+ *  C.L. Lawson and R.J. Hanson, Solving Least Squares Problems,
+ *  Prentice-Hall, Englewood Cliffs, New Jersey, 1974.
+ *  Version:
+ *  2002-08-19 Vesa Oikonen
+ *  2003-05-08 Kaisa Sederholm & VO
+ *  Included function nnlsWght().
+ *  2003-05-12 KS
+ *  Variable a_dim1 excluded
+ *  
+ *  Usage of the coefficient matrix altered so that it is
+ *  given in a[][] instead of a[].
+ *  2003-11-06 VO
+ *  If n<2, then itmax is set to n*n instead of previous n*3.
+ *  2004-09-17 VO
+ *  Doxygen style comments.
+ *  2006-24-04 Pauli Sundberg
+ *  Added some debuging output, and made some comments more precise.
+ *  2007-05-17 VO
+ *  2009-04-16 VO
+ *  Corrected a bug in nnls() which may have caused an infinite loop.
+ *  009-04-27 VO
+ *  Added function nnlsWghtSquared() for faster pixel-by-pixel calculations.
+ *  Checking for exceeding iteration count is corrected in nnls().
+ */ 
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+#define MAX(a,b) ((a) >= (b) ? (a) : (b))
+#define ABS(x) ((x) >= 0 ? (x) : -(x))
+
+int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) {
+  double d1,  b, clinv, cl, sm;
+  int k, j;
+
+  /* Check parameters */
+  if (mode!=1 && mode!=2) 
+    return(1);
+  if (m<1 || u==NULL || u_dim1<1 || cm==NULL) 
+    assert(0);
+  //     return(1);
+  if (lpivot<0 || lpivot>=l1 || l1>m) 
+    //     assert(0);
+    return(1);
+
+  /* Function Body */
+  cl = ABS( u[lpivot*u_dim1] );
+  // cl= (d1 = u[lpivot*u_dim1], fabs(d1));
+
+  if (mode==2) 
+  { /* Apply transformation I+U*(U**T)/B to cm[] */
+    if(cl<=0.) 
+      //       assert(0);
+      return(0);
+  } 
+  else 
+  {   /* Construct the transformation */
+
+
+    /* This is the way provided in the original pseudocode
+       sm = 0;
+       for (j = l1; j < m; j++)
+       {
+       d1 =  u[j * u_dim1];
+       sm += d1*d1;
+       }
+       d1 = u[lpivot * u_dim1];
+       sm += d1*d1;
+       sm = sqrt(sm);
+
+       if (u[lpivot*u_dim1] > 0) 
+       sm=-sm;
+
+       up[0] = u[lpivot*u_dim1] - sm; 
+       u[lpivot*u_dim1]=sm;
+       printf("Got sum: %f\n",sm);
+       */
+
+    /* and this trying to compensate overflow */
+    for (j=l1; j<m; j++) 
+    {  // Computing MAX 
+      cl = MAX( ABS( u[j*u_dim1] ), cl );
+    }
+    // zero vector?   
+
+    if (cl<=0.) 
+      return(0);
+
+    clinv=1.0/cl;
+
+    // Computing 2nd power 
+    d1=u[lpivot*u_dim1]*clinv; 
+    sm=d1*d1;
+
+    for (j=l1; j<m; j++) 
+    {
+      d1=u[j*u_dim1]*clinv; 
+      sm+=d1*d1;
+    }
+    cl *= sqrt(sm); 
+
+    if (u[lpivot*u_dim1] > 0.) 
+      cl=-cl;
+    up[0] = u[lpivot*u_dim1] - cl; 
+    u[lpivot*u_dim1]=cl;
+  }
+
+  // no vectors where to apply? only change pivot vector! 
+  b=up[0] * u[lpivot*u_dim1];
+
+  /* b must be nonpositive here; if b>=0., then return */
+  if (b == 0) 
+    return(0);
+
+  // ok, for all vectors we want to apply
+  for (j =0; j < ncv; j++) {
+    sm = cm[ lpivot * ice + j * icv ] * (up[0]);
+
+    for (k=l1; k<m; k++) 
+      sm += cm[ k * ice + j*icv ] * u[ k*u_dim1 ]; 
+
+    if (sm != 0.0) {
+      sm *= (1/b); 
+      // cm[lpivot, j] = ..
+      cm[ lpivot * ice + j*icv] += sm * (up[0]);
+      for (k= l1; k<m; k++) 
+      {
+        cm[ k*ice + j*icv] += u[k * u_dim1]*sm;
+      }
+    }
+  }
+
+  return(0);
+} 
+
+
+void g1(double a, double b, double *cterm, double *sterm, double *sig)
+{
+  double d1, xr, yr;
+
+  if( fabs(a) > fabs(b) ) {
+    xr = b / a;
+    d1 = xr;
+    yr = sqrt(d1*d1 + 1.);
+    d1 = 1./yr;
+    *cterm=(a>=0.0 ? fabs(d1) : -fabs(d1));
+    *sterm=(*cterm)*xr;
+    *sig=fabs(a)*yr;
+  } else if( b != 0.) {
+    xr = a / b;
+    d1 = xr;
+    yr = sqrt(d1 * d1 + 1.);
+    d1 = 1. / yr;
+    *sterm=(b>=0.0 ? fabs(d1) : -fabs(d1));
+    *cterm=(*sterm)*xr; *sig=fabs(b)*yr;
+  } else {
+    *sig=0.; *cterm=0.; *sterm=1.;
+  }
+} 
+
+
+int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) {
+  int pfeas;
+  int ret=0;
+  int iz;
+  int jz;
+  int k, j=0, l, itmax, izmax=0, ii, jj=0, ip;
+  double d1, d2, sm, up, ss; 
+  double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc;
+
+
+  /* Check the parameters and data */
+  if(m <= 0 || n <= 0 || a == NULL || b == NULL || x == NULL) 
+    return(2);
+
+  /* Allocate memory for working space, if required */
+  double *w = (double*)calloc(n, sizeof(double));
+  double *zz = (double*)calloc(m, sizeof(double));
+  int *index = (int*)calloc(n, sizeof(int));
+  if(w == NULL || zz == NULL || index == NULL) 
+    return(2);
+
+  /* Initialize the arrays INDEX[] and X[] */
+  for(k=0; k<n; k++) {
+    x[k]=0.;
+    index[k]=k;
+  }
+
+  int iz2 = n - 1;
+  int iz1 = 0;
+  int iter=0; 
+  int nsetp=0;
+  int npp1=0;
+
+  /* Main loop; quit if all coeffs are already in the solution or */
+  /* if M cols of A have been triangularized */
+  if(n < 3) 
+    itmax=n*3;
+  else 
+    itmax=n*n;
+ 
+
+  while(iz1 <= iz2 && nsetp < m) {
+    /* Compute components of the dual (negative gradient) vector W[] */
+    for(iz=iz1; iz<=iz2; iz++) {
+      j=index[iz];
+      sm=0.;
+      for(l=npp1; l<m; l++) 
+        sm+=a[j*m + l]*b[l];
+      w[j]=sm;
+    }
+
+    while(1) {
+      /* Find largest positive W[j] */
+      for(wmax=0., iz=iz1; iz<=iz2; iz++) {
+        j=index[iz]; if(w[j]>wmax) {wmax=w[j]; izmax=iz;}}
+
+      /* Terminate if wmax<=0.; */
+      /* it indicates satisfaction of the Kuhn-Tucker conditions */
+      if(wmax<=0.0)
+        break;
+
+      iz=izmax; 
+      j=index[iz];
+
+      /* The sign of W[j] is ok for j to be moved to set P. */
+      /* Begin the transformation and check new diagonal element to avoid */
+      /* near linear dependence. */
+      asave=a[j*m + npp1];
+      h12(1, npp1, npp1+1, m, &a[j*m +0], 1, &up, &dummy, 1, 1, 0);
+      unorm=0.;
+      if(nsetp!=0){
+        for(l=0; l<nsetp; l++) {
+          d1=a[j*m + l]; 
+          unorm+=d1*d1;
+        }
+      }
+
+      unorm=sqrt(unorm);
+      d2=unorm+(d1=a[j*m + npp1], fabs(d1)) * 0.01;
+      if((d2-unorm)>0.) {
+        /* Col j is sufficiently independent. Copy B into ZZ, update ZZ */
+        /* and solve for ztest ( = proposed new value for X[j] ) */
+        for(l=0; l<m; l++) zz[l]=b[l];
+        h12(2, npp1, npp1+1, m, &a[j*m + 0], 1, &up, zz, 1, 1, 1);
+        ztest=zz[npp1]/a[j*m +npp1];
+        /* See if ztest is positive */
+        if(ztest>0.) break;
+      }
+
+      /* Reject j as a candidate to be moved from set Z to set P. Restore */
+      /* A[npp1,j], set W[j]=0., and loop back to test dual coeffs again */
+      a[j*m+ npp1]=asave; w[j]=0.;
+    } /* while(1) */
+
+    if(wmax<=0.0)
+      break;
+
+    /* Index j=INDEX[iz] has been selected to be moved from set Z to set P. */
+    /* Update B and indices, apply householder transformations to cols in */
+    /* new set Z, zero subdiagonal elts in col j, set W[j]=0. */
+    for(l=0; l<m; ++l) 
+      b[l]=zz[l];
+
+    index[iz]=index[iz1];
+    index[iz1]=j;
+    iz1++;
+    npp1++;
+    nsetp=npp1;
+
+    if(iz1<=iz2) {
+     for(jz=iz1; jz<=iz2; jz++) {
+        jj=index[jz];
+        h12(2, nsetp-1, npp1, m, &a[j*m +0], 1, &up, &a[jj*m +0], 1, m, 1);
+      }
+    }
+
+    if(nsetp!=m) {
+      for(l=npp1; l<m; l++) 
+        a[j*m +l]=0.;
+    }
+
+    w[j]=0.;
+
+    /* Solve the triangular system; store the solution temporarily in Z[] */
+    for(l=0; l<nsetp; l++) {
+      ip=nsetp-(l+1);
+      if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
+      jj=index[ip]; zz[ip]/=a[jj*m +ip];
+    }
+
+    /* Secondary loop begins here */
+    while(++iter < itmax) {
+      /* See if all new constrained coeffs are feasible; if not, compute alpha */
+      for(alpha = 2.0, ip = 0; ip < nsetp; ip++) {
+        l=index[ip];
+        if(zz[ip]<=0.) {
+          t = -x[l]/(zz[ip]-x[l]);
+          if(alpha > t) {
+            alpha = t; 
+            jj = ip - 1;
+          }
+        }
+      }
+
+      /* If all new constrained coeffs are feasible then still alpha==2. */
+      /* If so, then exit from the secondary loop to main loop */
+      if(alpha==2.0) 
+        break;
+
+      /* Use alpha (0.<alpha<1.) to interpolate between old X and new ZZ */
+      for(ip=0; ip<nsetp; ip++) {
+        l = index[ip];
+        x[l] += alpha*(zz[ip]-x[l]);
+      }
+
+      /* Modify A and B and the INDEX arrays to move coefficient i */
+      /* from set P to set Z. */
+      k=index[jj+1]; pfeas=1;
+      do {
+        x[k]=0.;
+        if(jj!=(nsetp-1)) {
+          jj++;
+          for(j=jj+1; j<nsetp; j++) {
+            ii=index[j]; index[j-1]=ii;
+            g1(a[ii*m + (j-1)], a[ii*m + j], &cc, &ss, &a[ii*m + j-1]);
+            for(a[ii*m + j]=0., l=0; l<n; l++) if(l!=ii) {
+              /* Apply procedure G2 (CC,SS,A(J-1,L),A(J,L)) */
+              temp=a[l*m + j-1];
+              a[l*m + j-1]=cc*temp+ss*a[l*m + j];
+              a[l*m + j]=-ss*temp+cc*a[l*m + j];
+            }
+            /* Apply procedure G2 (CC,SS,B(J-1),B(J)) */
+            temp=b[j-1]; b[j-1]=cc*temp+ss*b[j]; b[j]=-ss*temp+cc*b[j];
+          }
+        }
+        npp1=nsetp-1; nsetp--; iz1--; index[iz1]=k;
+
+        /* See if the remaining coeffs in set P are feasible; they should */
+        /* be because of the way alpha was determined. If any are */
+        /* infeasible it is due to round-off error. Any that are */
+        /* nonpositive will be set to zero and moved from set P to set Z */
+        for(jj=0, pfeas=1; jj<nsetp; jj++) {
+          k=index[jj]; if(x[k]<=0.) {pfeas=0; break;}
+        }
+      } while(pfeas==0);
+
+      /* Copy B[] into zz[], then solve again and loop back */
+      for(k=0; k<m; k++) 
+        zz[k]=b[k];
+      for(l=0; l<nsetp; l++) {
+        ip=nsetp-(l+1);
+        if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
+        jj=index[ip]; zz[ip]/=a[jj*m + ip];
+      }
+    } /* end of secondary loop */
+
+    if(iter>=itmax) {
+      ret = 1;
+      break;
+    }
+
+    for(ip=0; ip<nsetp; ip++) {
+      k=index[ip];
+      x[k]=zz[ip];
+      }
+  } /* end of main loop */
+
+  /* Compute the norm of the final residual vector */
+  sm=0.;
+
+  if (rnorm != NULL) {
+    if (npp1<m) 
+      for (k=npp1; k<m; k++) 
+        sm+=(b[k] * b[k]);
+    else 
+      for (j=0; j<n; j++) 
+        w[j]=0.;
+    *rnorm=sqrt(sm);
+  } 
+
+  /* Free working space, if it was allocated here */
+  free(w);
+  free(zz);
+  free(index);
+  return(ret);
+}
+/* nnls_ */
+
+
+double *nnls(double *a_matrix, double *b_matrix, int height, int width) {
+
+  double *solution = (double*)calloc(height, sizeof(double));
+
+  if(solution == NULL) {
+    fprintf(stderr, "could not allocate enough memory for nnls\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int ret = nnls_algorithm(a_matrix, width, height, b_matrix, solution, NULL);
+  if(ret == 1) {
+    printf("NNLS has reached the maximum iterations\n");
+  } else if(ret == 2) {
+    fprintf(stderr, "NNLS could not allocate enough memory\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return solution;
+}
diff --git a/src/c/nnls.h b/src/c/nnls.h
new file mode 100644
index 0000000..ded8d01
--- /dev/null
+++ b/src/c/nnls.h
@@ -0,0 +1 @@
+double *nnls(double *a_matrix, double *b_matrix, int height, int width);
diff --git a/src/c/quikr.1 b/src/c/quikr.1
new file mode 100644
index 0000000..9982d94
--- /dev/null
+++ b/src/c/quikr.1
@@ -0,0 +1,70 @@
+.TH quikr 1 quikr-2013-05
+.SH NAME
+quikr \- Calculate estimated frequencies of bacteria in a sample.
+.SH SYNOPSIS
+.B quikr
+.RB [ \-i
+.IR input]
+.RB [ \-f
+.IR sensing-fasta]
+.RB [ \-s
+.IR sensing-matrix]
+.RB [ \-l
+.IR lambda ]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-o
+.IR output ]
+.RB [ \-v ]
+.P
+.BR quikr " ..."
+.SH DESCRIPTION
+.B quikr
+Quikr returns the estimated frequencies of batcteria present when given a
+input FASTA file and a trained sensing matrix
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input
+the sample's fasta file of NGS. READS (fasta format)
+.TP
+.B \-f, --sensing-fasta
+location of the fasta file database used to create the sensing matrix. (fasta format)
+.TP
+.B \-s, --sensing-matrix
+location of the sensing matrix. (trained from quikr_train)
+.TP
+.B \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-l, --lambda
+lambda value to use. (default value is 10000)
+.TP
+.B \-o, --output
+OTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH EXAMPLES
+Use quikr to calculate the estimated frequencies for sample.fa, using rdp7.fasta as the sensing matrix we generated with quikr_train. This uses 6-mers by default, and a lambda value of 10000:
+.P
+quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt
+.SH "SEE ALSO"
+\fBmultifasta_to_otu\fP(1), \fBquikr_train\fP(1).
+.SH AUTHORS
+.B quikr
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison 
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use, 
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright 
+notice appear in all copies and that both that copyright notice and this 
+permission noticeappear in supporting documentation.  No representations are
+made about the suitability of this software for any purpose.  It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/quikr.c b/src/c/quikr.c
new file mode 100644
index 0000000..c73e0dd
--- /dev/null
+++ b/src/c/quikr.c
@@ -0,0 +1,192 @@
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "nnls.h"
+#include "quikr_functions.h"
+
+#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
+#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+
+  int c;
+  int kmer = 0;
+
+  char *input_fasta_filename = NULL;
+  char *sensing_matrix_filename = NULL;
+  char *sensing_fasta_filename = NULL;
+  char *output_filename = NULL;
+
+  int x = 0;
+  int y = 0;
+  int verbose = 0;
+  int lambda = 0;
+  
+
+
+  while (1) {
+    static struct option long_options[] = {
+      {"input", required_argument, 0, 'i'},
+      {"kmer",  required_argument, 0, 'k'},
+      {"lambda",  required_argument, 0, 'l'},
+      {"output", required_argument, 0, 'o'},
+      {"sensing-fasta",  required_argument, 0, 'f'},
+      {"sensing-matrix", required_argument, 0, 's'},
+      {"verbose", no_argument, 0, 'v'},
+      {0, 0, 0, 0}
+    };
+
+    int option_index = 0;
+
+    c = getopt_long (argc, argv, "k:l:f:s:i:o:hdv", long_options, &option_index);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+      case 0:
+      case 'k':
+        kmer = atoi(optarg);
+        break;
+      case 'l':
+        lambda = atoi(optarg);
+        break;
+      case 'f':
+        sensing_fasta_filename = optarg;
+        break;
+      case 's':
+        sensing_matrix_filename = optarg;
+        break;
+      case 'i':
+        input_fasta_filename = optarg;
+        break;
+      case 'o':
+        output_filename = optarg;
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'h':
+        printf("%s\n", USAGE);
+        exit(EXIT_SUCCESS);
+        break;
+      default:
+        break;
+    }
+  }
+
+
+  if(sensing_matrix_filename == NULL) {
+    fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(sensing_fasta_filename == NULL) {
+    fprintf(stderr, "Error: sensing matrix filename (-f) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(output_filename == NULL) {
+    fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+  if(input_fasta_filename == NULL) {
+    fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+
+  if(lambda == 0) 
+    lambda = 10000;
+  if(kmer == 0) 
+    kmer = 6;
+
+  if(verbose) { 
+    printf("kmer: %d\n", kmer);
+    printf("lambda: %d\n", lambda);
+    printf("fasta: %s\n", input_fasta_filename);
+    printf("sensing database: %s\n", sensing_matrix_filename);
+    printf("sensing database fasta: %s\n", sensing_fasta_filename);
+    printf("output: %s\n", output_filename);
+  }
+  // 4 "ACGT" ^ Kmer gives us the size of output rows
+  int width = pow(4, kmer);
+  width = width + 1;
+
+  int sequences = count_sequences(sensing_fasta_filename);
+
+  if(verbose) {
+    printf("width: %d\nsequences %d\n", width, sequences);
+  }
+
+  double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
+  double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer);
+
+  // multiply our matrix by lambda
+  for(x = 1; x < sequences; x++) {
+    for(y= 0; y < width - 1; y++) {
+      sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
+    }
+  }
+
+  for(x= 0; x < sequences; x++) {
+    sensing_matrix(x, 0) = 1.0;
+  }
+  // normalize our count_matrix
+  normalize_matrix(count_matrix, 1, width);
+  for(x = 0; x < width; x++) 
+    count_matrix[x] = count_matrix[x] * lambda;
+  
+  // output our matricies if we are in verbose mode
+  if(verbose) { 
+    FILE *sensing_matrix_fh = fopen( "sensing.matrix", "w");
+    if(sensing_matrix_fh == NULL) {
+      fprintf(stderr, "could not open sensing.matrix for writing.\n");
+      exit(EXIT_FAILURE);
+    }
+    for(x = 0; x < sequences; x++) {
+      for( y = 0; y < width; y++) {
+        fprintf(sensing_matrix_fh, "%.10f\t", sensing_matrix(x, y));
+      }
+      fprintf(sensing_matrix_fh, "\n");
+    }
+    fclose(sensing_matrix_fh);
+
+    FILE *count_matrix_fh = fopen("count.matrix", "w");
+    if(count_matrix_fh == NULL) {
+      fprintf(stderr, "could not open sensing.matrix for writing.\n");
+      exit(EXIT_FAILURE);
+    }
+    for(x = 0; x < width; x++) {
+      fprintf(count_matrix_fh, "%.10f\n", count_matrix[x]);
+    }
+    fclose(count_matrix_fh);
+  }
+
+  double *solution = nnls(sensing_matrix, count_matrix, sequences, width);
+
+  // normalize our solution vector
+  normalize_matrix(solution, 1, sequences);
+
+  // output our matrix
+  FILE *output_fh = fopen(output_filename, "w");
+  if(output_fh == NULL) { 
+    fprintf(stderr, "Could not open %s for writing\n", output_filename);
+    exit(EXIT_FAILURE);
+  }
+  for(x = 0; x < sequences; x++) {
+      fprintf(output_fh, "%.10lf\n", solution[x]);
+  }
+  fclose(output_fh);
+
+  return EXIT_SUCCESS;
+}
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
new file mode 100644
index 0000000..7e18e64
--- /dev/null
+++ b/src/c/quikr_functions.c
@@ -0,0 +1,163 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+int count_sequences(char *filename) { 
+  char command[512];
+  int sequences = 0;
+  FILE *grep_output;
+
+  sprintf(command, "grep -c ^\\> %s", filename);
+  grep_output = popen(command, "r");
+  if(grep_output == NULL) {
+    fprintf(stderr, "Could not execute %s\n", command);
+    exit(EXIT_FAILURE);
+  }
+  
+  fscanf(grep_output, "%d", &sequences);
+
+  pclose(grep_output);
+  return sequences;
+}
+
+
+void normalize_matrix(double *matrix, int height, int width) {
+  int x = 0;
+  int y = 0;
+  for(x = 0; x < height; x++) {
+
+    double row_sum = 0;
+
+    for(y = 0; y < (width); y++)
+      row_sum = row_sum + matrix[width * x + y];
+    for(y = 0; y < (width); y++) 
+      matrix[width * x + y] = matrix[width * x + y] / row_sum; 
+  }
+}
+
+
+double *load_count_matrix(char *filename, int width, int kmer) {
+
+  double *count_matrix = malloc((width)*sizeof(double));
+  char count_command[512];
+  int x = 0;
+  char *line = NULL;
+  size_t len = 0;
+
+  if(count_matrix == NULL) {
+    fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width);
+    exit(EXIT_FAILURE);
+  }
+
+  // create out count matrix
+  sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename);
+  FILE *count_output = popen(count_command, "r");
+  if(count_output == NULL) {
+    fprintf(stderr, "could not execute \"%s\"", count_command);
+    exit(EXIT_FAILURE);
+  }
+
+  // set first element to zero.
+  count_matrix[0] = 0;
+
+  // get our first line
+  getline(&line, &len, count_output);
+  count_matrix[1] = atoi(line);
+
+  // iterate over the rest of the lines
+  for(x = 2; x < width; x++) { 
+    getline(&line, &len, count_output);
+    count_matrix[x] = atoi(line);
+  } 
+
+  pclose(count_output);
+
+  return count_matrix;
+}
+
+
+double *load_sensing_matrix(char *filename, int height, int width) {
+
+  int x = 0;
+  int y = 0;
+
+  char *line = NULL;
+  char *val;
+  char command[512];
+  size_t len = 0;
+  FILE *sensing_matrix_fh = NULL;
+
+  double *sensing_matrix = malloc(height * width * sizeof(double));
+  if(sensing_matrix == NULL) {
+    fprintf(stderr, "Could not allocate memory for the sensing matrix\n");
+  }
+
+  sprintf(command, "gzip -cd %s", filename);
+  sensing_matrix_fh = popen(command, "r");
+  if(sensing_matrix_fh == NULL) {
+    fprintf(stderr, "could not open %s", filename);
+    exit(EXIT_FAILURE);
+  }
+
+  // read our sensing matrix in
+  for(x = 0; x < height; x++) {
+    getline(&line, &len, sensing_matrix_fh);
+    char *ptr;
+
+    // Read our first element in outside of the loop
+    val = strtok_r(line,"\t", &ptr);
+    sensing_matrix[width*x + 1] = atof(val);
+    // iterate through and load the array
+    for (y = 2; y < width; y++) {
+      val = strtok_r(NULL, "\t", &ptr);
+      sensing_matrix[width*x + y] = atof(val);
+    }
+  }
+
+  pclose(sensing_matrix_fh);
+
+  return sensing_matrix;
+}
+
+char **load_headers(char *filename, int sequences) {
+  char command[512];
+  char *line= NULL;
+  int x = 0;
+  FILE *grep_output;
+  size_t len = 0;
+
+  sprintf(command, "grep ^\\> %s", filename);
+  grep_output = popen(command, "r");
+  if(grep_output == NULL) {
+    fprintf(stderr, "Could not execute %s\n", command);
+    exit(EXIT_FAILURE);
+  }
+  
+  char **headers = malloc(sequences * sizeof(char *));
+  if(headers == NULL) {
+    fprintf(stderr, "could not allocated enough memory\n");
+    exit(EXIT_FAILURE);
+  }
+
+  for(x = 0; x < sequences; x++) {
+
+    char *header = malloc(256 * sizeof(char));
+    if(header == NULL) {
+      fprintf(stderr, "could not allocated enough memory\n");
+      exit(EXIT_FAILURE);
+    }
+    getline(&line, &len, grep_output);
+    sscanf(line + 1, "%s", header);
+    headers[x] = header;
+  }
+
+  pclose(grep_output);
+
+  return headers;
+}
+
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
new file mode 100644
index 0000000..1826427
--- /dev/null
+++ b/src/c/quikr_functions.h
@@ -0,0 +1,14 @@
+// count the sequences in a fasta file
+int count_sequences(char *filename);
+
+// normalize a matrix by dividing each element by the sum of it's column
+void normalize_matrix(double *matrix, int height, int width);
+
+// load a sensing matrix
+double *load_sensing_matrix(char *filename, int height, int width);
+
+// load a count matrix
+double *load_count_matrix(char *filename, int width, int kmer);
+
+// load headers
+char **load_headers(char *filename, int sequences); 
diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1
new file mode 100644
index 0000000..dd1493f
--- /dev/null
+++ b/src/c/quikr_train.1
@@ -0,0 +1,61 @@
+.TH quikr_train 1 quikr_train-2013-05
+.SH NAME
+quikr_train \- train databases for use with Quikr
+.SH SYNOPSIS
+.B quikr_train
+.RB [ \-i
+.IR input]
+.RB [ \-o
+.IR output]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-v ]
+.P
+.BR quikr " ..."
+.SH DESCRIPTION
+.B quikr
+The quikr_train is a utility to train a database for use with quikr.
+Before running the quikr utility, you need to generate the sensing matrix or
+download a pretrained one from our project's homepage.
+
+quikr_train returns a sensing matrix that can be used with the quikr
+function. 
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input
+the database of sequences to create the sensing matrix. (fasta format)
+.TP
+.B \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-o, --output
+the sensing matrix. (a gzip'd text file)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH EXAMPLES
+Use quikr_train to generate a sensing matrix from rdp7.fasta. This uses 6mers by default.
+.P
+quikr_train -i rdp7.fa -o rd7_sensing_matrix.gz
+.SH USAGE
+If you do not have a .gz file on your output matrix name, it will be appended.
+.SH "SEE ALSO"
+\fBmultifasta_to_otu\fP(1), \fBquikr\fP(1).
+.SH AUTHORS
+.B quikr
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison 
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen.  Permission to use, 
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright 
+notice appear in all copies and that both that copyright notice and this 
+permission noticeappear in supporting documentation.  No representations are
+made about the suitability of this software for any purpose.  It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
new file mode 100644
index 0000000..d2a83ef
--- /dev/null
+++ b/src/c/quikr_train.c
@@ -0,0 +1,176 @@
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "quikr_functions.h"
+
+#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'"
+#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+  char probabilities_command[512];
+  char kmers_file[256];
+  char *line = NULL;
+  char *val;
+  size_t len = 0;
+
+
+  int c;
+  int kmer = 0;
+
+  char *fasta_file = NULL;
+  char *output_file = NULL;
+
+  int x = 0;
+  int y = 0;
+
+  int verbose = 0;
+
+  gzFile output = NULL;
+
+  while (1) {
+    static struct option long_options[] = {
+      {"verbose", no_argument, 0, 'v'},
+      {"input", required_argument, 0, 'i'},
+      {"kmer",  required_argument, 0, 'k'},
+      {"output", required_argument, 0, 'o'},
+      {0, 0, 0, 0}
+    };
+
+    int option_index = 0;
+
+    c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+      case 'i':
+        fasta_file = optarg;
+        break;
+      case 'k':
+        kmer = atoi(optarg);
+        break;
+      case 'o':
+        output_file = optarg;
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'h':
+        printf("%s\n", USAGE);
+        exit(EXIT_SUCCESS);
+        break;
+      case '?':
+        /* getopt_long already printed an error message. */
+        break;
+      default:
+        exit(EXIT_FAILURE);
+    }
+  }
+
+
+  if(fasta_file == NULL) {
+    fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+
+  if(output_file == NULL) {
+    fprintf(stderr, "Error: output directory (-o) must be specified\n\n");
+    fprintf(stderr, "%s\n", USAGE);
+    exit(EXIT_FAILURE);
+  }
+
+  if(kmer == 0)
+    kmer = 6;
+
+  if(verbose) {
+    printf("kmer size: %d\n", kmer);
+    printf("fasta file: %s\n", fasta_file);
+    printf("output file: %s\n", output_file);
+  }
+
+  if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) {
+    char *temp = malloc(sizeof(strlen(output_file) + 4));
+    sprintf(temp, "%s.gz", output_file);
+    output_file = temp;
+    printf("appending a .gz to our output file: %s\n", output_file);
+  }
+
+  // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length
+  int width = pow(4, kmer);
+  int sequences = count_sequences(fasta_file);
+
+  if(verbose)
+    printf("sequences: %d\nwidth: %d\n", sequences, width);
+
+  // Allocate our matrix with the appropriate size, just one row
+  double *trained_matrix = malloc(width*sizeof(double));
+  if(trained_matrix == NULL) {
+    fprintf(stderr, "Could not allocated enough memory\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // call the probabilities-by-read command
+  sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer);
+  sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file);
+  FILE *probabilities_output = popen(probabilities_command, "r");
+  if(probabilities_output == NULL) {
+    fprintf(stderr, "Error could not execute: %s\n", probabilities_command);
+    exit(EXIT_FAILURE);
+  }
+
+  // open our output file
+  output = gzopen(output_file, "w6");
+  if(output == NULL) {
+    fprintf(stderr, "Error: could not open output file, error code: %d", errno);
+    exit(EXIT_FAILURE);
+  }
+
+  if(verbose)
+    printf("Writing our sensing matrix to %s\n", output_file);
+
+  // read normalize and write our matrix  in one go
+  for(x = 0; x < sequences; x++) {
+
+    getline(&line, &len, probabilities_output);
+
+    // Read our first element in outside of the loop
+    val = strtok(line,"\t\n\r");
+    trained_matrix[0] = atof(val);
+    // iterate through and load the array
+    for (y = 1; y < width; y++) {
+      val = strtok (NULL, "\t\n\r");
+      trained_matrix[y] = atof(val);
+    }
+
+    double row_sum = 0;
+
+    for( y = 0; y < width; y++) {
+      row_sum = row_sum + trained_matrix[y];
+    }
+
+    for( y = 0; y < width; y++) {
+      trained_matrix[y] = trained_matrix[y] / row_sum;
+    }
+
+    for( y = 0; y < width; y++) {
+      gzprintf(output, "%.10f\t", trained_matrix[y]);
+    }
+    gzprintf(output, "\n");
+  }
+
+  free(trained_matrix);
+  gzclose(output);
+  pclose(probabilities_output);
+
+  return 0;
+}
author	Calvin <calvin@EESI>	2013-05-14 21:51:40 -0400
committer	Calvin <calvin@EESI>	2013-05-14 21:51:40 -0400
commit	0773aaf89678b967588a902df1f5e6f9ccea393d (patch)
tree	40762e5df1da876d460d8695357ab0835645e8c6 /src/c
parent	1d2becc9af591d37badfe0e77751bbb80932472f (diff)