aboutsummaryrefslogtreecommitdiff
path: root/src/c
diff options
context:
space:
mode:
Diffstat (limited to 'src/c')
-rw-r--r--src/c/Makefile14
-rw-r--r--src/c/multifasta_to_otu.195
-rw-r--r--src/c/multifasta_to_otu.c314
-rw-r--r--src/c/nnls.c416
-rw-r--r--src/c/nnls.h1
-rw-r--r--src/c/quikr.170
-rw-r--r--src/c/quikr.c192
-rw-r--r--src/c/quikr_functions.c163
-rw-r--r--src/c/quikr_functions.h14
-rw-r--r--src/c/quikr_train.161
-rw-r--r--src/c/quikr_train.c176
11 files changed, 1516 insertions, 0 deletions
diff --git a/src/c/Makefile b/src/c/Makefile
new file mode 100644
index 0000000..ba0c799
--- /dev/null
+++ b/src/c/Makefile
@@ -0,0 +1,14 @@
+UNAME := $(shell uname)
+CC = gcc
+QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME)
+QUIKR_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME)
+all: quikr_train_ quikr_ multifasta_to_otu_
+
+multifasta_to_otu_:
+ $(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(QUIKR_CFLAGS)
+quikr_train_:
+ $(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS)
+quikr_:
+ $(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../
+clean:
+ rm -vf quikr_train quikr multifasta_to_otu
diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1
new file mode 100644
index 0000000..7878ef0
--- /dev/null
+++ b/src/c/multifasta_to_otu.1
@@ -0,0 +1,95 @@
+.TH multifasta_to_otu 1 multifasta_to_otu-2013-05
+.SH NAME
+multifasta_to_otu \- create a QIIME OTU table based on Quikr results.
+.SH SYNOPSIS
+.B multifasta_to_otu
+.RB [ \-i
+.IR input-directory ]
+.RB [ \-f
+.IR sensing-fasta]
+.RB [ \-s
+.IR sensing-matrix]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-l
+.IR lambda ]
+.RB [ \-j
+.IR jobs ]
+.RB [ \-o
+.IR otu-table]
+.RB [ \-v ]
+.P
+.BR multifasta_to_otu " ..."
+.SH DESCRIPTION
+.B multifasta_to_otu
+The multifasta_to_otu tool is a handy wrapper for quikr which lets the user
+to input as many fasta files as they like, and then returns an OTU table of the
+number of times a specimen was seen in all of the samples.
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input-directory
+the directory containing the samples' fasta files of reads (note each fasta file should correspond to a separate sample)
+.TP
+.B \-f, --sensing-fasta
+location of the fasta file database used to create the sensing matrix.
+.TP
+.B \-s, --sensing-matrix
+location of the sensing matrix.
+.TP
+.b \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-l, --lambda
+lambda value to use. (default value is 10000)
+.TP
+.B \-j, --jobs
+specifies how many jobs to run at once. (default value is the number of CPUs)
+.TP
+.B \-o, --otu-table
+the OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or sequence table if not OTU's)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH USAGE
+This program will use a large amount of memory, and CPU time.
+You can reduce the number of cores used, and thus memory, by specifying the -j flag with aspecified number of jobs. Otherwise multifasta_to_otu will run one job per cpu core.
+.SH POSTPROCESSING
+.B Note: When making your QIIME Metadata file, the sample id's must match the sample fasta file prefix names
+.P
+4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: (Note: Our code works much better with WEIGHTED Unifrac as opposed to Unweighted.)
+.TP
+Pre-requisites:
+1. multifasta output file "quikr_output_table.txt" for our example.
+.br
+2. the tree of the database sequences that were used (e.g.dp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.)
+.br
+3. your-defined <qiime_metadata_file.txt>
+.TP
+The QIIME procedue:
+convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table"
+.br
+beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file>
+.br
+principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt
+.br
+make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file>
+.SH "SEE ALSO"
+\fBquikr\fP(1), \fBquikr_train\fP(1).
+.SH AUTHORS
+.B multifasta2otu
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use,
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright
+notice appear in all copies and that both that copyright notice and this
+permission noticeappear in supporting documentation. No representations are
+made about the suitability of this software for any purpose. It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
new file mode 100644
index 0000000..268dec6
--- /dev/null
+++ b/src/c/multifasta_to_otu.c
@@ -0,0 +1,314 @@
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "nnls.h"
+#include "quikr_functions.h"
+
+#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
+#define solutions(i,j) (solutions[sequences*i+ j])
+#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+ int c;
+
+ char *input_fasta_directory = NULL;
+ char *sensing_matrix_filename = NULL;
+ char *sensing_fasta_filename = NULL;
+ char *output_filename = NULL;
+
+ double *sensing_matrix;
+
+ long int width = 0;
+ long int sequences = 0;
+
+ int kmer = 6;
+ int lambda = 10000;
+
+ int x = 0;
+ int y = 0;
+
+ int jobs = 1;
+ #ifdef Linux
+ jobs = get_nprocs();
+ #endif
+ #ifdef Darwin
+ jobs = sysconf (_SC_NPROCESSORS_ONLN);
+ #endif
+
+ int verbose = 0;
+
+ DIR *input_directory_dh;
+ struct dirent *entry;
+
+ while (1) {
+ static struct option long_options[] = {
+ {"input-directory", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"lambda", required_argument, 0, 'l'},
+ {"jobs", required_argument, 0, 'j'},
+ {"output", required_argument, 0, 'o'},
+ {"sensing-fasta", required_argument, 0, 'f'},
+ {"sensing-matrix", required_argument, 0, 's'},
+ {"verbose", no_argument, 0, 'v'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "k:l:f:s:i:o:j:hv", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'k':
+ kmer = atoi(optarg);
+ break;
+ case 'l':
+ lambda = atoi(optarg);
+ break;
+ case 'f':
+ sensing_fasta_filename = optarg;
+ break;
+ case 's':
+ sensing_matrix_filename = optarg;
+ break;
+ case 'j':
+ jobs = atoi(optarg);
+ break;
+ case 'i':
+ input_fasta_directory = optarg;
+ break;
+ case 'o':
+ output_filename = optarg;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'h':
+ puts(USAGE);
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if(sensing_matrix_filename == NULL) {
+ fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(sensing_fasta_filename == NULL) {
+ fprintf(stderr, "Error: sensing fasta filename (-f) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(output_filename == NULL) {
+ fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(input_fasta_directory == NULL) {
+ fprintf(stderr, "Error: input fasta directory (-i) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ // set defaults
+
+
+ if(verbose) {
+ printf("kmer: %d\n", kmer);
+ printf("lambda: %d\n", lambda);
+ printf("input directory: %s\n", input_fasta_directory);
+ printf("sensing database: %s\n", sensing_matrix_filename);
+ printf("sensing database fasta: %s\n", sensing_fasta_filename);
+ printf("output: %s\n", output_filename);
+ printf("number of jobs to run at once: %d\n", jobs);
+ }
+
+
+ input_directory_dh = opendir(input_fasta_directory);
+ if(input_fasta_directory == NULL) {
+ fprintf(stderr, "could not open %s\n", input_fasta_directory);
+ exit(EXIT_FAILURE);
+ }
+
+ // do a directory count
+ int dir_count = -2; // -2 for ../ and ./
+ while(entry = readdir(input_directory_dh))
+ dir_count++;
+ rewinddir(input_directory_dh);
+ if(dir_count == 0) {
+ fprintf(stderr, "%s is empty\n", input_fasta_directory);
+ exit(EXIT_FAILURE);
+ }
+
+ // 4 "ACGT" ^ Kmer gives us the size of output rows
+ width = pow(4, kmer) + 1;
+ sequences = count_sequences(sensing_fasta_filename);
+
+ if(verbose) {
+ printf("directory count: %d\n", dir_count);
+ printf("width: %ld\nsequences %ld\n", width, sequences);
+ }
+
+ sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
+
+ // multiply our matrix by lambda
+ for(x = 0; x < sequences; x++) {
+ for(y= 0; y < width; y++) {
+ sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
+ }
+ }
+
+ // set the first row to be all 1's
+ for(x = 0; x < sequences; x++) {
+ sensing_matrix(x, 0) = 1.0;
+ }
+
+ double *solutions = malloc(dir_count * sequences * sizeof(double));
+ if(solutions == NULL) {
+ fprintf(stderr, "Could not allocate enough memory for solutions vector\n");
+ exit(EXIT_FAILURE);
+ }
+
+ char **filenames = malloc(dir_count * sizeof(char *));
+ if(filenames == NULL) {
+ fprintf(stderr, "Could not allocate enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ int *file_sequence_count = malloc(dir_count * sizeof(int));
+ if(file_sequence_count == NULL) {
+ fprintf(stderr, "Could not allocate enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ struct dirent result;
+
+ omp_set_num_threads(jobs);
+ int done = 0;
+ printf("Beginning to process samples\n");
+#pragma omp parallel for shared(solutions, sequences, width, result, done)
+ for(int i = 0; i < dir_count; i++ ) {
+
+ int z = 0;
+ struct dirent *directory_entry;
+ char *filename = malloc(256 * sizeof(char));
+ char *base_filename = malloc(256 * sizeof(char));
+ if(filename == NULL || base_filename == NULL) {
+ fprintf(stderr, "Could not allocate enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+#pragma omp critical
+ readdir_r(input_directory_dh, &result, &directory_entry);
+
+ if(strcmp(directory_entry->d_name, "..") == 0 || strcmp(directory_entry->d_name, ".") == 0) {
+ i--;
+ continue;
+ }
+
+ // get our base filenames
+ strcpy(base_filename, directory_entry->d_name);
+ filenames[i] = base_filename;
+
+ // get our real filename
+ sprintf(filename, "%s/%s", input_fasta_directory, directory_entry->d_name);
+
+ // get individual sequence count
+ file_sequence_count[i] = count_sequences(filename);
+
+ // count the kmer amounts
+ double *count_matrix = load_count_matrix(filename, width, kmer);
+
+ // normalize our kmer counts
+ normalize_matrix(count_matrix, 1, width);
+
+ // multiply our kmers frequency by lambda
+ for(z = 0; z < width; z++)
+ count_matrix[z] = count_matrix[z] * lambda;
+
+ double *sensing_matrix_copy = malloc(sizeof(double) * sequences * width);
+ if(sensing_matrix_copy == NULL) {
+ fprintf(stderr, "Could not allocate enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ memcpy(sensing_matrix_copy, sensing_matrix, sequences * width * sizeof(double));
+
+
+ // run nnls
+ double *solution = nnls(sensing_matrix_copy, count_matrix, sequences, width);
+
+ // normalize our solution
+ normalize_matrix(solution, 1, sequences);
+
+ // add the current solution to the solutions array
+ for(z = 0; z < sequences; z++ ) {
+ solutions(i, z) = solution[z];
+ }
+
+ done++;
+ printf("%d/%d samples processed\n", done, dir_count);
+ free(solution);
+ free(count_matrix);
+ free(filename);
+ free(sensing_matrix_copy);
+ }
+
+ char **headers = load_headers(sensing_fasta_filename, sequences);
+
+ // output our matrix
+ FILE *output_fh = fopen(output_filename, "w");
+ if(output_fh == NULL) {
+ fprintf(stderr, "Could not open %s for writing\n", output_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ fprintf(output_fh, "# QIIME vQuikr OTU table\n");
+ fprintf(output_fh, "#OTU_ID\t");
+
+ // print our filename headers
+ for(x = 0; x < dir_count - 1; x++) {
+ fprintf(output_fh, "%s\t", filenames[x]);
+ }
+ fprintf(output_fh, "%s\n", filenames[dir_count - 1]);
+
+ // get our actual values
+ for(y = 0; y < sequences; y++) {
+ for(x = 0; x < dir_count; x++) {
+ solutions(x, y) = round(solutions(x, y) * file_sequence_count[x]);
+ }
+ }
+
+ for(y = 0; y < sequences; y++) {
+
+ double column_sum = 0.;
+ for(x = 0; x < dir_count; x++) {
+ column_sum += solutions(x, y);
+ }
+
+ // if our column is zero, don't bother printing the row
+ if(column_sum != 0) {
+ fprintf(output_fh, "%s\t", headers[y]);
+
+ for(x = 0; x < dir_count - 1; x++) {
+ fprintf(output_fh, "%d\t", (int)solutions(x, y));
+ }
+ fprintf(output_fh, "%d\n", (int)solutions[sequences*(dir_count - 1) + y]);
+ }
+ }
+ fclose(output_fh);
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/c/nnls.c b/src/c/nnls.c
new file mode 100644
index 0000000..ddd9f2f
--- /dev/null
+++ b/src/c/nnls.c
@@ -0,0 +1,416 @@
+/*
+ * nnls.c (c) 2002-2009 Turku PET Centre
+ * This file contains the routine NNLS (nonnegative least squares)
+ * and the subroutines required by it, except h12, which is in
+ * file 'lss_h12.c'.
+ *
+ * This routine is based on the text and fortran code in
+ * C.L. Lawson and R.J. Hanson, Solving Least Squares Problems,
+ * Prentice-Hall, Englewood Cliffs, New Jersey, 1974.
+ * Version:
+ * 2002-08-19 Vesa Oikonen
+ * 2003-05-08 Kaisa Sederholm & VO
+ * Included function nnlsWght().
+ * 2003-05-12 KS
+ * Variable a_dim1 excluded
+ *
+ * Usage of the coefficient matrix altered so that it is
+ * given in a[][] instead of a[].
+ * 2003-11-06 VO
+ * If n<2, then itmax is set to n*n instead of previous n*3.
+ * 2004-09-17 VO
+ * Doxygen style comments.
+ * 2006-24-04 Pauli Sundberg
+ * Added some debuging output, and made some comments more precise.
+ * 2007-05-17 VO
+ * 2009-04-16 VO
+ * Corrected a bug in nnls() which may have caused an infinite loop.
+ * 009-04-27 VO
+ * Added function nnlsWghtSquared() for faster pixel-by-pixel calculations.
+ * Checking for exceeding iteration count is corrected in nnls().
+ */
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+#define MAX(a,b) ((a) >= (b) ? (a) : (b))
+#define ABS(x) ((x) >= 0 ? (x) : -(x))
+
+int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) {
+ double d1, b, clinv, cl, sm;
+ int k, j;
+
+ /* Check parameters */
+ if (mode!=1 && mode!=2)
+ return(1);
+ if (m<1 || u==NULL || u_dim1<1 || cm==NULL)
+ assert(0);
+ // return(1);
+ if (lpivot<0 || lpivot>=l1 || l1>m)
+ // assert(0);
+ return(1);
+
+ /* Function Body */
+ cl = ABS( u[lpivot*u_dim1] );
+ // cl= (d1 = u[lpivot*u_dim1], fabs(d1));
+
+ if (mode==2)
+ { /* Apply transformation I+U*(U**T)/B to cm[] */
+ if(cl<=0.)
+ // assert(0);
+ return(0);
+ }
+ else
+ { /* Construct the transformation */
+
+
+ /* This is the way provided in the original pseudocode
+ sm = 0;
+ for (j = l1; j < m; j++)
+ {
+ d1 = u[j * u_dim1];
+ sm += d1*d1;
+ }
+ d1 = u[lpivot * u_dim1];
+ sm += d1*d1;
+ sm = sqrt(sm);
+
+ if (u[lpivot*u_dim1] > 0)
+ sm=-sm;
+
+ up[0] = u[lpivot*u_dim1] - sm;
+ u[lpivot*u_dim1]=sm;
+ printf("Got sum: %f\n",sm);
+ */
+
+ /* and this trying to compensate overflow */
+ for (j=l1; j<m; j++)
+ { // Computing MAX
+ cl = MAX( ABS( u[j*u_dim1] ), cl );
+ }
+ // zero vector?
+
+ if (cl<=0.)
+ return(0);
+
+ clinv=1.0/cl;
+
+ // Computing 2nd power
+ d1=u[lpivot*u_dim1]*clinv;
+ sm=d1*d1;
+
+ for (j=l1; j<m; j++)
+ {
+ d1=u[j*u_dim1]*clinv;
+ sm+=d1*d1;
+ }
+ cl *= sqrt(sm);
+
+ if (u[lpivot*u_dim1] > 0.)
+ cl=-cl;
+ up[0] = u[lpivot*u_dim1] - cl;
+ u[lpivot*u_dim1]=cl;
+ }
+
+ // no vectors where to apply? only change pivot vector!
+ b=up[0] * u[lpivot*u_dim1];
+
+ /* b must be nonpositive here; if b>=0., then return */
+ if (b == 0)
+ return(0);
+
+ // ok, for all vectors we want to apply
+ for (j =0; j < ncv; j++) {
+ sm = cm[ lpivot * ice + j * icv ] * (up[0]);
+
+ for (k=l1; k<m; k++)
+ sm += cm[ k * ice + j*icv ] * u[ k*u_dim1 ];
+
+ if (sm != 0.0) {
+ sm *= (1/b);
+ // cm[lpivot, j] = ..
+ cm[ lpivot * ice + j*icv] += sm * (up[0]);
+ for (k= l1; k<m; k++)
+ {
+ cm[ k*ice + j*icv] += u[k * u_dim1]*sm;
+ }
+ }
+ }
+
+ return(0);
+}
+
+
+void g1(double a, double b, double *cterm, double *sterm, double *sig)
+{
+ double d1, xr, yr;
+
+ if( fabs(a) > fabs(b) ) {
+ xr = b / a;
+ d1 = xr;
+ yr = sqrt(d1*d1 + 1.);
+ d1 = 1./yr;
+ *cterm=(a>=0.0 ? fabs(d1) : -fabs(d1));
+ *sterm=(*cterm)*xr;
+ *sig=fabs(a)*yr;
+ } else if( b != 0.) {
+ xr = a / b;
+ d1 = xr;
+ yr = sqrt(d1 * d1 + 1.);
+ d1 = 1. / yr;
+ *sterm=(b>=0.0 ? fabs(d1) : -fabs(d1));
+ *cterm=(*sterm)*xr; *sig=fabs(b)*yr;
+ } else {
+ *sig=0.; *cterm=0.; *sterm=1.;
+ }
+}
+
+
+int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) {
+ int pfeas;
+ int ret=0;
+ int iz;
+ int jz;
+ int k, j=0, l, itmax, izmax=0, ii, jj=0, ip;
+ double d1, d2, sm, up, ss;
+ double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc;
+
+
+ /* Check the parameters and data */
+ if(m <= 0 || n <= 0 || a == NULL || b == NULL || x == NULL)
+ return(2);
+
+ /* Allocate memory for working space, if required */
+ double *w = (double*)calloc(n, sizeof(double));
+ double *zz = (double*)calloc(m, sizeof(double));
+ int *index = (int*)calloc(n, sizeof(int));
+ if(w == NULL || zz == NULL || index == NULL)
+ return(2);
+
+ /* Initialize the arrays INDEX[] and X[] */
+ for(k=0; k<n; k++) {
+ x[k]=0.;
+ index[k]=k;
+ }
+
+ int iz2 = n - 1;
+ int iz1 = 0;
+ int iter=0;
+ int nsetp=0;
+ int npp1=0;
+
+ /* Main loop; quit if all coeffs are already in the solution or */
+ /* if M cols of A have been triangularized */
+ if(n < 3)
+ itmax=n*3;
+ else
+ itmax=n*n;
+
+
+ while(iz1 <= iz2 && nsetp < m) {
+ /* Compute components of the dual (negative gradient) vector W[] */
+ for(iz=iz1; iz<=iz2; iz++) {
+ j=index[iz];
+ sm=0.;
+ for(l=npp1; l<m; l++)
+ sm+=a[j*m + l]*b[l];
+ w[j]=sm;
+ }
+
+ while(1) {
+ /* Find largest positive W[j] */
+ for(wmax=0., iz=iz1; iz<=iz2; iz++) {
+ j=index[iz]; if(w[j]>wmax) {wmax=w[j]; izmax=iz;}}
+
+ /* Terminate if wmax<=0.; */
+ /* it indicates satisfaction of the Kuhn-Tucker conditions */
+ if(wmax<=0.0)
+ break;
+
+ iz=izmax;
+ j=index[iz];
+
+ /* The sign of W[j] is ok for j to be moved to set P. */
+ /* Begin the transformation and check new diagonal element to avoid */
+ /* near linear dependence. */
+ asave=a[j*m + npp1];
+ h12(1, npp1, npp1+1, m, &a[j*m +0], 1, &up, &dummy, 1, 1, 0);
+ unorm=0.;
+ if(nsetp!=0){
+ for(l=0; l<nsetp; l++) {
+ d1=a[j*m + l];
+ unorm+=d1*d1;
+ }
+ }
+
+ unorm=sqrt(unorm);
+ d2=unorm+(d1=a[j*m + npp1], fabs(d1)) * 0.01;
+ if((d2-unorm)>0.) {
+ /* Col j is sufficiently independent. Copy B into ZZ, update ZZ */
+ /* and solve for ztest ( = proposed new value for X[j] ) */
+ for(l=0; l<m; l++) zz[l]=b[l];
+ h12(2, npp1, npp1+1, m, &a[j*m + 0], 1, &up, zz, 1, 1, 1);
+ ztest=zz[npp1]/a[j*m +npp1];
+ /* See if ztest is positive */
+ if(ztest>0.) break;
+ }
+
+ /* Reject j as a candidate to be moved from set Z to set P. Restore */
+ /* A[npp1,j], set W[j]=0., and loop back to test dual coeffs again */
+ a[j*m+ npp1]=asave; w[j]=0.;
+ } /* while(1) */
+
+ if(wmax<=0.0)
+ break;
+
+ /* Index j=INDEX[iz] has been selected to be moved from set Z to set P. */
+ /* Update B and indices, apply householder transformations to cols in */
+ /* new set Z, zero subdiagonal elts in col j, set W[j]=0. */
+ for(l=0; l<m; ++l)
+ b[l]=zz[l];
+
+ index[iz]=index[iz1];
+ index[iz1]=j;
+ iz1++;
+ npp1++;
+ nsetp=npp1;
+
+ if(iz1<=iz2) {
+ for(jz=iz1; jz<=iz2; jz++) {
+ jj=index[jz];
+ h12(2, nsetp-1, npp1, m, &a[j*m +0], 1, &up, &a[jj*m +0], 1, m, 1);
+ }
+ }
+
+ if(nsetp!=m) {
+ for(l=npp1; l<m; l++)
+ a[j*m +l]=0.;
+ }
+
+ w[j]=0.;
+
+ /* Solve the triangular system; store the solution temporarily in Z[] */
+ for(l=0; l<nsetp; l++) {
+ ip=nsetp-(l+1);
+ if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
+ jj=index[ip]; zz[ip]/=a[jj*m +ip];
+ }
+
+ /* Secondary loop begins here */
+ while(++iter < itmax) {
+ /* See if all new constrained coeffs are feasible; if not, compute alpha */
+ for(alpha = 2.0, ip = 0; ip < nsetp; ip++) {
+ l=index[ip];
+ if(zz[ip]<=0.) {
+ t = -x[l]/(zz[ip]-x[l]);
+ if(alpha > t) {
+ alpha = t;
+ jj = ip - 1;
+ }
+ }
+ }
+
+ /* If all new constrained coeffs are feasible then still alpha==2. */
+ /* If so, then exit from the secondary loop to main loop */
+ if(alpha==2.0)
+ break;
+
+ /* Use alpha (0.<alpha<1.) to interpolate between old X and new ZZ */
+ for(ip=0; ip<nsetp; ip++) {
+ l = index[ip];
+ x[l] += alpha*(zz[ip]-x[l]);
+ }
+
+ /* Modify A and B and the INDEX arrays to move coefficient i */
+ /* from set P to set Z. */
+ k=index[jj+1]; pfeas=1;
+ do {
+ x[k]=0.;
+ if(jj!=(nsetp-1)) {
+ jj++;
+ for(j=jj+1; j<nsetp; j++) {
+ ii=index[j]; index[j-1]=ii;
+ g1(a[ii*m + (j-1)], a[ii*m + j], &cc, &ss, &a[ii*m + j-1]);
+ for(a[ii*m + j]=0., l=0; l<n; l++) if(l!=ii) {
+ /* Apply procedure G2 (CC,SS,A(J-1,L),A(J,L)) */
+ temp=a[l*m + j-1];
+ a[l*m + j-1]=cc*temp+ss*a[l*m + j];
+ a[l*m + j]=-ss*temp+cc*a[l*m + j];
+ }
+ /* Apply procedure G2 (CC,SS,B(J-1),B(J)) */
+ temp=b[j-1]; b[j-1]=cc*temp+ss*b[j]; b[j]=-ss*temp+cc*b[j];
+ }
+ }
+ npp1=nsetp-1; nsetp--; iz1--; index[iz1]=k;
+
+ /* See if the remaining coeffs in set P are feasible; they should */
+ /* be because of the way alpha was determined. If any are */
+ /* infeasible it is due to round-off error. Any that are */
+ /* nonpositive will be set to zero and moved from set P to set Z */
+ for(jj=0, pfeas=1; jj<nsetp; jj++) {
+ k=index[jj]; if(x[k]<=0.) {pfeas=0; break;}
+ }
+ } while(pfeas==0);
+
+ /* Copy B[] into zz[], then solve again and loop back */
+ for(k=0; k<m; k++)
+ zz[k]=b[k];
+ for(l=0; l<nsetp; l++) {
+ ip=nsetp-(l+1);
+ if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1];
+ jj=index[ip]; zz[ip]/=a[jj*m + ip];
+ }
+ } /* end of secondary loop */
+
+ if(iter>=itmax) {
+ ret = 1;
+ break;
+ }
+
+ for(ip=0; ip<nsetp; ip++) {
+ k=index[ip];
+ x[k]=zz[ip];
+ }
+ } /* end of main loop */
+
+ /* Compute the norm of the final residual vector */
+ sm=0.;
+
+ if (rnorm != NULL) {
+ if (npp1<m)
+ for (k=npp1; k<m; k++)
+ sm+=(b[k] * b[k]);
+ else
+ for (j=0; j<n; j++)
+ w[j]=0.;
+ *rnorm=sqrt(sm);
+ }
+
+ /* Free working space, if it was allocated here */
+ free(w);
+ free(zz);
+ free(index);
+ return(ret);
+}
+/* nnls_ */
+
+
+double *nnls(double *a_matrix, double *b_matrix, int height, int width) {
+
+ double *solution = (double*)calloc(height, sizeof(double));
+
+ if(solution == NULL) {
+ fprintf(stderr, "could not allocate enough memory for nnls\n");
+ exit(EXIT_FAILURE);
+ }
+
+ int ret = nnls_algorithm(a_matrix, width, height, b_matrix, solution, NULL);
+ if(ret == 1) {
+ printf("NNLS has reached the maximum iterations\n");
+ } else if(ret == 2) {
+ fprintf(stderr, "NNLS could not allocate enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ return solution;
+}
diff --git a/src/c/nnls.h b/src/c/nnls.h
new file mode 100644
index 0000000..ded8d01
--- /dev/null
+++ b/src/c/nnls.h
@@ -0,0 +1 @@
+double *nnls(double *a_matrix, double *b_matrix, int height, int width);
diff --git a/src/c/quikr.1 b/src/c/quikr.1
new file mode 100644
index 0000000..9982d94
--- /dev/null
+++ b/src/c/quikr.1
@@ -0,0 +1,70 @@
+.TH quikr 1 quikr-2013-05
+.SH NAME
+quikr \- Calculate estimated frequencies of bacteria in a sample.
+.SH SYNOPSIS
+.B quikr
+.RB [ \-i
+.IR input]
+.RB [ \-f
+.IR sensing-fasta]
+.RB [ \-s
+.IR sensing-matrix]
+.RB [ \-l
+.IR lambda ]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-o
+.IR output ]
+.RB [ \-v ]
+.P
+.BR quikr " ..."
+.SH DESCRIPTION
+.B quikr
+Quikr returns the estimated frequencies of batcteria present when given a
+input FASTA file and a trained sensing matrix
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input
+the sample's fasta file of NGS. READS (fasta format)
+.TP
+.B \-f, --sensing-fasta
+location of the fasta file database used to create the sensing matrix. (fasta format)
+.TP
+.B \-s, --sensing-matrix
+location of the sensing matrix. (trained from quikr_train)
+.TP
+.B \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-l, --lambda
+lambda value to use. (default value is 10000)
+.TP
+.B \-o, --output
+OTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH EXAMPLES
+Use quikr to calculate the estimated frequencies for sample.fa, using rdp7.fasta as the sensing matrix we generated with quikr_train. This uses 6-mers by default, and a lambda value of 10000:
+.P
+quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt
+.SH "SEE ALSO"
+\fBmultifasta_to_otu\fP(1), \fBquikr_train\fP(1).
+.SH AUTHORS
+.B quikr
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use,
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright
+notice appear in all copies and that both that copyright notice and this
+permission noticeappear in supporting documentation. No representations are
+made about the suitability of this software for any purpose. It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/quikr.c b/src/c/quikr.c
new file mode 100644
index 0000000..c73e0dd
--- /dev/null
+++ b/src/c/quikr.c
@@ -0,0 +1,192 @@
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "nnls.h"
+#include "quikr_functions.h"
+
+#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
+#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+
+ int c;
+ int kmer = 0;
+
+ char *input_fasta_filename = NULL;
+ char *sensing_matrix_filename = NULL;
+ char *sensing_fasta_filename = NULL;
+ char *output_filename = NULL;
+
+ int x = 0;
+ int y = 0;
+ int verbose = 0;
+ int lambda = 0;
+
+
+
+ while (1) {
+ static struct option long_options[] = {
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"lambda", required_argument, 0, 'l'},
+ {"output", required_argument, 0, 'o'},
+ {"sensing-fasta", required_argument, 0, 'f'},
+ {"sensing-matrix", required_argument, 0, 's'},
+ {"verbose", no_argument, 0, 'v'},
+ {0, 0, 0, 0}
+ };
+
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "k:l:f:s:i:o:hdv", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 0:
+ case 'k':
+ kmer = atoi(optarg);
+ break;
+ case 'l':
+ lambda = atoi(optarg);
+ break;
+ case 'f':
+ sensing_fasta_filename = optarg;
+ break;
+ case 's':
+ sensing_matrix_filename = optarg;
+ break;
+ case 'i':
+ input_fasta_filename = optarg;
+ break;
+ case 'o':
+ output_filename = optarg;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'h':
+ printf("%s\n", USAGE);
+ exit(EXIT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+ }
+
+
+ if(sensing_matrix_filename == NULL) {
+ fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(sensing_fasta_filename == NULL) {
+ fprintf(stderr, "Error: sensing matrix filename (-f) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(output_filename == NULL) {
+ fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+ if(input_fasta_filename == NULL) {
+ fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ if(lambda == 0)
+ lambda = 10000;
+ if(kmer == 0)
+ kmer = 6;
+
+ if(verbose) {
+ printf("kmer: %d\n", kmer);
+ printf("lambda: %d\n", lambda);
+ printf("fasta: %s\n", input_fasta_filename);
+ printf("sensing database: %s\n", sensing_matrix_filename);
+ printf("sensing database fasta: %s\n", sensing_fasta_filename);
+ printf("output: %s\n", output_filename);
+ }
+ // 4 "ACGT" ^ Kmer gives us the size of output rows
+ int width = pow(4, kmer);
+ width = width + 1;
+
+ int sequences = count_sequences(sensing_fasta_filename);
+
+ if(verbose) {
+ printf("width: %d\nsequences %d\n", width, sequences);
+ }
+
+ double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
+ double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer);
+
+ // multiply our matrix by lambda
+ for(x = 1; x < sequences; x++) {
+ for(y= 0; y < width - 1; y++) {
+ sensing_matrix(x, y) = sensing_matrix(x, y) * lambda;
+ }
+ }
+
+ for(x= 0; x < sequences; x++) {
+ sensing_matrix(x, 0) = 1.0;
+ }
+ // normalize our count_matrix
+ normalize_matrix(count_matrix, 1, width);
+ for(x = 0; x < width; x++)
+ count_matrix[x] = count_matrix[x] * lambda;
+
+ // output our matricies if we are in verbose mode
+ if(verbose) {
+ FILE *sensing_matrix_fh = fopen( "sensing.matrix", "w");
+ if(sensing_matrix_fh == NULL) {
+ fprintf(stderr, "could not open sensing.matrix for writing.\n");
+ exit(EXIT_FAILURE);
+ }
+ for(x = 0; x < sequences; x++) {
+ for( y = 0; y < width; y++) {
+ fprintf(sensing_matrix_fh, "%.10f\t", sensing_matrix(x, y));
+ }
+ fprintf(sensing_matrix_fh, "\n");
+ }
+ fclose(sensing_matrix_fh);
+
+ FILE *count_matrix_fh = fopen("count.matrix", "w");
+ if(count_matrix_fh == NULL) {
+ fprintf(stderr, "could not open sensing.matrix for writing.\n");
+ exit(EXIT_FAILURE);
+ }
+ for(x = 0; x < width; x++) {
+ fprintf(count_matrix_fh, "%.10f\n", count_matrix[x]);
+ }
+ fclose(count_matrix_fh);
+ }
+
+ double *solution = nnls(sensing_matrix, count_matrix, sequences, width);
+
+ // normalize our solution vector
+ normalize_matrix(solution, 1, sequences);
+
+ // output our matrix
+ FILE *output_fh = fopen(output_filename, "w");
+ if(output_fh == NULL) {
+ fprintf(stderr, "Could not open %s for writing\n", output_filename);
+ exit(EXIT_FAILURE);
+ }
+ for(x = 0; x < sequences; x++) {
+ fprintf(output_fh, "%.10lf\n", solution[x]);
+ }
+ fclose(output_fh);
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
new file mode 100644
index 0000000..7e18e64
--- /dev/null
+++ b/src/c/quikr_functions.c
@@ -0,0 +1,163 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+int count_sequences(char *filename) {
+ char command[512];
+ int sequences = 0;
+ FILE *grep_output;
+
+ sprintf(command, "grep -c ^\\> %s", filename);
+ grep_output = popen(command, "r");
+ if(grep_output == NULL) {
+ fprintf(stderr, "Could not execute %s\n", command);
+ exit(EXIT_FAILURE);
+ }
+
+ fscanf(grep_output, "%d", &sequences);
+
+ pclose(grep_output);
+ return sequences;
+}
+
+
+void normalize_matrix(double *matrix, int height, int width) {
+ int x = 0;
+ int y = 0;
+ for(x = 0; x < height; x++) {
+
+ double row_sum = 0;
+
+ for(y = 0; y < (width); y++)
+ row_sum = row_sum + matrix[width * x + y];
+ for(y = 0; y < (width); y++)
+ matrix[width * x + y] = matrix[width * x + y] / row_sum;
+ }
+}
+
+
+double *load_count_matrix(char *filename, int width, int kmer) {
+
+ double *count_matrix = malloc((width)*sizeof(double));
+ char count_command[512];
+ int x = 0;
+ char *line = NULL;
+ size_t len = 0;
+
+ if(count_matrix == NULL) {
+ fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width);
+ exit(EXIT_FAILURE);
+ }
+
+ // create out count matrix
+ sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename);
+ FILE *count_output = popen(count_command, "r");
+ if(count_output == NULL) {
+ fprintf(stderr, "could not execute \"%s\"", count_command);
+ exit(EXIT_FAILURE);
+ }
+
+ // set first element to zero.
+ count_matrix[0] = 0;
+
+ // get our first line
+ getline(&line, &len, count_output);
+ count_matrix[1] = atoi(line);
+
+ // iterate over the rest of the lines
+ for(x = 2; x < width; x++) {
+ getline(&line, &len, count_output);
+ count_matrix[x] = atoi(line);
+ }
+
+ pclose(count_output);
+
+ return count_matrix;
+}
+
+
+double *load_sensing_matrix(char *filename, int height, int width) {
+
+ int x = 0;
+ int y = 0;
+
+ char *line = NULL;
+ char *val;
+ char command[512];
+ size_t len = 0;
+ FILE *sensing_matrix_fh = NULL;
+
+ double *sensing_matrix = malloc(height * width * sizeof(double));
+ if(sensing_matrix == NULL) {
+ fprintf(stderr, "Could not allocate memory for the sensing matrix\n");
+ }
+
+ sprintf(command, "gzip -cd %s", filename);
+ sensing_matrix_fh = popen(command, "r");
+ if(sensing_matrix_fh == NULL) {
+ fprintf(stderr, "could not open %s", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ // read our sensing matrix in
+ for(x = 0; x < height; x++) {
+ getline(&line, &len, sensing_matrix_fh);
+ char *ptr;
+
+ // Read our first element in outside of the loop
+ val = strtok_r(line,"\t", &ptr);
+ sensing_matrix[width*x + 1] = atof(val);
+ // iterate through and load the array
+ for (y = 2; y < width; y++) {
+ val = strtok_r(NULL, "\t", &ptr);
+ sensing_matrix[width*x + y] = atof(val);
+ }
+ }
+
+ pclose(sensing_matrix_fh);
+
+ return sensing_matrix;
+}
+
+char **load_headers(char *filename, int sequences) {
+ char command[512];
+ char *line= NULL;
+ int x = 0;
+ FILE *grep_output;
+ size_t len = 0;
+
+ sprintf(command, "grep ^\\> %s", filename);
+ grep_output = popen(command, "r");
+ if(grep_output == NULL) {
+ fprintf(stderr, "Could not execute %s\n", command);
+ exit(EXIT_FAILURE);
+ }
+
+ char **headers = malloc(sequences * sizeof(char *));
+ if(headers == NULL) {
+ fprintf(stderr, "could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ for(x = 0; x < sequences; x++) {
+
+ char *header = malloc(256 * sizeof(char));
+ if(header == NULL) {
+ fprintf(stderr, "could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+ getline(&line, &len, grep_output);
+ sscanf(line + 1, "%s", header);
+ headers[x] = header;
+ }
+
+ pclose(grep_output);
+
+ return headers;
+}
+
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
new file mode 100644
index 0000000..1826427
--- /dev/null
+++ b/src/c/quikr_functions.h
@@ -0,0 +1,14 @@
+// count the sequences in a fasta file
+int count_sequences(char *filename);
+
+// normalize a matrix by dividing each element by the sum of it's column
+void normalize_matrix(double *matrix, int height, int width);
+
+// load a sensing matrix
+double *load_sensing_matrix(char *filename, int height, int width);
+
+// load a count matrix
+double *load_count_matrix(char *filename, int width, int kmer);
+
+// load headers
+char **load_headers(char *filename, int sequences);
diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1
new file mode 100644
index 0000000..dd1493f
--- /dev/null
+++ b/src/c/quikr_train.1
@@ -0,0 +1,61 @@
+.TH quikr_train 1 quikr_train-2013-05
+.SH NAME
+quikr_train \- train databases for use with Quikr
+.SH SYNOPSIS
+.B quikr_train
+.RB [ \-i
+.IR input]
+.RB [ \-o
+.IR output]
+.RB [ \-k
+.IR kmer ]
+.RB [ \-v ]
+.P
+.BR quikr " ..."
+.SH DESCRIPTION
+.B quikr
+The quikr_train is a utility to train a database for use with quikr.
+Before running the quikr utility, you need to generate the sensing matrix or
+download a pretrained one from our project's homepage.
+
+quikr_train returns a sensing matrix that can be used with the quikr
+function.
+.P
+.SH OPTIONS
+.TP
+.B \-i, --input
+the database of sequences to create the sensing matrix. (fasta format)
+.TP
+.B \-k, --kmer
+specify what size of kmer to use. (default value is 6)
+.TP
+.B \-o, --output
+the sensing matrix. (a gzip'd text file)
+.TP
+.B \-v, --verbose
+verbose mode.
+.SH EXAMPLES
+Use quikr_train to generate a sensing matrix from rdp7.fasta. This uses 6mers by default.
+.P
+quikr_train -i rdp7.fa -o rd7_sensing_matrix.gz
+.SH USAGE
+If you do not have a .gz file on your output matrix name, it will be appended.
+.SH "SEE ALSO"
+\fBmultifasta_to_otu\fP(1), \fBquikr\fP(1).
+.SH AUTHORS
+.B quikr
+was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+.SH REPORTING BUGS
+.TP
+Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
+operating system, current compiler, and test files to reproduce your issue.
+.SH COPYRIGHT.
+Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use,
+copy, modify, distribute, and sell this software and its documentation for
+any purpose is hereby granted without fee, provided that the above copyright
+notice appear in all copies and that both that copyright notice and this
+permission noticeappear in supporting documentation. No representations are
+made about the suitability of this software for any purpose. It is provided
+"as is" without express or implied warranty.
+
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
new file mode 100644
index 0000000..d2a83ef
--- /dev/null
+++ b/src/c/quikr_train.c
@@ -0,0 +1,176 @@
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "quikr_functions.h"
+
+#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'"
+#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+ char probabilities_command[512];
+ char kmers_file[256];
+ char *line = NULL;
+ char *val;
+ size_t len = 0;
+
+
+ int c;
+ int kmer = 0;
+
+ char *fasta_file = NULL;
+ char *output_file = NULL;
+
+ int x = 0;
+ int y = 0;
+
+ int verbose = 0;
+
+ gzFile output = NULL;
+
+ while (1) {
+ static struct option long_options[] = {
+ {"verbose", no_argument, 0, 'v'},
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"output", required_argument, 0, 'o'},
+ {0, 0, 0, 0}
+ };
+
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fasta_file = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ break;
+ case 'o':
+ output_file = optarg;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'h':
+ printf("%s\n", USAGE);
+ exit(EXIT_SUCCESS);
+ break;
+ case '?':
+ /* getopt_long already printed an error message. */
+ break;
+ default:
+ exit(EXIT_FAILURE);
+ }
+ }
+
+
+ if(fasta_file == NULL) {
+ fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ if(output_file == NULL) {
+ fprintf(stderr, "Error: output directory (-o) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ if(kmer == 0)
+ kmer = 6;
+
+ if(verbose) {
+ printf("kmer size: %d\n", kmer);
+ printf("fasta file: %s\n", fasta_file);
+ printf("output file: %s\n", output_file);
+ }
+
+ if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) {
+ char *temp = malloc(sizeof(strlen(output_file) + 4));
+ sprintf(temp, "%s.gz", output_file);
+ output_file = temp;
+ printf("appending a .gz to our output file: %s\n", output_file);
+ }
+
+ // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length
+ int width = pow(4, kmer);
+ int sequences = count_sequences(fasta_file);
+
+ if(verbose)
+ printf("sequences: %d\nwidth: %d\n", sequences, width);
+
+ // Allocate our matrix with the appropriate size, just one row
+ double *trained_matrix = malloc(width*sizeof(double));
+ if(trained_matrix == NULL) {
+ fprintf(stderr, "Could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // call the probabilities-by-read command
+ sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer);
+ sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file);
+ FILE *probabilities_output = popen(probabilities_command, "r");
+ if(probabilities_output == NULL) {
+ fprintf(stderr, "Error could not execute: %s\n", probabilities_command);
+ exit(EXIT_FAILURE);
+ }
+
+ // open our output file
+ output = gzopen(output_file, "w6");
+ if(output == NULL) {
+ fprintf(stderr, "Error: could not open output file, error code: %d", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ if(verbose)
+ printf("Writing our sensing matrix to %s\n", output_file);
+
+ // read normalize and write our matrix in one go
+ for(x = 0; x < sequences; x++) {
+
+ getline(&line, &len, probabilities_output);
+
+ // Read our first element in outside of the loop
+ val = strtok(line,"\t\n\r");
+ trained_matrix[0] = atof(val);
+ // iterate through and load the array
+ for (y = 1; y < width; y++) {
+ val = strtok (NULL, "\t\n\r");
+ trained_matrix[y] = atof(val);
+ }
+
+ double row_sum = 0;
+
+ for( y = 0; y < width; y++) {
+ row_sum = row_sum + trained_matrix[y];
+ }
+
+ for( y = 0; y < width; y++) {
+ trained_matrix[y] = trained_matrix[y] / row_sum;
+ }
+
+ for( y = 0; y < width; y++) {
+ gzprintf(output, "%.10f\t", trained_matrix[y]);
+ }
+ gzprintf(output, "\n");
+ }
+
+ free(trained_matrix);
+ gzclose(output);
+ pclose(probabilities_output);
+
+ return 0;
+}