diff options
Diffstat (limited to 'src/c')
-rw-r--r-- | src/c/Makefile | 14 | ||||
-rw-r--r-- | src/c/multifasta_to_otu.1 | 95 | ||||
-rw-r--r-- | src/c/multifasta_to_otu.c | 314 | ||||
-rw-r--r-- | src/c/nnls.c | 416 | ||||
-rw-r--r-- | src/c/nnls.h | 1 | ||||
-rw-r--r-- | src/c/quikr.1 | 70 | ||||
-rw-r--r-- | src/c/quikr.c | 192 | ||||
-rw-r--r-- | src/c/quikr_functions.c | 163 | ||||
-rw-r--r-- | src/c/quikr_functions.h | 14 | ||||
-rw-r--r-- | src/c/quikr_train.1 | 61 | ||||
-rw-r--r-- | src/c/quikr_train.c | 176 |
11 files changed, 1516 insertions, 0 deletions
diff --git a/src/c/Makefile b/src/c/Makefile new file mode 100644 index 0000000..ba0c799 --- /dev/null +++ b/src/c/Makefile @@ -0,0 +1,14 @@ +UNAME := $(shell uname) +CC = gcc +QUIKR_TRAIN_CFLAGS = -O3 -s -mtune=native -Wall -lm -lz -D$(UNAME) +QUIKR_CFLAGS = -O3 -s -mtune=native -Wextra -Wall -lm -pthread -L../ -I../ -std=gnu99 -fopenmp -D$(UNAME) +all: quikr_train_ quikr_ multifasta_to_otu_ + +multifasta_to_otu_: + $(CC) multifasta_to_otu.c quikr_functions.c nnls.c -o multifasta_to_otu $(QUIKR_CFLAGS) +quikr_train_: + $(CC) quikr_train.c quikr_functions.c -o quikr_train $(QUIKR_TRAIN_CFLAGS) +quikr_: + $(CC) quikr.c quikr_functions.c nnls.c -o quikr $(QUIKR_CFLAGS) -I../ +clean: + rm -vf quikr_train quikr multifasta_to_otu diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1 new file mode 100644 index 0000000..7878ef0 --- /dev/null +++ b/src/c/multifasta_to_otu.1 @@ -0,0 +1,95 @@ +.TH multifasta_to_otu 1 multifasta_to_otu-2013-05 +.SH NAME +multifasta_to_otu \- create a QIIME OTU table based on Quikr results. +.SH SYNOPSIS +.B multifasta_to_otu +.RB [ \-i +.IR input-directory ] +.RB [ \-f +.IR sensing-fasta] +.RB [ \-s +.IR sensing-matrix] +.RB [ \-k +.IR kmer ] +.RB [ \-l +.IR lambda ] +.RB [ \-j +.IR jobs ] +.RB [ \-o +.IR otu-table] +.RB [ \-v ] +.P +.BR multifasta_to_otu " ..." +.SH DESCRIPTION +.B multifasta_to_otu +The multifasta_to_otu tool is a handy wrapper for quikr which lets the user +to input as many fasta files as they like, and then returns an OTU table of the +number of times a specimen was seen in all of the samples. +.P +.SH OPTIONS +.TP +.B \-i, --input-directory +the directory containing the samples' fasta files of reads (note each fasta file should correspond to a separate sample) +.TP +.B \-f, --sensing-fasta +location of the fasta file database used to create the sensing matrix. +.TP +.B \-s, --sensing-matrix +location of the sensing matrix. +.TP +.b \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-l, --lambda +lambda value to use. (default value is 10000) +.TP +.B \-j, --jobs +specifies how many jobs to run at once. (default value is the number of CPUs) +.TP +.B \-o, --otu-table +the OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or sequence table if not OTU's) +.TP +.B \-v, --verbose +verbose mode. +.SH USAGE +This program will use a large amount of memory, and CPU time. +You can reduce the number of cores used, and thus memory, by specifying the -j flag with aspecified number of jobs. Otherwise multifasta_to_otu will run one job per cpu core. +.SH POSTPROCESSING +.B Note: When making your QIIME Metadata file, the sample id's must match the sample fasta file prefix names +.P +4-step QIIME procedure after using Quikr to obtain 3D PCoA graphs: (Note: Our code works much better with WEIGHTED Unifrac as opposed to Unweighted.) +.TP +Pre-requisites: +1. multifasta output file "quikr_output_table.txt" for our example. +.br +2. the tree of the database sequences that were used (e.g.dp7_mafft.fasttree, gg_94_otus_4feb2011.tre, etc.) +.br +3. your-defined <qiime_metadata_file.txt> +.TP +The QIIME procedue: +convert_biom.py -i <quikr_otu_table.txt> -o <quikr_otu>.biom --biom_table_type="otu table" +.br +beta_diversity.py -i <quikr_otu>.biom -m weighted_unifrac -o beta_div -t <tree file> +.br +principal_coordinates.py -i beta_div/weighted_unifrac_<quikr_otu>.txt -o <quikr_otu_project_name>_weighted.txt +.br +make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirectory> -m <qiime_metadata_file> +.SH "SEE ALSO" +\fBquikr\fP(1), \fBquikr_train\fP(1). +.SH AUTHORS +.B multifasta2otu +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use, +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright +notice appear in all copies and that both that copyright notice and this +permission noticeappear in supporting documentation. No representations are +made about the suitability of this software for any purpose. It is provided +"as is" without express or implied warranty. + diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c new file mode 100644 index 0000000..268dec6 --- /dev/null +++ b/src/c/multifasta_to_otu.c @@ -0,0 +1,314 @@ +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <omp.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "nnls.h" +#include "quikr_functions.h" + +#define sensing_matrix(i,j) (sensing_matrix[width*i + j]) +#define solutions(i,j) (solutions[sequences*i+ j]) +#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with OTU_FRACTION_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + + int c; + + char *input_fasta_directory = NULL; + char *sensing_matrix_filename = NULL; + char *sensing_fasta_filename = NULL; + char *output_filename = NULL; + + double *sensing_matrix; + + long int width = 0; + long int sequences = 0; + + int kmer = 6; + int lambda = 10000; + + int x = 0; + int y = 0; + + int jobs = 1; + #ifdef Linux + jobs = get_nprocs(); + #endif + #ifdef Darwin + jobs = sysconf (_SC_NPROCESSORS_ONLN); + #endif + + int verbose = 0; + + DIR *input_directory_dh; + struct dirent *entry; + + while (1) { + static struct option long_options[] = { + {"input-directory", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"lambda", required_argument, 0, 'l'}, + {"jobs", required_argument, 0, 'j'}, + {"output", required_argument, 0, 'o'}, + {"sensing-fasta", required_argument, 0, 'f'}, + {"sensing-matrix", required_argument, 0, 's'}, + {"verbose", no_argument, 0, 'v'}, + {0, 0, 0, 0} + }; + int option_index = 0; + + c = getopt_long (argc, argv, "k:l:f:s:i:o:j:hv", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'k': + kmer = atoi(optarg); + break; + case 'l': + lambda = atoi(optarg); + break; + case 'f': + sensing_fasta_filename = optarg; + break; + case 's': + sensing_matrix_filename = optarg; + break; + case 'j': + jobs = atoi(optarg); + break; + case 'i': + input_fasta_directory = optarg; + break; + case 'o': + output_filename = optarg; + break; + case 'v': + verbose = 1; + break; + case 'h': + puts(USAGE); + exit(EXIT_SUCCESS); + break; + default: + break; + } + } + + if(sensing_matrix_filename == NULL) { + fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(sensing_fasta_filename == NULL) { + fprintf(stderr, "Error: sensing fasta filename (-f) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(output_filename == NULL) { + fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(input_fasta_directory == NULL) { + fprintf(stderr, "Error: input fasta directory (-i) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + // set defaults + + + if(verbose) { + printf("kmer: %d\n", kmer); + printf("lambda: %d\n", lambda); + printf("input directory: %s\n", input_fasta_directory); + printf("sensing database: %s\n", sensing_matrix_filename); + printf("sensing database fasta: %s\n", sensing_fasta_filename); + printf("output: %s\n", output_filename); + printf("number of jobs to run at once: %d\n", jobs); + } + + + input_directory_dh = opendir(input_fasta_directory); + if(input_fasta_directory == NULL) { + fprintf(stderr, "could not open %s\n", input_fasta_directory); + exit(EXIT_FAILURE); + } + + // do a directory count + int dir_count = -2; // -2 for ../ and ./ + while(entry = readdir(input_directory_dh)) + dir_count++; + rewinddir(input_directory_dh); + if(dir_count == 0) { + fprintf(stderr, "%s is empty\n", input_fasta_directory); + exit(EXIT_FAILURE); + } + + // 4 "ACGT" ^ Kmer gives us the size of output rows + width = pow(4, kmer) + 1; + sequences = count_sequences(sensing_fasta_filename); + + if(verbose) { + printf("directory count: %d\n", dir_count); + printf("width: %ld\nsequences %ld\n", width, sequences); + } + + sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); + + // multiply our matrix by lambda + for(x = 0; x < sequences; x++) { + for(y= 0; y < width; y++) { + sensing_matrix(x, y) = sensing_matrix(x, y) * lambda; + } + } + + // set the first row to be all 1's + for(x = 0; x < sequences; x++) { + sensing_matrix(x, 0) = 1.0; + } + + double *solutions = malloc(dir_count * sequences * sizeof(double)); + if(solutions == NULL) { + fprintf(stderr, "Could not allocate enough memory for solutions vector\n"); + exit(EXIT_FAILURE); + } + + char **filenames = malloc(dir_count * sizeof(char *)); + if(filenames == NULL) { + fprintf(stderr, "Could not allocate enough memory\n"); + exit(EXIT_FAILURE); + } + + int *file_sequence_count = malloc(dir_count * sizeof(int)); + if(file_sequence_count == NULL) { + fprintf(stderr, "Could not allocate enough memory\n"); + exit(EXIT_FAILURE); + } + + struct dirent result; + + omp_set_num_threads(jobs); + int done = 0; + printf("Beginning to process samples\n"); +#pragma omp parallel for shared(solutions, sequences, width, result, done) + for(int i = 0; i < dir_count; i++ ) { + + int z = 0; + struct dirent *directory_entry; + char *filename = malloc(256 * sizeof(char)); + char *base_filename = malloc(256 * sizeof(char)); + if(filename == NULL || base_filename == NULL) { + fprintf(stderr, "Could not allocate enough memory\n"); + exit(EXIT_FAILURE); + } + +#pragma omp critical + readdir_r(input_directory_dh, &result, &directory_entry); + + if(strcmp(directory_entry->d_name, "..") == 0 || strcmp(directory_entry->d_name, ".") == 0) { + i--; + continue; + } + + // get our base filenames + strcpy(base_filename, directory_entry->d_name); + filenames[i] = base_filename; + + // get our real filename + sprintf(filename, "%s/%s", input_fasta_directory, directory_entry->d_name); + + // get individual sequence count + file_sequence_count[i] = count_sequences(filename); + + // count the kmer amounts + double *count_matrix = load_count_matrix(filename, width, kmer); + + // normalize our kmer counts + normalize_matrix(count_matrix, 1, width); + + // multiply our kmers frequency by lambda + for(z = 0; z < width; z++) + count_matrix[z] = count_matrix[z] * lambda; + + double *sensing_matrix_copy = malloc(sizeof(double) * sequences * width); + if(sensing_matrix_copy == NULL) { + fprintf(stderr, "Could not allocate enough memory\n"); + exit(EXIT_FAILURE); + } + + memcpy(sensing_matrix_copy, sensing_matrix, sequences * width * sizeof(double)); + + + // run nnls + double *solution = nnls(sensing_matrix_copy, count_matrix, sequences, width); + + // normalize our solution + normalize_matrix(solution, 1, sequences); + + // add the current solution to the solutions array + for(z = 0; z < sequences; z++ ) { + solutions(i, z) = solution[z]; + } + + done++; + printf("%d/%d samples processed\n", done, dir_count); + free(solution); + free(count_matrix); + free(filename); + free(sensing_matrix_copy); + } + + char **headers = load_headers(sensing_fasta_filename, sequences); + + // output our matrix + FILE *output_fh = fopen(output_filename, "w"); + if(output_fh == NULL) { + fprintf(stderr, "Could not open %s for writing\n", output_filename); + exit(EXIT_FAILURE); + } + + fprintf(output_fh, "# QIIME vQuikr OTU table\n"); + fprintf(output_fh, "#OTU_ID\t"); + + // print our filename headers + for(x = 0; x < dir_count - 1; x++) { + fprintf(output_fh, "%s\t", filenames[x]); + } + fprintf(output_fh, "%s\n", filenames[dir_count - 1]); + + // get our actual values + for(y = 0; y < sequences; y++) { + for(x = 0; x < dir_count; x++) { + solutions(x, y) = round(solutions(x, y) * file_sequence_count[x]); + } + } + + for(y = 0; y < sequences; y++) { + + double column_sum = 0.; + for(x = 0; x < dir_count; x++) { + column_sum += solutions(x, y); + } + + // if our column is zero, don't bother printing the row + if(column_sum != 0) { + fprintf(output_fh, "%s\t", headers[y]); + + for(x = 0; x < dir_count - 1; x++) { + fprintf(output_fh, "%d\t", (int)solutions(x, y)); + } + fprintf(output_fh, "%d\n", (int)solutions[sequences*(dir_count - 1) + y]); + } + } + fclose(output_fh); + + return EXIT_SUCCESS; +} diff --git a/src/c/nnls.c b/src/c/nnls.c new file mode 100644 index 0000000..ddd9f2f --- /dev/null +++ b/src/c/nnls.c @@ -0,0 +1,416 @@ +/* + * nnls.c (c) 2002-2009 Turku PET Centre + * This file contains the routine NNLS (nonnegative least squares) + * and the subroutines required by it, except h12, which is in + * file 'lss_h12.c'. + * + * This routine is based on the text and fortran code in + * C.L. Lawson and R.J. Hanson, Solving Least Squares Problems, + * Prentice-Hall, Englewood Cliffs, New Jersey, 1974. + * Version: + * 2002-08-19 Vesa Oikonen + * 2003-05-08 Kaisa Sederholm & VO + * Included function nnlsWght(). + * 2003-05-12 KS + * Variable a_dim1 excluded + * + * Usage of the coefficient matrix altered so that it is + * given in a[][] instead of a[]. + * 2003-11-06 VO + * If n<2, then itmax is set to n*n instead of previous n*3. + * 2004-09-17 VO + * Doxygen style comments. + * 2006-24-04 Pauli Sundberg + * Added some debuging output, and made some comments more precise. + * 2007-05-17 VO + * 2009-04-16 VO + * Corrected a bug in nnls() which may have caused an infinite loop. + * 009-04-27 VO + * Added function nnlsWghtSquared() for faster pixel-by-pixel calculations. + * Checking for exceeding iteration count is corrected in nnls(). + */ +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <math.h> +#define MAX(a,b) ((a) >= (b) ? (a) : (b)) +#define ABS(x) ((x) >= 0 ? (x) : -(x)) + +int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) { + double d1, b, clinv, cl, sm; + int k, j; + + /* Check parameters */ + if (mode!=1 && mode!=2) + return(1); + if (m<1 || u==NULL || u_dim1<1 || cm==NULL) + assert(0); + // return(1); + if (lpivot<0 || lpivot>=l1 || l1>m) + // assert(0); + return(1); + + /* Function Body */ + cl = ABS( u[lpivot*u_dim1] ); + // cl= (d1 = u[lpivot*u_dim1], fabs(d1)); + + if (mode==2) + { /* Apply transformation I+U*(U**T)/B to cm[] */ + if(cl<=0.) + // assert(0); + return(0); + } + else + { /* Construct the transformation */ + + + /* This is the way provided in the original pseudocode + sm = 0; + for (j = l1; j < m; j++) + { + d1 = u[j * u_dim1]; + sm += d1*d1; + } + d1 = u[lpivot * u_dim1]; + sm += d1*d1; + sm = sqrt(sm); + + if (u[lpivot*u_dim1] > 0) + sm=-sm; + + up[0] = u[lpivot*u_dim1] - sm; + u[lpivot*u_dim1]=sm; + printf("Got sum: %f\n",sm); + */ + + /* and this trying to compensate overflow */ + for (j=l1; j<m; j++) + { // Computing MAX + cl = MAX( ABS( u[j*u_dim1] ), cl ); + } + // zero vector? + + if (cl<=0.) + return(0); + + clinv=1.0/cl; + + // Computing 2nd power + d1=u[lpivot*u_dim1]*clinv; + sm=d1*d1; + + for (j=l1; j<m; j++) + { + d1=u[j*u_dim1]*clinv; + sm+=d1*d1; + } + cl *= sqrt(sm); + + if (u[lpivot*u_dim1] > 0.) + cl=-cl; + up[0] = u[lpivot*u_dim1] - cl; + u[lpivot*u_dim1]=cl; + } + + // no vectors where to apply? only change pivot vector! + b=up[0] * u[lpivot*u_dim1]; + + /* b must be nonpositive here; if b>=0., then return */ + if (b == 0) + return(0); + + // ok, for all vectors we want to apply + for (j =0; j < ncv; j++) { + sm = cm[ lpivot * ice + j * icv ] * (up[0]); + + for (k=l1; k<m; k++) + sm += cm[ k * ice + j*icv ] * u[ k*u_dim1 ]; + + if (sm != 0.0) { + sm *= (1/b); + // cm[lpivot, j] = .. + cm[ lpivot * ice + j*icv] += sm * (up[0]); + for (k= l1; k<m; k++) + { + cm[ k*ice + j*icv] += u[k * u_dim1]*sm; + } + } + } + + return(0); +} + + +void g1(double a, double b, double *cterm, double *sterm, double *sig) +{ + double d1, xr, yr; + + if( fabs(a) > fabs(b) ) { + xr = b / a; + d1 = xr; + yr = sqrt(d1*d1 + 1.); + d1 = 1./yr; + *cterm=(a>=0.0 ? fabs(d1) : -fabs(d1)); + *sterm=(*cterm)*xr; + *sig=fabs(a)*yr; + } else if( b != 0.) { + xr = a / b; + d1 = xr; + yr = sqrt(d1 * d1 + 1.); + d1 = 1. / yr; + *sterm=(b>=0.0 ? fabs(d1) : -fabs(d1)); + *cterm=(*sterm)*xr; *sig=fabs(b)*yr; + } else { + *sig=0.; *cterm=0.; *sterm=1.; + } +} + + +int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) { + int pfeas; + int ret=0; + int iz; + int jz; + int k, j=0, l, itmax, izmax=0, ii, jj=0, ip; + double d1, d2, sm, up, ss; + double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc; + + + /* Check the parameters and data */ + if(m <= 0 || n <= 0 || a == NULL || b == NULL || x == NULL) + return(2); + + /* Allocate memory for working space, if required */ + double *w = (double*)calloc(n, sizeof(double)); + double *zz = (double*)calloc(m, sizeof(double)); + int *index = (int*)calloc(n, sizeof(int)); + if(w == NULL || zz == NULL || index == NULL) + return(2); + + /* Initialize the arrays INDEX[] and X[] */ + for(k=0; k<n; k++) { + x[k]=0.; + index[k]=k; + } + + int iz2 = n - 1; + int iz1 = 0; + int iter=0; + int nsetp=0; + int npp1=0; + + /* Main loop; quit if all coeffs are already in the solution or */ + /* if M cols of A have been triangularized */ + if(n < 3) + itmax=n*3; + else + itmax=n*n; + + + while(iz1 <= iz2 && nsetp < m) { + /* Compute components of the dual (negative gradient) vector W[] */ + for(iz=iz1; iz<=iz2; iz++) { + j=index[iz]; + sm=0.; + for(l=npp1; l<m; l++) + sm+=a[j*m + l]*b[l]; + w[j]=sm; + } + + while(1) { + /* Find largest positive W[j] */ + for(wmax=0., iz=iz1; iz<=iz2; iz++) { + j=index[iz]; if(w[j]>wmax) {wmax=w[j]; izmax=iz;}} + + /* Terminate if wmax<=0.; */ + /* it indicates satisfaction of the Kuhn-Tucker conditions */ + if(wmax<=0.0) + break; + + iz=izmax; + j=index[iz]; + + /* The sign of W[j] is ok for j to be moved to set P. */ + /* Begin the transformation and check new diagonal element to avoid */ + /* near linear dependence. */ + asave=a[j*m + npp1]; + h12(1, npp1, npp1+1, m, &a[j*m +0], 1, &up, &dummy, 1, 1, 0); + unorm=0.; + if(nsetp!=0){ + for(l=0; l<nsetp; l++) { + d1=a[j*m + l]; + unorm+=d1*d1; + } + } + + unorm=sqrt(unorm); + d2=unorm+(d1=a[j*m + npp1], fabs(d1)) * 0.01; + if((d2-unorm)>0.) { + /* Col j is sufficiently independent. Copy B into ZZ, update ZZ */ + /* and solve for ztest ( = proposed new value for X[j] ) */ + for(l=0; l<m; l++) zz[l]=b[l]; + h12(2, npp1, npp1+1, m, &a[j*m + 0], 1, &up, zz, 1, 1, 1); + ztest=zz[npp1]/a[j*m +npp1]; + /* See if ztest is positive */ + if(ztest>0.) break; + } + + /* Reject j as a candidate to be moved from set Z to set P. Restore */ + /* A[npp1,j], set W[j]=0., and loop back to test dual coeffs again */ + a[j*m+ npp1]=asave; w[j]=0.; + } /* while(1) */ + + if(wmax<=0.0) + break; + + /* Index j=INDEX[iz] has been selected to be moved from set Z to set P. */ + /* Update B and indices, apply householder transformations to cols in */ + /* new set Z, zero subdiagonal elts in col j, set W[j]=0. */ + for(l=0; l<m; ++l) + b[l]=zz[l]; + + index[iz]=index[iz1]; + index[iz1]=j; + iz1++; + npp1++; + nsetp=npp1; + + if(iz1<=iz2) { + for(jz=iz1; jz<=iz2; jz++) { + jj=index[jz]; + h12(2, nsetp-1, npp1, m, &a[j*m +0], 1, &up, &a[jj*m +0], 1, m, 1); + } + } + + if(nsetp!=m) { + for(l=npp1; l<m; l++) + a[j*m +l]=0.; + } + + w[j]=0.; + + /* Solve the triangular system; store the solution temporarily in Z[] */ + for(l=0; l<nsetp; l++) { + ip=nsetp-(l+1); + if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1]; + jj=index[ip]; zz[ip]/=a[jj*m +ip]; + } + + /* Secondary loop begins here */ + while(++iter < itmax) { + /* See if all new constrained coeffs are feasible; if not, compute alpha */ + for(alpha = 2.0, ip = 0; ip < nsetp; ip++) { + l=index[ip]; + if(zz[ip]<=0.) { + t = -x[l]/(zz[ip]-x[l]); + if(alpha > t) { + alpha = t; + jj = ip - 1; + } + } + } + + /* If all new constrained coeffs are feasible then still alpha==2. */ + /* If so, then exit from the secondary loop to main loop */ + if(alpha==2.0) + break; + + /* Use alpha (0.<alpha<1.) to interpolate between old X and new ZZ */ + for(ip=0; ip<nsetp; ip++) { + l = index[ip]; + x[l] += alpha*(zz[ip]-x[l]); + } + + /* Modify A and B and the INDEX arrays to move coefficient i */ + /* from set P to set Z. */ + k=index[jj+1]; pfeas=1; + do { + x[k]=0.; + if(jj!=(nsetp-1)) { + jj++; + for(j=jj+1; j<nsetp; j++) { + ii=index[j]; index[j-1]=ii; + g1(a[ii*m + (j-1)], a[ii*m + j], &cc, &ss, &a[ii*m + j-1]); + for(a[ii*m + j]=0., l=0; l<n; l++) if(l!=ii) { + /* Apply procedure G2 (CC,SS,A(J-1,L),A(J,L)) */ + temp=a[l*m + j-1]; + a[l*m + j-1]=cc*temp+ss*a[l*m + j]; + a[l*m + j]=-ss*temp+cc*a[l*m + j]; + } + /* Apply procedure G2 (CC,SS,B(J-1),B(J)) */ + temp=b[j-1]; b[j-1]=cc*temp+ss*b[j]; b[j]=-ss*temp+cc*b[j]; + } + } + npp1=nsetp-1; nsetp--; iz1--; index[iz1]=k; + + /* See if the remaining coeffs in set P are feasible; they should */ + /* be because of the way alpha was determined. If any are */ + /* infeasible it is due to round-off error. Any that are */ + /* nonpositive will be set to zero and moved from set P to set Z */ + for(jj=0, pfeas=1; jj<nsetp; jj++) { + k=index[jj]; if(x[k]<=0.) {pfeas=0; break;} + } + } while(pfeas==0); + + /* Copy B[] into zz[], then solve again and loop back */ + for(k=0; k<m; k++) + zz[k]=b[k]; + for(l=0; l<nsetp; l++) { + ip=nsetp-(l+1); + if(l!=0) for(ii=0; ii<=ip; ii++) zz[ii]-=a[jj*m + ii]*zz[ip+1]; + jj=index[ip]; zz[ip]/=a[jj*m + ip]; + } + } /* end of secondary loop */ + + if(iter>=itmax) { + ret = 1; + break; + } + + for(ip=0; ip<nsetp; ip++) { + k=index[ip]; + x[k]=zz[ip]; + } + } /* end of main loop */ + + /* Compute the norm of the final residual vector */ + sm=0.; + + if (rnorm != NULL) { + if (npp1<m) + for (k=npp1; k<m; k++) + sm+=(b[k] * b[k]); + else + for (j=0; j<n; j++) + w[j]=0.; + *rnorm=sqrt(sm); + } + + /* Free working space, if it was allocated here */ + free(w); + free(zz); + free(index); + return(ret); +} +/* nnls_ */ + + +double *nnls(double *a_matrix, double *b_matrix, int height, int width) { + + double *solution = (double*)calloc(height, sizeof(double)); + + if(solution == NULL) { + fprintf(stderr, "could not allocate enough memory for nnls\n"); + exit(EXIT_FAILURE); + } + + int ret = nnls_algorithm(a_matrix, width, height, b_matrix, solution, NULL); + if(ret == 1) { + printf("NNLS has reached the maximum iterations\n"); + } else if(ret == 2) { + fprintf(stderr, "NNLS could not allocate enough memory\n"); + exit(EXIT_FAILURE); + } + + return solution; +} diff --git a/src/c/nnls.h b/src/c/nnls.h new file mode 100644 index 0000000..ded8d01 --- /dev/null +++ b/src/c/nnls.h @@ -0,0 +1 @@ +double *nnls(double *a_matrix, double *b_matrix, int height, int width); diff --git a/src/c/quikr.1 b/src/c/quikr.1 new file mode 100644 index 0000000..9982d94 --- /dev/null +++ b/src/c/quikr.1 @@ -0,0 +1,70 @@ +.TH quikr 1 quikr-2013-05 +.SH NAME +quikr \- Calculate estimated frequencies of bacteria in a sample. +.SH SYNOPSIS +.B quikr +.RB [ \-i +.IR input] +.RB [ \-f +.IR sensing-fasta] +.RB [ \-s +.IR sensing-matrix] +.RB [ \-l +.IR lambda ] +.RB [ \-k +.IR kmer ] +.RB [ \-o +.IR output ] +.RB [ \-v ] +.P +.BR quikr " ..." +.SH DESCRIPTION +.B quikr +Quikr returns the estimated frequencies of batcteria present when given a +input FASTA file and a trained sensing matrix +.P +.SH OPTIONS +.TP +.B \-i, --input +the sample's fasta file of NGS. READS (fasta format) +.TP +.B \-f, --sensing-fasta +location of the fasta file database used to create the sensing matrix. (fasta format) +.TP +.B \-s, --sensing-matrix +location of the sensing matrix. (trained from quikr_train) +.TP +.B \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-l, --lambda +lambda value to use. (default value is 10000) +.TP +.B \-o, --output +OTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output) +.TP +.B \-v, --verbose +verbose mode. +.SH EXAMPLES +Use quikr to calculate the estimated frequencies for sample.fa, using rdp7.fasta as the sensing matrix we generated with quikr_train. This uses 6-mers by default, and a lambda value of 10000: +.P +quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt +.SH "SEE ALSO" +\fBmultifasta_to_otu\fP(1), \fBquikr_train\fP(1). +.SH AUTHORS +.B quikr +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use, +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright +notice appear in all copies and that both that copyright notice and this +permission noticeappear in supporting documentation. No representations are +made about the suitability of this software for any purpose. It is provided +"as is" without express or implied warranty. + diff --git a/src/c/quikr.c b/src/c/quikr.c new file mode 100644 index 0000000..c73e0dd --- /dev/null +++ b/src/c/quikr.c @@ -0,0 +1,192 @@ +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <stdio.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "nnls.h" +#include "quikr_functions.h" + +#define sensing_matrix(i,j) (sensing_matrix[width*i + j]) +#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + + + int c; + int kmer = 0; + + char *input_fasta_filename = NULL; + char *sensing_matrix_filename = NULL; + char *sensing_fasta_filename = NULL; + char *output_filename = NULL; + + int x = 0; + int y = 0; + int verbose = 0; + int lambda = 0; + + + + while (1) { + static struct option long_options[] = { + {"input", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"lambda", required_argument, 0, 'l'}, + {"output", required_argument, 0, 'o'}, + {"sensing-fasta", required_argument, 0, 'f'}, + {"sensing-matrix", required_argument, 0, 's'}, + {"verbose", no_argument, 0, 'v'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "k:l:f:s:i:o:hdv", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 0: + case 'k': + kmer = atoi(optarg); + break; + case 'l': + lambda = atoi(optarg); + break; + case 'f': + sensing_fasta_filename = optarg; + break; + case 's': + sensing_matrix_filename = optarg; + break; + case 'i': + input_fasta_filename = optarg; + break; + case 'o': + output_filename = optarg; + break; + case 'v': + verbose = 1; + break; + case 'h': + printf("%s\n", USAGE); + exit(EXIT_SUCCESS); + break; + default: + break; + } + } + + + if(sensing_matrix_filename == NULL) { + fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(sensing_fasta_filename == NULL) { + fprintf(stderr, "Error: sensing matrix filename (-f) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(output_filename == NULL) { + fprintf(stderr, "Error: Output Filename (-o) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + if(input_fasta_filename == NULL) { + fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(lambda == 0) + lambda = 10000; + if(kmer == 0) + kmer = 6; + + if(verbose) { + printf("kmer: %d\n", kmer); + printf("lambda: %d\n", lambda); + printf("fasta: %s\n", input_fasta_filename); + printf("sensing database: %s\n", sensing_matrix_filename); + printf("sensing database fasta: %s\n", sensing_fasta_filename); + printf("output: %s\n", output_filename); + } + // 4 "ACGT" ^ Kmer gives us the size of output rows + int width = pow(4, kmer); + width = width + 1; + + int sequences = count_sequences(sensing_fasta_filename); + + if(verbose) { + printf("width: %d\nsequences %d\n", width, sequences); + } + + double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width); + double *count_matrix = load_count_matrix(input_fasta_filename, width, kmer); + + // multiply our matrix by lambda + for(x = 1; x < sequences; x++) { + for(y= 0; y < width - 1; y++) { + sensing_matrix(x, y) = sensing_matrix(x, y) * lambda; + } + } + + for(x= 0; x < sequences; x++) { + sensing_matrix(x, 0) = 1.0; + } + // normalize our count_matrix + normalize_matrix(count_matrix, 1, width); + for(x = 0; x < width; x++) + count_matrix[x] = count_matrix[x] * lambda; + + // output our matricies if we are in verbose mode + if(verbose) { + FILE *sensing_matrix_fh = fopen( "sensing.matrix", "w"); + if(sensing_matrix_fh == NULL) { + fprintf(stderr, "could not open sensing.matrix for writing.\n"); + exit(EXIT_FAILURE); + } + for(x = 0; x < sequences; x++) { + for( y = 0; y < width; y++) { + fprintf(sensing_matrix_fh, "%.10f\t", sensing_matrix(x, y)); + } + fprintf(sensing_matrix_fh, "\n"); + } + fclose(sensing_matrix_fh); + + FILE *count_matrix_fh = fopen("count.matrix", "w"); + if(count_matrix_fh == NULL) { + fprintf(stderr, "could not open sensing.matrix for writing.\n"); + exit(EXIT_FAILURE); + } + for(x = 0; x < width; x++) { + fprintf(count_matrix_fh, "%.10f\n", count_matrix[x]); + } + fclose(count_matrix_fh); + } + + double *solution = nnls(sensing_matrix, count_matrix, sequences, width); + + // normalize our solution vector + normalize_matrix(solution, 1, sequences); + + // output our matrix + FILE *output_fh = fopen(output_filename, "w"); + if(output_fh == NULL) { + fprintf(stderr, "Could not open %s for writing\n", output_filename); + exit(EXIT_FAILURE); + } + for(x = 0; x < sequences; x++) { + fprintf(output_fh, "%.10lf\n", solution[x]); + } + fclose(output_fh); + + return EXIT_SUCCESS; +} diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c new file mode 100644 index 0000000..7e18e64 --- /dev/null +++ b/src/c/quikr_functions.c @@ -0,0 +1,163 @@ +#include <stdio.h> +#include <stdio.h> +#include <errno.h> +#include <ctype.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <math.h> + +int count_sequences(char *filename) { + char command[512]; + int sequences = 0; + FILE *grep_output; + + sprintf(command, "grep -c ^\\> %s", filename); + grep_output = popen(command, "r"); + if(grep_output == NULL) { + fprintf(stderr, "Could not execute %s\n", command); + exit(EXIT_FAILURE); + } + + fscanf(grep_output, "%d", &sequences); + + pclose(grep_output); + return sequences; +} + + +void normalize_matrix(double *matrix, int height, int width) { + int x = 0; + int y = 0; + for(x = 0; x < height; x++) { + + double row_sum = 0; + + for(y = 0; y < (width); y++) + row_sum = row_sum + matrix[width * x + y]; + for(y = 0; y < (width); y++) + matrix[width * x + y] = matrix[width * x + y] / row_sum; + } +} + + +double *load_count_matrix(char *filename, int width, int kmer) { + + double *count_matrix = malloc((width)*sizeof(double)); + char count_command[512]; + int x = 0; + char *line = NULL; + size_t len = 0; + + if(count_matrix == NULL) { + fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width); + exit(EXIT_FAILURE); + } + + // create out count matrix + sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename); + FILE *count_output = popen(count_command, "r"); + if(count_output == NULL) { + fprintf(stderr, "could not execute \"%s\"", count_command); + exit(EXIT_FAILURE); + } + + // set first element to zero. + count_matrix[0] = 0; + + // get our first line + getline(&line, &len, count_output); + count_matrix[1] = atoi(line); + + // iterate over the rest of the lines + for(x = 2; x < width; x++) { + getline(&line, &len, count_output); + count_matrix[x] = atoi(line); + } + + pclose(count_output); + + return count_matrix; +} + + +double *load_sensing_matrix(char *filename, int height, int width) { + + int x = 0; + int y = 0; + + char *line = NULL; + char *val; + char command[512]; + size_t len = 0; + FILE *sensing_matrix_fh = NULL; + + double *sensing_matrix = malloc(height * width * sizeof(double)); + if(sensing_matrix == NULL) { + fprintf(stderr, "Could not allocate memory for the sensing matrix\n"); + } + + sprintf(command, "gzip -cd %s", filename); + sensing_matrix_fh = popen(command, "r"); + if(sensing_matrix_fh == NULL) { + fprintf(stderr, "could not open %s", filename); + exit(EXIT_FAILURE); + } + + // read our sensing matrix in + for(x = 0; x < height; x++) { + getline(&line, &len, sensing_matrix_fh); + char *ptr; + + // Read our first element in outside of the loop + val = strtok_r(line,"\t", &ptr); + sensing_matrix[width*x + 1] = atof(val); + // iterate through and load the array + for (y = 2; y < width; y++) { + val = strtok_r(NULL, "\t", &ptr); + sensing_matrix[width*x + y] = atof(val); + } + } + + pclose(sensing_matrix_fh); + + return sensing_matrix; +} + +char **load_headers(char *filename, int sequences) { + char command[512]; + char *line= NULL; + int x = 0; + FILE *grep_output; + size_t len = 0; + + sprintf(command, "grep ^\\> %s", filename); + grep_output = popen(command, "r"); + if(grep_output == NULL) { + fprintf(stderr, "Could not execute %s\n", command); + exit(EXIT_FAILURE); + } + + char **headers = malloc(sequences * sizeof(char *)); + if(headers == NULL) { + fprintf(stderr, "could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + + for(x = 0; x < sequences; x++) { + + char *header = malloc(256 * sizeof(char)); + if(header == NULL) { + fprintf(stderr, "could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + getline(&line, &len, grep_output); + sscanf(line + 1, "%s", header); + headers[x] = header; + } + + pclose(grep_output); + + return headers; +} + diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h new file mode 100644 index 0000000..1826427 --- /dev/null +++ b/src/c/quikr_functions.h @@ -0,0 +1,14 @@ +// count the sequences in a fasta file +int count_sequences(char *filename); + +// normalize a matrix by dividing each element by the sum of it's column +void normalize_matrix(double *matrix, int height, int width); + +// load a sensing matrix +double *load_sensing_matrix(char *filename, int height, int width); + +// load a count matrix +double *load_count_matrix(char *filename, int width, int kmer); + +// load headers +char **load_headers(char *filename, int sequences); diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1 new file mode 100644 index 0000000..dd1493f --- /dev/null +++ b/src/c/quikr_train.1 @@ -0,0 +1,61 @@ +.TH quikr_train 1 quikr_train-2013-05 +.SH NAME +quikr_train \- train databases for use with Quikr +.SH SYNOPSIS +.B quikr_train +.RB [ \-i +.IR input] +.RB [ \-o +.IR output] +.RB [ \-k +.IR kmer ] +.RB [ \-v ] +.P +.BR quikr " ..." +.SH DESCRIPTION +.B quikr +The quikr_train is a utility to train a database for use with quikr. +Before running the quikr utility, you need to generate the sensing matrix or +download a pretrained one from our project's homepage. + +quikr_train returns a sensing matrix that can be used with the quikr +function. +.P +.SH OPTIONS +.TP +.B \-i, --input +the database of sequences to create the sensing matrix. (fasta format) +.TP +.B \-k, --kmer +specify what size of kmer to use. (default value is 6) +.TP +.B \-o, --output +the sensing matrix. (a gzip'd text file) +.TP +.B \-v, --verbose +verbose mode. +.SH EXAMPLES +Use quikr_train to generate a sensing matrix from rdp7.fasta. This uses 6mers by default. +.P +quikr_train -i rdp7.fa -o rd7_sensing_matrix.gz +.SH USAGE +If you do not have a .gz file on your output matrix name, it will be appended. +.SH "SEE ALSO" +\fBmultifasta_to_otu\fP(1), \fBquikr\fP(1). +.SH AUTHORS +.B quikr +was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison +<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot +.SH REPORTING BUGS +.TP +Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \ +operating system, current compiler, and test files to reproduce your issue. +.SH COPYRIGHT. +Copyright \(co 2013 by Calvin Morrison and Gail Rosen. Permission to use, +copy, modify, distribute, and sell this software and its documentation for +any purpose is hereby granted without fee, provided that the above copyright +notice appear in all copies and that both that copyright notice and this +permission noticeappear in supporting documentation. No representations are +made about the suitability of this software for any purpose. It is provided +"as is" without express or implied warranty. + diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c new file mode 100644 index 0000000..d2a83ef --- /dev/null +++ b/src/c/quikr_train.c @@ -0,0 +1,176 @@ +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include "quikr_functions.h" + +#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'" +#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + + char probabilities_command[512]; + char kmers_file[256]; + char *line = NULL; + char *val; + size_t len = 0; + + + int c; + int kmer = 0; + + char *fasta_file = NULL; + char *output_file = NULL; + + int x = 0; + int y = 0; + + int verbose = 0; + + gzFile output = NULL; + + while (1) { + static struct option long_options[] = { + {"verbose", no_argument, 0, 'v'}, + {"input", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"output", required_argument, 0, 'o'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + fasta_file = optarg; + break; + case 'k': + kmer = atoi(optarg); + break; + case 'o': + output_file = optarg; + break; + case 'v': + verbose = 1; + break; + case 'h': + printf("%s\n", USAGE); + exit(EXIT_SUCCESS); + break; + case '?': + /* getopt_long already printed an error message. */ + break; + default: + exit(EXIT_FAILURE); + } + } + + + if(fasta_file == NULL) { + fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(output_file == NULL) { + fprintf(stderr, "Error: output directory (-o) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(kmer == 0) + kmer = 6; + + if(verbose) { + printf("kmer size: %d\n", kmer); + printf("fasta file: %s\n", fasta_file); + printf("output file: %s\n", output_file); + } + + if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) { + char *temp = malloc(sizeof(strlen(output_file) + 4)); + sprintf(temp, "%s.gz", output_file); + output_file = temp; + printf("appending a .gz to our output file: %s\n", output_file); + } + + // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length + int width = pow(4, kmer); + int sequences = count_sequences(fasta_file); + + if(verbose) + printf("sequences: %d\nwidth: %d\n", sequences, width); + + // Allocate our matrix with the appropriate size, just one row + double *trained_matrix = malloc(width*sizeof(double)); + if(trained_matrix == NULL) { + fprintf(stderr, "Could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + + // call the probabilities-by-read command + sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer); + sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file); + FILE *probabilities_output = popen(probabilities_command, "r"); + if(probabilities_output == NULL) { + fprintf(stderr, "Error could not execute: %s\n", probabilities_command); + exit(EXIT_FAILURE); + } + + // open our output file + output = gzopen(output_file, "w6"); + if(output == NULL) { + fprintf(stderr, "Error: could not open output file, error code: %d", errno); + exit(EXIT_FAILURE); + } + + if(verbose) + printf("Writing our sensing matrix to %s\n", output_file); + + // read normalize and write our matrix in one go + for(x = 0; x < sequences; x++) { + + getline(&line, &len, probabilities_output); + + // Read our first element in outside of the loop + val = strtok(line,"\t\n\r"); + trained_matrix[0] = atof(val); + // iterate through and load the array + for (y = 1; y < width; y++) { + val = strtok (NULL, "\t\n\r"); + trained_matrix[y] = atof(val); + } + + double row_sum = 0; + + for( y = 0; y < width; y++) { + row_sum = row_sum + trained_matrix[y]; + } + + for( y = 0; y < width; y++) { + trained_matrix[y] = trained_matrix[y] / row_sum; + } + + for( y = 0; y < width; y++) { + gzprintf(output, "%.10f\t", trained_matrix[y]); + } + gzprintf(output, "\n"); + } + + free(trained_matrix); + gzclose(output); + pclose(probabilities_output); + + return 0; +} |