summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG3
-rw-r--r--src/c/multifasta_to_otu.12
-rw-r--r--src/c/multifasta_to_otu.c22
-rw-r--r--src/c/nnls.c33
-rw-r--r--src/c/quikr.12
-rw-r--r--src/c/quikr.c14
-rw-r--r--src/c/quikr_functions.c27
-rw-r--r--src/c/quikr_functions.c.1153
-rw-r--r--src/c/quikr_train.12
-rw-r--r--src/c/quikr_train.c5
10 files changed, 203 insertions, 60 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d8d3b79..002b265 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,9 +8,10 @@ May 16th 2013, 1.0.1 Released
May 20th 2013, 1.0.2 Released
- correted usage strings
- install manpages globally (to {PREFIX}/share/man/man1)
-- added native gzip decoding for quikr and multifasta_to_otu
Current Development Branch
- Licensing files and documentation added
- Removed unused code from the NBC project
- Fixed a overflow bug in quikr_train
+- added native gzip decoding for quikr and multifasta_to_otu
+- documentation fixes
diff --git a/src/c/multifasta_to_otu.1 b/src/c/multifasta_to_otu.1
index 29fe5d7..2a00d5c 100644
--- a/src/c/multifasta_to_otu.1
+++ b/src/c/multifasta_to_otu.1
@@ -79,7 +79,7 @@ make_3d_plots.py -i <quikr_otu_project_name>_weighted.txt -o <3d_pcoa_plotdirect
.SH AUTHORS
.B multifasta2otu
was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
-<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.
.SH REPORTING BUGS
.TP
Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c
index 2f4bf87..e0fb128 100644
--- a/src/c/multifasta_to_otu.c
+++ b/src/c/multifasta_to_otu.c
@@ -26,14 +26,14 @@ int main(int argc, char **argv) {
double *sensing_matrix;
- long int width = 0;
- long int sequences = 0;
+ long width = 0;
+ long sequences = 0;
int kmer = 6;
int lambda = 10000;
- int x = 0;
- int y = 0;
+ long x = 0;
+ long y = 0;
int jobs = 1;
#ifdef Linux
@@ -150,7 +150,7 @@ int main(int argc, char **argv) {
}
// do a directory count
- int dir_count = -2; // -2 for ../ and ./
+ long dir_count = -2; // -2 for ../ and ./
while(entry = readdir(input_directory_dh))
dir_count++;
rewinddir(input_directory_dh);
@@ -167,7 +167,7 @@ int main(int argc, char **argv) {
}
if(verbose) {
- printf("directory count: %d\n", dir_count);
+ printf("directory count: %ld\n", dir_count);
printf("width: %ld\nsequences %ld\n", width, sequences);
}
@@ -197,7 +197,7 @@ int main(int argc, char **argv) {
exit(EXIT_FAILURE);
}
- int *file_sequence_count = malloc(dir_count * sizeof(int));
+ long *file_sequence_count = malloc(dir_count * sizeof(long));
if(file_sequence_count == NULL) {
fprintf(stderr, "Could not allocate enough memory\n");
exit(EXIT_FAILURE);
@@ -206,12 +206,12 @@ int main(int argc, char **argv) {
struct dirent result;
omp_set_num_threads(jobs);
- int done = 0;
+ long done = 0;
printf("Beginning to process samples\n");
#pragma omp parallel for shared(solutions, sequences, width, result, done)
- for(int i = 0; i < dir_count; i++ ) {
+ for(long i = 0; i < dir_count; i++ ) {
- int z = 0;
+ long z = 0;
struct dirent *directory_entry;
char *filename = malloc(256 * sizeof(char));
char *base_filename = malloc(256 * sizeof(char));
@@ -269,7 +269,7 @@ int main(int argc, char **argv) {
}
done++;
- printf("%d/%d samples processed\n", done, dir_count);
+ printf("%ld/%ld samples processed\n", done, dir_count);
free(solution);
free(count_matrix);
free(filename);
diff --git a/src/c/nnls.c b/src/c/nnls.c
index e512388..f45bfec 100644
--- a/src/c/nnls.c
+++ b/src/c/nnls.c
@@ -1,5 +1,5 @@
/*
- * nnls.c (c) 2002-2009 Turku PET Centre
+ * nnls.c (c) 2003-2009 Turku PET Centre
* This file contains the routine NNLS (nonnegative least squares)
* and the subroutines required by it, except h12, which is in
* file 'lss_h12.c'.
@@ -30,15 +30,16 @@
* Checking for exceeding iteration count is corrected in nnls().
*/
#include <stdio.h>
+#include <stdint.h>
#include <assert.h>
#include <stdlib.h>
#include <math.h>
#define MAX(a,b) ((a) >= (b) ? (a) : (b))
#define ABS(x) ((x) >= 0 ? (x) : -(x))
-int h12( int mode, int lpivot, int l1, int m, double *u, int u_dim1, double *up, double *cm, int ice, int icv, int ncv) {
+int64_t h12( int64_t mode, int64_t lpivot, int64_t l1, int64_t m, double *u, int64_t u_dim1, double *up, double *cm, int64_t ice, int64_t icv, int64_t ncv) {
double d1, b, clinv, cl, sm;
- int k, j;
+ int64_t k, j;
/* Check parameters */
if (mode!=1 && mode!=2)
@@ -166,12 +167,12 @@ void g1(double a, double b, double *cterm, double *sterm, double *sig)
}
-int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm) {
- int pfeas;
+int64_t nnls_algorithm(double *a, int64_t m,int64_t n, double *b, double *x, double *rnorm) {
+ int64_t pfeas;
int ret=0;
- int iz;
- int jz;
- int k, j=0, l, itmax, izmax=0, ii, jj=0, ip;
+ int64_t iz;
+ int64_t jz;
+ int64_t k, j=0, l, itmax, izmax=0, ii, jj=0, ip;
double d1, d2, sm, up, ss;
double temp, wmax, t, alpha, asave, dummy, unorm, ztest, cc;
@@ -183,7 +184,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)
/* Allocate memory for working space, if required */
double *w = (double*)calloc(n, sizeof(double));
double *zz = (double*)calloc(m, sizeof(double));
- int *index = (int*)calloc(n, sizeof(int));
+ int64_t *index = (int64_t*)calloc(n, sizeof(int64_t));
if(w == NULL || zz == NULL || index == NULL)
return(2);
@@ -193,18 +194,18 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)
index[k]=k;
}
- int iz2 = n - 1;
- int iz1 = 0;
- int iter=0;
- int nsetp=0;
- int npp1=0;
+ int64_t iz2 = n - 1;
+ int64_t iz1 = 0;
+ int64_t iter=0;
+ int64_t nsetp=0;
+ int64_t npp1=0;
/* Main loop; quit if all coeffs are already in the solution or */
/* if M cols of A have been triangularized */
if(n < 3)
itmax=n*3;
else
- itmax=n*n*n;
+ itmax=n*n;
while(iz1 <= iz2 && nsetp < m) {
@@ -395,7 +396,7 @@ int nnls_algorithm(double *a, int m,int n, double *b, double *x, double *rnorm)
/* nnls_ */
-double *nnls(double *a_matrix, double *b_matrix, int height, int width) {
+double *nnls(double *a_matrix, double *b_matrix, int64_t height, int64_t width) {
double *solution = (double*)calloc(height, sizeof(double));
diff --git a/src/c/quikr.1 b/src/c/quikr.1
index 937109b..9b32667 100644
--- a/src/c/quikr.1
+++ b/src/c/quikr.1
@@ -58,7 +58,7 @@ quikr -i sample.fa -f rdp7.fa -s rdp_sensing_matrix.gz -o frequencies.txt
.SH AUTHORS
.B quikr
was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
-<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.
.SH REPORTING BUGS
.TP
Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
diff --git a/src/c/quikr.c b/src/c/quikr.c
index 17d6480..f588c5f 100644
--- a/src/c/quikr.c
+++ b/src/c/quikr.c
@@ -12,7 +12,7 @@
#include "quikr_functions.h"
#define sensing_matrix(i,j) (sensing_matrix[width*i + j])
-#define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details"
+#define USAGE "Usage:\n\tquikr [OPTION...] - Calculate estimated frequencies of bacteria in a sample.\n\nOptions:\n\n-i, --input\n\tthe sample's fasta file of NGS READS (fasta format)\n\n-f, --sensing-fasta\n\tlocation of the fasta file database used to create the sensing matrix (fasta format)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (trained from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-o, --output\n\tOTU_FRACTION_PRESENT a vector representing the percentage of database sequence's presence in sample. (csv output)\n\n-v, --verbose\n\tverbose mode.\n\n-d, --debug\n\tdebug mode, read manpage for more details"
int main(int argc, char **argv) {
@@ -24,8 +24,8 @@ int main(int argc, char **argv) {
char *sensing_fasta_filename = NULL;
char *output_filename = NULL;
- int x = 0;
- int y = 0;
+ long x = 0;
+ long y = 0;
int verbose = 0;
int debug = 0;
@@ -33,8 +33,8 @@ int main(int argc, char **argv) {
int lambda = 10000;
int kmer = 6;
- int width = 0;
- int sequences = 0;
+ long width = 0;
+ long sequences = 0;
while (1) {
static struct option long_options[] = {
@@ -144,7 +144,7 @@ int main(int argc, char **argv) {
}
if(verbose) {
- printf("width: %d\nsequences %d\n", width, sequences);
+ printf("width: %ld\nsequences %ld\n", width, sequences);
}
double *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, sequences, width);
@@ -157,7 +157,7 @@ int main(int argc, char **argv) {
}
}
- for(x= 0; x < sequences; x++) {
+ for(x = 0; x < sequences; x++) {
sensing_matrix(x, 0) = 1.0;
}
// normalize our count_matrix
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
index e55b28c..7bdbb9e 100644
--- a/src/c/quikr_functions.c
+++ b/src/c/quikr_functions.c
@@ -6,6 +6,7 @@
#include <unistd.h>
#include <string.h>
#include <math.h>
+#include <zlib.h>
int count_sequences(char *filename) {
char command[512];
@@ -87,19 +88,14 @@ double *load_sensing_matrix(char *filename, int height, int width) {
int x = 0;
int y = 0;
- char *line = NULL;
- char *val;
- char command[512];
- size_t len = 0;
- FILE *sensing_matrix_fh = NULL;
+ gzFile sensing_matrix_fh = NULL;
double *sensing_matrix = malloc(height * width * sizeof(double));
if(sensing_matrix == NULL) {
fprintf(stderr, "Could not allocate memory for the sensing matrix\n");
}
- sprintf(command, "gzip -cd %s", filename);
- sensing_matrix_fh = popen(command, "r");
+ sensing_matrix_fh = gzopen(filename, "r");
if(sensing_matrix_fh == NULL) {
fprintf(stderr, "could not open %s", filename);
exit(EXIT_FAILURE);
@@ -107,21 +103,14 @@ double *load_sensing_matrix(char *filename, int height, int width) {
// read our sensing matrix in
for(x = 0; x < height; x++) {
- getline(&line, &len, sensing_matrix_fh);
- char *ptr;
-
- // Read our first element in outside of the loop
- val = strtok_r(line,"\t", &ptr);
- sensing_matrix[width*x + 1] = atof(val);
- // iterate through and load the array
- for (y = 2; y < width; y++) {
- val = strtok_r(NULL, "\t", &ptr);
- sensing_matrix[width*x + y] = atof(val);
+ for (y = 1; y < width; y++) {
+ char buffer[14];
+ gzgets(sensing_matrix_fh, buffer, 14);
+ sensing_matrix[width*x + y] = atof(buffer);
}
}
- free(line);
- pclose(sensing_matrix_fh);
+ gzclose(sensing_matrix_fh);
return sensing_matrix;
}
diff --git a/src/c/quikr_functions.c.1 b/src/c/quikr_functions.c.1
new file mode 100644
index 0000000..31afb3b
--- /dev/null
+++ b/src/c/quikr_functions.c.1
@@ -0,0 +1,153 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+#include <zlib.h>
+
+int count_sequences(char *filename) {
+ char command[512];
+ int sequences = 0;
+ FILE *grep_output;
+
+ sprintf(command, "grep -c ^\\> %s", filename);
+ grep_output = popen(command, "r");
+ if(grep_output == NULL) {
+ fprintf(stderr, "Could not execute %s\n", command);
+ exit(EXIT_FAILURE);
+ }
+
+ fscanf(grep_output, "%d", &sequences);
+
+ pclose(grep_output);
+ return sequences;
+}
+
+
+void normalize_matrix(double *matrix, int height, int width) {
+ int x = 0;
+ int y = 0;
+ for(x = 0; x < height; x++) {
+
+ double row_sum = 0;
+
+ for(y = 0; y < (width); y++)
+ row_sum = row_sum + matrix[width * x + y];
+ for(y = 0; y < (width); y++)
+ matrix[width * x + y] = matrix[width * x + y] / row_sum;
+ }
+}
+
+
+double *load_count_matrix(char *filename, int width, int kmer) {
+
+ double *count_matrix = malloc((width)*sizeof(double));
+ char count_command[512];
+ int x = 0;
+ char *line = NULL;
+ size_t len = 0;
+
+ if(count_matrix == NULL) {
+ fprintf(stderr, "could not allocate enough memory for the count matrix (%d x double) \n", width);
+ exit(EXIT_FAILURE);
+ }
+
+ // create out count matrix
+ sprintf(count_command, "count-kmers -r %d -1 -u %s", kmer, filename);
+ FILE *count_output = popen(count_command, "r");
+ if(count_output == NULL) {
+ fprintf(stderr, "could not execute \"%s\"", count_command);
+ exit(EXIT_FAILURE);
+ }
+
+ // set first element to zero.
+ count_matrix[0] = 0;
+
+ // get our first line
+ getline(&line, &len, count_output);
+ count_matrix[1] = atoi(line);
+
+ // iterate over the rest of the lines
+ for(x = 2; x < width; x++) {
+ getline(&line, &len, count_output);
+ count_matrix[x] = atoi(line);
+ }
+
+ pclose(count_output);
+
+ return count_matrix;
+}
+
+
+double *load_sensing_matrix(char *filename, int height, int width) {
+
+ int x = 0;
+ int y = 0;
+
+ gzFile sensing_matrix_fh = NULL;
+
+ double *sensing_matrix = malloc(height * width * sizeof(double));
+ if(sensing_matrix == NULL) {
+ fprintf(stderr, "Could not allocate memory for the sensing matrix\n");
+ }
+
+ sensing_matrix_fh = gzopen(filename, "r");
+ if(sensing_matrix_fh == NULL) {
+ fprintf(stderr, "could not open %s", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ // read our sensing matrix in
+ for(x = 0; x < height; x++) {
+ for (y = 1; y < width; y++) {
+ char buffer[14];
+ gzgets(sensing_matrix_fh, buffer, 14);
+ sensing_matrix[width*x + y] = atof(buffer);
+ }
+ }
+
+ gzclose(sensing_matrix_fh);
+
+ return sensing_matrix;
+}
+
+char **load_headers(char *filename, int sequences) {
+ char command[512];
+ char *line= NULL;
+ int x = 0;
+ FILE *grep_output;
+ size_t len = 0;
+
+ sprintf(command, "grep ^\\> %s", filename);
+ grep_output = popen(command, "r");
+ if(grep_output == NULL) {
+ fprintf(stderr, "Could not execute %s\n", command);
+ exit(EXIT_FAILURE);
+ }
+
+ char **headers = malloc(sequences * sizeof(char *));
+ if(headers == NULL) {
+ fprintf(stderr, "could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ for(x = 0; x < sequences; x++) {
+
+ char *header = malloc(256 * sizeof(char));
+ if(header == NULL) {
+ fprintf(stderr, "could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+ getline(&line, &len, grep_output);
+ sscanf(line + 1, "%s", header);
+ headers[x] = header;
+ }
+
+ pclose(grep_output);
+
+ return headers;
+}
+
diff --git a/src/c/quikr_train.1 b/src/c/quikr_train.1
index dd1493f..6a058cd 100644
--- a/src/c/quikr_train.1
+++ b/src/c/quikr_train.1
@@ -45,7 +45,7 @@ If you do not have a .gz file on your output matrix name, it will be appended.
.SH AUTHORS
.B quikr
was written by Gail Rosen <gailr@ece.drexel.edu>, Calvin Morrison
-<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, Jean-Luc Bouchot
+<mutantturkey@gmail.com>, David Koslicki, Simon Foucart, and Jean-Luc Bouchot.
.SH REPORTING BUGS
.TP
Please report all bugs to Gail Rosen <gailr@ece.drexel.edu>. Include your \
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
index a2f132e..c728a2c 100644
--- a/src/c/quikr_train.c
+++ b/src/c/quikr_train.c
@@ -15,7 +15,6 @@
int main(int argc, char **argv) {
char probabilities_command[512];
- char kmers_file[256];
char *line = NULL;
char *val;
size_t len = 0;
@@ -169,10 +168,10 @@ int main(int argc, char **argv) {
trained_matrix[y] = trained_matrix[y] / row_sum;
}
- for( y = 0; y < width; y++) {
+ for( y = 0; y < (width - 1); y++) {
gzprintf(output, "%.10f\t", trained_matrix[y]);
}
- gzprintf(output, "\n");
+ gzprintf(output, "%.10f\n", trained_matrix[width]);
}
free(trained_matrix);