aboutsummaryrefslogtreecommitdiff
path: root/src/c/quikr_train.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/c/quikr_train.c')
-rw-r--r--src/c/quikr_train.c176
1 files changed, 176 insertions, 0 deletions
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
new file mode 100644
index 0000000..d2a83ef
--- /dev/null
+++ b/src/c/quikr_train.c
@@ -0,0 +1,176 @@
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "quikr_functions.h"
+
+#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'"
+#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode."
+
+int main(int argc, char **argv) {
+
+ char probabilities_command[512];
+ char kmers_file[256];
+ char *line = NULL;
+ char *val;
+ size_t len = 0;
+
+
+ int c;
+ int kmer = 0;
+
+ char *fasta_file = NULL;
+ char *output_file = NULL;
+
+ int x = 0;
+ int y = 0;
+
+ int verbose = 0;
+
+ gzFile output = NULL;
+
+ while (1) {
+ static struct option long_options[] = {
+ {"verbose", no_argument, 0, 'v'},
+ {"input", required_argument, 0, 'i'},
+ {"kmer", required_argument, 0, 'k'},
+ {"output", required_argument, 0, 'o'},
+ {0, 0, 0, 0}
+ };
+
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'i':
+ fasta_file = optarg;
+ break;
+ case 'k':
+ kmer = atoi(optarg);
+ break;
+ case 'o':
+ output_file = optarg;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'h':
+ printf("%s\n", USAGE);
+ exit(EXIT_SUCCESS);
+ break;
+ case '?':
+ /* getopt_long already printed an error message. */
+ break;
+ default:
+ exit(EXIT_FAILURE);
+ }
+ }
+
+
+ if(fasta_file == NULL) {
+ fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ if(output_file == NULL) {
+ fprintf(stderr, "Error: output directory (-o) must be specified\n\n");
+ fprintf(stderr, "%s\n", USAGE);
+ exit(EXIT_FAILURE);
+ }
+
+ if(kmer == 0)
+ kmer = 6;
+
+ if(verbose) {
+ printf("kmer size: %d\n", kmer);
+ printf("fasta file: %s\n", fasta_file);
+ printf("output file: %s\n", output_file);
+ }
+
+ if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) {
+ char *temp = malloc(sizeof(strlen(output_file) + 4));
+ sprintf(temp, "%s.gz", output_file);
+ output_file = temp;
+ printf("appending a .gz to our output file: %s\n", output_file);
+ }
+
+ // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length
+ int width = pow(4, kmer);
+ int sequences = count_sequences(fasta_file);
+
+ if(verbose)
+ printf("sequences: %d\nwidth: %d\n", sequences, width);
+
+ // Allocate our matrix with the appropriate size, just one row
+ double *trained_matrix = malloc(width*sizeof(double));
+ if(trained_matrix == NULL) {
+ fprintf(stderr, "Could not allocated enough memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // call the probabilities-by-read command
+ sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer);
+ sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file);
+ FILE *probabilities_output = popen(probabilities_command, "r");
+ if(probabilities_output == NULL) {
+ fprintf(stderr, "Error could not execute: %s\n", probabilities_command);
+ exit(EXIT_FAILURE);
+ }
+
+ // open our output file
+ output = gzopen(output_file, "w6");
+ if(output == NULL) {
+ fprintf(stderr, "Error: could not open output file, error code: %d", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ if(verbose)
+ printf("Writing our sensing matrix to %s\n", output_file);
+
+ // read normalize and write our matrix in one go
+ for(x = 0; x < sequences; x++) {
+
+ getline(&line, &len, probabilities_output);
+
+ // Read our first element in outside of the loop
+ val = strtok(line,"\t\n\r");
+ trained_matrix[0] = atof(val);
+ // iterate through and load the array
+ for (y = 1; y < width; y++) {
+ val = strtok (NULL, "\t\n\r");
+ trained_matrix[y] = atof(val);
+ }
+
+ double row_sum = 0;
+
+ for( y = 0; y < width; y++) {
+ row_sum = row_sum + trained_matrix[y];
+ }
+
+ for( y = 0; y < width; y++) {
+ trained_matrix[y] = trained_matrix[y] / row_sum;
+ }
+
+ for( y = 0; y < width; y++) {
+ gzprintf(output, "%.10f\t", trained_matrix[y]);
+ }
+ gzprintf(output, "\n");
+ }
+
+ free(trained_matrix);
+ gzclose(output);
+ pclose(probabilities_output);
+
+ return 0;
+}