diff options
Diffstat (limited to 'src/c/quikr_train.c')
-rw-r--r-- | src/c/quikr_train.c | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c new file mode 100644 index 0000000..d2a83ef --- /dev/null +++ b/src/c/quikr_train.c @@ -0,0 +1,176 @@ +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include "quikr_functions.h" + +#define AWK_KMER_PERMUTATIONS "awk 'function p(l,v,i){for(i in A) {if(l<%d) p(l+1, (v?v\"\":x)i); else print v\"\"i;}} {A[$0]} END {p(1);} ' <<<$'A\nC\nG\nT'" +#define USAGE "Usage:\n\tquikr_train [OPTION...] - to train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode." + +int main(int argc, char **argv) { + + char probabilities_command[512]; + char kmers_file[256]; + char *line = NULL; + char *val; + size_t len = 0; + + + int c; + int kmer = 0; + + char *fasta_file = NULL; + char *output_file = NULL; + + int x = 0; + int y = 0; + + int verbose = 0; + + gzFile output = NULL; + + while (1) { + static struct option long_options[] = { + {"verbose", no_argument, 0, 'v'}, + {"input", required_argument, 0, 'i'}, + {"kmer", required_argument, 0, 'k'}, + {"output", required_argument, 0, 'o'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "i:o:k:hv", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + fasta_file = optarg; + break; + case 'k': + kmer = atoi(optarg); + break; + case 'o': + output_file = optarg; + break; + case 'v': + verbose = 1; + break; + case 'h': + printf("%s\n", USAGE); + exit(EXIT_SUCCESS); + break; + case '?': + /* getopt_long already printed an error message. */ + break; + default: + exit(EXIT_FAILURE); + } + } + + + if(fasta_file == NULL) { + fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(output_file == NULL) { + fprintf(stderr, "Error: output directory (-o) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(kmer == 0) + kmer = 6; + + if(verbose) { + printf("kmer size: %d\n", kmer); + printf("fasta file: %s\n", fasta_file); + printf("output file: %s\n", output_file); + } + + if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) { + char *temp = malloc(sizeof(strlen(output_file) + 4)); + sprintf(temp, "%s.gz", output_file); + output_file = temp; + printf("appending a .gz to our output file: %s\n", output_file); + } + + // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length + int width = pow(4, kmer); + int sequences = count_sequences(fasta_file); + + if(verbose) + printf("sequences: %d\nwidth: %d\n", sequences, width); + + // Allocate our matrix with the appropriate size, just one row + double *trained_matrix = malloc(width*sizeof(double)); + if(trained_matrix == NULL) { + fprintf(stderr, "Could not allocated enough memory\n"); + exit(EXIT_FAILURE); + } + + // call the probabilities-by-read command + sprintf(kmers_file, AWK_KMER_PERMUTATIONS, kmer); + sprintf(probabilities_command, "%s | probabilities-by-read %d %s /dev/stdin", kmers_file, kmer, fasta_file); + FILE *probabilities_output = popen(probabilities_command, "r"); + if(probabilities_output == NULL) { + fprintf(stderr, "Error could not execute: %s\n", probabilities_command); + exit(EXIT_FAILURE); + } + + // open our output file + output = gzopen(output_file, "w6"); + if(output == NULL) { + fprintf(stderr, "Error: could not open output file, error code: %d", errno); + exit(EXIT_FAILURE); + } + + if(verbose) + printf("Writing our sensing matrix to %s\n", output_file); + + // read normalize and write our matrix in one go + for(x = 0; x < sequences; x++) { + + getline(&line, &len, probabilities_output); + + // Read our first element in outside of the loop + val = strtok(line,"\t\n\r"); + trained_matrix[0] = atof(val); + // iterate through and load the array + for (y = 1; y < width; y++) { + val = strtok (NULL, "\t\n\r"); + trained_matrix[y] = atof(val); + } + + double row_sum = 0; + + for( y = 0; y < width; y++) { + row_sum = row_sum + trained_matrix[y]; + } + + for( y = 0; y < width; y++) { + trained_matrix[y] = trained_matrix[y] / row_sum; + } + + for( y = 0; y < width; y++) { + gzprintf(output, "%.10f\t", trained_matrix[y]); + } + gzprintf(output, "\n"); + } + + free(trained_matrix); + gzclose(output); + pclose(probabilities_output); + + return 0; +} |