diff options
Diffstat (limited to 'src/c/quikr_train.c')
-rw-r--r-- | src/c/quikr_train.c | 203 |
1 files changed, 136 insertions, 67 deletions
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c index 1cf6be9..4d66b9d 100644 --- a/src/c/quikr_train.c +++ b/src/c/quikr_train.c @@ -8,30 +8,38 @@ #include <unistd.h> #include <zlib.h> +#include "kmer_utils.h" #include "quikr_functions.h" #define USAGE "Usage:\n\tquikr_train [OPTION...] - train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." int main(int argc, char **argv) { - char probabilities_command[512]; + // getline variables char *line = NULL; - char *val; size_t len = 0; + ssize_t read; int c; - int kmer = 6; - char *fasta_file = NULL; - char *output_file = NULL; + // k-mer is 6 by default + int kmer = 6; - int x = 0; - int y = 0; + // revision number + int revision = 0; + // iterators + long long i = 0; + long long position = 0; int verbose = 0; + + char *fasta_filename = NULL; + char *output_file = NULL; + gzFile output = NULL; + FILE *input = NULL; while (1) { static struct option long_options[] = { @@ -53,7 +61,7 @@ int main(int argc, char **argv) { switch (c) { case 'i': - fasta_file = optarg; + fasta_filename = optarg; break; case 'k': kmer = atoi(optarg); @@ -80,8 +88,7 @@ int main(int argc, char **argv) { } } - - if(fasta_file == NULL) { + if(fasta_filename == NULL) { fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n"); fprintf(stderr, "%s\n", USAGE); exit(EXIT_FAILURE); @@ -95,15 +102,20 @@ int main(int argc, char **argv) { if(verbose) { printf("kmer size: %d\n", kmer); - printf("fasta file: %s\n", fasta_file); + printf("fasta file: %s\n", fasta_filename); printf("output file: %s\n", output_file); } - if(access (fasta_file, F_OK) == -1) { - fprintf(stderr, "Error: could not find %s\n", fasta_file); + if(access (fasta_filename, F_OK) == -1) { + fprintf(stderr, "Error: could not find %s\n", fasta_filename); exit(EXIT_FAILURE); } + if(kmer == 0) { + fprintf(stderr, "Error: zero is not a valid kmer\n"); + exit(EXIT_FAILURE); + } + if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) { char *temp = malloc(strlen(output_file) + 4); if(temp == NULL) { @@ -115,74 +127,131 @@ int main(int argc, char **argv) { printf("appending a .gz to our output file: %s\n", output_file); } - // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length - int width = pow(4, kmer); - int sequences = count_sequences(fasta_file); + // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with + // kmer length + long width = pow(4, kmer); + unsigned long sequences = count_sequences(fasta_filename); if(sequences == 0) { - fprintf(stderr, "Error: %s contains 0 fasta sequences\n", fasta_file); + fprintf(stderr, "Error: %s contains 0 fasta sequences\n", fasta_filename); } - if(verbose) - printf("sequences: %d\nwidth: %d\n", sequences, width); - - // Allocate our matrix with the appropriate size, just one row - double *trained_matrix = malloc(width*sizeof(double)); - if(trained_matrix == NULL) { - fprintf(stderr, "Could not allocate enough memory\n"); - exit(EXIT_FAILURE); - } + if(verbose) { + printf("sequences: %ld\nwidth: %ld\n", sequences, width); + printf("Writing our sensing matrix to %s\n", output_file); + } - // call the probabilities-by-read command - sprintf(probabilities_command, "generate_kmers %d | probabilities-by-read %d %s /dev/stdin", kmer, kmer, fasta_file); - FILE *probabilities_output = popen(probabilities_command, "r"); - if(probabilities_output == NULL) { - fprintf(stderr, "Error could not execute: %s\n", probabilities_command); + input = fopen(fasta_filename, "r" ); + if(input == NULL) { + fprintf(stderr, "Error opening %s - %s\n", fasta_filename, strerror(errno)); exit(EXIT_FAILURE); } // open our output file - output = gzopen(output_file, "w6"); + output = gzopen(output_file, "w"); if(output == NULL) { - fprintf(stderr, "Error: could not open output file, error code: %d", errno); + fprintf(stderr, "Error: could not open output file, error code: %s\n", strerror(errno)); exit(EXIT_FAILURE); } - if(verbose) - printf("Writing our sensing matrix to %s\n", output_file); - - // read normalize and write our matrix in one go - for(x = 0; x < sequences; x++) { + // create our header + gzprintf(output, "quikr\n"); + gzprintf(output, "%ld\n", revision); + gzprintf(output, "%ld\n", sequences); + gzprintf(output, "%d\n", kmer); - getline(&line, &len, probabilities_output); - - // Read our first element in outside of the loop - val = strtok(line,"\t\n\r"); - trained_matrix[0] = atof(val); - // iterate through and load the array - for (y = 1; y < width; y++) { - val = strtok (NULL, "\t\n\r"); - trained_matrix[y] = atof(val); - } - - double row_sum = 0; - - for( y = 0; y < width; y++) { - row_sum = row_sum + trained_matrix[y]; - } - - for( y = 0; y < width; y++) { - trained_matrix[y] = trained_matrix[y] / row_sum; - } - - for( y = 0; y < (width - 1); y++) { - gzprintf(output, "%.10f\t", trained_matrix[y]); - } - gzprintf(output, "%.10f\n", trained_matrix[width]); - } + // malloc our return array + unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long)); + if(counts == NULL) { + fprintf(stderr, strerror(errno)); + exit(EXIT_FAILURE); + } + + + char *str = malloc(4096); + if(str == NULL) { + fprintf(stderr, strerror(errno)); + exit(EXIT_FAILURE); + } + + unsigned long long str_size = 4096; + + // seek the first character, and skip over it + fseek(input, 1, SEEK_CUR); + while ((read = getdelim(&line, &len, '>', input)) != -1) { + + // find first whitespace + for(i = 0; i < read; i ++) { + if(line[i] == ' ' || line[i] == '\t' || line[i] == '\n') + break; + } + + // write our header + gzprintf(output, ">%.*s\n", i, line); + + // find our first \n, this should be the end of the header + char *start = strchr(line, '\n'); + if(start == NULL) + continue; + + size_t start_len = strlen(start); + + + // if our current str buffer isn't big enough, realloc + if(start_len + 1 > str_size + 1) { + str = realloc(str, start_len + 1); + if(str == NULL) { + exit(EXIT_FAILURE); + fprintf(stderr, strerror(errno)); + } + } + + // strip out all other newlines to handle multiline sequences + str = strnstrip(start, str, '\n',start_len); + size_t seq_length = strlen(str); + + // relace A, C, G and T with 0, 1, 2, 3 respectively + // everything else is 5 + for(i = 0; i < seq_length; i++) { + str[i] = alpha[(int)str[i]]; + } + + // set counts to zero + memset(counts, 0, width); + // loop through our string to process each k-mer + for(position = 0; position < (seq_length - kmer + 1); position++) { + unsigned long mer = 0; + unsigned long multiply = 1; + + // for each char in the k-mer check if it is an error char + for(i = position + kmer - 1; i >= position; i--){ + if(str[i] >> 2) { + mer = width; + position = i; + goto next; + } + + // multiply this char in the mer by the multiply + // and bitshift the multiply for the next round + mer += str[i] * multiply; + multiply = multiply << 2; + } + // use this point to get mer of our loop + next: + // bump up the mer value in the counts array + counts[mer]++; + } + + for(i = 0; i < width; i++) { + gzprintf(output, "%lld\n", counts[i]); + } + + } + + free(counts); + free(line); - free(trained_matrix); gzclose(output); - pclose(probabilities_output); + fclose(input); - return 0; + return EXIT_SUCCESS; } |