aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-10-28 12:06:45 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-10-28 12:06:45 -0400
commit3f20f3ba6b9b7375290ea8342dcca4e3e977829e (patch)
tree56f8ace90207e4f02a6a5089587af10dbf2f8c4e
parent1c84ef12c76848ebddee31b4aec7f66768f59b3b (diff)
New sensing matrix format
quikr_train now has a new format that should allow for a more sane approach in terms of usability. headers are now packed into the format, and counts, not probabilities are included as each row the header looks like quikr version of format sequences kmer >header 2 4 12 22 ... n-mers >header2 ....
-rw-r--r--src/c/quikr_train.c203
1 files changed, 136 insertions, 67 deletions
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
index 1cf6be9..4d66b9d 100644
--- a/src/c/quikr_train.c
+++ b/src/c/quikr_train.c
@@ -8,30 +8,38 @@
#include <unistd.h>
#include <zlib.h>
+#include "kmer_utils.h"
#include "quikr_functions.h"
#define USAGE "Usage:\n\tquikr_train [OPTION...] - train a database for use with quikr.\n\nOptions:\n\n-i, --input\n\tthe database of sequences to create the sensing matrix (fasta format)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-o, --output\n\tthe sensing matrix. (a gzip'd text file)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version."
int main(int argc, char **argv) {
- char probabilities_command[512];
+ // getline variables
char *line = NULL;
- char *val;
size_t len = 0;
+ ssize_t read;
int c;
- int kmer = 6;
- char *fasta_file = NULL;
- char *output_file = NULL;
+ // k-mer is 6 by default
+ int kmer = 6;
- int x = 0;
- int y = 0;
+ // revision number
+ int revision = 0;
+ // iterators
+ long long i = 0;
+ long long position = 0;
int verbose = 0;
+
+ char *fasta_filename = NULL;
+ char *output_file = NULL;
+
gzFile output = NULL;
+ FILE *input = NULL;
while (1) {
static struct option long_options[] = {
@@ -53,7 +61,7 @@ int main(int argc, char **argv) {
switch (c) {
case 'i':
- fasta_file = optarg;
+ fasta_filename = optarg;
break;
case 'k':
kmer = atoi(optarg);
@@ -80,8 +88,7 @@ int main(int argc, char **argv) {
}
}
-
- if(fasta_file == NULL) {
+ if(fasta_filename == NULL) {
fprintf(stderr, "Error: input fasta file (-i) must be specified\n\n");
fprintf(stderr, "%s\n", USAGE);
exit(EXIT_FAILURE);
@@ -95,15 +102,20 @@ int main(int argc, char **argv) {
if(verbose) {
printf("kmer size: %d\n", kmer);
- printf("fasta file: %s\n", fasta_file);
+ printf("fasta file: %s\n", fasta_filename);
printf("output file: %s\n", output_file);
}
- if(access (fasta_file, F_OK) == -1) {
- fprintf(stderr, "Error: could not find %s\n", fasta_file);
+ if(access (fasta_filename, F_OK) == -1) {
+ fprintf(stderr, "Error: could not find %s\n", fasta_filename);
exit(EXIT_FAILURE);
}
+ if(kmer == 0) {
+ fprintf(stderr, "Error: zero is not a valid kmer\n");
+ exit(EXIT_FAILURE);
+ }
+
if(strcmp(&output_file[strlen(output_file) - 3], ".gz") != 0) {
char *temp = malloc(strlen(output_file) + 4);
if(temp == NULL) {
@@ -115,74 +127,131 @@ int main(int argc, char **argv) {
printf("appending a .gz to our output file: %s\n", output_file);
}
- // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with kmer length
- int width = pow(4, kmer);
- int sequences = count_sequences(fasta_file);
+ // 4 ^ Kmer gives us the width, or the number of permutations of ACTG with
+ // kmer length
+ long width = pow(4, kmer);
+ unsigned long sequences = count_sequences(fasta_filename);
if(sequences == 0) {
- fprintf(stderr, "Error: %s contains 0 fasta sequences\n", fasta_file);
+ fprintf(stderr, "Error: %s contains 0 fasta sequences\n", fasta_filename);
}
- if(verbose)
- printf("sequences: %d\nwidth: %d\n", sequences, width);
-
- // Allocate our matrix with the appropriate size, just one row
- double *trained_matrix = malloc(width*sizeof(double));
- if(trained_matrix == NULL) {
- fprintf(stderr, "Could not allocate enough memory\n");
- exit(EXIT_FAILURE);
- }
+ if(verbose) {
+ printf("sequences: %ld\nwidth: %ld\n", sequences, width);
+ printf("Writing our sensing matrix to %s\n", output_file);
+ }
- // call the probabilities-by-read command
- sprintf(probabilities_command, "generate_kmers %d | probabilities-by-read %d %s /dev/stdin", kmer, kmer, fasta_file);
- FILE *probabilities_output = popen(probabilities_command, "r");
- if(probabilities_output == NULL) {
- fprintf(stderr, "Error could not execute: %s\n", probabilities_command);
+ input = fopen(fasta_filename, "r" );
+ if(input == NULL) {
+ fprintf(stderr, "Error opening %s - %s\n", fasta_filename, strerror(errno));
exit(EXIT_FAILURE);
}
// open our output file
- output = gzopen(output_file, "w6");
+ output = gzopen(output_file, "w");
if(output == NULL) {
- fprintf(stderr, "Error: could not open output file, error code: %d", errno);
+ fprintf(stderr, "Error: could not open output file, error code: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
- if(verbose)
- printf("Writing our sensing matrix to %s\n", output_file);
-
- // read normalize and write our matrix in one go
- for(x = 0; x < sequences; x++) {
+ // create our header
+ gzprintf(output, "quikr\n");
+ gzprintf(output, "%ld\n", revision);
+ gzprintf(output, "%ld\n", sequences);
+ gzprintf(output, "%d\n", kmer);
- getline(&line, &len, probabilities_output);
-
- // Read our first element in outside of the loop
- val = strtok(line,"\t\n\r");
- trained_matrix[0] = atof(val);
- // iterate through and load the array
- for (y = 1; y < width; y++) {
- val = strtok (NULL, "\t\n\r");
- trained_matrix[y] = atof(val);
- }
-
- double row_sum = 0;
-
- for( y = 0; y < width; y++) {
- row_sum = row_sum + trained_matrix[y];
- }
-
- for( y = 0; y < width; y++) {
- trained_matrix[y] = trained_matrix[y] / row_sum;
- }
-
- for( y = 0; y < (width - 1); y++) {
- gzprintf(output, "%.10f\t", trained_matrix[y]);
- }
- gzprintf(output, "%.10f\n", trained_matrix[width]);
- }
+ // malloc our return array
+ unsigned long long * counts = malloc((width+ 1) * sizeof(unsigned long long));
+ if(counts == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+
+ char *str = malloc(4096);
+ if(str == NULL) {
+ fprintf(stderr, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ unsigned long long str_size = 4096;
+
+ // seek the first character, and skip over it
+ fseek(input, 1, SEEK_CUR);
+ while ((read = getdelim(&line, &len, '>', input)) != -1) {
+
+ // find first whitespace
+ for(i = 0; i < read; i ++) {
+ if(line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
+ break;
+ }
+
+ // write our header
+ gzprintf(output, ">%.*s\n", i, line);
+
+ // find our first \n, this should be the end of the header
+ char *start = strchr(line, '\n');
+ if(start == NULL)
+ continue;
+
+ size_t start_len = strlen(start);
+
+
+ // if our current str buffer isn't big enough, realloc
+ if(start_len + 1 > str_size + 1) {
+ str = realloc(str, start_len + 1);
+ if(str == NULL) {
+ exit(EXIT_FAILURE);
+ fprintf(stderr, strerror(errno));
+ }
+ }
+
+ // strip out all other newlines to handle multiline sequences
+ str = strnstrip(start, str, '\n',start_len);
+ size_t seq_length = strlen(str);
+
+ // relace A, C, G and T with 0, 1, 2, 3 respectively
+ // everything else is 5
+ for(i = 0; i < seq_length; i++) {
+ str[i] = alpha[(int)str[i]];
+ }
+
+ // set counts to zero
+ memset(counts, 0, width);
+ // loop through our string to process each k-mer
+ for(position = 0; position < (seq_length - kmer + 1); position++) {
+ unsigned long mer = 0;
+ unsigned long multiply = 1;
+
+ // for each char in the k-mer check if it is an error char
+ for(i = position + kmer - 1; i >= position; i--){
+ if(str[i] >> 2) {
+ mer = width;
+ position = i;
+ goto next;
+ }
+
+ // multiply this char in the mer by the multiply
+ // and bitshift the multiply for the next round
+ mer += str[i] * multiply;
+ multiply = multiply << 2;
+ }
+ // use this point to get mer of our loop
+ next:
+ // bump up the mer value in the counts array
+ counts[mer]++;
+ }
+
+ for(i = 0; i < width; i++) {
+ gzprintf(output, "%lld\n", counts[i]);
+ }
+
+ }
+
+ free(counts);
+ free(line);
- free(trained_matrix);
gzclose(output);
- pclose(probabilities_output);
+ fclose(input);
- return 0;
+ return EXIT_SUCCESS;
}