diff options
Diffstat (limited to 'src/c/multifasta_to_otu.c')
-rw-r--r-- | src/c/multifasta_to_otu.c | 373 |
1 files changed, 230 insertions, 143 deletions
diff --git a/src/c/multifasta_to_otu.c b/src/c/multifasta_to_otu.c index 1abe787..e060fcf 100644 --- a/src/c/multifasta_to_otu.c +++ b/src/c/multifasta_to_otu.c @@ -23,28 +23,86 @@ #define USAGE "Usage:\n\tmultifasta_to_otu [OPTION...] - create a QIIME OTU table based on Quikr results. \n\nOptions:\n\n-i, --input-directory\n\tthe directory containing the samples' fasta files of reads (note each file should correspond to a separate sample)\n\n-s, --sensing-matrix\n\t location of the sensing matrix. (sensing from quikr_train)\n\n-k, --kmer\n\tspecify what size of kmer to use. (default value is 6)\n\n-l, --lambda\n\tlambda value to use. (default value is 10000)\n\n-j, --jobs\n\t specifies how many jobs to run at once. (default value is the number of CPUs)\n\n-o, --output\n\tthe OTU table, with NUM_READS_PRESENT for each sample which is compatible with QIIME's convert_biom.py (or a sequence table if not OTU's)\n\n-v, --verbose\n\tverbose mode.\n\n-V, --version\n\tprint version." static int cmp (const void * a, const void * b) { - return ( *(double*)a - *(double*)b ); - } + return ( *(double*)a - *(double*)b ); +} + +char **get_fasta_files_from_file(char *fn) { + char **files; + int files_count = 0; + + // getline stuff + ssize_t read; + size_t len = 0; + char *line = NULL; + + FILE *fh = fopen(fn, "r"); + if(fh == NULL) { + fprintf(stderr, "Error opening %s - %s\n", fn, strerror(errno)); + exit(EXIT_FAILURE); + } + + files = malloc(sizeof(char **)); + + while ((read = getline(&line, &len, fh)) != -1) { + char *file = malloc(sizeof(char) * (strlen(line))); + if(file == NULL) { + exit(EXIT_FAILURE); + } + strncpy(file, line, strlen(line) + 1 ); + printf("%zu\n", strlen(line)); + printf("%zu\n", strlen(file)); + file[strlen(file)- 1] = '\0'; + printf(">%s< %zu >%s< %zu\n", line, strlen(line), file, strlen(file)); + + if(access(file, F_OK) == 0) { + + files[files_count] = file; + files_count++; + + files = realloc(files, sizeof(char *) * (files_count + 1)); + if(files == NULL) { + fprintf(stderr, "could not realloc keys\n"); + exit(EXIT_FAILURE); + } + } + else { + fprintf(stderr, "Warning: ignoring %s (%s)\n", file, strerror(errno)); + errno = 0; + free(file); + } + } -char **get_fasta_files(char *directory) { + files_count++; + files = realloc(files, sizeof(char *) * files_count); + if(files == NULL) { + fprintf(stderr, "could not realloc keys\n"); + exit(EXIT_FAILURE); + } + files[files_count] = NULL; + + fclose(fh); + return files; +} + +char **get_fasta_files_from_directory(char *directory) { - DIR *dh; - struct dirent *e; + DIR *dh; + struct dirent *e; char **headers; long long count = -2; // -2 for ../ and ./ long long i = 0; // open our directory - dh = opendir(directory); - if(dh == NULL) { - fprintf(stderr, "could not open %s\n", directory); - exit(EXIT_FAILURE); - } + dh = opendir(directory); + if(dh == NULL) { + fprintf(stderr, "could not open %s\n", directory); + exit(EXIT_FAILURE); + } while((e = readdir(dh))) - count++; - + count++; + e = NULL; rewinddir(dh); @@ -61,7 +119,7 @@ char **get_fasta_files(char *directory) { for(i = 0; i < count; i++) { char *ext = NULL; e = readdir(dh); - + if(strcmp(e->d_name, "..") == 0 || strcmp(e->d_name, ".") == 0) { i--; continue; @@ -70,12 +128,13 @@ char **get_fasta_files(char *directory) { ext = strrchr(e->d_name, '.'); if(str_eq(ext, ".fasta") || - str_eq(ext, ".fa") || - str_eq(ext, ".fna")) + str_eq(ext, ".fa") || + str_eq(ext, ".fna")) { - char *header = malloc(strlen(e->d_name) + 1); + + char *header = malloc(strlen(directory) + strlen(e->d_name) + 1); check_malloc(header, NULL); - strcpy(header, e->d_name); + sprintf(header, "%s/%s", directory, e->d_name); headers[array_pos] = header; } else { @@ -90,151 +149,181 @@ char **get_fasta_files(char *directory) { closedir(dh); return headers; } + + int main(int argc, char **argv) { - int c; + int c; - char *input_fasta_directory = NULL; - char *sensing_matrix_filename = NULL; - char *output_filename = NULL; + char *input_fasta_directory = NULL; + char *input_fasta_filelist = NULL; + char *sensing_matrix_filename = NULL; + char *output_filename = NULL; - unsigned long long i = 0; - unsigned long long j = 0; + unsigned long long i = 0; + unsigned long long j = 0; - unsigned long long width = 0; + unsigned long long width = 0; - unsigned int kmer = 6; - unsigned long long lambda = 10000; + unsigned int kmer = 6; + unsigned long long lambda = 10000; - double rare_percent = 1.0; + double rare_percent = 1.0; - unsigned int jobs = 1; + unsigned int jobs = 1; long done = 0; unsigned long long dir_count = 0; - #ifdef Linux - jobs = get_nprocs(); - #endif - #ifdef Darwin - jobs = sysconf (_SC_NPROCESSORS_ONLN); - #endif - - int verbose = 0; - - while (1) { - static struct option long_options[] = { - {"input-directory", required_argument, 0, 'i'}, - {"kmer", required_argument, 0, 'k'}, - {"lambda", required_argument, 0, 'l'}, - {"jobs", required_argument, 0, 'j'}, - {"output", required_argument, 0, 'o'}, - {"sensing-matrix", required_argument, 0, 's'}, - {"rare-percent", required_argument, 0, 'r'}, - {"verbose", no_argument, 0, 'v'}, - {"help", no_argument, 0, 'h'}, - {"version", no_argument, 0, 'V'}, - {0, 0, 0, 0} - }; - int option_index = 0; - - c = getopt_long (argc, argv, "k:l:s:i:o:j:r:hvV", long_options, &option_index); - - if (c == -1) - break; - - switch (c) { - case 'i': - input_fasta_directory = optarg; - break; - case 'j': - jobs = atoi(optarg); - break; - case 'k': - kmer = atoi(optarg); - break; - case 'l': - lambda = atoi(optarg); - break; - case 'o': - output_filename = optarg; - break; - case 'r': - rare_percent = atof(optarg); - break; - case 's': - sensing_matrix_filename = optarg; - break; - case 'v': - verbose = 1; - break; - case 'V': - printf("%s\n", VERSION); - exit(EXIT_SUCCESS); - break; - case 'h': - puts(USAGE); - exit(EXIT_SUCCESS); - break; - default: - break; - } - } + #ifdef Linux + jobs = get_nprocs(); + #endif - if(sensing_matrix_filename == NULL) { - fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); - fprintf(stderr, "%s\n", USAGE); - exit(EXIT_FAILURE); - } - if(output_filename == NULL) { - fprintf(stderr, "Error: output filename (-o) must be specified\n\n"); - fprintf(stderr, "%s\n", USAGE); - exit(EXIT_FAILURE); - } - if(input_fasta_directory == NULL) { - fprintf(stderr, "Error: input fasta directory (-i) must be specified\n\n"); - fprintf(stderr, "%s\n", USAGE); - exit(EXIT_FAILURE); - } + #ifdef Darwin + jobs = sysconf (_SC_NPROCESSORS_ONLN); + #endif + + int verbose = 0; + + static struct option long_options[] = { + {"input-directory", required_argument, 0, 'i'}, + {"input-filelist", required_argument, 0, 'f'}, + {"kmer", required_argument, 0, 'k'}, + {"lambda", required_argument, 0, 'l'}, + {"jobs", required_argument, 0, 'j'}, + {"output", required_argument, 0, 'o'}, + {"sensing-matrix", required_argument, 0, 's'}, + {"rare-percent", required_argument, 0, 'r'}, + {"verbose", no_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0} + }; + + while (1) { + int option_index = 0; + + c = getopt_long (argc, argv, "f:k:l:s:i:o:j:r:hvV", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + input_fasta_directory = optarg; + break; + case 'f': + input_fasta_filelist = optarg; + break; + case 'j': + jobs = atoi(optarg); + break; + case 'k': + kmer = atoi(optarg); + break; + case 'l': + lambda = atoi(optarg); + break; + case 'o': + output_filename = optarg; + break; + case 'r': + rare_percent = atof(optarg); + break; + case 's': + sensing_matrix_filename = optarg; + break; + case 'v': + verbose = 1; + break; + case 'V': + printf("%s\n", VERSION); + exit(EXIT_SUCCESS); + break; + case 'h': + puts(USAGE); + exit(EXIT_SUCCESS); + break; + default: + break; + } + } + + if(sensing_matrix_filename == NULL) { + fprintf(stderr, "Error: sensing matrix filename (-s) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(output_filename == NULL) { + fprintf(stderr, "Error: output filename (-o) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + // input fasta parsing + if(input_fasta_directory == NULL && input_fasta_filelist == NULL) { + fprintf(stderr, "Error: input fasta directory (-i) or input fasta filelist (-f) must be specified\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } + + if(input_fasta_directory != NULL && input_fasta_filelist != NULL) { + fprintf(stderr, "Error: input fasta directory (-i) and input fasta filelist (-f) cannot be used concurrently\n\n"); + fprintf(stderr, "%s\n", USAGE); + exit(EXIT_FAILURE); + } if(rare_percent <= 0 || rare_percent > 1.0) { - fprintf(stderr, "Error: rare percent must be between 0 and 1\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "Error: rare percent must be between 0 and 1\n"); + exit(EXIT_FAILURE); } - if(verbose) { - printf("kmer: %u\n", kmer); - printf("lambda: %llu\n", lambda); - printf("input directory: %s\n", input_fasta_directory); - printf("sensing database: %s\n", sensing_matrix_filename); - printf("output: %s\n", output_filename); - printf("number of jobs to run at once: %d\n", jobs); - } - if(access (sensing_matrix_filename, F_OK) == -1) { - fprintf(stderr, "Error: could not find %s\n", sensing_matrix_filename); - exit(EXIT_FAILURE); - } + if(verbose) { + printf("kmer: %u\n", kmer); + printf("lambda: %llu\n", lambda); + printf("input directory: %s\n", input_fasta_directory); + printf("input filelist: %s\n", input_fasta_filelist); + printf("sensing database: %s\n", sensing_matrix_filename); + printf("output: %s\n", output_filename); + printf("number of jobs to run at once: %d\n", jobs); + } + + if(access (sensing_matrix_filename, F_OK) == -1) { + fprintf(stderr, "Error: could not find %s\n", sensing_matrix_filename); + exit(EXIT_FAILURE); + } if(kmer == 0) { - fprintf(stderr, "Error: zero is not a valid kmer\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "Error: zero is not a valid kmer\n"); + exit(EXIT_FAILURE); } - char **filenames = get_fasta_files(input_fasta_directory); + // load filenames + char **filenames = NULL; + if(input_fasta_directory != NULL) + filenames = get_fasta_files_from_directory(input_fasta_directory); + else + filenames = get_fasta_files_from_file(input_fasta_filelist); + while(filenames[dir_count] != NULL) dir_count++; + + if(dir_count == 0) { + fprintf(stderr, "Error: No files loaded from input\n"); + exit(EXIT_FAILURE); + } + // 4 "ACGT" ^ Kmer gives us the size of output rows + width = pow(4, kmer); - // 4 "ACGT" ^ Kmer gives us the size of output rows - width = pow(4, kmer); - - struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer); + struct matrix *sensing_matrix = load_sensing_matrix(sensing_matrix_filename, kmer); double *sensing_matrix_ptr = sensing_matrix->matrix; unsigned long long sequences = sensing_matrix->sequences; - if(verbose) { - printf("directory count: %llu\n", dir_count); - printf("width: %llu\n", width); + if(verbose) { + printf("directory count: %llu\n", dir_count); + printf("width: %llu\n", width); printf("sequences: %llu\n", sequences); } @@ -245,9 +334,12 @@ int main(int argc, char **argv) { check_malloc(file_sequence_count, NULL); #ifdef OMP - omp_set_num_threads(jobs); + omp_set_num_threads(jobs); #endif + + printf("Beginning to process samples\n"); + #pragma omp parallel for shared(solutions, sensing_matrix_ptr, width, done, sequences) for(size_t i = 0; i < dir_count; i++ ) { @@ -261,12 +353,8 @@ int main(int argc, char **argv) { double rare_percent = 1.0; - char *full_filename = malloc(strlen(input_fasta_directory) + strlen(filenames[i]) + 2); - check_malloc(full_filename, NULL); - - sprintf(full_filename, "%s/%s", input_fasta_directory, filenames[i]); - printf("processing %s\n", full_filename); - file_sequence_count = count_sequences(full_filename); + printf("processing %s\n", filenames[i]); + file_sequence_count = count_sequences(filenames[i]); // load counts matrix double *count_matrix = malloc(width * sizeof(double)); @@ -276,7 +364,7 @@ int main(int argc, char **argv) { // convert our matrix into doubles { - unsigned long long *integer_counts = get_kmer_counts_from_file(full_filename, kmer); + unsigned long long *integer_counts = get_kmer_counts_from_file(filenames[i], kmer); for(x = 0; x < width; x++) { sorted_count_matrix[x] = count_matrix[x] = (double)integer_counts[x]; @@ -364,7 +452,6 @@ int main(int argc, char **argv) { done++; printf("%ld/%llu samples processed\n", done, dir_count); free(solution); - free(full_filename); free(count_matrix_rare); free(count_matrix); free(sensing_matrix_rare); |