diff options
author | Calvin Morrison <mutantturkey@gmail.com> | 2014-05-06 12:58:09 -0400 |
---|---|---|
committer | Calvin Morrison <mutantturkey@gmail.com> | 2014-05-06 12:58:09 -0400 |
commit | f251dc46c40501704bfebf2d778577b1f1dc3484 (patch) | |
tree | f09e5c15a0b2ce33f7020310161d3f44cfe1c62f | |
parent | 151602f582746f8374a3f8bc4ea1ffdbcc4a513a (diff) | |
parent | 958b167c741c2928021216fddadf24ecdabfbf61 (diff) |
werks
-rw-r--r-- | Makefile | 1 | ||||
-rw-r--r-- | src/c/Makefile | 4 | ||||
-rw-r--r-- | src/c/kmer_utils.c | 2 | ||||
-rw-r--r-- | src/c/quikr_functions.c | 160 | ||||
-rw-r--r-- | src/c/quikr_functions.h | 4 | ||||
-rw-r--r-- | src/c/quikr_train.c | 2 |
6 files changed, 161 insertions, 12 deletions
@@ -33,5 +33,4 @@ install_python: clean: @echo "cleaning up" @cd src/python; rm build -Rvf - @cd src/nbc; make clean @cd src/c; make clean diff --git a/src/c/Makefile b/src/c/Makefile index c63c1bc..4c62aa6 100644 --- a/src/c/Makefile +++ b/src/c/Makefile @@ -16,8 +16,8 @@ all: nnls.o kmer_utils.o quikr_functions.o quikr_train quikr multifasta_to_otu t nnls.o: nnls.c $(CC) -c nnls.c -o nnls.o $(CFLAGS) -kmer_utils.o: kmer_utils.c - $(CC) -c kmer_utils.c -o kmer_utils.o $(CFLAGS) +kmer_utils.o: kmer_utils.c quikr_functions.o + $(CC) -c kmer_utils.c quikr_functions.o -o kmer_utils.o $(CFLAGS) quikr_functions.o: quikr_functions.c $(CC) -c quikr_functions.c -o quikr_functions.o $(CFLAGS) multifasta_to_otu: kmer_utils.o nnls.o quikr_functions.o multifasta_to_otu.c diff --git a/src/c/kmer_utils.c b/src/c/kmer_utils.c index 1ca522a..b85dbda 100644 --- a/src/c/kmer_utils.c +++ b/src/c/kmer_utils.c @@ -95,7 +95,7 @@ unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned in unsigned long long str_size = 4096; - while ((read = getdelim(&line, &len, '>', fh)) != -1) { + while ((read = getseq(&line, &len, fh)) != -1) { // find our first \n, this should be the end of the header char *start = strchr(line, '\n'); diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c index f690549..5715024 100644 --- a/src/c/quikr_functions.c +++ b/src/c/quikr_functions.c @@ -6,11 +6,106 @@ #include <string.h> #include <unistd.h> #include <zlib.h> +#include <assert.h> #include "kmer_utils.h" #include "quikr.h" +/* getdelim.c --- Implementation of replacement getdelim function. + Copyright (C) 1994, 1996, 1997, 1998, 2001, 2003, 2005 Free + Software Foundation, Inc. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2, or (at + your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +ssize_t getseq(char **lineptr, size_t *n, FILE *fp) { + int result = 0; + ssize_t cur_len = 0; + + if (lineptr == NULL || n == NULL || fp == NULL) + { + errno = EINVAL; + return -1; + } + + flockfile (fp); + + if (*lineptr == NULL || *n == 0) + { + *n = 120; + *lineptr = (char *) malloc (*n); + if (*lineptr == NULL) + { + result = -1; + goto unlock_return; + } + } + + int newline = 0; + for (;;) { + int i; + + i = getc (fp); + if (i == EOF) + { + result = -1; + break; + } + + /* Make enough space for len+1 (for final NUL) bytes. */ + if (cur_len + 2 >= *n) + { + size_t needed = 2 * (cur_len + 1) + 2; /* Be generous. */ + char *new_lineptr; + + if (needed < cur_len) + { + result = -1; + goto unlock_return; + } + + new_lineptr = (char *) realloc (*lineptr, needed); + if (new_lineptr == NULL){ + result = -1; + goto unlock_return; + } + + *lineptr = new_lineptr; + *n = needed; + } + + (*lineptr)[cur_len] = i; + cur_len++; + + if (i == '\n') { + newline = 1; + } + + if(i == '>' && newline == 1) { + break; + } + } + (*lineptr)[cur_len] = '\0'; + result = cur_len ? cur_len : result; + + unlock_return: + funlockfile (fp); + return result; +} + void check_malloc(void *ptr, char *error) { if (ptr == NULL) { if(error != NULL) { @@ -124,6 +219,42 @@ unsigned long long count_sequences(const char *filename) { } +/* + * * Sally - A Tool for Embedding Strings in Vector Spaces + * * Copyright (C) 2010 Konrad Rieck (konrad@mlsec.org) + * * -- + * * This program is free software; you can redistribute it and/or modify it + * * under the terms of the GNU General Public License as published by the + * * Free Software Foundation; either version 3 of the License, or (at your + * * option) any later version. This program is distributed without any + * * warranty. See the GNU General Public License for more details. + * */ +size_t gzgetline(char **s, size_t * n, gzFile f) { + assert(f); + int c = 0; + *n = 0; + + if (gzeof(f)) + return -1; + + while (c != '\n') { + if (!*s || *n % 256 == 0) { + *s = realloc(*s, *n + 256 + 1); + if (!*s) + return -1; + } + + c = gzgetc(f); + if (c == -1) + return -1; + + (*s)[(*n)++] = c; + } + + (*s)[*n] = 0; + return *n; +} + struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) { char *line = NULL; @@ -139,6 +270,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme unsigned long long width = 0; struct matrix *ret = NULL; + size_t lineno = 0; gzFile fh = NULL; @@ -153,6 +285,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme // Check for quikr line = gzgets(fh, line, 1024); + lineno++; if(strcmp(line, "quikr\n") != 0) { fprintf(stderr, "This does not look like a quikr sensing matrix. Please check your path: %s\n", filename); exit(EXIT_FAILURE); @@ -164,6 +297,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme fprintf(stderr, "Sensing Matrix uses an unsupported version, please retrain your matrix\n"); exit(EXIT_FAILURE); } + lineno++; // get number of sequences line = gzgets(fh, line, 1024); @@ -172,6 +306,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme fprintf(stderr, "Error parsing sensing matrix, sequence count is zero\n"); exit(EXIT_FAILURE); } + lineno++; // get kmer gzgets(fh, line, 1024); @@ -180,6 +315,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n"); exit(EXIT_FAILURE); } + lineno++; if(kmer != target_kmer) { fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); @@ -198,20 +334,29 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme headers = malloc(sequences * sizeof(char *)); check_malloc(headers, NULL); - unsigned long long lineno = 0; + char *buf = NULL; + size_t len = 0; + size_t read = 0; for(i = 0; i < sequences; i++) { unsigned long long j = 0; // get header and add it to headers array - char *header = malloc(512 * sizeof(char)); - check_malloc(header, NULL); - gzgets(fh, header, 512); - lineno++; + // + read = gzgetline(&buf, &len, fh); + if(read == 0) { + fprintf(stderr, "Error parsing sensing matrix, could not read header\n"); + exit(EXIT_FAILURE); + } + + char *header = malloc(sizeof(char) * read + 1); + check_malloc(header, NULL); + header = strncpy(header, buf, read - 1); if(header[0] != '>') { fprintf(stderr, "Error parsing sensing matrix, could not read header in line %llu\n", lineno); exit(EXIT_FAILURE); } + lineno++; - header[strlen(header) - 1] = '\0'; + header[read - 1] = '\0'; headers[i] = header+1; row = memset(row, 0, (width) * sizeof(unsigned long long)); @@ -220,9 +365,10 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme line = gzgets(fh, line, 32); lineno++; if(line == NULL || line[0] == '>') { - fprintf(stderr, "Error parsing sensing matrix, line does not look like a value in line %llu\n", lineno); + fprintf(stderr, "Error parsing sensing matrix, line %zu does not look like a value\n", lineno); exit(EXIT_FAILURE); } + lineno++; row[j] = strtoull(line, NULL, 10); if(errno) { diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h index 978f614..f65edbf 100644 --- a/src/c/quikr_functions.h +++ b/src/c/quikr_functions.h @@ -12,3 +12,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme // get_rare_value void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width); + + +// getline reimpl +ssize_t getseq(char **lineptr, size_t *n, FILE *fp); diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c index 6eabd4f..08a1f0a 100644 --- a/src/c/quikr_train.c +++ b/src/c/quikr_train.c @@ -178,7 +178,7 @@ int main(int argc, char **argv) { // seek the first character, and skip over it fseek(input, 1, SEEK_CUR); - while ((read = getdelim(&line, &len, '>', input)) != -1) { + while ((read = getseq(&line, &len, input)) != -1) { // find first whitespace for(i = 0; i < read; i ++) { |