aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-04-29 19:30:17 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-04-29 19:30:17 -0400
commit958b167c741c2928021216fddadf24ecdabfbf61 (patch)
treee8a240a30bf635db5209e630b173e53a95d1cc96
parent65709a225fe0816be6d345e1401a8d67f3a0c9d1 (diff)
fix make clean; push changes to fix > bug
-rw-r--r--Makefile1
-rw-r--r--src/c/Makefile4
-rw-r--r--src/c/kmer_utils.c2
-rw-r--r--src/c/quikr_functions.c160
-rw-r--r--src/c/quikr_functions.h4
-rw-r--r--src/c/quikr_train.c2
6 files changed, 163 insertions, 10 deletions
diff --git a/Makefile b/Makefile
index d6f0df7..48564ca 100644
--- a/Makefile
+++ b/Makefile
@@ -33,5 +33,4 @@ install_python:
clean:
@echo "cleaning up"
@cd src/python; rm build -Rvf
- @cd src/nbc; make clean
@cd src/c; make clean
diff --git a/src/c/Makefile b/src/c/Makefile
index c63c1bc..4c62aa6 100644
--- a/src/c/Makefile
+++ b/src/c/Makefile
@@ -16,8 +16,8 @@ all: nnls.o kmer_utils.o quikr_functions.o quikr_train quikr multifasta_to_otu t
nnls.o: nnls.c
$(CC) -c nnls.c -o nnls.o $(CFLAGS)
-kmer_utils.o: kmer_utils.c
- $(CC) -c kmer_utils.c -o kmer_utils.o $(CFLAGS)
+kmer_utils.o: kmer_utils.c quikr_functions.o
+ $(CC) -c kmer_utils.c quikr_functions.o -o kmer_utils.o $(CFLAGS)
quikr_functions.o: quikr_functions.c
$(CC) -c quikr_functions.c -o quikr_functions.o $(CFLAGS)
multifasta_to_otu: kmer_utils.o nnls.o quikr_functions.o multifasta_to_otu.c
diff --git a/src/c/kmer_utils.c b/src/c/kmer_utils.c
index aa341f3..20a79f4 100644
--- a/src/c/kmer_utils.c
+++ b/src/c/kmer_utils.c
@@ -95,7 +95,7 @@ unsigned long long * get_kmer_counts_from_file(const char *fn, const unsigned in
unsigned long long str_size = 4096;
- while ((read = getdelim(&line, &len, '>', fh)) != -1) {
+ while ((read = getseq(&line, &len, fh)) != -1) {
// find our first \n, this should be the end of the header
char *start = strchr(line, '\n');
diff --git a/src/c/quikr_functions.c b/src/c/quikr_functions.c
index 5bbe913..82af029 100644
--- a/src/c/quikr_functions.c
+++ b/src/c/quikr_functions.c
@@ -6,11 +6,108 @@
#include <string.h>
#include <unistd.h>
#include <zlib.h>
+#include <assert.h>
#include "kmer_utils.h"
#include "quikr.h"
+/* getdelim.c --- Implementation of replacement getdelim function.
+ Copyright (C) 1994, 1996, 1997, 1998, 2001, 2003, 2005 Free
+ Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2, or (at
+ your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+ssize_t getseq(char **lineptr, size_t *n, FILE *fp) {
+ int result = 0;
+ ssize_t cur_len = 0;
+ ssize_t len;
+
+ if (lineptr == NULL || n == NULL || fp == NULL)
+ {
+ errno = EINVAL;
+ return -1;
+ }
+
+ flockfile (fp);
+
+ if (*lineptr == NULL || *n == 0)
+ {
+ *n = 120;
+ *lineptr = (char *) malloc (*n);
+ if (*lineptr == NULL)
+ {
+ result = -1;
+ goto unlock_return;
+ }
+ }
+
+ int newline = 0;
+ for (;;) {
+ char *t;
+ int i;
+
+ i = getc (fp);
+ if (i == EOF)
+ {
+ result = -1;
+ break;
+ }
+
+ /* Make enough space for len+1 (for final NUL) bytes. */
+ if (cur_len + 2 >= *n)
+ {
+ size_t needed = 2 * (cur_len + 1) + 2; /* Be generous. */
+ char *new_lineptr;
+
+ if (needed < cur_len)
+ {
+ result = -1;
+ goto unlock_return;
+ }
+
+ new_lineptr = (char *) realloc (*lineptr, needed);
+ if (new_lineptr == NULL){
+ result = -1;
+ goto unlock_return;
+ }
+
+ *lineptr = new_lineptr;
+ *n = needed;
+ }
+
+ (*lineptr)[cur_len] = i;
+ cur_len++;
+
+ if (i == '\n') {
+ newline = 1;
+ }
+
+ if(i == '>' && newline == 1) {
+ break;
+ }
+ }
+ (*lineptr)[cur_len] = '\0';
+ result = cur_len ? cur_len : result;
+
+ unlock_return:
+ funlockfile (fp);
+ return result;
+}
+
void check_malloc(void *ptr, char *error) {
if (ptr == NULL) {
if(error != NULL) {
@@ -124,6 +221,42 @@ unsigned long long count_sequences(const char *filename) {
}
+/*
+ * * Sally - A Tool for Embedding Strings in Vector Spaces
+ * * Copyright (C) 2010 Konrad Rieck (konrad@mlsec.org)
+ * * --
+ * * This program is free software; you can redistribute it and/or modify it
+ * * under the terms of the GNU General Public License as published by the
+ * * Free Software Foundation; either version 3 of the License, or (at your
+ * * option) any later version. This program is distributed without any
+ * * warranty. See the GNU General Public License for more details.
+ * */
+size_t gzgetline(char **s, size_t * n, gzFile f) {
+ assert(f);
+ int c = 0;
+ *n = 0;
+
+ if (gzeof(f))
+ return -1;
+
+ while (c != '\n') {
+ if (!*s || *n % 256 == 0) {
+ *s = realloc(*s, *n + 256 + 1);
+ if (!*s)
+ return -1;
+ }
+
+ c = gzgetc(f);
+ if (c == -1)
+ return -1;
+
+ (*s)[(*n)++] = c;
+ }
+
+ (*s)[*n] = 0;
+ return *n;
+}
+
struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) {
char *line = NULL;
@@ -139,6 +272,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
unsigned long long width = 0;
struct matrix *ret = NULL;
+ size_t lineno = 0;
gzFile fh = NULL;
@@ -153,6 +287,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
// Check for quikr
line = gzgets(fh, line, 1024);
+ lineno++;
if(strcmp(line, "quikr\n") != 0) {
fprintf(stderr, "This does not look like a quikr sensing matrix. Please check your path: %s\n", filename);
exit(EXIT_FAILURE);
@@ -164,6 +299,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
fprintf(stderr, "Sensing Matrix uses an unsupported version, please retrain your matrix\n");
exit(EXIT_FAILURE);
}
+ lineno++;
// get number of sequences
line = gzgets(fh, line, 1024);
@@ -172,6 +308,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
fprintf(stderr, "Error parsing sensing matrix, sequence count is zero\n");
exit(EXIT_FAILURE);
}
+ lineno++;
// get kmer
gzgets(fh, line, 1024);
@@ -180,6 +317,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n");
exit(EXIT_FAILURE);
}
+ lineno++;
if(kmer != target_kmer) {
fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
@@ -198,18 +336,29 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
headers = malloc(sequences * sizeof(char *));
check_malloc(headers, NULL);
+ char *buf = NULL;
+ size_t len = 0;
+ size_t read = 0;
for(i = 0; i < sequences; i++) {
unsigned long long j = 0;
// get header and add it to headers array
- char *header = malloc(256 * sizeof(char));
- check_malloc(header, NULL);
- gzgets(fh, header, 256);
+ //
+ read = gzgetline(&buf, &len, fh);
+ if(read == 0) {
+ fprintf(stderr, "Error parsing sensing matrix, could not read header\n");
+ exit(EXIT_FAILURE);
+ }
+
+ char *header = malloc(sizeof(char) * read + 1);
+ check_malloc(header, NULL);
+ header = strncpy(header, buf, read - 1);
if(header[0] != '>') {
fprintf(stderr, "Error parsing sensing matrix, could not read header\n");
exit(EXIT_FAILURE);
}
+ lineno++;
- header[strlen(header) - 1] = '\0';
+ header[read - 1] = '\0';
headers[i] = header+1;
row = memset(row, 0, (width) * sizeof(unsigned long long));
@@ -217,9 +366,10 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
for(j = 0; j < width; j++) {
line = gzgets(fh, line, 32);
if(line == NULL || line[0] == '>') {
- fprintf(stderr, "Error parsing sensing matrix, line does not look like a value\n");
+ fprintf(stderr, "Error parsing sensing matrix, line %zu does not look like a value\n", lineno);
exit(EXIT_FAILURE);
}
+ lineno++;
row[j] = strtoull(line, NULL, 10);
if(errno) {
diff --git a/src/c/quikr_functions.h b/src/c/quikr_functions.h
index 978f614..f65edbf 100644
--- a/src/c/quikr_functions.h
+++ b/src/c/quikr_functions.h
@@ -12,3 +12,7 @@ struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kme
// get_rare_value
void get_rare_value(double *count_matrix, unsigned long long width, double rare_percent, unsigned long long *ret_rare_value, unsigned long long *ret_rare_width);
+
+
+// getline reimpl
+ssize_t getseq(char **lineptr, size_t *n, FILE *fp);
diff --git a/src/c/quikr_train.c b/src/c/quikr_train.c
index 6eabd4f..08a1f0a 100644
--- a/src/c/quikr_train.c
+++ b/src/c/quikr_train.c
@@ -178,7 +178,7 @@ int main(int argc, char **argv) {
// seek the first character, and skip over it
fseek(input, 1, SEEK_CUR);
- while ((read = getdelim(&line, &len, '>', input)) != -1) {
+ while ((read = getseq(&line, &len, input)) != -1) {
// find first whitespace
for(i = 0; i < read; i ++) {