From 5b53a787e4d1cefa5660c891791cd0df4a8fd89c Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Tue, 10 Sep 2013 23:43:56 -0400 Subject: Initial commit of some kmer utilities. there are two utilties included. one is kmer_frequency_per_sequence, which outputs a (m x n) matrix where m is the sequence, and n is the frequency of that nmer to occur in the given sequence. the other tool is kmer_total_count, which counts kmers for the total file, not just one sequence --- Makefile | 13 ++++++++++ kmer_frequency_per_sequence.c | 58 +++++++++++++++++++++++++++++++++++++++++++ kmer_total_count.c | 49 ++++++++++++++++++++++++++++++++++++ kmer_utils.c | 54 ++++++++++++++++++++++++++++++++++++++++ kmer_utils.h | 1 + 5 files changed, 175 insertions(+) create mode 100644 Makefile create mode 100644 kmer_frequency_per_sequence.c create mode 100644 kmer_total_count.c create mode 100644 kmer_utils.c create mode 100644 kmer_utils.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..df96037 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +VERSION=\"v0.0.1\" +CC = gcc +CFLAGS = -O3 -s -mtune=native -Wall -lm -DVERSION=$(VERSION) + +all: kmer_total_count kmer_frequency_per_sequence + +kmer_total_count: + $(CC) kmer_total_count.c kmer_utils.c -o kmer_count_total $(CFLAGS) +kmer_frequency_per_sequence: + $(CC) kmer_frequency_per_sequence.c kmer_utils.c -o kmer_frequency_per_sequence $(CFLAGS) + +clean: + rm -vf kmer_total_count kmer_frequency_per_sequence diff --git a/kmer_frequency_per_sequence.c b/kmer_frequency_per_sequence.c new file mode 100644 index 0000000..e0995f9 --- /dev/null +++ b/kmer_frequency_per_sequence.c @@ -0,0 +1,58 @@ +// Copyright 2013 Calvin Morrison +#include +#include +#include +#include +#include + +#include "kmer_utils.h" + +int main(int argc, char **argv) { + + char *line = NULL; + long kmer = 6; + size_t len = 0; + ssize_t read; + + if(argc != 3) { + printf("Please supply a filename, and only a filename\n"); + exit(EXIT_FAILURE); + } + + FILE *fh = fopen(argv[1], "r" ); + if(fh == NULL) { + fprintf(stderr, "Couldn't open: %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + + int width = (int)pow(4, kmer); + while ((read = getline(&line, &len, fh)) != -1) { + if(line[0] != '>') { + + unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); + if(counts == NULL) + exit(EXIT_FAILURE); + + unsigned int i = 0; + for(i = 0; i < strlen(line) - kmer; i++) { + counts[convert_kmer_to_index(&line[i], kmer, width)]++; + } + + unsigned long total = 0; + for(i = 0; i < width; i++) + total += counts[i]; + + for(i = 0; i < width - 1; i++) + printf("%.12f\t", (double)counts[i] / total); + printf("%.12f\n", (double)counts[width - 1] / total); + + free(counts); + } + } + + free(line); + + + return EXIT_SUCCESS; +} diff --git a/kmer_total_count.c b/kmer_total_count.c new file mode 100644 index 0000000..4d1ab87 --- /dev/null +++ b/kmer_total_count.c @@ -0,0 +1,49 @@ +// Copyright 2013 Calvin Morrison +#include +#include +#include +#include +#include + +#include "kmer_utils.h" + +int main(int argc, char **argv) { + + char *line = NULL; + size_t len = 0; + ssize_t read; + unsigned int i = 0; + + if(argc != 3) { + printf("Please supply a filename, and only a filename\n"); + exit(EXIT_FAILURE); + } + + FILE *fh = fopen(argv[1], "r" ); + if(fh == NULL) { + fprintf(stderr, "Couldn't open: %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + int kmer = atoi(argv[2]); + int width = (int)pow(4, kmer); + + unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); + if(counts == NULL) + exit(EXIT_FAILURE); + + while ((read = getline(&line, &len, fh)) != -1) { + if(line[0] != '>') { + + for(i = 0; i < strlen(line) - kmer; i++) { + counts[convert_kmer_to_index(&line[i],kmer, width)]++; + } + } + } + + for(i = 0; i < width; i++) + printf("%llu\n", counts[i]); + + + return EXIT_SUCCESS; +} diff --git a/kmer_utils.c b/kmer_utils.c new file mode 100644 index 0000000..b6f66b5 --- /dev/null +++ b/kmer_utils.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +// This function takes a char array containing sequences and converts it +// into a kmer index (essentially a base 4 radix converstion to base 10, with +// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning +// them into a radix-4 number we can use. +// +// kind of convoluted but it works. +// +// Arguemnts: +// char *str - a NULL terminated character array +// long kmer - how long of a index value you want to return +// long error_pos - what index to return for a non ACGT character +// +long convert_kmer_to_index(char *str, long kmer, long error_pos) { + + char **ptr = NULL; + char vals[kmer]; + long i = 0; + long ret = 0; + + for(i = 0; i < kmer; i++) { + int val = 0; + switch(str[i]) { + case 'a': + case 'A': + val = 48; + break; + case 'c': + case 'C': + val = 49; + break; + case 'g': + case 'G': + val = 50; + break; + case 't': + case 'T': + val = 51; + break; + default: + return error_pos; + } + + + vals[i] = val; + } + + ret = strtol(vals, ptr, 4); + return ret; +} diff --git a/kmer_utils.h b/kmer_utils.h new file mode 100644 index 0000000..a93216e --- /dev/null +++ b/kmer_utils.h @@ -0,0 +1 @@ +long convert_kmer_to_index(char *str, long kmer, long error_pos); -- cgit v1.2.3