diff options
| author | Calvin Morrison <mutantturkey@gmail.com> | 2013-09-10 23:43:56 -0400 | 
|---|---|---|
| committer | Calvin Morrison <mutantturkey@gmail.com> | 2013-09-10 23:43:56 -0400 | 
| commit | 5b53a787e4d1cefa5660c891791cd0df4a8fd89c (patch) | |
| tree | 005983f5d068409c9be44e614b4c1722cde37dd5 | |
Initial commit of some kmer utilities.
there are two utilties included.
one is kmer_frequency_per_sequence,
which outputs a (m x n) matrix where m is the sequence, and n is the
frequency of that nmer to occur in the given sequence.
the other tool is kmer_total_count, which counts kmers for the total
file, not just one sequence
| -rw-r--r-- | Makefile | 13 | ||||
| -rw-r--r-- | kmer_frequency_per_sequence.c | 58 | ||||
| -rw-r--r-- | kmer_total_count.c | 49 | ||||
| -rw-r--r-- | kmer_utils.c | 54 | ||||
| -rw-r--r-- | kmer_utils.h | 1 | 
5 files changed, 175 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..df96037 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +VERSION=\"v0.0.1\" +CC = gcc +CFLAGS = -O3 -s -mtune=native -Wall -lm -DVERSION=$(VERSION) + +all: kmer_total_count kmer_frequency_per_sequence + +kmer_total_count: +	$(CC) kmer_total_count.c kmer_utils.c -o kmer_count_total $(CFLAGS) +kmer_frequency_per_sequence: +	$(CC) kmer_frequency_per_sequence.c kmer_utils.c -o kmer_frequency_per_sequence $(CFLAGS) + +clean: +	rm -vf kmer_total_count kmer_frequency_per_sequence diff --git a/kmer_frequency_per_sequence.c b/kmer_frequency_per_sequence.c new file mode 100644 index 0000000..e0995f9 --- /dev/null +++ b/kmer_frequency_per_sequence.c @@ -0,0 +1,58 @@ +// Copyright 2013 Calvin Morrison +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> + +#include "kmer_utils.h" + +int main(int argc, char **argv) { + +  char *line = NULL; +  long kmer = 6; +  size_t len = 0; +  ssize_t read; + +  if(argc != 3) { +    printf("Please supply a filename, and only a filename\n"); +    exit(EXIT_FAILURE); +  } + +  FILE *fh = fopen(argv[1], "r" ); +  if(fh == NULL) { +    fprintf(stderr, "Couldn't open: %s\n", argv[1]); +    exit(EXIT_FAILURE); +  } + + +  int width = (int)pow(4, kmer); +  while ((read = getline(&line, &len, fh)) != -1) { +    if(line[0] != '>')   { + +      unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); +      if(counts == NULL)  +        exit(EXIT_FAILURE); + +      unsigned int i = 0; +      for(i = 0; i < strlen(line) - kmer; i++) { +        counts[convert_kmer_to_index(&line[i], kmer, width)]++; +      } + +      unsigned long total = 0; +      for(i = 0; i < width; i++) +        total += counts[i]; + +      for(i = 0; i < width - 1; i++) +        printf("%.12f\t", (double)counts[i] / total); +      printf("%.12f\n", (double)counts[width - 1] / total); + +      free(counts); +    } +  } + +  free(line); + + +  return EXIT_SUCCESS; +} diff --git a/kmer_total_count.c b/kmer_total_count.c new file mode 100644 index 0000000..4d1ab87 --- /dev/null +++ b/kmer_total_count.c @@ -0,0 +1,49 @@ +// Copyright 2013 Calvin Morrison +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> + +#include "kmer_utils.h" + +int main(int argc, char **argv) { + +  char *line = NULL; +  size_t len = 0; +  ssize_t read; +  unsigned int i = 0; + +  if(argc != 3) { +    printf("Please supply a filename, and only a filename\n"); +    exit(EXIT_FAILURE); +  } + +  FILE *fh = fopen(argv[1], "r" ); +  if(fh == NULL) { +    fprintf(stderr, "Couldn't open: %s\n", argv[1]); +    exit(EXIT_FAILURE); +  } +   +  int kmer = atoi(argv[2]); +  int width = (int)pow(4, kmer); + +  unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long)); +  if(counts == NULL)  +    exit(EXIT_FAILURE); + +  while ((read = getline(&line, &len, fh)) != -1) { +    if(line[0] != '>')   { + +      for(i = 0; i < strlen(line) - kmer; i++) { +        counts[convert_kmer_to_index(&line[i],kmer, width)]++; +      } +    } +  } + +  for(i = 0; i < width; i++) +    printf("%llu\n", counts[i]); + + +  return EXIT_SUCCESS; +} diff --git a/kmer_utils.c b/kmer_utils.c new file mode 100644 index 0000000..b6f66b5 --- /dev/null +++ b/kmer_utils.c @@ -0,0 +1,54 @@ +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +// This function takes a char array containing sequences and converts it +// into a kmer index (essentially a base 4 radix converstion to base 10, with +// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning +// them into a radix-4 number we can use. +// +// kind of convoluted but it works. +// +// Arguemnts:  +//  char *str - a NULL terminated character array  +//  long kmer - how long of a index value you want to return +//  long error_pos - what index to return for a non ACGT character +// +long convert_kmer_to_index(char *str, long kmer, long error_pos) { + +  char **ptr = NULL; +  char vals[kmer]; +  long i = 0; +  long ret = 0; + +  for(i = 0; i < kmer; i++) { +    int val = 0; +    switch(str[i]) { +      case 'a': +      case 'A': +        val = 48; +        break; +      case 'c': +      case 'C': +        val = 49; +        break; +      case 'g': +      case 'G': +        val = 50; +        break; +      case 't': +      case 'T': +        val = 51; +        break; +      default: +        return error_pos;  +    } + + +    vals[i] = val;  +  } + +  ret = strtol(vals, ptr, 4); +  return ret; +} diff --git a/kmer_utils.h b/kmer_utils.h new file mode 100644 index 0000000..a93216e --- /dev/null +++ b/kmer_utils.h @@ -0,0 +1 @@ +long convert_kmer_to_index(char *str, long kmer, long error_pos);  | 
