aboutsummaryrefslogtreecommitdiff
path: root/kmer_utils.c
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2013-09-10 23:43:56 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2013-09-10 23:43:56 -0400
commit5b53a787e4d1cefa5660c891791cd0df4a8fd89c (patch)
tree005983f5d068409c9be44e614b4c1722cde37dd5 /kmer_utils.c
Initial commit of some kmer utilities.
there are two utilties included. one is kmer_frequency_per_sequence, which outputs a (m x n) matrix where m is the sequence, and n is the frequency of that nmer to occur in the given sequence. the other tool is kmer_total_count, which counts kmers for the total file, not just one sequence
Diffstat (limited to 'kmer_utils.c')
-rw-r--r--kmer_utils.c54
1 files changed, 54 insertions, 0 deletions
diff --git a/kmer_utils.c b/kmer_utils.c
new file mode 100644
index 0000000..b6f66b5
--- /dev/null
+++ b/kmer_utils.c
@@ -0,0 +1,54 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+// This function takes a char array containing sequences and converts it
+// into a kmer index (essentially a base 4 radix converstion to base 10, with
+// some mumbo-jumbo of taking each of the characters we want (ACGT) and turning
+// them into a radix-4 number we can use.
+//
+// kind of convoluted but it works.
+//
+// Arguemnts:
+// char *str - a NULL terminated character array
+// long kmer - how long of a index value you want to return
+// long error_pos - what index to return for a non ACGT character
+//
+long convert_kmer_to_index(char *str, long kmer, long error_pos) {
+
+ char **ptr = NULL;
+ char vals[kmer];
+ long i = 0;
+ long ret = 0;
+
+ for(i = 0; i < kmer; i++) {
+ int val = 0;
+ switch(str[i]) {
+ case 'a':
+ case 'A':
+ val = 48;
+ break;
+ case 'c':
+ case 'C':
+ val = 49;
+ break;
+ case 'g':
+ case 'G':
+ val = 50;
+ break;
+ case 't':
+ case 'T':
+ val = 51;
+ break;
+ default:
+ return error_pos;
+ }
+
+
+ vals[i] = val;
+ }
+
+ ret = strtol(vals, ptr, 4);
+ return ret;
+}