aboutsummaryrefslogtreecommitdiff
path: root/kmer_counts_per_sequence.c
blob: ac7ff44c4a75be6d51b5317956f6dbcfa40d2146 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Copyright 2013 Calvin Morrison
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#include "kmer_utils.h"

unsigned long position = 0;
int main(int argc, char **argv) {

	// getdelim variables
	char *line = NULL;
	size_t len = 0;
	ssize_t read;

	// specific mer variables
	bool specific_mers = false;
	size_t num_desired_indicies = 0;
	size_t *desired_indicies = NULL;


	if(argc < 3) {
		printf("Please supply a filename and a kmer\n");
		exit(EXIT_FAILURE);
	}

	unsigned long kmer = atoi(argv[2]);
	if(kmer == 0) { 
		fprintf(stderr, "Error: invalid kmer.\n");
		exit(EXIT_FAILURE);
	}

	const unsigned long long width = (unsigned long long)1 << (kmer * 2);

	if(argc == 4) {
		specific_mers = true;
		desired_indicies = malloc((width) * sizeof(size_t));
		if(desired_indicies == NULL) 
			exit(EXIT_FAILURE);
		num_desired_indicies = load_specific_mers_from_file(argv[3], kmer, width, desired_indicies);

	}

	FILE *fh = fopen(argv[1], "r" );
	if(fh == NULL) {
		fprintf(stderr, "Error opening %s - %s\n", argv[1], strerror(errno));
		exit(EXIT_FAILURE);
	}
	unsigned long long *counts = malloc((width+ 1) * sizeof(unsigned long long));
	if(counts == NULL) 
		exit(EXIT_FAILURE);

	char *str = malloc(BUFSIZ);
	if(str == NULL) { 
		fprintf(stderr, strerror(errno));
		exit(EXIT_FAILURE);
	}

	unsigned long long str_size = BUFSIZ;
	unsigned long long sequence = 0;
	while ((read = getdelim(&line, &len, '>', fh)) != -1) {
		size_t i = 0;

		memset(counts, 0, width * sizeof(unsigned long long));

		// find our first \n, this should be the end of the header
		char *start = strchr(line, '\n');	
		if(start == NULL) 
			continue;

		// point to one past that.
		start = start + 1;

		size_t start_len = strlen(start);


		// if our current str buffer isn't big enough, realloc
		if(start_len + 1 > str_size + 1) { 
			str = realloc(str, start_len + 1);
			if(str == NULL) { 
				exit(EXIT_FAILURE);
				fprintf(stderr, strerror(errno));
			}
		}


		// strip out all other newlines to handle multiline sequences
		str = strnstrip(start, str, '\n',start_len);
		size_t seq_length = strlen(str);

		// reset our count matrix to zero

		for(i = 0; i < seq_length; i++) {
			str[i] = alpha[(int)str[i]];
		}

		for(i = 0; i < (seq_length - kmer + 1); i++) {
			size_t mer = num_to_index(&str[i],kmer, width, &i);
			counts[mer]++;
		}
		

		if(specific_mers) {
			for(i = 0; i < num_desired_indicies; i++)
				printf("%llu\t%zu\t%llu\n", sequence, desired_indicies[i], counts[desired_indicies[i]]);
			sequence++;
		} 
		else {
			for(i = 0; i < width - 1; i++)
				printf("%llu\t", counts[i]);
			printf("%llu\n", counts[width - 1]);
		}
	}

free(counts);
free(line);


return EXIT_SUCCESS;
}