From 872e0ad9563c6d53201eb337458d108dd57d9e94 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Fri, 21 Mar 2014 14:23:12 -0400 Subject: add more tools --- Makefile | 10 +++++++--- README | 7 ++++++- sequence_end_points.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ sequence_lengths.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 sequence_end_points.c create mode 100644 sequence_lengths.c diff --git a/Makefile b/Makefile index 255c1fb..8193e1d 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,14 @@ CC = gcc CFLAGS= -O3 -s -mtune=native -Wextra -Wall -all: count_nucleobases_ +all: count_nucleobases sequence_lengths -count_nucleobases_: +count_nucleobases: count_nucleobases.c $(CC) $(CFLAGS) count_nucleobases.c -o count_nucleobases +sequence_lengths: sequence_lengths.c + $(CC) $(CFLAGS) sequence_lengths.c -o sequence_lengths +sequence_end_points: sequence_end_points.c + $(CC) $(CFLAGS) sequence_end_points.c -o sequence_end_points clean: - rm -vf count_nucleobases + rm -vf count_nucleobases sequence_lengths sequence_end_points diff --git a/README b/README index 19d14ab..83f27d2 100644 --- a/README +++ b/README @@ -1 +1,6 @@ -This will count nucleobases of argv[1] +random fasta tools: + +count_nucleobases - counts nucleobases (A, C, G, T) in fasta files +sequence_lengths - print out length of each sequences +sequence_end_points - print out the end position of each sequence (continuous) + diff --git a/sequence_end_points.c b/sequence_end_points.c new file mode 100644 index 0000000..dc03dd9 --- /dev/null +++ b/sequence_end_points.c @@ -0,0 +1,54 @@ +// Copyright 2013 Calvin Morrison +#include +#include +#include +#include +#include +#include +int main() { + + size_t len = 0; + + char buffer[4096]; + bool header = false; + + len = fread(&buffer, 1, 1, stdin); + + unsigned long long seq_length = 0; + if(!errno) { + if(buffer[0] == '>') { + header = true; + + while((len = fread(&buffer, 1, 4096, stdin)) != 0) { + size_t i = 0; + for(i = 0; i < len; i++) { + if(buffer[i] == '>') { + printf("%llu\n", seq_length); + header = true; + continue; + } + else if(buffer[i] == '\n' && header == true) { + header = false; + continue; + } + if(header == false && buffer[i] != '\n') { + seq_length++; + } + } + } + } + else { + fprintf(stderr, "this does not look like a fasta file\n"); + return EXIT_FAILURE; + } + } + else { + fprintf(stderr, "could not read file\n"); + return EXIT_FAILURE; + } + + printf("%llu\n", seq_length); + + return EXIT_SUCCESS; +} + diff --git a/sequence_lengths.c b/sequence_lengths.c new file mode 100644 index 0000000..70ed43e --- /dev/null +++ b/sequence_lengths.c @@ -0,0 +1,55 @@ +// Copyright 2013 Calvin Morrison +#include +#include +#include +#include +#include +#include +int main() { + + size_t len = 0; + + char buffer[4096]; + bool header = false; + + len = fread(&buffer, 1, 1, stdin); + + unsigned long long seq_length = 0; + if(!errno) { + if(buffer[0] == '>') { + header = true; + + while((len = fread(&buffer, 1, 4096, stdin)) != 0) { + size_t i = 0; + for(i = 0; i < len; i++) { + if(buffer[i] == '>') { + printf("%llu\n", seq_length); + header = true; + seq_length = 0; + continue; + } + else if(buffer[i] == '\n' && header == true) { + header = false; + continue; + } + if(header == false && buffer[i] != '\n') { + seq_length++; + } + } + } + } + else { + fprintf(stderr, "this does not look like a fasta file\n"); + return EXIT_FAILURE; + } + } + else { + fprintf(stderr, "could not read file\n"); + return EXIT_FAILURE; + } + + printf("%llu\n", seq_length); + + return EXIT_SUCCESS; +} + -- cgit v1.2.3