From 6a7295a258932a78c616ea4f8f591eb5f0191da4 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 8 Jan 2014 11:20:39 -0500 Subject: initial commit --- Makefile | 11 +++ below_melting_temperature.py | 33 +++++++ select_kmers.sh | 86 +++++++++++++++++ select_mers.py | 213 +++++++++++++++++++++++++++++++++++++++++++ strstream.c | 52 +++++++++++ 5 files changed, 395 insertions(+) create mode 100644 Makefile create mode 100644 below_melting_temperature.py create mode 100755 select_kmers.sh create mode 100644 select_mers.py create mode 100644 strstream.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1f8d839 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +VERSION=\"v0.0.1\" +CC = gcc +CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra + +all: strstream + +strstream: strstream.c + $(CC) strstream.c -o strstream $(CLIBS) $(CFLAGS) + +clean: + rm -vf kmer_total_count kmer_counts_per_sequence libkmer.so libkmer.a libkmer.o diff --git a/below_melting_temperature.py b/below_melting_temperature.py new file mode 100644 index 0000000..bc4157e --- /dev/null +++ b/below_melting_temperature.py @@ -0,0 +1,33 @@ +import sys +from multiprocessing import Pool + +def above_melting_temperature(kmer_with_count): + kmer = kmer_with_count.split("\t")[0] + + A = kmer.count('A') + C = kmer.count('C') + G = kmer.count('G') + T = kmer.count('T') + + melt_temp = 0.0; + + if len(kmer) < 13: + melt_temp = ((A+T) * 2) + ((C+G) * 4) + else: + melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C) + + if melt_temp < max_melting_temp: + return kmer_with_count + else: + return 0 + + +lines = sys.stdin.readlines() +max_melting_temp = float(sys.argv[1]) + +p = Pool() +output = p.map(above_melting_temperature, lines) + +for line in output: + if line != 0: + sys.stdout.write(line) diff --git a/select_kmers.sh b/select_kmers.sh new file mode 100755 index 0000000..11ee124 --- /dev/null +++ b/select_kmers.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# range of mers, min and max +: ${min_mer_range=6} +: ${max_mer_range=10} +# directory to store our counts and sorted counts +: ${counts_directory=$PWD/counts} +# temp directory +: ${tmp_directory=$PWD/tmp} +# maximum kmer melting point +: ${max_melting_temp=20} +# minimum mer count +# ! you can supply either a percentage like this .5 +# ! or you can supply a raw number (100) +: ${min_mer_count=1000} +# maximum mers to pick +: ${max_select=15} + +export min_mer_range +export max_mer_range +export max_select +export min_mer_count + + +if [ ! -d $counts_directory ]; then + mkdir $counts_directory +fi + +if [ ! -d $tmp_directory ]; then + mkdir $tmp_directory +fi + +foreground=$1 +background=$2 + +if [[ ! -f $foreground ]]; then + echo "Could not open $foreground." + exit 1 +fi + +if [[ ! -f $background ]]; then + echo "Could not open $background." + exit 1 +fi + +for fasta_file in $foreground $background; do + + counts=$counts_directory/$(basename $fasta_file) + tmp=$tmp_directory/$(basename $fasta_file) + + echo pre-processing $fasta_file + + # check if our preprocessed file exists + if [[ ! -f $tmp ]]; then + echo "> pre processed $fasta_file" >> $tmp + cat $fasta_file | grep -v "^>" | tr -d '\n' >> $tmp + fi + + # run counts if they haven't been created + rm $counts-counts + for mer in `seq $min_mer_range $max_mer_range`; do + if [ ! -e $counts-counts-$mer ]; then + echo checking $mer mers for $fasta_file + kmer_total_count -i $tmp -k $mer -l -n >> $counts-counts-$mer + else + echo "$mer mers already done for $fasta_file" + fi + + cat $counts-counts-$mer >> $counts-counts + + done +done + + +fg_counts=$counts_directory/$(basename $foreground)-counts +bg_counts=$counts_directory/$(basename $background)-counts + +fg_tmp=$tmp_directory/$(basename $foreground) +bg_tmp=$tmp_directory/$(basename $background) + +echo "checking if mers are below melting temperature in the foreground" + +rm $fg_counts-fg-non-melting + +python below_melting_temperature.py $max_melting_temp < $fg_counts > $fg_counts-fg-non-melting + +python ./select_mers.py $fg_counts-fg-non-melting $fg_tmp $bg_counts $bg_tmp # > $(basename $foreground)_$(basename $background)_final_mers diff --git a/select_mers.py b/select_mers.py new file mode 100644 index 0000000..9dea7fe --- /dev/null +++ b/select_mers.py @@ -0,0 +1,213 @@ +import sys +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot +import os + +from multiprocessing import Pool +from subprocess import * +import numpy as np +import pdb + +fg_mers = {} +bg_mers = {} +# empty class to fill up mer information with +class Mer: + pass + +class Score: + pass + +# import our variables +min_mer_range = int(os.getenv("min_mer_range")); +max_mer_range = int(os.getenv("max_mer_range")); +min_mer_count = int(os.getenv("min_mer_count")); +max_select = int(os.getenv("max_select")); + +def populate_locations(input_fn, mers, selected): + + mer_str = ' '.join(selected) + cmd = './strstream ' + mer_str + " < " + input_fn + + strstream = Popen(cmd, stdout=PIPE, shell=True) + for line in strstream.stdout: + (index, pos) = line.split(' ') + mers[selected[int(index)]].pts.append(int(pos)) + + + return None + +def select_mers(selected): + from itertools import combinations + + scores = [] + + p = Pool() + + for select_n in range(1, max_select): + print "scoring size ", select_n + scores_it = [] + scores_it = p.map(score, combinations(selected, select_n)) + scores = scores + scores_it + + + p.close() + + + scores = sorted(scores, key=lambda score: score.score) + return scores + +def score(combination): + + # print "scoring", combination + + fg_pts_arr = [] + fg_dist_arr = [] + + bg_pts_arr = [] + bg_dist_arr = [] + + for mer in combination: + fg_pts_arr = fg_pts_arr + fg_mers[mer].pts + bg_pts_arr = bg_pts_arr + bg_mers[mer].pts + + fg_pts_arr.sort() + bg_pts_arr.sort() + + # remove any exact duplicates + fg_pts_arr = list(set(fg_pts_arr)) + bg_pts_arr = list(set(bg_pts_arr)) + + # distances + for i in range(1, len(fg_pts_arr)): + fg_dist_arr.append(abs(fg_pts_arr[i-1] - fg_pts_arr[i])); + + for i in range(1, len(bg_pts_arr)): + bg_dist_arr.append(abs(bg_pts_arr[i-1] - bg_pts_arr[i])); + + fg_dist_arr = np.array(fg_dist_arr) + bg_dist_arr = np.array(bg_dist_arr) + + nb_primers = len(combination) + fg_mean_dist = np.mean(fg_dist_arr) + fg_variance_dist = np.var(fg_dist_arr) + bg_mean_dist = np.mean(bg_dist_arr) + bg_variance_dist = np.var(bg_dist_arr) + + # this is our equation + + score = Score() + score.mers = combination + score.score = (nb_primers * (fg_mean_dist * fg_variance_dist)) / (bg_mean_dist * bg_variance_dist) + # score.bg_dist_arr = bg_dist_arr + # score.fg_dist_arr = fg_dist_arr + return score + + + +def most_frequent(mers, select_nb): + print "selecting the top ", select_nb, " mers" + selected = [] + for mer in mers.keys(): + selected.append([mer, mers[mer].count]) + + selected = sorted(selected, key=lambda mer: mer[1]) + selected = selected[-select_nb:] + selected = list(row[0] for row in selected) + + return selected + + +def main(): + fg_count_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + + bg_count_fn = sys.argv[3] + bg_fasta_fn = sys.argv[4] + + fg_count_fh = open(fg_count_fn, "r") + bg_count_fh = open(bg_count_fn, "r") + + # get our genome length + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) + + # copy in our fg_mers and counts + for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: + + for line in fh: + (mer, count) = line.split() + count = int(count) + mers[mer] = Mer() + mers[mer].count = count + mers[mer].pts = [] + + # if min_mer_count is less than one, then we are looking at a bottom cutoff threshold + if(min_mer_count < 1 and min_mer_count > 0): + pass + # counts = [] + # for mer in fg_mers.keys(): + # counts.append(fg_mers[mer]['count']) + # + # counts = counts.sort() + # + # count_set = set(counts): + # len(count_set) + # count_set( + # if it is 1 or great then use it as a numerical cutoff + print "removing mers that don't meet our minimum count" + if min_mer_count >= 1: + for mer in fg_mers.keys(): + if(fg_mers[mer].count < min_mer_count): + del fg_mers[mer] + if mer in bg_mers: + del bg_mers[mer] + + print "removing useless mers from the background" + for mer in bg_mers.keys(): + if mer not in fg_mers: + del bg_mers[mer] + + print "adding empty mers to the background" + for mer in fg_mers: + if mer not in bg_mers: + bg_mers[mer] = Mer() + bg_mers[mer].count = 2 + bg_mers[mer].pts = [0, bg_genome_length] + + + exhaustive = True + + if exhaustive: + selected = fg_mers.keys() + if not exhaustive: + selected = most_frequent(fg_mers, max_select) + + print "selected the top ", max_select, " mers" + print "selected:", ", ".join(selected) + + print "Populating foreground locations" + populate_locations(fg_fasta_fn, fg_mers, selected) + print "Populating background locations" + populate_locations(bg_fasta_fn, bg_mers, selected) + + scores = select_mers(selected) + + print "fg_genome_length", fg_genome_length + print "bg_genome_length", bg_genome_length + + + for score in scores: + sys.stdout.write(str(score.score) + "\t") + sys.stdout.write(", ".join(list(score.mers)) + "\t") + # sys.stdout.write("\t".join(score.dist_arr)); + sys.stdout.write("\n"); + + #matplotlib.pyplot.hist(score.fg_dist_arr) + #matplotlib.pyplot.savefig('graph/' + str(x) + ".png") + #matplotlib.pyplot.clf() + import pdb + pdb.set_trace() + +if __name__ == "__main__": + sys.exit(main()) diff --git a/strstream.c b/strstream.c new file mode 100644 index 0000000..f4a296e --- /dev/null +++ b/strstream.c @@ -0,0 +1,52 @@ +// find string in +#include +#include +#include +#include + +int main(int argc, char **argv){ + + char buffer[BUFSIZ] = { 0 }; + char *buf, *start; + ssize_t len = 0; + + int save_size = 0; + int cpy = 0; + + unsigned long long pos = 0; + unsigned long long cpy_size = 0; + + int i = 0; + + // get max argument length + for(i = 1; i < argc; i++) { + int len = strlen(argv[i]); + if(len > save_size) + save_size = len; + } + + cpy = save_size - 1; + cpy_size = BUFSIZ - cpy; + + buf = buffer; + start = buf + cpy; + + // copy our first cpy length into the first part of our buffer + len = fread(buffer, 1, cpy, stdin); + if(len == 0) + exit(EXIT_FAILURE); + + // read into "start" (buf + cpy) from stdin + while((len = fread(start, 1, cpy_size, stdin)) != 0) { + for(i = 1; i < argc; i++) { + char *p = buffer; + while((p = strstr(p, argv[i])) != NULL) { + printf("%d %llu\n", i - 1, pos + (p - buffer)); + p++; + } + } + memcpy(buffer, buffer + len, cpy); + pos = pos + len; + } + return 0; +} -- cgit v1.2.3