aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-01-08 11:20:39 -0500
committerCalvin Morrison <mutantturkey@gmail.com>2014-01-08 11:20:39 -0500
commit6a7295a258932a78c616ea4f8f591eb5f0191da4 (patch)
tree6a32de9133d65c194f84e111ccf889021b2ca440
initial commit
-rw-r--r--Makefile11
-rw-r--r--below_melting_temperature.py33
-rwxr-xr-xselect_kmers.sh86
-rw-r--r--select_mers.py213
-rw-r--r--strstream.c52
5 files changed, 395 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1f8d839
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,11 @@
+VERSION=\"v0.0.1\"
+CC = gcc
+CFLAGS = -O3 -s -mtune=native -Wall -DVERSION=$(VERSION) -Wextra
+
+all: strstream
+
+strstream: strstream.c
+ $(CC) strstream.c -o strstream $(CLIBS) $(CFLAGS)
+
+clean:
+ rm -vf kmer_total_count kmer_counts_per_sequence libkmer.so libkmer.a libkmer.o
diff --git a/below_melting_temperature.py b/below_melting_temperature.py
new file mode 100644
index 0000000..bc4157e
--- /dev/null
+++ b/below_melting_temperature.py
@@ -0,0 +1,33 @@
+import sys
+from multiprocessing import Pool
+
+def above_melting_temperature(kmer_with_count):
+ kmer = kmer_with_count.split("\t")[0]
+
+ A = kmer.count('A')
+ C = kmer.count('C')
+ G = kmer.count('G')
+ T = kmer.count('T')
+
+ melt_temp = 0.0;
+
+ if len(kmer) < 13:
+ melt_temp = ((A+T) * 2) + ((C+G) * 4)
+ else:
+ melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C)
+
+ if melt_temp < max_melting_temp:
+ return kmer_with_count
+ else:
+ return 0
+
+
+lines = sys.stdin.readlines()
+max_melting_temp = float(sys.argv[1])
+
+p = Pool()
+output = p.map(above_melting_temperature, lines)
+
+for line in output:
+ if line != 0:
+ sys.stdout.write(line)
diff --git a/select_kmers.sh b/select_kmers.sh
new file mode 100755
index 0000000..11ee124
--- /dev/null
+++ b/select_kmers.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# range of mers, min and max
+: ${min_mer_range=6}
+: ${max_mer_range=10}
+# directory to store our counts and sorted counts
+: ${counts_directory=$PWD/counts}
+# temp directory
+: ${tmp_directory=$PWD/tmp}
+# maximum kmer melting point
+: ${max_melting_temp=20}
+# minimum mer count
+# ! you can supply either a percentage like this .5
+# ! or you can supply a raw number (100)
+: ${min_mer_count=1000}
+# maximum mers to pick
+: ${max_select=15}
+
+export min_mer_range
+export max_mer_range
+export max_select
+export min_mer_count
+
+
+if [ ! -d $counts_directory ]; then
+ mkdir $counts_directory
+fi
+
+if [ ! -d $tmp_directory ]; then
+ mkdir $tmp_directory
+fi
+
+foreground=$1
+background=$2
+
+if [[ ! -f $foreground ]]; then
+ echo "Could not open $foreground."
+ exit 1
+fi
+
+if [[ ! -f $background ]]; then
+ echo "Could not open $background."
+ exit 1
+fi
+
+for fasta_file in $foreground $background; do
+
+ counts=$counts_directory/$(basename $fasta_file)
+ tmp=$tmp_directory/$(basename $fasta_file)
+
+ echo pre-processing $fasta_file
+
+ # check if our preprocessed file exists
+ if [[ ! -f $tmp ]]; then
+ echo "> pre processed $fasta_file" >> $tmp
+ cat $fasta_file | grep -v "^>" | tr -d '\n' >> $tmp
+ fi
+
+ # run counts if they haven't been created
+ rm $counts-counts
+ for mer in `seq $min_mer_range $max_mer_range`; do
+ if [ ! -e $counts-counts-$mer ]; then
+ echo checking $mer mers for $fasta_file
+ kmer_total_count -i $tmp -k $mer -l -n >> $counts-counts-$mer
+ else
+ echo "$mer mers already done for $fasta_file"
+ fi
+
+ cat $counts-counts-$mer >> $counts-counts
+
+ done
+done
+
+
+fg_counts=$counts_directory/$(basename $foreground)-counts
+bg_counts=$counts_directory/$(basename $background)-counts
+
+fg_tmp=$tmp_directory/$(basename $foreground)
+bg_tmp=$tmp_directory/$(basename $background)
+
+echo "checking if mers are below melting temperature in the foreground"
+
+rm $fg_counts-fg-non-melting
+
+python below_melting_temperature.py $max_melting_temp < $fg_counts > $fg_counts-fg-non-melting
+
+python ./select_mers.py $fg_counts-fg-non-melting $fg_tmp $bg_counts $bg_tmp # > $(basename $foreground)_$(basename $background)_final_mers
diff --git a/select_mers.py b/select_mers.py
new file mode 100644
index 0000000..9dea7fe
--- /dev/null
+++ b/select_mers.py
@@ -0,0 +1,213 @@
+import sys
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot
+import os
+
+from multiprocessing import Pool
+from subprocess import *
+import numpy as np
+import pdb
+
+fg_mers = {}
+bg_mers = {}
+# empty class to fill up mer information with
+class Mer:
+ pass
+
+class Score:
+ pass
+
+# import our variables
+min_mer_range = int(os.getenv("min_mer_range"));
+max_mer_range = int(os.getenv("max_mer_range"));
+min_mer_count = int(os.getenv("min_mer_count"));
+max_select = int(os.getenv("max_select"));
+
+def populate_locations(input_fn, mers, selected):
+
+ mer_str = ' '.join(selected)
+ cmd = './strstream ' + mer_str + " < " + input_fn
+
+ strstream = Popen(cmd, stdout=PIPE, shell=True)
+ for line in strstream.stdout:
+ (index, pos) = line.split(' ')
+ mers[selected[int(index)]].pts.append(int(pos))
+
+
+ return None
+
+def select_mers(selected):
+ from itertools import combinations
+
+ scores = []
+
+ p = Pool()
+
+ for select_n in range(1, max_select):
+ print "scoring size ", select_n
+ scores_it = []
+ scores_it = p.map(score, combinations(selected, select_n))
+ scores = scores + scores_it
+
+
+ p.close()
+
+
+ scores = sorted(scores, key=lambda score: score.score)
+ return scores
+
+def score(combination):
+
+ # print "scoring", combination
+
+ fg_pts_arr = []
+ fg_dist_arr = []
+
+ bg_pts_arr = []
+ bg_dist_arr = []
+
+ for mer in combination:
+ fg_pts_arr = fg_pts_arr + fg_mers[mer].pts
+ bg_pts_arr = bg_pts_arr + bg_mers[mer].pts
+
+ fg_pts_arr.sort()
+ bg_pts_arr.sort()
+
+ # remove any exact duplicates
+ fg_pts_arr = list(set(fg_pts_arr))
+ bg_pts_arr = list(set(bg_pts_arr))
+
+ # distances
+ for i in range(1, len(fg_pts_arr)):
+ fg_dist_arr.append(abs(fg_pts_arr[i-1] - fg_pts_arr[i]));
+
+ for i in range(1, len(bg_pts_arr)):
+ bg_dist_arr.append(abs(bg_pts_arr[i-1] - bg_pts_arr[i]));
+
+ fg_dist_arr = np.array(fg_dist_arr)
+ bg_dist_arr = np.array(bg_dist_arr)
+
+ nb_primers = len(combination)
+ fg_mean_dist = np.mean(fg_dist_arr)
+ fg_variance_dist = np.var(fg_dist_arr)
+ bg_mean_dist = np.mean(bg_dist_arr)
+ bg_variance_dist = np.var(bg_dist_arr)
+
+ # this is our equation
+
+ score = Score()
+ score.mers = combination
+ score.score = (nb_primers * (fg_mean_dist * fg_variance_dist)) / (bg_mean_dist * bg_variance_dist)
+ # score.bg_dist_arr = bg_dist_arr
+ # score.fg_dist_arr = fg_dist_arr
+ return score
+
+
+
+def most_frequent(mers, select_nb):
+ print "selecting the top ", select_nb, " mers"
+ selected = []
+ for mer in mers.keys():
+ selected.append([mer, mers[mer].count])
+
+ selected = sorted(selected, key=lambda mer: mer[1])
+ selected = selected[-select_nb:]
+ selected = list(row[0] for row in selected)
+
+ return selected
+
+
+def main():
+ fg_count_fn = sys.argv[1]
+ fg_fasta_fn = sys.argv[2]
+
+ bg_count_fn = sys.argv[3]
+ bg_fasta_fn = sys.argv[4]
+
+ fg_count_fh = open(fg_count_fn, "r")
+ bg_count_fh = open(bg_count_fn, "r")
+
+ # get our genome length
+ fg_genome_length = os.path.getsize(fg_fasta_fn)
+ bg_genome_length = os.path.getsize(bg_fasta_fn)
+
+ # copy in our fg_mers and counts
+ for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]:
+
+ for line in fh:
+ (mer, count) = line.split()
+ count = int(count)
+ mers[mer] = Mer()
+ mers[mer].count = count
+ mers[mer].pts = []
+
+ # if min_mer_count is less than one, then we are looking at a bottom cutoff threshold
+ if(min_mer_count < 1 and min_mer_count > 0):
+ pass
+ # counts = []
+ # for mer in fg_mers.keys():
+ # counts.append(fg_mers[mer]['count'])
+ #
+ # counts = counts.sort()
+ #
+ # count_set = set(counts):
+ # len(count_set)
+ # count_set(
+ # if it is 1 or great then use it as a numerical cutoff
+ print "removing mers that don't meet our minimum count"
+ if min_mer_count >= 1:
+ for mer in fg_mers.keys():
+ if(fg_mers[mer].count < min_mer_count):
+ del fg_mers[mer]
+ if mer in bg_mers:
+ del bg_mers[mer]
+
+ print "removing useless mers from the background"
+ for mer in bg_mers.keys():
+ if mer not in fg_mers:
+ del bg_mers[mer]
+
+ print "adding empty mers to the background"
+ for mer in fg_mers:
+ if mer not in bg_mers:
+ bg_mers[mer] = Mer()
+ bg_mers[mer].count = 2
+ bg_mers[mer].pts = [0, bg_genome_length]
+
+
+ exhaustive = True
+
+ if exhaustive:
+ selected = fg_mers.keys()
+ if not exhaustive:
+ selected = most_frequent(fg_mers, max_select)
+
+ print "selected the top ", max_select, " mers"
+ print "selected:", ", ".join(selected)
+
+ print "Populating foreground locations"
+ populate_locations(fg_fasta_fn, fg_mers, selected)
+ print "Populating background locations"
+ populate_locations(bg_fasta_fn, bg_mers, selected)
+
+ scores = select_mers(selected)
+
+ print "fg_genome_length", fg_genome_length
+ print "bg_genome_length", bg_genome_length
+
+
+ for score in scores:
+ sys.stdout.write(str(score.score) + "\t")
+ sys.stdout.write(", ".join(list(score.mers)) + "\t")
+ # sys.stdout.write("\t".join(score.dist_arr));
+ sys.stdout.write("\n");
+
+ #matplotlib.pyplot.hist(score.fg_dist_arr)
+ #matplotlib.pyplot.savefig('graph/' + str(x) + ".png")
+ #matplotlib.pyplot.clf()
+ import pdb
+ pdb.set_trace()
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/strstream.c b/strstream.c
new file mode 100644
index 0000000..f4a296e
--- /dev/null
+++ b/strstream.c
@@ -0,0 +1,52 @@
+// find string in
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(int argc, char **argv){
+
+ char buffer[BUFSIZ] = { 0 };
+ char *buf, *start;
+ ssize_t len = 0;
+
+ int save_size = 0;
+ int cpy = 0;
+
+ unsigned long long pos = 0;
+ unsigned long long cpy_size = 0;
+
+ int i = 0;
+
+ // get max argument length
+ for(i = 1; i < argc; i++) {
+ int len = strlen(argv[i]);
+ if(len > save_size)
+ save_size = len;
+ }
+
+ cpy = save_size - 1;
+ cpy_size = BUFSIZ - cpy;
+
+ buf = buffer;
+ start = buf + cpy;
+
+ // copy our first cpy length into the first part of our buffer
+ len = fread(buffer, 1, cpy, stdin);
+ if(len == 0)
+ exit(EXIT_FAILURE);
+
+ // read into "start" (buf + cpy) from stdin
+ while((len = fread(start, 1, cpy_size, stdin)) != 0) {
+ for(i = 1; i < argc; i++) {
+ char *p = buffer;
+ while((p = strstr(p, argv[i])) != NULL) {
+ printf("%d %llu\n", i - 1, pos + (p - buffer));
+ p++;
+ }
+ }
+ memcpy(buffer, buffer + len, cpy);
+ pos = pos + len;
+ }
+ return 0;
+}