From 94d04a1e503121a98b403f882c18a4f0799267d7 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 29 Jan 2014 11:53:30 -0500 Subject: add filtering based on consecutive mer lengths --- src/filter_max_consecutive_binding.py | 72 +++++++++++++ src/filter_melting_range.c | 58 +++++++++++ src/max_consecutive_bindings.py | 54 ---------- src/melting_range.c | 58 ----------- src/score_mers.py | 184 ++++++++++++++++++++++++++++++++++ src/select_mers.py | 83 +++++++++++++++ 6 files changed, 397 insertions(+), 112 deletions(-) create mode 100755 src/filter_max_consecutive_binding.py create mode 100644 src/filter_melting_range.c delete mode 100644 src/max_consecutive_bindings.py delete mode 100644 src/melting_range.c create mode 100755 src/score_mers.py create mode 100755 src/select_mers.py (limited to 'src') diff --git a/src/filter_max_consecutive_binding.py b/src/filter_max_consecutive_binding.py new file mode 100755 index 0000000..daebee4 --- /dev/null +++ b/src/filter_max_consecutive_binding.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +import sys, os + +binding = { 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '_': False } + + +def max_consecutive_binding(mer1, mer2): + if len(mer2) > len(mer1): + mer1, mer2 = mer2, mer1 + + # reverse mer2, + mer2 = mer2[::-1] + # pad mer one to avoid errors + mer1 = mer1.ljust(len(mer1) + len(mer1), "_") + + max_bind = 0; + for offset in range(len(mer2)): + consecutive = 0 + for x in range(len(mer2)): + if binding[mer1[offset+x]] == mer2[x]: + consecutive = consecutive + 1 + else: + consecutive = 0 + + max_bind = max(consecutive,max_bind) + + return max_bind + +def test(): + # mer 1 mer 2 # correct ans + arr = [ + ("ATATAT", "TATATA", 5), + ("ACAGGGAT", "ATATATAT", 2), + ("CATATATAT", "ATATATATATAT", 8), + ("ATATATATATAT", "ATATATATAT", 10), + ("ATATAT", "TATAT", 5), + ("AACGATACCATG", "GGATCATACGTA", 3), + ("CGT", "ACG", 3), + ("ACG", "CGT", 3), + ("CACC", "GGTGT", 4), + ("GGTGT", "CACC", 4), + ] + + print 'pass\tmer1\tmer2\tres\tcorr' + for mer_combination in arr: + response = [] + ans = max_consecutive_binding(mer_combination[0], mer_combination[1]) + + response.append(str(ans == mer_combination[2])) + response.append(mer_combination[0]) + response.append(mer_combination[1]) + response.append(str(ans)) + response.append(str(mer_combination[2])) + + print '\t'.join(response) + +def main(): + + if(len(sys.argv) < 2): + print "cutoff is expected as an argument" + exit() + else: + cutoff = int(sys.argv[1]) + + for line in sys.stdin: + mer = line.split()[0] + if max_consecutive_binding(mer, mer) < cutoff: + sys.stdout.write(line) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/filter_melting_range.c b/src/filter_melting_range.c new file mode 100644 index 0000000..2c89195 --- /dev/null +++ b/src/filter_melting_range.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include + +float melting_temperature(char *mer) { + + float a = 0; + float c = 0; + float g = 0; + float t = 0; + int i = 0; + + for(i = 0; i < strlen(mer); i++) { + switch(mer[i]) { + case 'A': + a++; + break; + case 'C': + c++; + break; + case 'G': + g++; + break; + case 'T': + t++; + break; + default: + break; + } + } + + if(strlen(mer) < 13) + return ((a+t) * 2) + ((c+g) * 4); + else + return 64.9 + 41.0*(g+c-16.4)/(a+t+g+c); +} + +int main(int argc, char **argv){ + + if(argc < 3) { + printf("please supply the min and max as stdargs"); + exit(EXIT_FAILURE); + } + float min = atof(argv[1]); + float max = atof(argv[2]); + + char mer[24] = { 0 }; + int count = 0; + + while(fscanf(stdin, "%s\t%d\n", &mer, &count) == 2) { + float temp = melting_temperature(mer); + if( (temp > min) && (temp < max) ) + printf("%s\t%d\n", mer, count); + } + + exit(EXIT_SUCCESS); +} diff --git a/src/max_consecutive_bindings.py b/src/max_consecutive_bindings.py deleted file mode 100644 index a2e6c8d..0000000 --- a/src/max_consecutive_bindings.py +++ /dev/null @@ -1,54 +0,0 @@ -import sys - -binding = { 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '_': False } - - -def max_consecutive_bindings(mer1, mer2): - if len(mer2) > len(mer1): - mer1, mer2 = mer2, mer1 - - # reverse mer2, - mer2 = mer2[::-1] - mer1 = mer1.ljust(len(mer1) + len(mer1), "_") - - max_bind = 0; - for offset in range(len(mer2)): - consecutive = 0 - for x in range(len(mer2)): - if binding[mer1[offset+x]] == mer2[x]: - consecutive = consecutive + 1 - else: - consecutive = 0 - - max_bind = max(consecutive,max_bind) - - return max_bind - -def test(): - # mer 1 mer 2 # correct ans - arr = [ - ("ATATAT", "TATATA", 5), - ("ACAGGGAT", "ATATATAT", 2), - ("CATATATAT", "ATATATATATAT", 8), - ("ATATATATATAT", "ATATATATAT", 10), - ("ATATAT", "TATAT", 5), - ("AACGATACCATG", "GGATCATACGTA", 3), - ("CGT", "ACG", 3), - ("ACG", "CGT", 3), - ("CACC", "GGTGT", 4), - ("GGTGT", "CACC", 4), - ] - - print 'pass\tmer1\tmer2\tres\tcorr' - for mer_combination in arr: - response = [] - ans = max_consecutive_bindings(mer_combination[0], mer_combination[1]) - - response.append(str(ans == mer_combination[2])) - response.append(mer_combination[0]) - response.append(mer_combination[1]) - response.append(str(ans)) - response.append(str(mer_combination[2])) - - print '\t'.join(response) - diff --git a/src/melting_range.c b/src/melting_range.c deleted file mode 100644 index 2c89195..0000000 --- a/src/melting_range.c +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include - -float melting_temperature(char *mer) { - - float a = 0; - float c = 0; - float g = 0; - float t = 0; - int i = 0; - - for(i = 0; i < strlen(mer); i++) { - switch(mer[i]) { - case 'A': - a++; - break; - case 'C': - c++; - break; - case 'G': - g++; - break; - case 'T': - t++; - break; - default: - break; - } - } - - if(strlen(mer) < 13) - return ((a+t) * 2) + ((c+g) * 4); - else - return 64.9 + 41.0*(g+c-16.4)/(a+t+g+c); -} - -int main(int argc, char **argv){ - - if(argc < 3) { - printf("please supply the min and max as stdargs"); - exit(EXIT_FAILURE); - } - float min = atof(argv[1]); - float max = atof(argv[2]); - - char mer[24] = { 0 }; - int count = 0; - - while(fscanf(stdin, "%s\t%d\n", &mer, &count) == 2) { - float temp = melting_temperature(mer); - if( (temp > min) && (temp < max) ) - printf("%s\t%d\n", mer, count); - } - - exit(EXIT_SUCCESS); -} diff --git a/src/score_mers.py b/src/score_mers.py new file mode 100755 index 0000000..0a73cfb --- /dev/null +++ b/src/score_mers.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +import sys +import os + +from multiprocessing import Pool +from subprocess import * +import numpy as np +import pdb + +fg_mers = {} +bg_mers = {} + +if(len(sys.argv) == 5): + selectivity_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_fasta_fn = sys.argv[3] + output_file = sys.argv[4] +else: + print "please specify your inputs" + print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file" + exit() + +# empty class to fill up mer information with +class Mer: + pass + +# import our variables +min_mer_range = int(os.environ.get("min_mer_range", 6)); +max_mer_range = int(os.environ.get("max_mer_range", 10)); +min_mer_count = int(os.environ.get("min_mer_count", 0)); +max_select = int(os.environ.get("max_select", 15)); +max_mer_distance = int(os.environ.get("max_mer_distance", 5000)); + + +def populate_locations(input_fn, mers, mer): + ''' Run the strstreamone command, and parse in the integers that are output + by the command, and add it to mers[mer].pts + ''' + + cmd = 'strstreamone ' + mer + " < " + input_fn + + strstream = Popen(cmd, stdout=PIPE, shell=True) + for line in strstream.stdout: + mers[mer].pts.append(int(line)) + + +def score_mers(selected): + from itertools import combinations + import time + + scores = [] + + p = Pool() + + fh = open(output_file, 'w'); + fh.write("scores:\n"); + for select_n in range(1, max_select+1): + print "scoring size ", select_n, + t = time.time() + scores_it = p.imap_unordered(score, combinations(selected, select_n)) + for score_res in scores_it: + fh.write(str(score_res) + "\n"); + print "size ", select_n, "took:", t - time.time() + return scores + + +def score(combination): +# input is a string of mers like +# ['ACCAA', 'ACCCGA', 'ACGTATA'] + + ret = [combination] + + for mer in combination: + for other_mer in combination: + if not mer == other_mer: + if mer in other_mer: + ret.append("duplicates") + return ret + + fg_pts = [] + fg_dist = [] + + bg_pts = [] + bg_dist = [] + + for mer in combination: + fg_pts = fg_pts + fg_mers[mer].pts + bg_pts = bg_pts + bg_mers[mer].pts + + fg_pts.sort() + bg_pts.sort() + + # remove any exact duplicates + # fg_pts = list(set(fg_pts)) + # bg_pts = list(set(bg_pts)) + + # distances + min_mer_distance = max(len(i) for i in combination) + fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))]) + bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))]) + + # return without calculating scores if any objects are higher than our max distance + if any(dist > max_mer_distance for dist in fg_dist): + ret.append("max") + ret.append(max(fg_dist)) + return ret + + # return without calculating scores if any mers are closer than the length of our longest mer in the combination + if any(dist < min_mer_distance for dist in fg_dist): + ret.append("min") + ret.append(min(fg_dist)) + return ret + + nb_primers = len(combination) + fg_mean_dist = np.mean(fg_dist) + fg_variance_dist = np.var(fg_dist) + bg_mean_dist = np.mean(bg_dist) + bg_variance_dist = np.var(bg_dist) + + # this is our equation + score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001) + + ret.append(score) + ret.append(fg_mean_dist) + ret.append(fg_variance_dist) + ret.append(bg_mean_dist) + ret.append(bg_variance_dist) + + return ret + +def pop_fg(mer): + ''' helper for map function ''' + populate_locations(fg_fasta_fn, fg_mers, mer) + +def pop_bg(mer): + ''' helper for map function ''' + populate_locations(bg_fasta_fn, bg_mers, mer) + +def main(): + import time + selected = [] + selectivity_fh = open(selectivity_fn, "r") + + # get our genome length + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) + + for row in selectivity_fh: + (mer, fg_count, bg_count, selectivity) = row.split() + fg_mers[mer] = Mer() + fg_mers[mer].pts = [] + fg_mers[mer].count = fg_count + bg_mers[mer] = Mer() + bg_mers[mer].pts = [] + bg_mers[mer].count = bg_count + selected.append([mer, selectivity]) + + # exhaustive = False + # + # if exhaustive: + # selected = fg_mers.keys() + # else: + # selected = select_mers(fg_mers, bg_mers, max_select) + selected = selected[-100:] + selected_mers = [row[0] for row in selected] + pdb.set_trace() + # print "searching through combinations of" + # print selected + + print "Populating foreground locations" + + + map(pop_fg, selected_mers) + map(pop_bg, selected_mers) + + scores = score_mers(selected_mers) + + print "fg_genome_length", fg_genome_length + print "bg_genome_length", bg_genome_length + print "output_file:", output_file + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/select_mers.py b/src/select_mers.py new file mode 100755 index 0000000..21306cc --- /dev/null +++ b/src/select_mers.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +import sys +import os + +fg_mers = {} +bg_mers = {} + +min_mer_count = int(os.environ.get("min_mer_count", 0)); + +if(len(sys.argv) == 5): + fg_count_fn = sys.argv[1] + fg_fasta_fn = sys.argv[2] + bg_count_fn = sys.argv[3] + bg_fasta_fn = sys.argv[4] + + fg_genome_length = os.path.getsize(fg_fasta_fn) + bg_genome_length = os.path.getsize(bg_fasta_fn) +else: + print len(sys.argv) + print "please specify your inputs" + print "ex: select_mers.py fg_counts fg_fasta bg_counts bg_fasta" + exit() + + +# select mers based on our 'selectivity' measure. (count in fg) / (count in bg) +def select_mers(fg_mers, bg_mers): + import numpy as np + + mers = [] # contains mer strings + fg_arr = [] # contains fg counts + bg_arr = [] # contains bg counts + + # populate our bg_arr and fg_arr as well as our mer arr. + for mer in fg_mers.keys(): + mers.append(mer); + bg_arr.append(bg_mers.get(mer, 1)); + fg_arr.append(fg_mers[mer]); + + fg_arr = np.array(fg_arr, dtype='f'); + bg_arr = np.array(bg_arr, dtype='f'); + + selectivity = (fg_arr / bg_arr) + + arr = [(mers[i], fg_arr[i], bg_arr[i], selectivity[i]) for i in range(len(mers))] + + # filter results less than 1 ( indicates that the bg is more present than the fg) + # arr = filter(lambda i: i[3] > 1, arr) + + # sort by the selectivity + arr = sorted(arr, key = lambda row: row[3]) + + # return only our mers, without our selectivity scores + return arr + + +def main(): + + fg_count_fh = open(fg_count_fn, "r") + bg_count_fh = open(bg_count_fn, "r") + + # copy in our fg_mers and counts + for mers,fh in [(fg_mers, fg_count_fh), (bg_mers, bg_count_fh)]: + for line in fh: + (mer, count) = line.split() + mers[mer] = int(count) + + if min_mer_count >= 1: + for mer in fg_mers.keys(): + if(fg_mers[mer] < min_mer_count): + del fg_mers[mer] + if mer in bg_mers: + del bg_mers[mer] + + for mer in bg_mers.keys(): + if mer not in fg_mers: + del bg_mers[mer] + + selected = select_mers(fg_mers, bg_mers) + for row in selected: + print row[0] +"\t"+str("%d" % row[1]) + "\t" + str("%d" % row[2]) + "\t" + str("%.5f" % row[3]) + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3