From 94d04a1e503121a98b403f882c18a4f0799267d7 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 29 Jan 2014 11:53:30 -0500 Subject: add filtering based on consecutive mer lengths --- score_mers.py | 184 ---------------------------------------------------------- 1 file changed, 184 deletions(-) delete mode 100755 score_mers.py (limited to 'score_mers.py') diff --git a/score_mers.py b/score_mers.py deleted file mode 100755 index 0a73cfb..0000000 --- a/score_mers.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python -import sys -import os - -from multiprocessing import Pool -from subprocess import * -import numpy as np -import pdb - -fg_mers = {} -bg_mers = {} - -if(len(sys.argv) == 5): - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] -else: - print "please specify your inputs" - print "ex: select_mers.py fg_counts_file fg_fasta_file bg_counts_file bg_fasta_file output_file" - exit() - -# empty class to fill up mer information with -class Mer: - pass - -# import our variables -min_mer_range = int(os.environ.get("min_mer_range", 6)); -max_mer_range = int(os.environ.get("max_mer_range", 10)); -min_mer_count = int(os.environ.get("min_mer_count", 0)); -max_select = int(os.environ.get("max_select", 15)); -max_mer_distance = int(os.environ.get("max_mer_distance", 5000)); - - -def populate_locations(input_fn, mers, mer): - ''' Run the strstreamone command, and parse in the integers that are output - by the command, and add it to mers[mer].pts - ''' - - cmd = 'strstreamone ' + mer + " < " + input_fn - - strstream = Popen(cmd, stdout=PIPE, shell=True) - for line in strstream.stdout: - mers[mer].pts.append(int(line)) - - -def score_mers(selected): - from itertools import combinations - import time - - scores = [] - - p = Pool() - - fh = open(output_file, 'w'); - fh.write("scores:\n"); - for select_n in range(1, max_select+1): - print "scoring size ", select_n, - t = time.time() - scores_it = p.imap_unordered(score, combinations(selected, select_n)) - for score_res in scores_it: - fh.write(str(score_res) + "\n"); - print "size ", select_n, "took:", t - time.time() - return scores - - -def score(combination): -# input is a string of mers like -# ['ACCAA', 'ACCCGA', 'ACGTATA'] - - ret = [combination] - - for mer in combination: - for other_mer in combination: - if not mer == other_mer: - if mer in other_mer: - ret.append("duplicates") - return ret - - fg_pts = [] - fg_dist = [] - - bg_pts = [] - bg_dist = [] - - for mer in combination: - fg_pts = fg_pts + fg_mers[mer].pts - bg_pts = bg_pts + bg_mers[mer].pts - - fg_pts.sort() - bg_pts.sort() - - # remove any exact duplicates - # fg_pts = list(set(fg_pts)) - # bg_pts = list(set(bg_pts)) - - # distances - min_mer_distance = max(len(i) for i in combination) - fg_dist = np.array([abs(fg_pts[i] - fg_pts[i-1]) for i in range(1, len(fg_pts))]) - bg_dist = np.array([abs(bg_pts[i] - bg_pts[i-1]) for i in range(1, len(bg_pts))]) - - # return without calculating scores if any objects are higher than our max distance - if any(dist > max_mer_distance for dist in fg_dist): - ret.append("max") - ret.append(max(fg_dist)) - return ret - - # return without calculating scores if any mers are closer than the length of our longest mer in the combination - if any(dist < min_mer_distance for dist in fg_dist): - ret.append("min") - ret.append(min(fg_dist)) - return ret - - nb_primers = len(combination) - fg_mean_dist = np.mean(fg_dist) - fg_variance_dist = np.var(fg_dist) - bg_mean_dist = np.mean(bg_dist) - bg_variance_dist = np.var(bg_dist) - - # this is our equation - score = (nb_primers * fg_mean_dist * fg_variance_dist) / ((bg_mean_dist * bg_variance_dist) + .000001) - - ret.append(score) - ret.append(fg_mean_dist) - ret.append(fg_variance_dist) - ret.append(bg_mean_dist) - ret.append(bg_variance_dist) - - return ret - -def pop_fg(mer): - ''' helper for map function ''' - populate_locations(fg_fasta_fn, fg_mers, mer) - -def pop_bg(mer): - ''' helper for map function ''' - populate_locations(bg_fasta_fn, bg_mers, mer) - -def main(): - import time - selected = [] - selectivity_fh = open(selectivity_fn, "r") - - # get our genome length - fg_genome_length = os.path.getsize(fg_fasta_fn) - bg_genome_length = os.path.getsize(bg_fasta_fn) - - for row in selectivity_fh: - (mer, fg_count, bg_count, selectivity) = row.split() - fg_mers[mer] = Mer() - fg_mers[mer].pts = [] - fg_mers[mer].count = fg_count - bg_mers[mer] = Mer() - bg_mers[mer].pts = [] - bg_mers[mer].count = bg_count - selected.append([mer, selectivity]) - - # exhaustive = False - # - # if exhaustive: - # selected = fg_mers.keys() - # else: - # selected = select_mers(fg_mers, bg_mers, max_select) - selected = selected[-100:] - selected_mers = [row[0] for row in selected] - pdb.set_trace() - # print "searching through combinations of" - # print selected - - print "Populating foreground locations" - - - map(pop_fg, selected_mers) - map(pop_bg, selected_mers) - - scores = score_mers(selected_mers) - - print "fg_genome_length", fg_genome_length - print "bg_genome_length", bg_genome_length - print "output_file:", output_file - - -if __name__ == "__main__": - sys.exit(main()) -- cgit v1.2.3