From f5ac0df9d42de657a64e15d3f0cfa2198a1921c6 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Thu, 27 Mar 2014 15:10:04 -0400 Subject: Add scoring for specific combinations. - add scoring for specific combos - update readme - add CLI args for score_mers.py - update score_wrapper - use more generic print functions in score_mers --- README.md | 51 ++++++++++++-- src/score_mers.py | 195 ++++++++++++++++++++++++++++++++------------------- src/score_wrapper.sh | 2 +- 3 files changed, 170 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index 81f7fc6..f6a4087 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ SelectiveGenomeAmplification PI: http://brisson.bio.upenn.edu/ + + +## Requirements To use this you'll need: - A unix environment @@ -10,31 +13,69 @@ To use this you'll need: - bash or compliant shell. -Setup: +## Setup git clone git@github.com:mutantturkey/SelectiveGenomeAmplification.git cd SelectiveGenomeAmplification make sudo make install -Example Usage: +## Usage Examples +Standard use of (SGA) SelectiveGenomeAmplification is easy. it takes two arguments, +the foreground and background + SelectiveGenomeAmplification PfalciparumGenome.fasta HumanGenome.fasta; less PfalciparumGenome_HumanGenome/final_mers -For user customizable variables: +SGA allows for many tunable parameters, which are all explained in the chart +below. For user customizable variables, they need to be passed in as +environmental variables like so: max_mer_distance=5000 max_select=6 min_mer_range=6 max_mer_range=12 \ SelectiveGenomeAmplification.sh PfalciparumGenome.fasta half.fasta +SGA also comes with a easy to use user prompt called SelectiveGenomeAmplificationUI. +It allows for a less expereienced user to use +SGA without issue. + +### Running individual steps By default SelectiveGenomeAmplification runs all four steps, but you can -specify the program to run other steps, like score in this example. +specify the program to run other steps, like in these examples. current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta score + current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta select score + + current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta 3 4 + +valid steps are these: + +- count (1) +- filter (2) +- select (3) +- score (4) + This function does not try to be smart, so use it wisely + +### Manually scoring mer combinations + +Users can manually score combinations of mers they choose using the +score\_mers.py script. + + score_mers.py -f foreground.fa -b background.fa -c combination file -o output + + +The combination file should look like this: + + ACGATATAT TACATAGA TATATATAT ACGTACCAT ATATTA + AAATTATCAGT ATACATA ATATACAT ATATACATA ACATA + ATATACATA ATCATGATA CCAGATACATAT + +each row is combination to be scored. + ## Customizable variables range of mers, min and max @@ -45,7 +86,7 @@ current\_run | Not Enabled | specify the run you want to run steps on min\_mer\_range | 6 | minimum mer size to use max\_mer\_range | 12 | maximum mer size to use max\_mer\_distance | 5000 | maximum distance between mers in foreground -output\_directory | $PWD/$foreground\_$background/ | ex. if fg is Bacillus.fasta and bg is HumanGenome.fasta then folder would be $PWD/Bacillus.fasta\_HumanGenome\_output.fasta/ +output\_directory | $foreground\_$background/ | ex. if fg is Bacillus.fasta and bg is HumanGenome.fasta then folder would be $PWD/Bacillus.fasta\_HumanGenome\_output.fasta/ counts\_directory | $output\_directory/.tmp | directory for counts directory tmp\_directory | $output\_directory/.tmp | temporary files directory max\_melting\_temp | 30° | maximum melting temp of mers diff --git a/src/score_mers.py b/src/score_mers.py index aaf8e13..5c5416a 100755 --- a/src/score_mers.py +++ b/src/score_mers.py @@ -2,6 +2,8 @@ import sys import os +import argparse + from multiprocessing import Pool from multiprocessing import cpu_count @@ -26,9 +28,6 @@ output_file = "" # import our variables cpus = int(os.environ.get("cpus", cpu_count())) debug = os.environ.get("debug", False) -min_mer_range = int(os.environ.get("min_mer_range", 6)) -max_mer_range = int(os.environ.get("max_mer_range", 12)) -min_mer_count = int(os.environ.get("min_mer_count", 0)) max_select = int(os.environ.get("max_select", 15)) max_check = int(os.environ.get("max_check", 35)) max_mer_distance = int(os.environ.get("max_mer_distance", 5000)) @@ -115,26 +114,7 @@ def populate_locations(selected_mers, mer_dic, input_fn): mer_dic[selected_mers[int(mer)]].append(int(pos)) merlist_fh.close() - - - -def filter_mers(combination): - ''' - filter out mers that are either inside other mers, - or don't fit the heterodimer requirement. - ''' - for combo in combinations(combination, 2): - if heterodimer_dic[combo]: - return True - - for mer in combination: - for other_mer in combination: - if not mer == other_mer: - if mer in other_mer: - return True - - return False def check_feasible(selected): total = 0 @@ -159,9 +139,53 @@ def percentage(part, whole, precision=2): return str(percent) + "%" -def score_mers(selected): - import time +def write_header(fh): + fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n") +def write_result(fh, score_res): + combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res + fh.write(str(combination) + "\t") + fh.write(str(score_val) + "\t") + fh.write(str(fg_mean_dist) + "\t") + fh.write(str(fg_stddev_dist) + "\t") + fh.write(str(bg_ratio) + "\n") + +def print_rejected(total_reject, total_checked, total_scored, excluded): + print "" + print "Reasons mers were excluded:\n" + print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")" + print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")" + print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")" + print "" + print " total combinations checked: ", total_checked + print " total combinations scored: ", total_scored + print " percent rejected: " + percentage(total_reject, total_checked) + print "" + +def score_specific_combinations(mers): + + total_scored = 0 + total_checked = 0 + excluded = [0, 0, 0] + + p = Pool(cpus) + + fh = open(output_file, 'wb') + write_header(fh) + + score_it = p.map(score, mers) + for score_res in score_it: + if type(score_res) is list: + total_scored += 1 + write_result(fh, score_res) + else: + excluded[score_res] += 1; + + total_reject = len(mers) - total_scored + print_rejected(total_reject, len(mers), total_scored, excluded) + +def score_all_combinations(selected): + import time total_scored = 0 total_checked = 0 @@ -172,7 +196,7 @@ def score_mers(selected): p = Pool(cpus) fh = open(output_file, 'wb') - fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n") + write_header(fh) for select_n in range(1, max_select+1): print "scoring size ", select_n, @@ -182,28 +206,15 @@ def score_mers(selected): total_checked += 1 if type(score_res) is list: total_scored += 1 - combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res - fh.write(str(combination) + "\t") - fh.write(str(score_val) + "\t") - fh.write(str(fg_mean_dist) + "\t") - fh.write(str(fg_stddev_dist) + "\t") - fh.write(str(bg_ratio) + "\n") + write_result(fh, score_res) else: excluded[score_res] += 1; print "size ", select_n, "took:", time.time() - t total_reject = total_checked - total_scored - print "" - print "Reasons mers were excluded:\n" - print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")" - print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")" - print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")" - print "" - print " total combinations checked: ", total_checked - print " total combinations scored: ", total_scored - print " percent rejected: " + percentage(total_reject, total_checked) - print "" + + print_rejected(total_reject, total_checked, total_scored, excluded) if(total_scored == 0): print "NO RESULTS FOUND" @@ -329,52 +340,92 @@ def main(): ''' global fg_genome_length global bg_genome_length + global seq_ends global output_file - if len(sys.argv) == 5: - selectivity_fn = sys.argv[1] - fg_fasta_fn = sys.argv[2] - bg_fasta_fn = sys.argv[3] - output_file = sys.argv[4] - else: - print "please specify your inputs" - print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file" + parser = argparse.ArgumentParser(description="score mers") + parser.add_argument("-f", "--foreground", help="foreground fasta file", required=True) + parser.add_argument("-b", "--background", help="background fasta file", required=True) + parser.add_argument("-o", "--output", help="output fasta with UIDs in the file", required=True) + parser.add_argument("-s", "--selectivity-file", help="mer selectivity file generated by select_mers.py", required=False) + parser.add_argument("-c", "--combination-file", help="a set of combinations you want to score", required=False) + + args = parser.parse_args() + + if args.selectivity_file is None and args.combination_file is None: + print "you must either have a selectivity file or a combination file to score from" exit() + if args.selectivity_file is not None and args.combination_file is not None: + print "you can only select either a selectivity file or a combination file to score from" + exit() + + output_file = args.output + + print "Getting genome length" + fg_genome_length = get_length(args.foreground) + bg_genome_length = get_length(args.background) + + print "Populating sequence end points" + seq_ends = load_end_points(args.foreground) - fg_genome_length = get_length(fg_fasta_fn) - bg_genome_length = get_length(bg_fasta_fn) - selectivity_fh = open(selectivity_fn, "r") + if(args.selectivity_file is not None): + + print "Scoring all mer combinations" + + selectivity_fh = open(args.selectivity_file, "r") - # load our mer list into python - mer_selectivity = selectivity_fh.readlines() + # load our mer list into python + mer_selectivity = selectivity_fh.readlines() - # get the last max_check (it's sorted) - selected_mers = mer_selectivity[-max_check:] + # get the last max_check (it's sorted) + selected_mers = mer_selectivity[-max_check:] - # load it into our fg and bg counts into their dictionaries - for mer in selected_mers: - split_mer = mer.split() - fg_mers[split_mer[0]] = [] - bg_mers[split_mer[0]] = int(split_mer[2]) + # load it into our fg and bg counts into their dictionaries + for mer in selected_mers: + split_mer = mer.split() + fg_mers[split_mer[0]] = [] + bg_mers[split_mer[0]] = int(split_mer[2]) - selected_mers = [x.split()[0] for x in selected_mers] + selected_mers = [x.split()[0] for x in selected_mers] - print "Populating sequence end points" - global seq_ends - seq_ends = load_end_points(fg_fasta_fn) + print "Populating foreground locations" + populate_locations(selected_mers, fg_mers, args.foreground) - print "Populating foreground locations" - populate_locations(selected_mers, fg_mers, fg_fasta_fn) + print "calculating heterodimer distances" + load_heterodimer_dic(selected_mers) - print "calculating heterodimer distances" - load_heterodimer_dic(selected_mers) + print "scoring mer combinations" + score_all_combinations(selected_mers) + + else: + print "Scoring specific mer combinations" + + combinations = [] + + combination_fh = open(args.combination_file, "r") + for line in combination_fh: + mers = line.split() + combinations.append(mers) + for mer in mers: + fg_mers[mer] = [] + bg_mers[mer] = [] + + print "calculating heterodimer distances" + load_heterodimer_dic(fg_mers.keys()) - print "scoring mer combinations" - score_mers(selected_mers) + print "Populating foreground locations" + populate_locations(fg_mers.keys(), fg_mers, args.foreground) + + print "Populating background locations" + populate_locations(fg_mers.keys(), bg_mers, args.background) + + for mer in bg_mers: + bg_mers[mer] = len(bg_mers[mer]) + + score_specific_combinations(combinations) print "output_file:", output_file - if __name__ == "__main__": sys.exit(main()) diff --git a/src/score_wrapper.sh b/src/score_wrapper.sh index 1c6f856..ee7e40d 100755 --- a/src/score_wrapper.sh +++ b/src/score_wrapper.sh @@ -10,7 +10,7 @@ function clean_exit { } trap clean_exit SIGINT -score_mers.py $1 $2 $3 $4 &