aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCalvin Morrison <mutantturkey@gmail.com>2014-03-27 15:10:04 -0400
committerCalvin Morrison <mutantturkey@gmail.com>2014-03-27 15:10:04 -0400
commitf5ac0df9d42de657a64e15d3f0cfa2198a1921c6 (patch)
tree68609ff01dfac4a0f402d7b5e259b3a29bf23596
parent495228f7167a6df24a139022e7a0560a4dd07b56 (diff)
Add scoring for specific combinations.
- add scoring for specific combos - update readme - add CLI args for score_mers.py - update score_wrapper - use more generic print functions in score_mers
-rw-r--r--README.md51
-rwxr-xr-xsrc/score_mers.py195
-rwxr-xr-xsrc/score_wrapper.sh2
3 files changed, 170 insertions, 78 deletions
diff --git a/README.md b/README.md
index 81f7fc6..f6a4087 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,9 @@ SelectiveGenomeAmplification
PI: http://brisson.bio.upenn.edu/
+
+
+## Requirements
To use this you'll need:
- A unix environment
@@ -10,31 +13,69 @@ To use this you'll need:
- bash or compliant shell.
-Setup:
+## Setup
git clone git@github.com:mutantturkey/SelectiveGenomeAmplification.git
cd SelectiveGenomeAmplification
make
sudo make install
-Example Usage:
+## Usage Examples
+Standard use of (SGA) SelectiveGenomeAmplification is easy. it takes two arguments,
+the foreground and background
+
SelectiveGenomeAmplification PfalciparumGenome.fasta HumanGenome.fasta;
less PfalciparumGenome_HumanGenome/final_mers
-For user customizable variables:
+SGA allows for many tunable parameters, which are all explained in the chart
+below. For user customizable variables, they need to be passed in as
+environmental variables like so:
max_mer_distance=5000 max_select=6 min_mer_range=6 max_mer_range=12 \
SelectiveGenomeAmplification.sh PfalciparumGenome.fasta half.fasta
+SGA also comes with a easy to use user prompt called SelectiveGenomeAmplificationUI.
+It allows for a less expereienced user to use
+SGA without issue.
+
+### Running individual steps
By default SelectiveGenomeAmplification runs all four steps, but you can
-specify the program to run other steps, like score in this example.
+specify the program to run other steps, like in these examples.
current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta score
+ current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta select score
+
+ current_run=run_1 SelectiveGenomeAmplification target.fasta bg.fasta 3 4
+
+valid steps are these:
+
+- count (1)
+- filter (2)
+- select (3)
+- score (4)
+
This function does not try to be smart, so use it wisely
+
+### Manually scoring mer combinations
+
+Users can manually score combinations of mers they choose using the
+score\_mers.py script.
+
+ score_mers.py -f foreground.fa -b background.fa -c combination file -o output
+
+
+The combination file should look like this:
+
+ ACGATATAT TACATAGA TATATATAT ACGTACCAT ATATTA
+ AAATTATCAGT ATACATA ATATACAT ATATACATA ACATA
+ ATATACATA ATCATGATA CCAGATACATAT
+
+each row is combination to be scored.
+
## Customizable variables
range of mers, min and max
@@ -45,7 +86,7 @@ current\_run | Not Enabled | specify the run you want to run steps on
min\_mer\_range | 6 | minimum mer size to use
max\_mer\_range | 12 | maximum mer size to use
max\_mer\_distance | 5000 | maximum distance between mers in foreground
-output\_directory | $PWD/$foreground\_$background/ | ex. if fg is Bacillus.fasta and bg is HumanGenome.fasta then folder would be $PWD/Bacillus.fasta\_HumanGenome\_output.fasta/
+output\_directory | $foreground\_$background/ | ex. if fg is Bacillus.fasta and bg is HumanGenome.fasta then folder would be $PWD/Bacillus.fasta\_HumanGenome\_output.fasta/
counts\_directory | $output\_directory/.tmp | directory for counts directory
tmp\_directory | $output\_directory/.tmp | temporary files directory
max\_melting\_temp | 30° | maximum melting temp of mers
diff --git a/src/score_mers.py b/src/score_mers.py
index aaf8e13..5c5416a 100755
--- a/src/score_mers.py
+++ b/src/score_mers.py
@@ -2,6 +2,8 @@
import sys
import os
+import argparse
+
from multiprocessing import Pool
from multiprocessing import cpu_count
@@ -26,9 +28,6 @@ output_file = ""
# import our variables
cpus = int(os.environ.get("cpus", cpu_count()))
debug = os.environ.get("debug", False)
-min_mer_range = int(os.environ.get("min_mer_range", 6))
-max_mer_range = int(os.environ.get("max_mer_range", 12))
-min_mer_count = int(os.environ.get("min_mer_count", 0))
max_select = int(os.environ.get("max_select", 15))
max_check = int(os.environ.get("max_check", 35))
max_mer_distance = int(os.environ.get("max_mer_distance", 5000))
@@ -115,26 +114,7 @@ def populate_locations(selected_mers, mer_dic, input_fn):
mer_dic[selected_mers[int(mer)]].append(int(pos))
merlist_fh.close()
-
-
-
-def filter_mers(combination):
- '''
- filter out mers that are either inside other mers,
- or don't fit the heterodimer requirement.
- '''
- for combo in combinations(combination, 2):
- if heterodimer_dic[combo]:
- return True
-
- for mer in combination:
- for other_mer in combination:
- if not mer == other_mer:
- if mer in other_mer:
- return True
-
- return False
def check_feasible(selected):
total = 0
@@ -159,9 +139,53 @@ def percentage(part, whole, precision=2):
return str(percent) + "%"
-def score_mers(selected):
- import time
+def write_header(fh):
+ fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n")
+def write_result(fh, score_res):
+ combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res
+ fh.write(str(combination) + "\t")
+ fh.write(str(score_val) + "\t")
+ fh.write(str(fg_mean_dist) + "\t")
+ fh.write(str(fg_stddev_dist) + "\t")
+ fh.write(str(bg_ratio) + "\n")
+
+def print_rejected(total_reject, total_checked, total_scored, excluded):
+ print ""
+ print "Reasons mers were excluded:\n"
+ print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")"
+ print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")"
+ print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")"
+ print ""
+ print " total combinations checked: ", total_checked
+ print " total combinations scored: ", total_scored
+ print " percent rejected: " + percentage(total_reject, total_checked)
+ print ""
+
+def score_specific_combinations(mers):
+
+ total_scored = 0
+ total_checked = 0
+ excluded = [0, 0, 0]
+
+ p = Pool(cpus)
+
+ fh = open(output_file, 'wb')
+ write_header(fh)
+
+ score_it = p.map(score, mers)
+ for score_res in score_it:
+ if type(score_res) is list:
+ total_scored += 1
+ write_result(fh, score_res)
+ else:
+ excluded[score_res] += 1;
+
+ total_reject = len(mers) - total_scored
+ print_rejected(total_reject, len(mers), total_scored, excluded)
+
+def score_all_combinations(selected):
+ import time
total_scored = 0
total_checked = 0
@@ -172,7 +196,7 @@ def score_mers(selected):
p = Pool(cpus)
fh = open(output_file, 'wb')
- fh.write("Combination\tScore\tFG_mean_dist\tFG_stdev_dist\tBG_ratio\n")
+ write_header(fh)
for select_n in range(1, max_select+1):
print "scoring size ", select_n,
@@ -182,28 +206,15 @@ def score_mers(selected):
total_checked += 1
if type(score_res) is list:
total_scored += 1
- combination, score_val, fg_mean_dist, fg_stddev_dist, bg_ratio = score_res
- fh.write(str(combination) + "\t")
- fh.write(str(score_val) + "\t")
- fh.write(str(fg_mean_dist) + "\t")
- fh.write(str(fg_stddev_dist) + "\t")
- fh.write(str(bg_ratio) + "\n")
+ write_result(fh, score_res)
else:
excluded[score_res] += 1;
print "size ", select_n, "took:", time.time() - t
total_reject = total_checked - total_scored
- print ""
- print "Reasons mers were excluded:\n"
- print " max distance: " + percentage(excluded[0], total_reject) + " (" + str(excluded[0]) + ")"
- print " mers overlap: " + percentage(excluded[1], total_reject) + " (" + str(excluded[1]) + ")"
- print " heterodimers: " + percentage(excluded[2], total_reject) + " (" + str(excluded[2]) + ")"
- print ""
- print " total combinations checked: ", total_checked
- print " total combinations scored: ", total_scored
- print " percent rejected: " + percentage(total_reject, total_checked)
- print ""
+
+ print_rejected(total_reject, total_checked, total_scored, excluded)
if(total_scored == 0):
print "NO RESULTS FOUND"
@@ -329,52 +340,92 @@ def main():
'''
global fg_genome_length
global bg_genome_length
+ global seq_ends
global output_file
- if len(sys.argv) == 5:
- selectivity_fn = sys.argv[1]
- fg_fasta_fn = sys.argv[2]
- bg_fasta_fn = sys.argv[3]
- output_file = sys.argv[4]
- else:
- print "please specify your inputs"
- print "ex: score_mers.py selectivity_file fg_fasta bg_fasta output_file"
+ parser = argparse.ArgumentParser(description="score mers")
+ parser.add_argument("-f", "--foreground", help="foreground fasta file", required=True)
+ parser.add_argument("-b", "--background", help="background fasta file", required=True)
+ parser.add_argument("-o", "--output", help="output fasta with UIDs in the file", required=True)
+ parser.add_argument("-s", "--selectivity-file", help="mer selectivity file generated by select_mers.py", required=False)
+ parser.add_argument("-c", "--combination-file", help="a set of combinations you want to score", required=False)
+
+ args = parser.parse_args()
+
+ if args.selectivity_file is None and args.combination_file is None:
+ print "you must either have a selectivity file or a combination file to score from"
exit()
+ if args.selectivity_file is not None and args.combination_file is not None:
+ print "you can only select either a selectivity file or a combination file to score from"
+ exit()
+
+ output_file = args.output
+
+ print "Getting genome length"
+ fg_genome_length = get_length(args.foreground)
+ bg_genome_length = get_length(args.background)
+
+ print "Populating sequence end points"
+ seq_ends = load_end_points(args.foreground)
- fg_genome_length = get_length(fg_fasta_fn)
- bg_genome_length = get_length(bg_fasta_fn)
- selectivity_fh = open(selectivity_fn, "r")
+ if(args.selectivity_file is not None):
+
+ print "Scoring all mer combinations"
+
+ selectivity_fh = open(args.selectivity_file, "r")
- # load our mer list into python
- mer_selectivity = selectivity_fh.readlines()
+ # load our mer list into python
+ mer_selectivity = selectivity_fh.readlines()
- # get the last max_check (it's sorted)
- selected_mers = mer_selectivity[-max_check:]
+ # get the last max_check (it's sorted)
+ selected_mers = mer_selectivity[-max_check:]
- # load it into our fg and bg counts into their dictionaries
- for mer in selected_mers:
- split_mer = mer.split()
- fg_mers[split_mer[0]] = []
- bg_mers[split_mer[0]] = int(split_mer[2])
+ # load it into our fg and bg counts into their dictionaries
+ for mer in selected_mers:
+ split_mer = mer.split()
+ fg_mers[split_mer[0]] = []
+ bg_mers[split_mer[0]] = int(split_mer[2])
- selected_mers = [x.split()[0] for x in selected_mers]
+ selected_mers = [x.split()[0] for x in selected_mers]
- print "Populating sequence end points"
- global seq_ends
- seq_ends = load_end_points(fg_fasta_fn)
+ print "Populating foreground locations"
+ populate_locations(selected_mers, fg_mers, args.foreground)
- print "Populating foreground locations"
- populate_locations(selected_mers, fg_mers, fg_fasta_fn)
+ print "calculating heterodimer distances"
+ load_heterodimer_dic(selected_mers)
- print "calculating heterodimer distances"
- load_heterodimer_dic(selected_mers)
+ print "scoring mer combinations"
+ score_all_combinations(selected_mers)
+
+ else:
+ print "Scoring specific mer combinations"
+
+ combinations = []
+
+ combination_fh = open(args.combination_file, "r")
+ for line in combination_fh:
+ mers = line.split()
+ combinations.append(mers)
+ for mer in mers:
+ fg_mers[mer] = []
+ bg_mers[mer] = []
+
+ print "calculating heterodimer distances"
+ load_heterodimer_dic(fg_mers.keys())
- print "scoring mer combinations"
- score_mers(selected_mers)
+ print "Populating foreground locations"
+ populate_locations(fg_mers.keys(), fg_mers, args.foreground)
+
+ print "Populating background locations"
+ populate_locations(fg_mers.keys(), bg_mers, args.background)
+
+ for mer in bg_mers:
+ bg_mers[mer] = len(bg_mers[mer])
+
+ score_specific_combinations(combinations)
print "output_file:", output_file
-
if __name__ == "__main__":
sys.exit(main())
diff --git a/src/score_wrapper.sh b/src/score_wrapper.sh
index 1c6f856..ee7e40d 100755
--- a/src/score_wrapper.sh
+++ b/src/score_wrapper.sh
@@ -10,7 +10,7 @@ function clean_exit {
}
trap clean_exit SIGINT
-score_mers.py $1 $2 $3 $4 & </dev/null
+score_mers.py -s $1 -f $2 -b $3 -o $4 & </dev/null
pid=$!
wait $pid