From d4ec5459d0fc141d20a4bbbf0a7dc40742e0372f Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 13 Aug 2014 16:59:37 -0400 Subject: add filter max bg mers --- SelectiveWholeGenomeAmplification | 10 +++++++++- src/filter_max_bg_mers.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100755 src/filter_max_bg_mers.py diff --git a/SelectiveWholeGenomeAmplification b/SelectiveWholeGenomeAmplification index 7a3d46b..8304387 100755 --- a/SelectiveWholeGenomeAmplification +++ b/SelectiveWholeGenomeAmplification @@ -182,6 +182,9 @@ done # bg_ratio : ${min_bg_ratio=0} +# max_bg_mers +: ${max_bg_mers=-1} + export ignore_mers export min_mer_range export max_mer_range @@ -246,6 +249,7 @@ ignore_all_mers_counts="$output_directory/$current_run/passes-filter/2-$fg_basen average_binding="$output_directory/$current_run/passes-filter/3-$fg_basename-average-binding" non_melting="$output_directory/$current_run/passes-filter/4-$fg_basename-non-melting" consecutive_binding="$output_directory/$current_run/passes-filter/5-$fg_basename-consecutive-binding" +bg_filtered="$output_directory/$current_run/passes-filter/6-$fg_basename-bg-filtered" # Make our output directory if [[ ! -d "$output_directory" ]]; then @@ -342,7 +346,11 @@ if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then filter_max_consecutive_binding.py "$max_consecutive_binding" < "$non_melting" > "$consecutive_binding" || exit 1 check_non_empty "$consecutive_binding" "consecutive binding" - cp $consecutive_binding $final_fg_counts + echo " filtering mers that have more bg mers than allowed ($max_bg_mers)" + filter_max_bg_mers.py "$max_bg_mers" "$bg_counts" < "$consecutive_binding" > "$bg_filtered" || exit 1 + check_non_empty "$bg_filtered" "background filtered" + + cp $bg_filtered $final_fg_counts fi if [[ -n "$step_select" ]] || [[ -n "$all" ]]; then diff --git a/src/filter_max_bg_mers.py b/src/filter_max_bg_mers.py new file mode 100755 index 0000000..8ea38a4 --- /dev/null +++ b/src/filter_max_bg_mers.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python2.7 +import sys, os + +def main(): + + if(len(sys.argv) < 2): + print "cutoff and bg_counts is expected as an argument" + exit() + else: + cutoff = int(sys.argv[1]) + bg_count_fn = sys.argv[2] + + # if cutoff, is less than zero, we ignore, aka so we can do -1 by default, + # we can't do 0, because that might have a valid use case + if cutoff < 0: + for line in sys.stdin: + sys.stdout.write(line) + else: + + mers = {} + + bg_count_fh = open(bg_count_fn, "r") + + # copy in our foreground mers and counts into mers dictionary, then process it + for line in sys.stdin: + (mer, count) = line.split() + mers[mer] = [int(count), -1] + + for line in bg_count_fh: + (mer, count) = line.split() + if mer in mers: + mers[mer][1] = int(count) + + for mer in mers: + if mers[mer][1] == -1 or mers[mer][1] <= cutoff: + sys.stdout.write(mer + '\t' + str(mers[mer][0]) + '\n') + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.1