#!/usr/bin/env bash set -e # arguments: # remove lock if error exit exit_handler() { rmdir $tmp_directory/counts-lock exit 1 } arg() { echo -e "\e[32m$@\e[39m" } # check_non_empty check_non_empty() { if [[ ! -s $1 ]]; then echo "Warning: no mers remain after the '$2' filter!" echo "Exiting..." exit 1 fi } # check_mers filename check_mers() { local fasta_file="$1" local counts="$2" local mer=0 echo " counting mers in $fasta_file:" echo -e "\e[32m" # remove the counts file so we can concatenate if [[ -e "$counts"-counts ]]; then echo " removing $counts-counts" rm "$counts"-counts fi # check each mer size and process if not already run lock $tmp_directory/counts-lock for (( mer = min_mer_range; mer <= max_mer_range; mer++)) ; do if [[ ! -e "$counts"-counts-"$mer" ]]; then echo " checking $mer mers for $fasta_file" kmer_total_count -c -i "$fasta_file" -k "$mer" -l -n > "$counts"-counts-"$mer" || exit_handler else echo " $mer-mers already done for $fasta_file (assuming no change)" fi # concatentate cat "$counts"-counts-"$mer" >> "$counts"-counts echo -e "\e[39m" done rmdir $tmp_directory/counts-lock } all=1 # Parse in our arguments if [[ -z "$foreground" ]] && [[ -z "$background" ]]; then if (( $# < 2 )); then echo "please supply two genomes, foreground and background" exit 1 fi; : ${foreground=$1} : ${background=$2} start=3 else start=1 fi if [[ -n "$step_mers" ]] || [[ -n "$step_filters" ]] || [[ -n "$step_select" ]] || [[ -n "$step_score" ]]; then unset all fi if (( $# != 0 )); then for i in "${@:$start}"; do if [[ "$i" = "1" ]] || [[ "$i" = "count" ]]; then step=1; step_mers=1 fi if [[ "$i" = "2" ]] || [[ "$i" = "filter" ]]; then step=1; step_filters=1 fi if [[ "$i" = "3" ]] || [[ "$i" = "select" ]]; then step=1; step_select=1 fi if [[ "$i" = "4" ]] || [[ "$i" = "score" ]]; then step=1; step_score=1 fi if [[ $step ]] && [[ ! "$current_run" ]] && [[ ! $step_mers ]]; then echo "Error: If you are going to step through your program, and aren't starting" \ "at the first step, you better specify what previous run you want to use" \ "as your base" exit fi if [[ -n $step ]]; then unset all fi done; fi; if [[ -n "$step" ]] && [[ -z "$step_mers" ]] && [[ -z "$step_filters" ]] && [[ -z $step_select ]] && [[ -z "$step_score" ]]; then echo "Error: you need to select at least one step to run." exit fi echo echo "Planning on running these steps:" for var in step_mers step_filters step_select step_score all; do if [[ -n "${!var}" ]]; then echo ' '$var fi done # output directory : ${output_directory=$(basename "$foreground")_$(basename "$background")} # temp directory : ${tmp_directory="$output_directory"/.tmp} # directory to store our counts and sorted counts : ${counts_directory="$tmp_directory"} # range of mers, min and max : ${min_mer_range=6} : ${max_mer_range=12} # max mer distance, the distance between two mers in our selected outputs : ${max_mer_distance=5000} # min/maximum kmer meling point : ${max_melting_temp=30} : ${min_melting_temp=0} # Concentrations of solution : ${dna_con=5000} : ${na_con=10} : ${mg_con=20} : ${dntps_con=10} # minimum average binding distance in the foreground : ${min_foreground_binding_average=50000} # maximum mers to pick : ${max_select=15} # maximum mers to check : ${max_check=35} # mers to specifically IGNORE, space delimited : ${ignore_mers=''} # IGNORE all mers that are in these files, space delimited : ${ignore_all_mers_from_files} # maximum number of mers that are consecutively binding : ${max_consecutive_binding=4} # fg_weight, how much to weight to give the higher bindnig primers : ${fg_weight=0} # primer_weight, how much weight to give to sets with a higher number of primers. (between 0 and 1) : ${primer_weight=0} # output_top_nb, How many scored sets would you like in the top_scored_sets output file (Default = 10000)? : ${output_top_nb=10000} # score_func: A custom scoring function. disable by default. See README.md : ${score_func="(nb_primers**primer_weight) * (fg_mean_dist * fg_std_dist) / bg_ratio"} # sort score by the minimum or maximum value. acceptable parameters are min or max. : ${sort_by="min"} # bg_ratio : ${min_bg_ratio=0} # max_bg_mers : ${max_bg_mers=-1} export ignore_mers export min_mer_range export max_mer_range export max_select export min_foreground_binding_average export max_mer_distance export max_melting_temp export min_melting_temp export fg_weight export primer_weight # check foreground and background if [[ ! -f "$foreground" ]]; then echo "Error: could not open foreground: $foreground" exit 1 fi if [[ ! -f "$background" ]]; then echo "Error: could not open background: $background" exit 1 fi if [[ -n "$current_run" ]] && [[ ! -d "$output_directory/$current_run" ]]; then echo -n "run $current_run was not found, it should be a folder here: " echo "$output_directory/$current_run" exit fi if [[ "$sort_by" = "min" ]]; then sort='' elif [[ "$sort_by" = "max" ]]; then sort="-r" else echo "Error: \$sort_by must either be set to max or min" exit fi num=1 if [[ -z "$current_run" ]]; then while [[ -d $output_directory/run_$num ]] ; do let num++ done current_run=run_$num fi fg_basename=$(basename "$foreground") bg_basename=$(basename "$background") fg_counts=$counts_directory/$fg_basename-counts bg_counts=$counts_directory/$bg_basename-counts final_fg_counts=$output_directory/$current_run/$fg_basename-filtered-counts selected=$output_directory/$current_run/selected-mers ignore_mers_counts="$output_directory/$current_run/passes-filter/1-$fg_basename-ignore-mers" ignore_all_mers_counts="$output_directory/$current_run/passes-filter/2-$fg_basename-ignore-all-mers" average_binding="$output_directory/$current_run/passes-filter/3-$fg_basename-average-binding" non_melting="$output_directory/$current_run/passes-filter/4-$fg_basename-non-melting" consecutive_binding="$output_directory/$current_run/passes-filter/5-$fg_basename-consecutive-binding" bg_filtered="$output_directory/$current_run/passes-filter/6-$fg_basename-bg-filtered" # Make our output directory if [[ ! -d "$output_directory" ]]; then mkdir "$output_directory" fi # Make our counts directory if [[ ! -d "$counts_directory" ]]; then mkdir "$counts_directory" fi # Make our temporary directory if [[ ! -d $tmp_directory ]]; then mkdir "$tmp_directory" fi # Make our current run directory if [[ ! -d $output_directory/$current_run ]]; then mkdir "$output_directory"/"$current_run" fi # Make our filter directory if [[ ! -d "$output_directory/$current_run/passes-filter" ]]; then mkdir "$output_directory/$current_run/passes-filter" fi echo "Outputting current run parameters" for var in score_func ignore_mers ignore_all_mers_from_files min_mer_range max_mer_range max_check cpus max_consecutive_binding max_select min_foreground_binding_average max_mer_distance min_melting_temp max_melting_temp dna_con na_con mg_con dntps_con foreground background; do echo "$var" "${!var}" >> "$output_directory"/"$current_run"/parameters done; echo "current run is: $current_run" echo if [[ -n "$step_mers" ]] || [[ -n "$all" ]]; then # to continue this project you need to use the current run. echo "Step 1: counting primers in foreground and background" check_mers "$foreground" "$counts_directory/$(basename "$foreground")" check_mers "$background" "$counts_directory/$(basename "$background")" fi if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then if [[ ! -f "$fg_counts" ]]; then echo "Error: you need to run your count step before filtration" exit fi echo "Step 2: filtering mers" # remove ignored mers if [[ "$ignore_mers" ]]; then echo " filtering explicitly ignored mers: $ignore_mers" cat "$fg_counts" | remove_mers.py $ignore_mers > "$ignore_mers_counts" else cp "$fg_counts" "$ignore_mers_counts" fi check_non_empty "$ignore_mers_counts" "ignore mers" # create full ignore_all_counts cp "$ignore_mers_counts" "$ignore_all_mers_counts" # remove ignored mers if [[ "$ignore_all_mers_from_files" ]]; then for ignore_file in $ignore_all_mers_from_files; do if [[ -f "$ignore_file" ]]; then # check mers from next ignore file counts="$counts_directory/ignore-"$(basename "$ignore_file") check_mers "$ignore_file" "$counts" counts="$counts-counts" echo " filtering ignored mers from: $ignore_file" cat "$ignore_all_mers_counts" | remove_mers_from_file.py "$counts"> "$ignore_all_mers_counts-tmp" mv "$ignore_all_mers_counts-tmp" "$ignore_all_mers_counts" check_non_empty "$ignore_all_mers_counts" "ignore all mers from file $ignore_file" else echo " $ignore_file not found, continuing..." fi done fi check_non_empty "$ignore_all_mers_counts" "ignore all mers from file " echo " filtering mers that appear less frequently than the average binding site distance ($min_foreground_binding_average)" filter_average_binding.py --fasta "$foreground" --minimum "$min_foreground_binding_average" --counts "$ignore_all_mers_counts" > "$average_binding" || exit 1 check_non_empty "$average_binding" "average binding" echo " filtering mers that are not in the melting range ($min_melting_temp-$max_melting_temp)" filter_melting_temperature.py -m "$min_melting_temp" -x "$max_melting_temp" -d "$dna_con" -g "$mg_con" -n "$na_con" -p "$dntps_con" < "$average_binding" > "$non_melting" || exit 1 check_non_empty "$non_melting" "melting temperature" echo " filtering mers that have more consecutive binding mers than allowed ($max_consecutive_binding)" filter_max_consecutive_binding.py "$max_consecutive_binding" < "$non_melting" > "$consecutive_binding" || exit 1 check_non_empty "$consecutive_binding" "consecutive binding" echo " filtering mers that have more bg mers than allowed ($max_bg_mers)" filter_max_bg_mers.py "$max_bg_mers" "$bg_counts" < "$consecutive_binding" > "$bg_filtered" || exit 1 check_non_empty "$bg_filtered" "background filtered" cp $bg_filtered $final_fg_counts fi if [[ -n "$step_select" ]] || [[ -n "$all" ]]; then if [[ ! -f "$final_fg_counts" ]]; then echo "Error: you need to run your filtration step before selection" exit fi echo "Step 3: Scoring mer selectivity" select_mers.py "$final_fg_counts" "$bg_counts" > "$selected" || exit 1 fi if [[ -n "$step_score" ]] || [[ -n "$all" ]]; then if [[ ! -f "$selected" ]]; then echo "Error: you need to run your selection step before you run your scoring" exit fi echo "Step 4: Scoring top mers based on selectivity" score_wrapper.sh "$selected" "$foreground" "$background" "$output_directory/$current_run/all-scores" || exit 1 # output our sorted scores echo "sorting and outputting top $output_top_nb scores" echo "top scores output file: $output_directory/$current_run/top-scores" head -n 3 $output_directory/$current_run/all-scores > $output_directory/$current_run/top-scores tail -n +4 $output_directory/$current_run/all-scores | sort $sort -t $'\t' -nk 3 | head -n $output_top_nb >> $output_directory/$current_run/top-scores fi