SelectiveWholeGenomeAmplification


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379

#!/usr/bin/env bash

set -e
# arguments:

# remove lock if error exit
exit_handler() {
	rmdir $tmp_directory/counts-lock
	exit 1
}

arg() {
		echo -e "\e[32m$@\e[39m"
}
# check_non_empty
check_non_empty() {
	if [[ ! -s $1 ]]; then
		echo "Warning: no mers remain after the '$2' filter!"
		echo "Exiting..."
		exit 1
	fi
}
# check_mers filename
check_mers() {

	local fasta_file="$1"
	local counts="$2"
	local mer=0

	echo "  counting mers in $fasta_file:"
	echo -e "\e[32m"

	#  remove the counts file so we can concatenate
	if [[ -e "$counts"-counts ]]; then
		echo "   removing $counts-counts"
		rm "$counts"-counts
	fi

	# check each mer size and process if not already run
	lock $tmp_directory/counts-lock
	for (( mer = min_mer_range; mer <= max_mer_range; mer++)) ; do 
		if [[ ! -e "$counts"-counts-"$mer" ]]; then
			echo "   checking $mer mers for $fasta_file"
			kmer_total_count -c -i "$fasta_file" -k "$mer" -l -n > "$counts"-counts-"$mer" || exit_handler
		else 
			echo "   $mer-mers already done for $fasta_file (assuming no change)"
		fi
		
		# concatentate 
		cat "$counts"-counts-"$mer" >> "$counts"-counts

	echo -e "\e[39m"
	done
	rmdir $tmp_directory/counts-lock
}

all=1

# Parse in our arguments
if [[ -z "$foreground" ]] && [[ -z "$background" ]]; then 
	if (( $# < 2 )); then  
		echo "please supply two genomes, foreground and background"
		exit 1
	fi;

	: ${foreground=$1}
	: ${background=$2}
	start=3
else
	start=1
fi

if [[ -n "$step_mers" ]] || [[ -n "$step_filters" ]] || [[ -n "$step_select" ]] || [[ -n "$step_score" ]]; then
	unset all
fi

if (( $# != 0 )); then

	for i in "${@:$start}"; do 
		if [[ "$i" = "1" ]] || [[ "$i" = "count" ]]; then
			step=1;	step_mers=1
		fi

		if [[ "$i" = "2" ]] || [[ "$i" = "filter" ]]; then
			step=1;	step_filters=1
		fi

		if [[ "$i" = "3" ]] || [[ "$i" = "select" ]]; then
			step=1;	step_select=1
		fi

		if [[ "$i" = "4" ]] || [[ "$i" = "score" ]]; then
			step=1; step_score=1
		fi

		if [[ $step ]] && [[ ! "$current_run" ]] && [[ ! $step_mers ]]; then
			echo "Error: If you are going to step through your program, and aren't starting" \
			"at the first step, you better specify what previous run you want to use" \
			"as your base"
			exit
		fi
		
		if [[ -n $step ]]; then
			unset all
		fi

		done;
fi;

if [[ -n "$step" ]] && [[ -z "$step_mers" ]] &&  [[ -z "$step_filters" ]] && [[ -z $step_select ]] && [[ -z "$step_score" ]]; then
	echo "Error: you need to select at least one step to run."
	exit 
fi

echo 
echo "Planning on running these steps:"
for var in step_mers step_filters step_select step_score all; do
	if [[ -n "${!var}" ]]; then
		echo '  '$var
	fi
done

# output directory 
: ${output_directory=$(basename "$foreground")_$(basename "$background")}

# temp directory 
: ${tmp_directory="$output_directory"/.tmp}

# directory to store our counts and sorted counts
: ${counts_directory="$tmp_directory"}

# range of mers, min and max 
: ${min_mer_range=6}
: ${max_mer_range=12}

# max mer distance, the distance between two mers in our selected outputs
: ${max_mer_distance=5000}

# min/maximum kmer meling point
: ${max_melting_temp=30}
: ${min_melting_temp=0}

# Concentrations of solution
: ${dna_con=5000}
: ${na_con=10}
: ${mg_con=20}
: ${dntps_con=10}

# minimum average binding distance in the foreground
: ${min_foreground_binding_average=50000}

# maximum mers to pick
: ${max_select=15}

# maximum mers to check
: ${max_check=35}

# mers to specifically IGNORE, space delimited
: ${ignore_mers=''}

# IGNORE all mers that are in these files, space delimited
: ${ignore_all_mers_from_files}

# maximum number of mers that are consecutively binding
: ${max_consecutive_binding=4}

# fg_weight, how much to weight to give the higher bindnig primers
: ${fg_weight=0}

# primer_weight, how much weight to give to sets with a higher number of primers. (between 0 and 1)
: ${primer_weight=0}

# output_top_nb, How many scored sets would you like in the top_scored_sets output file (Default = 10000)?
: ${output_top_nb=10000}

# score_func: A custom scoring function. disable by default. See README.md
: ${score_func="(nb_primers**primer_weight) * (fg_mean_dist * fg_std_dist) / bg_ratio"}

# sort score by the minimum or maximum value. acceptable parameters are min or max.
: ${sort_by="min"}

# bg_ratio
: ${min_bg_ratio=0}

# max_bg_mers
: ${max_bg_mers=-1}

export ignore_mers
export min_mer_range
export max_mer_range

export max_select

export min_foreground_binding_average
export max_mer_distance

export max_melting_temp 
export min_melting_temp 

export fg_weight
export primer_weight

# check foreground and background
if [[ ! -f "$foreground" ]]; then
	echo "Error: could not open foreground: $foreground"
	exit 1
fi

if [[ ! -f "$background" ]]; then
	echo "Error: could not open background: $background"
	exit 1
fi


if [[ -n "$current_run" ]] && [[ ! -d "$output_directory/$current_run" ]]; then
	echo -n "run $current_run was not found, it should be a folder here: "
	echo "$output_directory/$current_run"
	exit
fi

if [[ "$sort_by" = "min" ]]; then
	sort=''
elif [[ "$sort_by" = "max" ]]; then
	sort="-r"
else
	echo "Error: \$sort_by must either be set to max or min"
	exit
fi

num=1
if [[ -z "$current_run" ]]; then 
	while [[ -d $output_directory/run_$num ]] ; do
		let num++
	done
	current_run=run_$num
fi

fg_basename=$(basename "$foreground")
bg_basename=$(basename "$background")

fg_counts=$counts_directory/$fg_basename-counts
bg_counts=$counts_directory/$bg_basename-counts

final_fg_counts=$output_directory/$current_run/$fg_basename-filtered-counts
selected=$output_directory/$current_run/selected-mers

ignore_mers_counts="$output_directory/$current_run/passes-filter/1-$fg_basename-ignore-mers"
ignore_all_mers_counts="$output_directory/$current_run/passes-filter/2-$fg_basename-ignore-all-mers"
average_binding="$output_directory/$current_run/passes-filter/3-$fg_basename-average-binding"
non_melting="$output_directory/$current_run/passes-filter/4-$fg_basename-non-melting"
consecutive_binding="$output_directory/$current_run/passes-filter/5-$fg_basename-consecutive-binding"
bg_filtered="$output_directory/$current_run/passes-filter/6-$fg_basename-bg-filtered"

# Make our output directory
if [[ ! -d "$output_directory" ]]; then
	mkdir "$output_directory"
fi

# Make our counts directory
if [[ ! -d "$counts_directory" ]]; then
	mkdir "$counts_directory"
fi

# Make our temporary directory
if [[ ! -d $tmp_directory ]]; then
	mkdir "$tmp_directory"
fi

# Make our current run directory 
if [[ ! -d $output_directory/$current_run ]]; then
	mkdir "$output_directory"/"$current_run" 
fi

# Make our filter directory 
if [[ ! -d "$output_directory/$current_run/passes-filter" ]]; then
	mkdir "$output_directory/$current_run/passes-filter"
fi

echo "Outputting current run parameters"
	for var in score_func ignore_mers ignore_all_mers_from_files min_mer_range max_mer_range max_check cpus max_consecutive_binding max_select min_foreground_binding_average max_mer_distance min_melting_temp max_melting_temp dna_con na_con mg_con dntps_con foreground background; do 
	echo "$var" "${!var}" >> "$output_directory"/"$current_run"/parameters
done;

echo "current run is: $current_run"
echo

if [[ -n "$step_mers" ]] || [[ -n "$all" ]]; then
	# to continue this project you need to use the current run.

	echo "Step 1: counting primers in foreground and background"
	check_mers "$foreground" "$counts_directory/$(basename "$foreground")"
	check_mers "$background" "$counts_directory/$(basename "$background")"
fi

if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then
	if [[ ! -f "$fg_counts" ]]; then
		echo "Error: you need to run your count step before filtration"
		exit
	fi
	echo "Step 2: filtering mers"


	# remove ignored mers
	if [[ "$ignore_mers" ]]; then
		echo "  filtering explicitly ignored mers: $ignore_mers"
		cat "$fg_counts" | remove_mers.py $ignore_mers > "$ignore_mers_counts"
	else
		cp "$fg_counts" "$ignore_mers_counts"
	fi

	check_non_empty "$ignore_mers_counts" "ignore mers" 

	# create full ignore_all_counts
	cp "$ignore_mers_counts" "$ignore_all_mers_counts"
	# remove ignored mers
	if [[ "$ignore_all_mers_from_files" ]]; then
		for ignore_file in $ignore_all_mers_from_files; do
			if [[ -f "$ignore_file" ]]; then 

				# check mers from next ignore file
				counts="$counts_directory/ignore-"$(basename "$ignore_file")
				check_mers "$ignore_file" "$counts"
				counts="$counts-counts"

				echo "  filtering ignored mers from: $ignore_file"
				cat "$ignore_all_mers_counts" | remove_mers_from_file.py "$counts"> "$ignore_all_mers_counts-tmp"
				mv "$ignore_all_mers_counts-tmp" "$ignore_all_mers_counts"
				check_non_empty "$ignore_all_mers_counts" "ignore all mers from file $ignore_file"
			else 
				echo "  $ignore_file not found, continuing..."
			fi

		done
	fi
	check_non_empty "$ignore_all_mers_counts" "ignore all mers from file "
	
	echo "  filtering mers that appear less frequently than the average binding site distance ($min_foreground_binding_average)"
	filter_average_binding.py "$ignore_all_mers_counts" "$min_foreground_binding_average" < "$fg_counts"  > "$average_binding" || exit 1
	check_non_empty "$average_binding" "average binding"

	echo "  filtering mers that are not in the melting range ($min_melting_temp-$max_melting_temp)"
  filter_melting_temperature.py -m "$min_melting_temp" -x "$max_melting_temp" -d "$dna_con" -g "$mg_con" -n "$na_con" -p "$dntps_con" < "$average_binding" > "$non_melting" || exit 1
	check_non_empty "$non_melting" "melting temperature"

	echo "  filtering mers that have more consecutive binding mers than allowed ($max_consecutive_binding)"
	filter_max_consecutive_binding.py "$max_consecutive_binding" < "$non_melting" > "$consecutive_binding" || exit 1
	check_non_empty "$consecutive_binding" "consecutive binding"

	echo "  filtering mers that have more bg mers than allowed ($max_bg_mers)"
	filter_max_bg_mers.py "$max_bg_mers" "$bg_counts" < "$consecutive_binding" > "$bg_filtered" || exit 1
	check_non_empty "$bg_filtered" "background filtered"

	cp $bg_filtered $final_fg_counts
fi

if [[ -n "$step_select" ]] || [[ -n "$all" ]]; then
	if [[ ! -f "$final_fg_counts" ]]; then
		echo "Error: you need to run your filtration step before selection"
		exit
	fi
	echo "Step 3: Scoring mer selectivity"
	select_mers.py "$final_fg_counts" "$bg_counts" > "$selected" || exit 1
fi

if [[ -n "$step_score" ]] || [[ -n "$all" ]]; then
	if [[ ! -f "$selected" ]]; then
		echo "Error: you need to run your selection step before you run your scoring"
		exit
	fi

	echo "Step 4: Scoring top mers based on selectivity"
	score_wrapper.sh "$selected" "$foreground" "$background" "$output_directory/$current_run/all-scores" || exit 1

	# output our sorted scores
	echo "sorting and outputting top $output_top_nb scores"
	echo "top scores output file: $output_directory/$current_run/top-scores"
	head -n 3 $output_directory/$current_run/all-scores > $output_directory/$current_run/top-scores
	tail -n +4 $output_directory/$current_run/all-scores | sort $sort -t $'\t' -nk 3 | head -n $output_top_nb >> $output_directory/$current_run/top-scores
fi