blob: 85a166ffd1386fb1b9ba0eace32ba1e1ef882d12 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
|
#!/usr/bin/env bash
all=run
# Parse in our arguments
if [[ -z "$foreground" ]] && [[ -z "$background" ]]; then
if (( $# < 2 )); then
echo "please supply two genomes, foreground and background"
exit 1
fi;
: ${foreground=$1}
: ${background=$2}
start=3
else
start=1
fi
if (( $# > 2 )); then
unset all
for i in "${@:$start}"; do
if [[ "$i" = "1" ]] || [[ "$i" = "count" ]]; then
step=1; step_mers=1
fi
if [[ "$i" = "2" ]] || [[ "$i" = "filter" ]]; then
step=1; step_filters=1
fi
if [[ "$i" = "3" ]] || [[ "$i" = "select" ]]; then
step=1; step_select=1
fi
if [[ "$i" = "4" ]] || [[ "$i" = "score" ]]; then
step=1; step_score=1
fi
if [[ $step ]] && [[ ! "$current_run" ]] && [[ ! $step_mers ]]; then
echo "Error: If you are going to step through your program, and aren't starting" \
"at the first step, you better specify what previous run you want to use" \
"as your base"
exit
fi
done;
fi;
if [[ -n "$step" ]] && [[ -z "$step_mers" ]] && [[ -z "$step_filters" ]] && [[ -z $step_select ]] && [[ -z "$step_score" ]]; then
echo "Error: you need to select at least one step to run."
exit
fi
echo
echo "Planning on running these steps:"
for var in step_mers step_filters step_select step_score all; do
if [[ -n "${!var}" ]]; then
echo ' '$var
fi
done
# output directory
: ${output_directory=$(basename "$foreground")_$(basename "$background")}
# temp directory
: ${tmp_directory="$output_directory"/.tmp}
# directory to store our counts and sorted counts
: ${counts_directory="$tmp_directory"}
# range of mers, min and max
: ${min_mer_range=6}
: ${max_mer_range=12}
# max mer distance, the distance between two mers in our selected outputs
: ${max_mer_distance=5000}
# min/maximum kmer meling point
: ${max_melting_temp=30}
: ${min_melting_temp=0}
# minimum average binding distance in the foreground
: ${min_foreground_binding_average=50000}
# maximum mers to pick
: ${max_select=15}
# maximum mers to check
: ${max_check=35}
# mers to specifically IGNORE, space delimited
: ${ignore_mers=''}
# maximum number of mers that are consecutively binding
: ${max_consecutive_binding=4}
# fg_weight, how much to weight to give the higher bindnig primers
: ${fg_weight=0}
# primer_weight, how much weight to give to sets with a higher number of primers. (between 0 and 1)
: ${primer_weight=0}
export ignore_mers
export min_mer_range
export max_mer_range
export max_select
export min_foreground_binding_average
export max_mer_distance
export max_melting_temp
export min_melting_temp
export fg_weight
export primer_weight
echo
# check foreground and background
if [[ ! -f "$foreground" ]]; then
echo "Error: could not open $foreground"
exit 1
fi
if [[ ! -f "$background" ]]; then
echo "Error: could not open $background"
exit 1
fi
if [[ -n "$current_run" ]] && [[ ! -d "$output_directory/$current_run" ]]; then
echo -n "run $current_run was not found, it should be a folder here: "
echo "$output_directory/$current_run"
exit
fi
num=1
if [[ -z "$current_run" ]]; then
while [[ -d $output_directory/run_$num ]] ; do
let num++
done
current_run=run_$num
fi
fg_basename=$(basename "$foreground")
bg_basename=$(basename "$background")
fg_counts=$counts_directory/$fg_basename-counts
bg_counts=$counts_directory/$bg_basename-counts
selected=$output_directory/$current_run/selected-mers
average_binding=$output_directory/$current_run/$fg_basename-counts-average-binding
non_melting=$output_directory/$current_run/$fg_basename-counts-non-melting
consecutive_binding=$output_directory/$current_run/$fg_basename-counts-consecutive-binding
# Make our output directory
if [[ ! -d "$output_directory" ]]; then
mkdir "$output_directory"
fi
# Make our counts directory
if [[ ! -d "$counts_directory" ]]; then
mkdir "$counts_directory"
fi
# Make our temporary directory
if [[ ! -d $tmp_directory ]]; then
mkdir "$tmp_directory"
fi
# Make our current run directory
if [[ ! -d $output_directory/$current_run ]]; then
mkdir "$output_directory"/"$current_run"
fi
echo "Outputting current run parameters"
for var in ignore_mers min_mer_range max_mer_range max_check cpus max_consecutive_binding max_select min_foreground_binding_average max_mer_distance min_melting_temp max_melting_temp foreground background; do
echo "$var" "${!var}" >> "$output_directory"/"$current_run"/parameters
done;
echo "current run is: $current_run"
echo
if [[ -n "$step_mers" ]] || [[ -n "$all" ]]; then
# to continue this project you need to use the current run.
echo "Step 1: counting primers in foreground and background"
for fasta_file in "$foreground" "$background"; do
counts="$counts_directory"/$(basename "$fasta_file")
echo "counting mers in $fasta_file"
# check each mer size and process if not already run
for (( mer = min_mer_range; mer <= max_mer_range; mer++)) ; do
if [[ ! -e "$counts"-counts-"$mer" ]]; then
echo "checking $mer mers for $fasta_file (assuming $fasta_file didn't change)"
kmer_continuous_count -c -i "$fasta_file" -k "$mer" -l -n > "$counts"-counts-"$mer" || exit 1
else
echo "$mer mers already done for $fasta_file"
fi
# remove the counts file so we can concatenate
if [[ -e "$counts"-counts ]]; then
rm "$counts"-counts
fi
# concatentate
cat "$counts"-counts-"$mer" >> "$counts"-counts
done
done
fi
if [[ -n "$step_filters" ]] || [[ -n "$all" ]]; then
if [[ ! -f "$fg_counts" ]]; then
echo "Error: you need to run your count step before filtration"
exit
fi
echo "Step 2: Filtering mer combinations based on parameters"
# remove ignored mers
if [[ "$ignore_mers" ]]; then
echo "removing ignored mers: " + "$ignore_mers"
for mer in $ignore_mers; do
sed -i '/^'"$mer"'\t/d' "$fg_counts"
sed -i '/^'"$mer"'\t/d' "$bg_counts"
done
fi
echo "checking if mers appear at least as often in the fg as the average binding site or more $min_foreground_binding_average"
filter_average_binding.py "$foreground" "$min_foreground_binding_average" < "$fg_counts" > "$average_binding" || exit 1
echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp"
filter_melting_temperature.py "$min_melting_temp" "$max_melting_temp" < "$average_binding" > "$non_melting" || exit 1
echo "filtering out elements that have more consecutive binding mers than allowed by \$max_consecutive_binding $max_consecutive_binding"
filter_max_consecutive_binding.py "$max_consecutive_binding" < "$non_melting" > "$consecutive_binding" || exit 1
fi
if [[ -n "$step_select" ]] || [[ -n "$all" ]]; then
if [[ ! -f "$consecutive_binding" ]]; then
echo "Error: you need to run your filtration step before selection"
exit
fi
echo "Step 3: Scoring mer selectivity"
select_mers.py "$consecutive_binding" "$bg_counts" > "$selected" || exit 1
fi
if [[ -n "$step_score" ]] || [[ -n "$all" ]]; then
if [[ ! -f "$selected" ]]; then
echo "Error: you need to run your selection step before you run your scoring"
exit
fi
echo "Step 4: Scoring top mers based on selectivity"
score_wrapper.sh "$selected" "$foreground" "$background" "$output_directory"/"$current_run"/scores-output || exit 1
fi
|