blob: d03ff96d9c54402c412d96b19e4899e686588bc7 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
#!/usr/bin/env bash
if [ -z "$foreground" ] && [ -z "$background" ]; then
if (( $# < 2 )); then
echo "please supply two genomes, foreground and background"
exit 1
fi;
: ${foreground=$1}
: ${background=$2}
fi
if [[ ! -f $foreground ]]; then
echo "Could not open $foreground"
exit 1
fi
if [[ ! -f $background ]]; then
echo "Could not open $background"
exit 1
fi
# output directory
: ${output_directory=`basename $foreground`_`basename $background`}
# temp directory
: ${tmp_directory=$output_directory/.tmp}
# directory to store our counts and sorted counts
: ${counts_directory=$tmp_directory}
# range of mers, min and max
: ${min_mer_range=6}
: ${max_mer_range=10}
# max mer distance, the distance between two mers in our selected outputs
: ${max_mer_distance=5000}
# min/maximum kmer meling point
: ${max_melting_temp=30}
: ${min_melting_temp=18}
# minimum mer count
: ${min_mer_count=0}
# maximum mers to pick
: ${max_select=15}
# mers to specifically IGNORE, space delimited
: ${ignore_mers=''}
# maximum number of mers that are consecutively binding
: ${max_consecutive_binding=4}
export ignore_mers
export min_mer_range
export max_mer_range
export max_select
export min_mer_count
export max_mer_distance
export max_melting_temp
export min_melting_temp
# Make our output directory
if [ ! -d $output_directory ]; then
mkdir $output_directory
fi
# Make our counts directory
if [ ! -d $counts_directory ]; then
mkdir $counts_directory
fi
# Make our temporary directory
if [ ! -d $tmp_directory ]; then
mkdir $tmp_directory
fi
current_run=$output_`date +%s`
mkdir -p $output_directory/$current_run
for fasta_file in $foreground $background; do
counts=$counts_directory/$(basename $fasta_file)
tmp=$tmp_directory/$(basename $fasta_file)
echo pre-processing $fasta_file
# check if our preprocessed file exists
if [[ ! -f $tmp ]]; then
echo "> pre processed $fasta_file" >> $tmp
cat $fasta_file | grep -v "^>" | tr -d '\n' >> $tmp
fi
# run counts if they haven't been created
rm $counts-counts
for mer in `seq $min_mer_range $max_mer_range`; do
if [ ! -e $counts-counts-$mer ]; then
echo checking $mer mers for $fasta_file
kmer_total_count -i $tmp -k $mer -l -n >> $counts-counts-$mer
else
echo "$mer mers already done for $fasta_file"
fi
cat $counts-counts-$mer >> $counts-counts
done
done
fg_counts=$counts_directory/$(basename $foreground)-counts
bg_counts=$counts_directory/$(basename $background)-counts
fg_tmp=$tmp_directory/$(basename $foreground)
bg_tmp=$tmp_directory/$(basename $background)
selected=$output_directory/$current_run/selected-mers
# remove ignored mers
if [ "$ignore_mers" ]; then
echo "removing ignored mers: " + $ignore_mers
for mer in $ignore_mers; do
sed -i '/^'$mer'\t/d' $fg_counts
sed -i '/^'$mer'\t/d' $bg_counts
done
fi
echo "outputing current run parameters"
for var in ignore_mers min_mer_range max_consecutive_binding max_mer_range max_select min_mer_count max_mer_distance max_melting_temp min_melting_temp foreground background; do
echo $var "${!var}" >> $output_directory/$current_run/parameters
done;
echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp"
cat $fg_counts | filter_melting_range $min_melting_temp $max_melting_temp > $output_directory/$foreground-counts-non-melting
cat $bg_counts | filter_melting_range $min_melting_temp $max_melting_temp > $output_directory/$background-counts-non-melting
echo "filtering out elements that have more consecutive binding mers than allowed by default $max_consecutive_binding"
cat $output_directory/$foreground-counts-non-melting | filter_max_consecutive_binding.py $max_consecutive_binding > $output_directory/$foreground-counts-filtered-binding
cat $output_directory/$background-counts-non-melting | filter_max_consecutive_binding.py $max_consecutive_binding > $output_directory/$background-counts-filtered-binding
echo "scoring mer selectivity"
select_mers.py $output_directory/$foreground-counts-filtered-binding $fg_tmp $output_directory/$background-counts-filtered-binding $bg_tmp > $selected
echo "scoring top mers based on selectivity"
score_mers.py $selected $fg_tmp $bg_tmp $output_directory/$current_run/scores-output
|