aboutsummaryrefslogtreecommitdiff
path: root/SelectiveGenomeAmplification
blob: b66d06c18be6ca9f4ca6353edcab848a1d7425aa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env bash

if [ -z "$foreground" ] && [ -z "$background" ]; then 
	if (( $# < 2 )); then  
		echo "please supply two genomes, foreground and background"
		exit 1
	fi;

	: ${foreground=$1}
	: ${background=$2}
fi


if [[ ! -f $foreground ]]; then
	echo "Could not open $foreground"
	exit 1
fi

if [[ ! -f $background ]]; then
	echo "Could not open $background"
	exit 1
fi

# output directory 
: ${output_directory=`basename $foreground`_`basename $background`}

# temp directory 
: ${tmp_directory=$output_directory/.tmp}

# directory to store our counts and sorted counts
: ${counts_directory=$tmp_directory}

# range of mers, min and max 
: ${min_mer_range=6}
: ${max_mer_range=12}

# max mer distance, the distance between two mers in our selected outputs
: ${max_mer_distance=5000}

# min/maximum kmer meling point
: ${max_melting_temp=30}
: ${min_melting_temp=0}

# minimum average binding distance in the foreground
: ${min_foreground_binding_average=50000}

# maximum mers to pick
: ${max_select=15}

# maximum mers to check
: ${max_check=35}

# mers to specifically IGNORE, space delimited
: ${ignore_mers=''}

# maximum number of mers that are consecutively binding
: ${max_consecutive_binding=4}

export ignore_mers
export min_mer_range
export max_mer_range

export max_select

export min_foreground_binding_average
export max_mer_distance

export max_melting_temp 
export min_melting_temp 


# Make our output directory
if [ ! -d $output_directory ]; then
	mkdir $output_directory
fi

# Make our counts directory
if [ ! -d $counts_directory ]; then
	mkdir $counts_directory
fi

# Make our temporary directory
if [ ! -d $tmp_directory ]; then
	mkdir $tmp_directory
fi


current_run=$output_`date +%s`
mkdir -p $output_directory/$current_run 

for fasta_file in $foreground $background; do

	counts=$counts_directory/$(basename $fasta_file)

	echo pre-processing $fasta_file

	# run counts if they haven't been created 
	if [ -e $counts-counts ]; then
		rm $counts-counts
	fi
	for mer in `seq $min_mer_range $max_mer_range`;	do 
		if [ ! -e $counts-counts-$mer ]; then
			echo checking $mer mers for $fasta_file
			kmer_continuous_count -c -i $fasta_file -k $mer -l -n > $counts-counts-$mer
		else 
			echo "$mer mers already done for $fasta_file (assuming $fasta_file didn't change)"
		fi
		
		cat $counts-counts-$mer >> $counts-counts
	
	done
done

fg_counts=$counts_directory/$(basename $foreground)-counts
bg_counts=$counts_directory/$(basename $background)-counts

selected=$output_directory/$current_run/selected-mers

# remove ignored mers
if [ "$ignore_mers" ]; then
	echo "removing ignored mers: " + $ignore_mers
	for mer in $ignore_mers; do
		sed -i '/^'$mer'\t/d' $fg_counts
		sed -i '/^'$mer'\t/d' $bg_counts
	done
fi

echo "outputing current run parameters"
for var in ignore_mers min_mer_range max_check cpus max_consecutive_binding max_mer_range max_select min_mer_count max_mer_distance max_melting_temp min_melting_temp foreground background; do 
 echo $var "${!var}" >> $output_directory/$current_run/parameters
done;
 
average_binding=$output_directory/$current_run/`basename $foreground`-counts-average-binding
consecutive_binding=$output_directory/$current_run/`basename $foreground`-counts-consecutive-binding
non_melting=$output_directory/$current_run/`basename $foreground`-counts-non-melting

echo "checking if mers appear at least as often in the fg as the average binding site or more $min_foreground_binding_average"
cat $fg_counts | filter_average_binding.py $foreground $min_foreground_binding_average > $average_binding || exit 1

echo "checking if mers are within the melting range $min_melting_temp $max_melting_temp"
cat $average_binding | filter_melting_temperature.py $min_melting_temp $max_melting_temp > $non_melting || exit 1

echo "filtering out elements that have more consecutive binding mers than allowed by \$max_consecutive_binding $max_consecutive_binding"
cat $non_melting | filter_max_consecutive_binding.py $max_consecutive_binding > $consecutive_binding || exit 1

echo "scoring mer selectivity"
select_mers.py $consecutive_binding $bg_counts > $selected || exit 1

echo "scoring top mers based on selectivity"
score_wrapper.sh $selected $foreground $background $output_directory/$current_run/scores-output || exit 1