aboutsummaryrefslogtreecommitdiff
path: root/src/filter_melting_temperature.py
blob: 26f6b54267b2f70ae82e49703cd57cd65e983471 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
import sys

"""Calculate the thermodynamic melting temperatures of nucleotide sequences."""

import math
def Tm_staluc(s, dnac=5000, na=10, mg=20, dNTPs=10):
		"""Returns DNA/DNA tm using nearest neighbor thermodynamics.

		dnac is DNA concentration [nM]
		na is salt concentration [mM].
		mg, the concentration of magnesium in mM
		dNTPs, the concentration of dNTPs in mM
		
		Sebastian Bassi <sbassi@genesdigitales.com>"""
		
		#Credits: 
		#Main author: Sebastian Bassi <sbassi@genesdigitales.com>
		#Overcount function: Greg Singer <singerg@tcd.ie>
		#Based on the work of Nicolas Le Novere <lenov@ebi.ac.uk> Bioinformatics.
		#17:1226-1227(2001)

		#This function returns better results than EMBOSS DAN because it uses
		#updated thermodynamics values and takes into account inicialization
		#parameters from the work of SantaLucia (1998).
		
		#Things to do:
		#+Detect complementary sequences. Change K according to result.
		#+Add support for heteroduplex (see Sugimoto et al. 1995).
		#+Correction for Mg2+. Now supports only monovalent ions.
		#+Put thermodinamics table in a external file for users to change at will
		#+Add support for danglings ends (see Le Novele. 2001) and mismatches.
		
		dh = 0 #DeltaH. Enthalpy
		ds = 0 #deltaS Entropy

		def tercorr(stri):
				deltah = 0
				deltas = 0
				#DNA/DNA
				#Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594
				if stri.startswith('G') or stri.startswith('C'):
						deltah -= 0.1
						deltas += 2.8
				elif stri.startswith('A') or stri.startswith('T'):
						deltah -= 2.3
						deltas -= 4.1
				if stri.endswith('G') or stri.endswith('C'):
						deltah -= 0.1
						deltas += 2.8
				elif stri.endswith('A') or stri.endswith('T'):
						deltah -= 2.3
						deltas -= 4.1
				dhL = dh + deltah
				dsL = ds + deltas
				return dsL,dhL

		def overcount(st,p):
				"""Returns how many p are on st, works even for overlapping"""
				ocu = 0
				x = 0
				while 1:
						try:
								i = st.index(p,x)
						except ValueError:
								break
						ocu += 1
						x = i + 1
				return ocu

		R = 1.987 # universal gas constant in Cal/degrees C*Mol
		sup = s.upper()
		vsTC,vh = tercorr(sup)
		vs = vsTC
		
		k = (dnac/4.0)*1e-9
		#With complementary check on, the 4.0 should be changed to a variable.
		
		#DNA/DNA
		#Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594
		vh = vh + (overcount(sup,"AA"))*7.9 + (overcount(sup,"TT"))*\
		7.9 + (overcount(sup,"AT"))*7.2 + (overcount(sup,"TA"))*7.2 \
		+ (overcount(sup,"CA"))*8.5 + (overcount(sup,"TG"))*8.5 + \
		(overcount(sup,"GT"))*8.4 + (overcount(sup,"AC"))*8.4
		vh = vh + (overcount(sup,"CT"))*7.8+(overcount(sup,"AG"))*\
		7.8 + (overcount(sup,"GA"))*8.2 + (overcount(sup,"TC"))*8.2
		vh = vh + (overcount(sup,"CG"))*10.6+(overcount(sup,"GC"))*\
		9.8 + (overcount(sup,"GG"))*8 + (overcount(sup,"CC"))*8
		vs = vs + (overcount(sup,"AA"))*22.2+(overcount(sup,"TT"))*\
		22.2 + (overcount(sup,"AT"))*20.4 + (overcount(sup,"TA"))*21.3
		vs = vs + (overcount(sup,"CA"))*22.7+(overcount(sup,"TG"))*\
		22.7 + (overcount(sup,"GT"))*22.4 + (overcount(sup,"AC"))*22.4
		vs = vs + (overcount(sup,"CT"))*21.0+(overcount(sup,"AG"))*\
		21.0 + (overcount(sup,"GA"))*22.2 + (overcount(sup,"TC"))*22.2
		vs = vs + (overcount(sup,"CG"))*27.2+(overcount(sup,"GC"))*\
		24.4 + (overcount(sup,"GG"))*19.9 + (overcount(sup,"CC"))*19.9
		ds = vs
		dh = vh
		
		fgc = lambda s: len(filter(lambda x: x =='G' or x =='C', s)) / float(len(s))

		tm = ((1000* (-dh))/(-ds+(R * (math.log(k)))))-273.15
		Mmg = mg * 1E-3
		Mna = na * 1E-3
		Mdntp = dNTPs * 1E-3

		Fmg = ((3E4 * Mdntp - 3E4 * Mmg + 1) + ((3E4 * Mdntp - 3E4 * Mmg + 1)**2 + 4 * 3E4 * Mmg)**0.5 )/ (2 * 3E4)
		cationratio = - Fmg**0.5 / Mna
		
		if cationratio < 0.22:
			SaltCorrectedTm = 1 / ( (1 / tm) + ((4.29 * fgc(s) - 3.95) * math.log(Mna) + 0.940 * (math.log(Mna))**2) * 1E-5 )
		else:
			a = 0; d = 0; g = 0;
			if cationratio < 6.0:
				a = 3.92 * (0.843 - 0.352 * (Mna)^0.5 * math.log(Mna) )
				d = 1.42 * (1.279 - 4.03 * math.log(Mna) * 1E-3 - 8.03 * (math.log(Mna))^2 * 1E-3 )
				g = 8.31 * (0.486 - 0.258 * math.log(Mna) + 5.25 * (math.log(Mna))^3 * 1E-3)
			elif cationratio > 6.0:
				a = 3.92
				d = 1.42
				g = 8.31

			SaltcorrectedTm = 1 / ( (1 / tm) + (a - 0.91 * math.log(Fmg) + fgc(s) * (6.26 + d * math.log(Fmg)) + 1/(2 * (len(s) - 1)) * (-48.2 + 52.5 * math.log(Fmg) + g * (math.log(Fmg))**2)) * 1E-5 )

		return tm

# naiive
def in_temp_range(kmer):

	A = kmer.count('A')
	C = kmer.count('C')
	G = kmer.count('G')
	T = kmer.count('T')

	melt_temp = 0.0;

	if len(kmer) < 13:
		melt_temp = ((A+T) * 2) + ((C+G) * 4)
	else:
		melt_temp = 64.9 + 41*(G+C-16.4)/(A+T+G+C)

	return min_melting_temp < melt_temp < max_melting_temp

min_melting_temp = float(sys.argv[1])
max_melting_temp = float(sys.argv[2])


if __name__ == "__main__":
	for line in sys.stdin:
		if min_melting_temp < Tm_staluc(line.split("\t")[0]) < max_melting_temp:
			sys.stdout.write(line)