From 94d04a1e503121a98b403f882c18a4f0799267d7 Mon Sep 17 00:00:00 2001 From: Calvin Morrison Date: Wed, 29 Jan 2014 11:53:30 -0500 Subject: add filtering based on consecutive mer lengths --- src/filter_max_consecutive_binding.py | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100755 src/filter_max_consecutive_binding.py (limited to 'src/filter_max_consecutive_binding.py') diff --git a/src/filter_max_consecutive_binding.py b/src/filter_max_consecutive_binding.py new file mode 100755 index 0000000..daebee4 --- /dev/null +++ b/src/filter_max_consecutive_binding.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +import sys, os + +binding = { 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '_': False } + + +def max_consecutive_binding(mer1, mer2): + if len(mer2) > len(mer1): + mer1, mer2 = mer2, mer1 + + # reverse mer2, + mer2 = mer2[::-1] + # pad mer one to avoid errors + mer1 = mer1.ljust(len(mer1) + len(mer1), "_") + + max_bind = 0; + for offset in range(len(mer2)): + consecutive = 0 + for x in range(len(mer2)): + if binding[mer1[offset+x]] == mer2[x]: + consecutive = consecutive + 1 + else: + consecutive = 0 + + max_bind = max(consecutive,max_bind) + + return max_bind + +def test(): + # mer 1 mer 2 # correct ans + arr = [ + ("ATATAT", "TATATA", 5), + ("ACAGGGAT", "ATATATAT", 2), + ("CATATATAT", "ATATATATATAT", 8), + ("ATATATATATAT", "ATATATATAT", 10), + ("ATATAT", "TATAT", 5), + ("AACGATACCATG", "GGATCATACGTA", 3), + ("CGT", "ACG", 3), + ("ACG", "CGT", 3), + ("CACC", "GGTGT", 4), + ("GGTGT", "CACC", 4), + ] + + print 'pass\tmer1\tmer2\tres\tcorr' + for mer_combination in arr: + response = [] + ans = max_consecutive_binding(mer_combination[0], mer_combination[1]) + + response.append(str(ans == mer_combination[2])) + response.append(mer_combination[0]) + response.append(mer_combination[1]) + response.append(str(ans)) + response.append(str(mer_combination[2])) + + print '\t'.join(response) + +def main(): + + if(len(sys.argv) < 2): + print "cutoff is expected as an argument" + exit() + else: + cutoff = int(sys.argv[1]) + + for line in sys.stdin: + mer = line.split()[0] + if max_consecutive_binding(mer, mer) < cutoff: + sys.stdout.write(line) + + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3