Friday, November 12, 2010

Code to find restriction sites

Having downloaded a list of restriction enzymes from REBASE earlier today for another post, I couldn't resist writing a short program to find restriction sites in a DNA sequence. The input enzyme data looks like this:

AarI                           CACCTGC (4/8)
AatII GACGT^C
AbsI CC^TCGAGG
AccI GT^MKAC
AceIII CAGCTC (7/11)
AciI CCGC (-3/-1)

We ignore the numbers (4/8) etc. To make it more interesting, I used a simple form of Python's regular expressions (docs here), which are pre-compiled.

The results can be limited to non-degenerate six-cutters (or longer) and sorted by the index of the match. It took a bit more than an hour, and was great fun. Naturally, the code hasn't been tested carefully. (Note: 0-based indexing).

output:

$ python script.py
Eco47III AGCGCT 11 AGCGCT
HaeII RGCGCY 11 AGCGCT
TsoI TARCCA 23 TAACCA
HgiCI GGYRCC 35 GGCACC
AflIII ACRYGT 56 ACGCGT
MluI ACGCGT 56 ACGCGT
AclI AACGTT 62 AACGTT
BclI TGATCA 83 TGATCA
BspGI CTGGAC 93 CTGGAC
MstI TGCGCA 107 TGCGCA
BsgI GTGCAG 120 GTGCAG
HindII GTYRAC 140 GTCAAC
BspMI ACCTGC 190 ACCTGC

code listing:

import re
sample_dna = '''
ATGACCCTTTTAGCGCTCGGTATTAACCATAAAACGGCACCTGTATCGCT
GCGAGAACGCGTAACGTTTTCGCCGGACACGCTTGATCAGGCGCTGGACA
GCCTGCTTGCGCAGCCAATGGTGCAGGGCGGGGTCGTGCTGTCAACCTGT
AACCGTACAGAGCTGTATCTGAGCGTGGAAGAGCAGGATAACCTGCAAGA'''

# load data from downloaded file
# http://rebase.neb.com/rebase/link_proto
def load_data(fn = 'link_proto.txt'):
FH = open(fn,'r')
data = FH.read()
FH.close()
L = data.strip().split('\n\n')
return L

# parse data into names and sites
def preprocess(data):
L = data.strip().split('\n')
names = list()
sites = list()
for e in L:
n,s = e.split(' ',1)
names.append(n)
words = s.strip().split()
if words[0][0] == '(':
sites.append(words[1])
else:
sites.append(words[0])
return names,sites

# codes for degenerate positions
# http://www.bioinformatics.org/sms/iupac.html
def get_pattern(s):
D = { 'A':'A','C':'C','G':'G','T':'T',
'R':'[AG]','Y':'[CT]','N':'.',
'S':'[GC]','W':'[AT]','K':'[GT]',
'M':'[AC]','B':'[CGT]','D':'[AGT]',
'H':'[ACT]','V':'[ACG]',
'^':''}
rL = [D[c] for c in s]
return ''.join(rL)

# dictionary of pre-compiled regexps
def make_dict(data):
names,sites = preprocess(data)
D = dict()
for n,s in zip(names,sites):
p = get_pattern(s)
p = re.compile(p)
i = s.find('^')
s = s.replace('^','')
rD = { 'name':n,'site':s,
'pattern':p,'i':i }
D[n] = rD
return D

def search(dna,D,minlength=4,ignore_ambig=True):
N = max([len(n) for n in D.keys()])
M = max([len(D[n]['site']) for n in D])
rL = list()
for n in D:
rD = D[n]
n,p,s = rD['name'], rD['pattern'],rD['site']
if len(s) < minlength:
continue
if ignore_ambig and 'N' in s:
continue
m = p.search(dna)
if m:
i = m.start()
j = i + len(rD['site'])
e = [n.ljust(N),s.ljust(M)]
e += [i,dna[i:j]]
rL.append(e)
rL.sort()
return rL

def show(result):
def f(s): return s[2] # index of match
result = sorted(result, key=f)
for line in result:
# left i as int, so convert to str
i = line[2]
line[2] = str(i).rjust(4)
print ' '.join(line)

if __name__ == '__main__':
dna = ''.join([c for c in sample_dna if c in 'ACGT'])
data = load_data()
L = data[3].strip() # type II enzymes only
D = make_dict(L)

result = search(dna,D,minlength=6)
show(result)