Python for Bioinformatics: Code to find restriction sites

Friday, November 12, 2010

Code to find restriction sites

Having downloaded a list of restriction enzymes from REBASE earlier today for another post, I couldn't resist writing a short program to find restriction sites in a DNA sequence. The input enzyme data looks like this:

AarI                           CACCTGC (4/8)
AatII                          GACGT^C
AbsI                           CC^TCGAGG
AccI                           GT^MKAC
AceIII                         CAGCTC (7/11)
AciI                           CCGC (-3/-1)

We ignore the numbers (4/8) etc. To make it more interesting, I used a simple form of Python's regular expressions (docs here), which are pre-compiled.

The results can be limited to non-degenerate six-cutters (or longer) and sorted by the index of the match. It took a bit more than an hour, and was great fun. Naturally, the code hasn't been tested carefully. (Note: 0-based indexing).

output:

$ python script.py
Eco47III   AGCGCT             11  AGCGCT
HaeII      RGCGCY             11  AGCGCT
TsoI       TARCCA             23  TAACCA
HgiCI      GGYRCC             35  GGCACC
AflIII     ACRYGT             56  ACGCGT
MluI       ACGCGT             56  ACGCGT
AclI       AACGTT             62  AACGTT
BclI       TGATCA             83  TGATCA
BspGI      CTGGAC             93  CTGGAC
MstI       TGCGCA            107  TGCGCA
BsgI       GTGCAG            120  GTGCAG
HindII     GTYRAC            140  GTCAAC
BspMI      ACCTGC            190  ACCTGC

code listing:

import re
sample_dna = '''
 ATGACCCTTTTAGCGCTCGGTATTAACCATAAAACGGCACCTGTATCGCT
 GCGAGAACGCGTAACGTTTTCGCCGGACACGCTTGATCAGGCGCTGGACA
 GCCTGCTTGCGCAGCCAATGGTGCAGGGCGGGGTCGTGCTGTCAACCTGT
 AACCGTACAGAGCTGTATCTGAGCGTGGAAGAGCAGGATAACCTGCAAGA'''

# load data from downloaded file
# http://rebase.neb.com/rebase/link_proto
def load_data(fn = 'link_proto.txt'):
    FH = open(fn,'r')
    data = FH.read()
    FH.close()
    L = data.strip().split('\n\n')
    return L

# parse data into names and sites
def preprocess(data):
    L = data.strip().split('\n')
    names = list()
    sites = list()
    for e in L:
        n,s = e.split(' ',1)
        names.append(n)
        words = s.strip().split()
        if words[0][0] == '(':
            sites.append(words[1])
        else:
            sites.append(words[0])
    return names,sites

# codes for degenerate positions
# http://www.bioinformatics.org/sms/iupac.html 
def get_pattern(s):
    D = { 'A':'A','C':'C','G':'G','T':'T',
          'R':'[AG]','Y':'[CT]','N':'.',
          'S':'[GC]','W':'[AT]','K':'[GT]',
          'M':'[AC]','B':'[CGT]','D':'[AGT]',
          'H':'[ACT]','V':'[ACG]',
          '^':''}
    rL = [D[c] for c in s]
    return ''.join(rL)

# dictionary of pre-compiled regexps
def make_dict(data):
    names,sites = preprocess(data)
    D = dict()
    for n,s in zip(names,sites):
        p = get_pattern(s)
        p = re.compile(p)
        i = s.find('^')
        s = s.replace('^','')
        rD = { 'name':n,'site':s,
               'pattern':p,'i':i }
        D[n] = rD
    return D
    
def search(dna,D,minlength=4,ignore_ambig=True):
    N = max([len(n) for n in D.keys()])
    M = max([len(D[n]['site']) for n in D])
    rL = list()
    for n in D:
        rD = D[n]
        n,p,s = rD['name'], rD['pattern'],rD['site']
        if len(s) < minlength:  
            continue
        if ignore_ambig and 'N' in s:
            continue
        m = p.search(dna)
        if m:
            i = m.start()
            j = i + len(rD['site'])
            e = [n.ljust(N),s.ljust(M)]
            e += [i,dna[i:j]]
            rL.append(e)
        rL.sort()
    return rL

def show(result):
    def f(s):  return s[2]   # index of match
    result = sorted(result, key=f)
    for line in result:
        # left i as int, so convert to str
        i = line[2]
        line[2] = str(i).rjust(4)
        print '  '.join(line)
            
if __name__ == '__main__':
    dna = ''.join([c for c in sample_dna if c in 'ACGT'])
    data = load_data()
    L = data[3].strip()  # type II enzymes only
    D = make_dict(L)
    
    result = search(dna,D,minlength=6)
    show(result)