import re sample_dna = ''' ATGACCCTTTTAGCGCTCGGTATTAACCATAAAACGGCACCTGTATCGCT GCGAGAACGCGTAACGTTTTCGCCGGACACGCTTGATCAGGCGCTGGACA GCCTGCTTGCGCAGCCAATGGTGCAGGGCGGGGTCGTGCTGTCAACCTGT AACCGTACAGAGCTGTATCTGAGCGTGGAAGAGCAGGATAACCTGCAAGA'''
# load data from downloaded file # http://rebase.neb.com/rebase/link_proto def load_data(fn = 'link_proto.txt'): FH = open(fn,'r') data = FH.read() FH.close() L = data.strip().split('\n\n') return L
# parse data into names and sites def preprocess(data): L = data.strip().split('\n') names = list() sites = list() for e in L: n,s = e.split(' ',1) names.append(n) words = s.strip().split() if words[0][0] == '(': sites.append(words[1]) else: sites.append(words[0]) return names,sites
# codes for degenerate positions # http://www.bioinformatics.org/sms/iupac.html def get_pattern(s): D = { 'A':'A','C':'C','G':'G','T':'T', 'R':'[AG]','Y':'[CT]','N':'.', 'S':'[GC]','W':'[AT]','K':'[GT]', 'M':'[AC]','B':'[CGT]','D':'[AGT]', 'H':'[ACT]','V':'[ACG]', '^':''} rL = [D[c] for c in s] return ''.join(rL)
# dictionary of pre-compiled regexps def make_dict(data): names,sites = preprocess(data) D = dict() for n,s in zip(names,sites): p = get_pattern(s) p = re.compile(p) i = s.find('^') s = s.replace('^','') rD = { 'name':n,'site':s, 'pattern':p,'i':i } D[n] = rD return D def search(dna,D,minlength=4,ignore_ambig=True): N = max([len(n) for n in D.keys()]) M = max([len(D[n]['site']) for n in D]) rL = list() for n in D: rD = D[n] n,p,s = rD['name'], rD['pattern'],rD['site'] if len(s) < minlength: continue if ignore_ambig and 'N' in s: continue m = p.search(dna) if m: i = m.start() j = i + len(rD['site']) e = [n.ljust(N),s.ljust(M)] e += [i,dna[i:j]] rL.append(e) rL.sort() return rL
def show(result): def f(s): return s[2] # index of match result = sorted(result, key=f) for line in result: # left i as int, so convert to str i = line[2] line[2] = str(i).rjust(4) print ' '.join(line) if __name__ == '__main__': dna = ''.join([c for c in sample_dna if c in 'ACGT']) data = load_data() L = data[3].strip() # type II enzymes only D = make_dict(L) result = search(dna,D,minlength=6) show(result) |