# Lawrence et al. Gibbs Sampler algorithm # version 0.1 # the alignment is stored as a list of # the motif's start point in each sequence
import random,sys,time from motif import getTestSet from scoring import hammingScore #---------------------------------------------- def showStartSeqs(sL): for i,m,seq in sL: print str(i).rjust(5),m,' ', print seq[:50], if len(seq) > 50: print '...' else: print #---------------------------------------------- # aL holds positions of the motifs in seqs in L # assign elements in aL randomly to start def init_aL(D): N = D['N'] SZ = D['SZ'] M = D['M'] aL = list() for i in range(N): aL.append(random.choice(range(SZ-M))) return aL #---------------------------------------------- # utility method def showMotifAlignment(L,D,aL): M = D['M'] retL = list() for i in range(len(aL)): j = aL[i] retL.append(L[i][j:j+M]) return '\n'.join(retL) #---------------------------------------------- # given a list of sequences, # alignment start points # index of one sequence to leave out # and motif length
def getMotifSeqs(L,M,aL,currSeq): retL = list() for i,seq in enumerate(L): if i == currSeq: continue j = aL[i] retL.append(seq[j:j+M]) return retL #---------------------------------------------- # the sampler: # consider one seq at a time # slide to find the best score v. current motif # repeat R times def slideSeqs(L,aL,D,v=False): N = D['N'] M = D['M'] SZ = D['SZ'] R = D['R'] currSeq = 0 for j in range(R): if v: print aL sL = getMotifSeqs(L,M,aL,currSeq) # for currSeq find best match to alignment maxI = 0 maxS = 0 for i in range(SZ-M): seq = L[currSeq][i:i+M] score = hammingScore(sL + [seq]) if score > maxS: maxS = score maxI = i aL[currSeq] = maxI
if v: print maxS #for e in sL: print e if v: print 'currSeq', currSeq, if v: print 'best', maxI if v: print 'aL:', aL #print L[currSeq][maxI:maxI+M] currSeq = random.choice(range(N)) if v: print '-' * 30 return currSeq,aL,maxS #---------------------------------------------- def main(D): print 'numSeq ='.rjust(15), print str(D['N']).rjust(4) print 'motif length ='.rjust(15), print str(D['M']).rjust(4) print 'seq length ='.rjust(15), print str(D['SZ']).rjust(4) print 'slide cycles ='.rjust(15), print str(D['R']).rjust(4) print '# runs ='.rjust(15), print str(D['T']).rjust(4) # returns i,m,seq # N = D['N'] startL = getTestSet(D) # save the actual sequences separately in L L = [t[2] for t in startL]
# r for results, L for list rL = list() # run sliding test from independent starts print '\nruns:' for i in range(T): # display something to show progress if not i: print '.', if i: if not (i+1)%10: print str(i+1).rjust(3) else: print '.', aL = init_aL(D) # do the work here rL.append(slideSeqs(L,aL,D)) print '\n\n'
# extract all slider end scores to a new list sL = [t[2] for t in rL] # scan for final result with best score maxI = -1 maxS = 0 for i,score in enumerate(sL): if i and not i % 8: print print score, if score > maxS: maxS = score maxI = i currSeq,aL,score = rL[maxI] print '\nbest score =', print score n = sL.count(score) print 'recovered',n, if n > 1: print 'times\n\n' else: print 'time\n\n' print 'motif found:' print aL print showMotifAlignment(L,D,aL)
# display original motif print '\n\noriginal:' print hammingScore([t[1] for t in startL]) showStartSeqs(startL) print
if __name__ == "__main__": random.seed(157) start = time.time() N = 10 # number of sequences M = 10 # motif length SZ = 50 # sequence length R = 25 # cycles of basic sliding T = 100 # number of independent starts D = {'N':N,'M':M,'SZ':SZ,'R':R,'T':T } main(D) t = time.time() - start print 'elapsed time =', round(t,2), 'sec'
|