Python for Bioinformatics: Motif Discovery 3

I realize that reading other people's code is [fill in the blank with your favorite expletive] impossible, and mind-numbing as well. It's part of the reason I find it difficult to enjoy books like Jones & Pevzner and Kinser's book, Python for Bioinformatics.

But we need to have some code to examine if we want to talk about algorithms. So, if you would like to follow the discussion that's coming later, please paste the code below into a textfile on your desktop (my copy is named gibbs.py but you can do whatever you want). You will also need the code from here, saved as motif.py, and the code from here, saved as scoring.py. You can run it in the usual way by doing "python gibbs.py."

Here is the code, and output for the particular settings. Discussion coming soon.

# Lawrence et al. Gibbs Sampler algorithm
# version 0.1
# the alignment is stored as a list of 
# the motif's start point in each sequence

import random,sys,time
from motif import getTestSet
from scoring import hammingScore
#----------------------------------------------
def showStartSeqs(sL):
    for i,m,seq in sL:
        print str(i).rjust(5),m,'  ',
        print seq[:50],
        if len(seq) > 50:  print '...'
        else:  print
#----------------------------------------------
# aL holds positions of the motifs in seqs in L
# assign elements in aL randomly to start
def init_aL(D):
    N = D['N']
    SZ = D['SZ']
    M = D['M']
    aL = list()
    for i in range(N):
        aL.append(random.choice(range(SZ-M)))
    return aL
#----------------------------------------------
# utility method
def showMotifAlignment(L,D,aL):
    M = D['M']
    retL = list()
    for i in range(len(aL)):
        j = aL[i]
        retL.append(L[i][j:j+M])
    return '\n'.join(retL)
#----------------------------------------------
# given a list of sequences,
# alignment start points
# index of one sequence to leave out
# and motif length

def getMotifSeqs(L,M,aL,currSeq):
    retL = list()
    for i,seq in enumerate(L):
        if i == currSeq:  continue
        j = aL[i]
        retL.append(seq[j:j+M])
    return retL
#----------------------------------------------
# the sampler:  
# consider one seq at a time
# slide to find the best score v. current motif
# repeat R times
def slideSeqs(L,aL,D,v=False):
    N = D['N']
    M = D['M']
    SZ = D['SZ']
    R = D['R']
    currSeq = 0
    for j in range(R):
        if v:  print aL
        sL = getMotifSeqs(L,M,aL,currSeq)
        
        # for currSeq find best match to alignment
        maxI = 0
        maxS = 0
        for i in range(SZ-M):
            seq = L[currSeq][i:i+M]
            score = hammingScore(sL + [seq])
            if score > maxS:
                maxS = score
                maxI = i
        aL[currSeq] = maxI

        if v:  print maxS 
        #for e in sL:  print e
        if v:  print 'currSeq', currSeq, 
        if v:  print 'best', maxI
        if v:  print 'aL:', aL
        #print L[currSeq][maxI:maxI+M]
        
        currSeq = random.choice(range(N))
        if v:  print '-' * 30
    return currSeq,aL,maxS
#----------------------------------------------
def main(D):
    print 'numSeq ='.rjust(15),
    print str(D['N']).rjust(4)
    print 'motif length ='.rjust(15),
    print str(D['M']).rjust(4)
    print 'seq length ='.rjust(15),
    print str(D['SZ']).rjust(4)
    print 'slide cycles ='.rjust(15),
    print str(D['R']).rjust(4)
    print '# runs ='.rjust(15),
    print str(D['T']).rjust(4)
    
    # returns i,m,seq
    # N = D['N']
    startL = getTestSet(D)
    # save the actual sequences separately in L               
    L = [t[2] for t in startL]

    # r for results, L for list
    rL = list()
    # run sliding test from independent starts
    
    print '\nruns:'
    for i in range(T):
        # display something to show progress
        if not i:  print '.',
        if i:
            if not (i+1)%10:  
                print str(i+1).rjust(3)        
            else:  print '.',
        aL = init_aL(D)
        # do the work here
        rL.append(slideSeqs(L,aL,D))
    print '\n\n'

    # extract all slider end scores to a new list
    sL = [t[2] for t in rL]
    # scan for final result with best score
    maxI = -1
    maxS = 0
    for i,score in enumerate(sL):
        if i and not i % 8:  print
        print score,
        if score > maxS:
            maxS = score
            maxI = i 
                
    currSeq,aL,score = rL[maxI]
    print '\nbest score =',
    print score
    n = sL.count(score)
    print 'recovered',n,
    if n > 1:  print 'times\n\n'
    else:  print 'time\n\n'
    
    print 'motif found:'
    print aL
    print showMotifAlignment(L,D,aL)

    # display original motif
    print '\n\noriginal:'
    print hammingScore([t[1] for t in startL])
    showStartSeqs(startL)
    print

if __name__ == "__main__":
    random.seed(157)
    start = time.time()    
    N = 10     # number of sequences
    M = 10     # motif length
    SZ = 50    # sequence length
    R = 25    # cycles of basic sliding
    T = 100     # number of independent starts
    D = {'N':N,'M':M,'SZ':SZ,'R':R,'T':T }
    main(D)
    t = time.time() - start
    print 'elapsed time =', round(t,2), 'sec'

Output:

       numSeq =   10
 motif length =   10
   seq length =   50
 slide cycles =   25
       # runs =  100

runs:
. . . . . . . . .  10
. . . . . . . . .  20
. . . . . . . . .  30
. . . . . . . . .  40
. . . . . . . . .  50
. . . . . . . . .  60
. . . . . . . . .  70
. . . . . . . . .  80
. . . . . . . . .  90
. . . . . . . . . 100



235 227 212 257 254 229 212 220
206 226 228 231 191 208 212 236
198 201 242 211 202 210 171 216
218 209 237 206 201 180 209 206
212 256 239 257 207 207 242 258
202 252 210 192 203 249 199 197
192 211 238 218 221 195 259 222
225 265 250 222 214 214 218 191
215 208 215 249 200 227 211 239
233 207 204 218 196 215 218 196
228 205 221 221 213 207 201 208
239 232 252 245 206 252 214 213
192 213 204 210 
best score = 265
recovered 1 time


motif found:
[35, 17, 24, 32, 18, 5, 33, 24, 18, 36]
GCACATAGCA
GAATTTGAAA
GAAAACACCT
GAACATAAGA
GAAAATAAAA
GATCATAATA
GGACATGCCA
GAAAATTCCA
GAAGTTAACT
GAATATAAGA


original:
252
   35 GCACATAGCA    TCGAAGACAAACTGCCGGCCTGGGTCCCCGAACGAGCACATAGCAATCAT
   17 GAATTTGAAA    TGCGACTGGGTGTTTCCGAATTTGAAATTCTCCACGCTATCGACTCTCAT
   19 GTGCCGAAAA    AATTGGCTGAGTGAATCGAGTGCCGAAAACACCTAGAATTATAGGGTACA
   32 GAACATAAGA    GATGGGCTCAACGTGCCGGGGATGATGGCACGGAACATAAGAGAGCGGAG
   18 GAAAATAAAA    ACGCAACGTACAATCACTGAAAATAAAACAATCCCGCTAGACAACTTCCG
    5 GATCATAATA    TGTATGATCATAATAAACGTACTCGCTCAACTAACGGTTACTGCTTAAAG
   33 GGACATGCCA    TAGTAAGCATAATTGTTACCTGGTGAAATCTGCGGACATGCCAGCGCTGA
   39 GACCTTAAAC    AGCAGGACACTCAGGCGAATATTCGAAAATTCCAAACCGGACCTTAAACT
   18 GAAGTTAACT    ATTTGATGTACTGGTGTAGAAGTTAACTGAAAGACGGTATAGAATGTATC
   36 GAATATAAGA    CTGACCCCATTTGTACTGCCCCCAAGGTACCAGTAGGAATATAAGAGTCC

elapsed time = 17.05 sec

Tuesday, June 16, 2009

Motif Discovery 3