Python for Bioinformatics: Python version of simple Markov chain

Saturday, August 9, 2008

Python version of simple Markov chain

Here is Python code to generate a simple Markov chain. The first part is what we were given for the Deonier example. The second part is mainly a utility function to simulate what R's "sample" does, choose an item randomly from a sequence, based on a frequency distribution. The third part constructs the model and tests it.

import random,sys
nt = 'ACGT'
dinuc = [n1 + n2 for n1 in nt for n2 in nt]
R = random.Random(1531)

# what do we need for a Markov process?
# example:  dinucleotide frequencies

# start with a list of dinucleotide frequencies
# nt order is ACGT, row 1..4 are A..T as 1st nt
# cols 1..4 are A..T as 2nd nt

def initDiNucleotideDict():
    L = [   0.146, 0.052, 0.058, 0.089,
            0.063, 0.029, 0.010, 0.056,
            0.050, 0.030, 0.028, 0.051,
            0.087, 0.047, 0.063, 0.140  ]
    # organize L into a dictionary of dinucleotides
    return dict(zip(dinuc,L))
    
#-------------------------------------------------
    
# compute individual nt probabilities pN

def initMonoNucleotideDict(nnD):
    D = dict(zip(list(nt), [0]*len(nt)))
    for nn in dinuc:
        n = nn[0]
        D[n] += nnD[nn]
    return D
    
#-------------------------------------------------
    
# P(nn) is the observed dinucleotide freq in nnD
# P(nn) = P(n1) * P for transition from n1 to n2
# i.e. P(n1 => n2) = P(n1+n2) / P(n1)

# since we will choose given n1
# we need a dict of dicts

def initTransitionDict(nD,nnD):
    bigD = dict()
    for n1 in nt:
        D = dict()
        for n2 in nt:
            D[n2] = nnD[n1+n2] / nD[n1]
        bigD[n1] = D
    return bigD

#-------------------------------------------------

# utility function:  randomly choose one item
# from a list of tuples of (item,freq)
# not necessarily summing to 1
# allow a dict argument

def randomChoice(L):
    if type(L) == type({}):
        L = L.items()[:]
    floor = 0
    ceiling = sum([t[1] for t in L])
    f = R.random()
    while f > ceiling:  f = R.random()
    for t in L:
        floor += t[1]
        if f < floor:   return t[0]

def testRandomChoice(L):
    R = range(int(1E5))
    L2 = [randomChoice(L) for i in R]
    for n in nt:   
        print n, L2.count(n)
    
def doInits():
    nnD = initDiNucleotideDict()
    nD = initMonoNucleotideDict(nnD)
    tD = initTransitionDict(nD,nnD)
    return nD,nnD,tD

def testAll():
    nD,nnD,tD = doInits()
    for n in nt:  
        print n,'%3.3f' % nD[n]
    print
    testRandomChoice(nD.items())
    print
    for nn in dinuc: 
        print nn,'%3.4f' % nnD[nn]
    print
    for n1 in nt:
        for n2 in nt:
            print n1+n2,'%3.4f' % tD[n1][n2]
    
# testAll();  sys.exit()

#-------------------------------------------------

def markov(nD,tD,SZ):
    L = [randomChoice(nD.items())]
    for i in range(SZ-1):
        D = tD[L[-1]]
        L.append(randomChoice(D.items()))
    return ''.join(L)
        
def testmarkov(seq,nnD):
    D = dict(zip(dinuc, [0]*len(dinuc)))
    SZ = len(seq)
    for i in range(1,SZ):
        D[seq[i-1:i+1]] += 1
    for k in nnD.keys():
        print k.ljust(4), '%3.4f' % nnD[k], '\t', 
        print '%3.4f' % (D[k] * 1.0 / SZ)


nD,nnD,tD = doInits()
L = markov(nD,tD,int(1E5))
print L[:30] + '\n'
testmarkov(L,nnD)

'''
prints:

TAGTTTGTTTACAATGTCATTACAGATGTT

AA   0.1460  0.1458
AC   0.0520  0.0512
GT   0.0510  0.0518
AG   0.0580  0.0576
CC   0.0290  0.0290
CA   0.0630  0.0624
CG   0.0100  0.0095
TT   0.1400  0.1444
GG   0.0280  0.0276
GC   0.0300  0.0303
AT   0.0890  0.0888
GA   0.0500  0.0484
TG   0.0630  0.0635
TA   0.0870  0.0867
TC   0.0470  0.0467
CT   0.0560  0.0563
'''