## Saturday, August 9, 2008

### Python version of simple Markov chain

Here is Python code to generate a simple Markov chain. The first part is what we were given for the Deonier example. The second part is mainly a utility function to simulate what R's "sample" does, choose an item randomly from a sequence, based on a frequency distribution. The third part constructs the model and tests it.
 `import random,sysnt = 'ACGT'dinuc = [n1 + n2 for n1 in nt for n2 in nt]R = random.Random(1531)# what do we need for a Markov process?# example: dinucleotide frequencies# start with a list of dinucleotide frequencies# nt order is ACGT, row 1..4 are A..T as 1st nt# cols 1..4 are A..T as 2nd ntdef initDiNucleotideDict(): L = [ 0.146, 0.052, 0.058, 0.089, 0.063, 0.029, 0.010, 0.056, 0.050, 0.030, 0.028, 0.051, 0.087, 0.047, 0.063, 0.140 ] # organize L into a dictionary of dinucleotides return dict(zip(dinuc,L)) #------------------------------------------------- # compute individual nt probabilities pNdef initMonoNucleotideDict(nnD): D = dict(zip(list(nt), [0]*len(nt))) for nn in dinuc: n = nn[0] D[n] += nnD[nn] return D #------------------------------------------------- # P(nn) is the observed dinucleotide freq in nnD# P(nn) = P(n1) * P for transition from n1 to n2# i.e. P(n1 => n2) = P(n1+n2) / P(n1)# since we will choose given n1# we need a dict of dictsdef initTransitionDict(nD,nnD): bigD = dict() for n1 in nt: D = dict() for n2 in nt: D[n2] = nnD[n1+n2] / nD[n1] bigD[n1] = D return bigD#-------------------------------------------------# utility function: randomly choose one item# from a list of tuples of (item,freq)# not necessarily summing to 1# allow a dict argumentdef randomChoice(L): if type(L) == type({}): L = L.items()[:] floor = 0 ceiling = sum([t[1] for t in L]) f = R.random() while f > ceiling: f = R.random() for t in L: floor += t[1] if f < floor: return t[0]def testRandomChoice(L): R = range(int(1E5)) L2 = [randomChoice(L) for i in R] for n in nt: print n, L2.count(n) def doInits(): nnD = initDiNucleotideDict() nD = initMonoNucleotideDict(nnD) tD = initTransitionDict(nD,nnD) return nD,nnD,tDdef testAll(): nD,nnD,tD = doInits() for n in nt: print n,'%3.3f' % nD[n] print testRandomChoice(nD.items()) print for nn in dinuc: print nn,'%3.4f' % nnD[nn] print for n1 in nt: for n2 in nt: print n1+n2,'%3.4f' % tD[n1][n2] # testAll(); sys.exit()#-------------------------------------------------def markov(nD,tD,SZ): L = [randomChoice(nD.items())] for i in range(SZ-1): D = tD[L[-1]] L.append(randomChoice(D.items())) return ''.join(L) def testmarkov(seq,nnD): D = dict(zip(dinuc, [0]*len(dinuc))) SZ = len(seq) for i in range(1,SZ): D[seq[i-1:i+1]] += 1 for k in nnD.keys(): print k.ljust(4), '%3.4f' % nnD[k], '\t', print '%3.4f' % (D[k] * 1.0 / SZ)nD,nnD,tD = doInits()L = markov(nD,tD,int(1E5))print L[:30] + '\n'testmarkov(L,nnD)'''prints:TAGTTTGTTTACAATGTCATTACAGATGTTAA 0.1460 0.1458AC 0.0520 0.0512GT 0.0510 0.0518AG 0.0580 0.0576CC 0.0290 0.0290CA 0.0630 0.0624CG 0.0100 0.0095TT 0.1400 0.1444GG 0.0280 0.0276GC 0.0300 0.0303AT 0.0890 0.0888GA 0.0500 0.0484TG 0.0630 0.0635TA 0.0870 0.0867TC 0.0470 0.0467CT 0.0560 0.0563'''`