import random,sys nt = 'ACGT' dinuc = [n1 + n2 for n1 in nt for n2 in nt] R = random.Random(1531)
# what do we need for a Markov process? # example: dinucleotide frequencies
# start with a list of dinucleotide frequencies # nt order is ACGT, row 1..4 are A..T as 1st nt # cols 1..4 are A..T as 2nd nt
def initDiNucleotideDict(): L = [ 0.146, 0.052, 0.058, 0.089, 0.063, 0.029, 0.010, 0.056, 0.050, 0.030, 0.028, 0.051, 0.087, 0.047, 0.063, 0.140 ] # organize L into a dictionary of dinucleotides return dict(zip(dinuc,L)) # # compute individual nt probabilities pN
def initMonoNucleotideDict(nnD): D = dict(zip(list(nt), [0]*len(nt))) for nn in dinuc: n = nn[0] D[n] += nnD[nn] return D # # P(nn) is the observed dinucleotide freq in nnD # P(nn) = P(n1) * P for transition from n1 to n2 # i.e. P(n1 => n2) = P(n1+n2) / P(n1)
# since we will choose given n1 # we need a dict of dicts
def initTransitionDict(nD,nnD): bigD = dict() for n1 in nt: D = dict() for n2 in nt: D[n2] = nnD[n1+n2] / nD[n1] bigD[n1] = D return bigD
#
# utility function: randomly choose one item # from a list of tuples of (item,freq) # not necessarily summing to 1 # allow a dict argument
def randomChoice(L): if type(L) == type({}): L = L.items()[:] floor = 0 ceiling = sum([t[1] for t in L]) f = R.random() while f > ceiling: f = R.random() for t in L: floor += t[1] if f < floor: return t[0]
def testRandomChoice(L): R = range(int(1E5)) L2 = [randomChoice(L) for i in R] for n in nt: print n, L2.count(n) def doInits(): nnD = initDiNucleotideDict() nD = initMonoNucleotideDict(nnD) tD = initTransitionDict(nD,nnD) return nD,nnD,tD
def testAll(): nD,nnD,tD = doInits() for n in nt: print n,'%3.3f' % nD[n] print testRandomChoice(nD.items()) print for nn in dinuc: print nn,'%3.4f' % nnD[nn] print for n1 in nt: for n2 in nt: print n1+n2,'%3.4f' % tD[n1][n2] # testAll(); sys.exit()
#
def markov(nD,tD,SZ): L = [randomChoice(nD.items())] for i in range(SZ1): D = tD[L[1]] L.append(randomChoice(D.items())) return ''.join(L) def testmarkov(seq,nnD): D = dict(zip(dinuc, [0]*len(dinuc))) SZ = len(seq) for i in range(1,SZ): D[seq[i1:i+1]] += 1 for k in nnD.keys(): print k.ljust(4), '%3.4f' % nnD[k], '\t', print '%3.4f' % (D[k] * 1.0 / SZ)
nD,nnD,tD = doInits() L = markov(nD,tD,int(1E5)) print L[:30] + '\n' testmarkov(L,nnD)
''' prints:
TAGTTTGTTTACAATGTCATTACAGATGTT
AA 0.1460 0.1458 AC 0.0520 0.0512 GT 0.0510 0.0518 AG 0.0580 0.0576 CC 0.0290 0.0290 CA 0.0630 0.0624 CG 0.0100 0.0095 TT 0.1400 0.1444 GG 0.0280 0.0276 GC 0.0300 0.0303 AT 0.0890 0.0888 GA 0.0500 0.0484 TG 0.0630 0.0635 TA 0.0870 0.0867 TC 0.0470 0.0467 CT 0.0560 0.0563 '''
