Wednesday, August 12, 2009

Python simulation of Needleman-Wunsch 1

I want to show you my Python code for doing Needleman-Wunsch global alignments. I hasten to remind you that I don't really expect anyone to study this stuff, but I want to encourage you to try the problem yourself. It's fun, and will increase your understanding of the method. The purpose of posting the code is to give you a working example to fall back on if you encounter difficulty.

Let's do some (boring) utility functions first. I have the following code in a file called The setUp function makes a matrix of the type we used in the demo:

bases = 'ACGT'

def initScore(g=-6,e=-6):
class Score: pass
score = Score() = g
score.ext = e
return score

def seqType(s1,s2):
for c in s1+s2:
if not c in bases: return 'protein'
return 'DNA'

def newDict(score, path):
return {'score':score,

def setUp(s1,s2,sc):
R = len(s1) + 1 # items per row or numCols
C = len(s2) + 1 # items per col or numRows
# a list of D with keys = score,path
# just use a flat list, not array
L = [None]*R*C
L[0] = { 'score':0,'path':None }

L[1] = newDict(, 'L')
for c in range(2,R):
score = L[c-1]['score'] + sc.ext
L[c] = newDict(score, 'L')

L[R] = newDict(, 'U')
for r in range(2, C):
prev = (r-1)*R
next = r*R
score = L[prev]['score'] + sc.ext
L[next] = newDict(score, 'U')
#print len(L)
return L

And I have functions to load the sequences (filename can be passed in on the command line), and for pretty printing of alignments, saved in the following module as

# utility:  pretty print alignment
def pprint(L,s1,s2):
N = len(s1)+1 # row length
pad = 5
# print title row using s1
print ' '*pad*2,
for c in s1: print c.rjust(pad),
i = 0
L = L[:]
while L:
# print col 1 using chars from s2
if i == 0: print ' '.rjust(pad),
else: print s2[i-1].rjust(pad),
i += 1
sub = L[:N]
L = L[N:]
for D in sub:
if D:
score = str(D['score'])
path = D['path']
if path:
print (score+path).rjust(pad),
print score.rjust(pad),
else: print '.'.rjust(pad),
# for debugging
def report(r,c,i,up,left,diag):
print 'r', r, 'c' ,c,
print 'i',i
if up: print 'up', up
if left: print 'left', left
if diag: print 'diag', diag
print s1[c-2]
print s2[r-2]

def printAlignment(t):
L1,L2,L3 = t
SZ = 50
for i in range(0,len(L1),SZ):
print L1[i:i+SZ]
print L2[i:i+SZ]
print L3[i:i+SZ]

def load(fn):
FH = open(fn,'r')
data =
if not data.strip()[0] == '>':
return 'data is not FASTA'
s1,s2 = data.strip().split('\n\n')
except ValueError:
return 'need two sequences to compare'
s1 = s1.split('\n',1)[1]
s2 = s2.split('\n',1)[1]
s1 = s1.replace('\n','')
s2 = s2.replace('\n','')
return s1,s2

No comments: