import random, os, sys random.seed(137) from cogent import LoadSeqs, DNA from cogent.db.ncbi import EFetch from cogent.app.muscle import align_unaligned_seqs from cogent.app.fasttree import build_tree_from_alignment
def fetch_ncbi_data(ofile,s): # get the seqs from Genbank input = [e.split() for e in s.strip().split('\n')] id_list = [t[0] for t in input] names = [t[1] for t in input] ef = EFetch(id=','.join(id_list), rettype='fasta') data = ef.read().strip() # title lines are too long, replace by genus_species rL = list() for i,e in enumerate(data.split('\n\n')): old_title, seq = e.strip().split('\n',1) new_title = '>' + names[i] seq = seq[:500] rL.append('\n'.join([new_title,seq])) FH = open(ofile,'w') FH.write('\n\n'.join(rL)) FH.close()
def mutagenize(seq, percent=5): L = list(seq) D = { 'A':'CGT', 'C':'AGT', 'G':'ACT', 'T':'ACG' } N = int(percent / 100.0 * len(seq)) X = len(seq) for i in range(N): j = random.choice(range(X)) nt = L[j] if not nt in 'ACGT': continue L[j] = random.choice(D[nt]) return ''.join(L)
def distribute_seqs(ifile,ofile,mut_freq=10): # set up our two samples FH = open(ifile,'r') data = FH.read().strip().split('\n\n') FH.close() A_distr = {0:1,1:3,2:5,3:1} B_distr = {0:0,1:3,2:1,3:5} A = list() B = list() x = y = 0 for i,e in enumerate(data): title,seq = e.split('\n',1) seq = ''.join(seq.split()) for j in range(A_distr[i]): x += 1 copy = mutagenize(seq[:],mut_freq) new_seq = DNA.makeSequence(copy,'A' + str(x)) A.append(new_seq) for j in range(B_distr[i]): y += 1 copy = mutagenize(seq[:],mut_freq) new_seq = DNA.makeSequence(copy,'B' + str(y)) B.append(new_seq) FH = open(ofile,'w') L = [seq.toFasta() for seq in A + B] FH.write('\n\n'.join(L)) FH.close()
def align_seqs(ifile,ofile): seqs = LoadSeqs(ifile, moltype=DNA, aligned=False) aln = align_unaligned_seqs(seqs, DNA) aln.writeToFile(ofile) return aln
def get_tree(ifile): aln = LoadSeqs(ifile, moltype=DNA, aligned=True) tr = build_tree_from_alignment(aln,moltype=DNA) return tr
#=============================================== s = ''' AY005045.1 Streptococcus_mitis_bv2 AB302401.1 Pseudomonas_cinnamophila L14639.1 Capnocytophaga_gingivalis AF411020.1 Achromobacter_xylosoxidans_AU1011 '''
fn1 = 'rRNA_gb.fasta' fn2 = 'samples.fasta' fn3 = 'samples.aln.fasta' fn4 = 'samples.tree' if not os.path.exists(fn1): fetch_ncbi_data(fn1,s) if not os.path.exists(fn2): distribute_seqs(fn1,fn2) if not os.path.exists(fn3): aln = align_seqs(fn2,fn3) tr = get_tree(fn3) print tr.asciiArt()
tree_str = tr.getNewick(with_distances=True) FH = open(fn4,'w') FH.write(tree_str + '\n') FH.close() |