#! /usr/bin/env python #from __future__ import division # / operator performs float rather than int division import os, sys, string, cp, copy import SeqIO import pre as re findNodes = re.compile(r"@.*?@", re.I) find_innermost_nodes = re.compile(r"""\({1}?[-.,:\w@]+?\)[0-9]*""", re.I) taxaReo = re.compile(r"[(,]{0,1}(?P[-:.\w@]+)[,)]",re.I) nameTableReo = re.compile(r"translate.*?;", re.I | re.S) def main(): """ %(filename)s Modifies Newick format trees from PAUP (NEXUS format) and phylip phylogenetics programs. Can be used to change taxon names, node labels, and branch lengths. The default input format is from PAUP 4.0b10. {{out name of outfile. Prints to stdout by default.}} {{intree input tree}} {{fmt tree format nex nexus format none file contains nothing but the parenthesized expression for the tree: "( ... );"}} {{brlen add branch labels to intree from this tree in nexus format}} {{nodelab add node labels to intree. -nodelab with no argument \ assigns (fairly arbitrary) integer values to internal nodes. -nodelab=treefile \ adds node labels (eg bootstrap values) specified in a Newick \ format tree contained in treefile. Labels are transferred only \ if a node has an identical set of terminal taxa in both intree and the \ tree specified by -nodelab.}} {{rmbrlen remove branch lengths from intree}} {{rmnode remove node labels from intree}} {{namerpl file directing replacement of names in tree. Format is \ [old name] [new name] \ Sequence names not found in the tree are ignored.}} {{h Prints documentation.}} {{v verbosity 1}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: treeparser.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') ### determine selected options remove_branchlengths = dict.status('rmbrlen') remove_nodelabels = dict.status('rmnode') add_nodelabels = dict.status('nodelab') and dict.value('nodelab') == '' transfer_node_labels = dict.status('nodelab') and dict.value('nodelab') != '' if transfer_node_labels: nodeLabTreeStrRaw = dict.value('nodelab','in','file').read() treeFormat = dict.value('fmt') intreeStringRaw = dict.value('intree', 'in','file').read() nameReplace = 0 if dict.status('namerpl'): nameReplace = 1 rplFile = dict.value('namerpl','in','file') rplDict = {} nameIndex = 1 for line in rplFile.readlines(): if line.strip() == '' or line.strip()[0] == '#': continue splitline = line.lower().split() if len(splitline) > 1: rplDict[splitline[0]] = splitline[1] else: rplDict[splitline[0]] = `nameIndex` nameIndex += 1 if treeFormat == 'nex': intreeString = readNexusFile(intreeStringRaw)['treestring'] elif treeFormat == 'none': intreeString = intreeStringRaw.replace('\n','') intreeDict = treeparse(intreeString, v=v) if add_nodelabels: intreeDict = addNodeCounts(intreeDict) elif transfer_node_labels: if treeFormat == 'nex': nodeLabTreeStr = readNexusFile(nodeLabTreeStrRaw)['treestring'] elif treeFormat == 'none': nodeLabTreeStr = nodeLabTreeStrRaw.replace('\n','') nodeLabTreeDict = treeparse(nodeLabTreeStr, v=v) findChildren(intreeDict, v=0) findChildren(nodeLabTreeDict, v=0) transferNodeLabels(intreeDict, nodeLabTreeDict, v=v) if remove_branchlengths: intreeDict = removeBranchLenghts(intreeDict) if remove_nodelabels: intreeDict = removeNodeLabels(intreeDict) newTree = reconstructTree(intreeDict) #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') writeToFile = 1 else: outfile = sys.stdout v = 0 # replace names specified in -namerlp if this option is active if nameReplace and treeFormat == 'nex': translationTableList = nameTableReo.findall(intreeStringRaw) if len(translationTableList) == 0: if v > 0: sys.stderr.write('Treefile did not contain a translation table - names will not be replaced') else: translationTableStr = translationTableList[0] newTranslationTableStr = translationTableStr[:].lower() for key in rplDict.keys(): # find exact matches for name thisNameReo = re.compile(r'\s' + key + '[,\n]', re.MULTILINE) found = thisNameReo.findall(newTranslationTableStr) if found == []: continue if v > 2: print key, rplDict[key], found, found[0].replace(key, rplDict[key]) newTranslationTableStr = thisNameReo.sub(found[0].replace(key, rplDict[key]),newTranslationTableStr) intreeStringRaw = intreeStringRaw.replace(translationTableStr, newTranslationTableStr) elif nameReplace and treeFormat == 'none': for key in rplDict.keys(): newTree = newTree.replace(key, rplDict[key]) if v>1: sys.stderr.write( '********* original tree ****************\n' + intreeString + '\n') sys.stderr.write( '********* reconstructed tree ***********\n' + newTree + '\n') sys.stderr.write( '****************************************' + '\n') #write a file with original nexus file with reformatted # tree replacing the old tree (if fmt==nex), or just the treestring # if fmt==none if treeFormat == 'nex': outfile.write(intreeStringRaw.replace(intreeString,newTree)) elif treeFormat == 'none': outfile.write(newTree) if writeToFile: sys.stderr.write( 'Writing %s ...\n' % outfile.name ) outfile.close() def treeparse(treestring, v=1): """Parse a Newick format tree, break it down into nodes, and return a dictionary.""" treestring = treestring.replace('\n','') nodeIndex = 1 masterNodeDict = {} while 1: innodes = find_innermost_nodes.findall(treestring) if innodes == []: break for node in innodes: thisDict = nodeToDict(node,v=v) nodeName = '@%04i@' % nodeIndex treestring = treestring.replace(node,nodeName) masterNodeDict[nodeName] = thisDict # error check: can node be parsed then reconstructed to generate an identical string? recon = reconstructNode(thisDict) if node != recon: str = '%s\nError - original and reconstructed nodes differ\noriginal: %s\nrecon: %s \n%s' sys.exit(str % ("*"*30,node,recon,"*"*30) ) ####################### nodeIndex += 1 # if v>1: # print "*"*30 + ' round %i ' % round + "*"*30 # print treestring # print "*"*30 return masterNodeDict def nodeToDict(node, v=1): """Parses a single node and returns a dict Typical node structure: (3:0.008906,4:0.001323)100""" try: bscore = node.split(')')[1] except KeyError: bscore = '' taxonList = taxaReo.findall(node) taxa = [] children = [] for taxon in taxonList: spl = taxon.split(':') if len(spl) == 1: taxa.append((spl[0],'')) else: taxa.append((spl[0],':' + spl[1])) # add to list of children children.append(spl[0]) dictOut = {} dictOut['nodelab'] = bscore dictOut['taxa'] = taxa dictOut['children'] = children return dictOut def reconstructNode(nodeDict): """ Reconstructs a node from a nodeDict having typical structure (3:0.008906,4:0.001323)100 {'taxa': [('3', ':0.008906'), ('4', ':0.001323')], 'nodelab': '100'} """ taxList = [] for tup in nodeDict['taxa']: taxList.append('%s%s' % tup) return '(%s)%s' % (string.join(taxList,','), nodeDict['nodelab']) def findChildren(treeDict, v=1): """Modifies the named treeDict so that that 'children' field reflects the actual list of names and not the node tags. Also changes all children lists to tuples""" nodeList = treeDict.keys() nodeList.sort() for node in nodeList: children = treeDict[node]['children'] childString = string.join(children,':') if v>2: print 'node: %s **********' % node print children print childString tagList = findNodes.findall(childString) if tagList != []: for tag in tagList: childString = childString.replace(tag, string.join(treeDict[tag]['children'],':')) if v>2: print childString resolvedChildList = childString.split(':') resolvedChildList.sort() treeDict[node]['children'] = tuple(resolvedChildList) if v>2: print treeDict[node]['children'] else: treeDict[node]['children'] = tuple(treeDict[node]['children']) def reconstructTree(treeDict): """Reconstructs a dictionary of nodeDict dictionaries (ie, output from treeparse()) into a tree. """ lastNode = max(treeDict.keys()) # print 'lastNode', lastNode #reconstruct nodeDict corresponding to last element newTreeStr = reconstructNode(treeDict[lastNode]) while 1: nodeList = findNodes.findall(newTreeStr) if nodeList == []: break #all nodes have been replaced #replace reconstructed nodes for node tags for node in nodeList: newTreeStr = newTreeStr.replace(node, reconstructNode(treeDict[node])) return newTreeStr + ';' def removeBranchLenghts(origDict): """Remove branch lengths from a treedict. Returns a NEW treeDict - does not modify the original """ treeDict = copy.deepcopy(origDict) for key in treeDict: thisNodeDictTaxonList = treeDict[key]['taxa'] newNodeDictTaxonList = [] for name, brlen in thisNodeDictTaxonList: newNodeDictTaxonList.append((name,'')) treeDict[key]['taxa'] = newNodeDictTaxonList return treeDict def removeNodeLabels(origDict): """Remove node labels (eg, bootstrap values) from a treedict. Returns a NEW treeDict - does not modify the original """ treeDict = copy.deepcopy(origDict) for node in treeDict: treeDict[node]['bscore'] = '' return treeDict def addNodeCounts(origDict): """Add labels at the nodes corresponding to the numbers of the node keys in origDict. Returns a NEW treeDict - does not modify the original """ treeDict = copy.deepcopy(origDict) for node in treeDict: treeDict[node]['bscore'] = `int(node.replace('@',''))` return treeDict nexusTreeRe = re.compile(r"tree\s*(?P[\w]+)\s*="+ r"\s*(?P[\[\&\w\]]+)?\s*" + r"(?P\(.+?;)", re.I) def readNexusFile(instring): """Reads one tree from a nexus format file. A dictionary with the keys treename, tag, and treestring.""" reobj = nexusTreeRe.search(instring) if reobj == None: sys.exit("No trees were found in this string:\n%s" % instring) treedict = reobj.groupdict() return treedict def transferNodeLabels(toTreeDict, fromTreeDict, v=0): """Transfer node labels at all nodes having all terminal taxa (children) in common""" # make a dictionary keyed by lists of children in toTreeDict toTreeChildDict = {} for node in toTreeDict.keys(): toTreeChildDict[toTreeDict[node]['children']] = node if v>2: sys.stderr.write('********** transferNodeLabels() *************\n') # loop through nodes of fromTreeDict and check each childlist against toTreeChildDict for node in fromTreeDict.keys(): nodeTagInToTree = toTreeChildDict.get(fromTreeDict[node]['children'],None) if nodeTagInToTree: toTreeDict[nodeTagInToTree]['nodelab'] = fromTreeDict[node]['nodelab'] if v>2: str = 'ftn: %s ttn: %s ftc: %s ttc: %s label: %s\n' % (node, nodeTagInToTree, fromTreeDict[node]['children'], toTreeDict[nodeTagInToTree]['children'], fromTreeDict[node]['nodelab']) sys.stderr.write(str) if v>2: sys.stderr.write('********** transferNodeLabels() end *************\n') if __name__ == '__main__': main()