#! /usr/bin/env python import os, sys, string, cp, re import SeqIO, Seq def main(): """ %(filename)s Aligns nucleotide sequences based on an aligned peptide sequence. Neither alignment need be sorted or in any particular order. {{nuc unaligned nucleotide sequence in fasta format}} {{pep aligned peptide sequence in fasta format}} {{out name of outfile. Writes to stdout if unspecified.}} {{v verbosity 1}} {{h Prints documentation.}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: codonalign.py,v 1.1 2004/10/10 02:53:09 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') else: outfile = sys.stdout v = 0 #read fasta files nucFile = dict.value('nuc', 'in','file', ask=1) nucFasta = SeqIO.readFasta(nucFile, degap=1, output='dict') pepFile = dict.value('pep', 'in','file',ask=1) pepFasta = SeqIO.readFasta(pepFile, output='dict') # length of longest peptide? maxPepLen = 0 for name in pepFasta.keys(): maxPepLen = max([maxPepLen, len(pepFasta[name])]) pepNames = pepFasta.keys() nucNames = nucFasta.keys() allNames = [] for name in pepNames + nucNames: if name not in allNames: allNames.append(name) allNames.sort() if v>0: print 'Writing %s ...' % outfile.name for n in allNames: try: dnaSeq = nucFasta[n] except KeyError: if v>0: print '# %s not found in %s' % (n, nucFile.name) continue try: pepSeq = pepFasta[n] except KeyError: if v>0: print '# %s not found in %s' % (n, pepFile.name) continue alignedNuc = codonalign(dnaSeq, pepSeq, maxPepLen, v = 0) if v > 1: print "# Original peptide sequence:" print SeqIO.seqToFasta(pepSeq) print "# Translation of aligned nucleotide sequence:" print SeqIO.seqToFasta(Seq.translate(alignedNuc)) outfile.write(SeqIO.seqToFasta(alignedNuc)) if writeToFile: outfile.close() codexp = re.compile('[a-z]{1,3}', re.I) def codonalign(dnaSeq, pepSeq, pepLen, v = 0): """Align an ungaped nucleotide sequence based on an aligned peptide sequence; Returns a new sequence object with gaps corresponding to peptide sequence.""" #degap, remove ws, and codonize the dna string dnaString = dnaSeq[:] dnaString = SeqIO.removeWhitespace(dnaString) codons = codexp.findall(dnaString) alignedCodons = [] codonIndex = 0 # all aligned nucleotide sequences will be the same length for i in range(pepLen): try: aa = pepSeq[i] except IndexError: aa = '-' if aa not in ['.','-','~'] and codonIndex < len(codons): #pad end of nucleotide sequence if it's shorter than the pep cod = codons[codonIndex] if len(cod) < 3: cod += '-'*(3 - len(cod)) #all codons must be 3 chars alignedCodons.append(cod) codonIndex += 1 else: alignedCodons.append('---') newSeqStr = string.join(alignedCodons, '') dnaSeq.setSeq(newSeqStr) return dnaSeq if __name__ == '__main__': main()