#! /usr/bin/env python import sys, os, string, SeqIO, cp, Seq ### updated to use Seq, SeqIO 011212 NH def main(): """ %(filename)s Translates a fasta format file containing nucleotide sequences. Can filter translated sequences according to length, presence of unwanted characters (like X,*). {{degap Gap characters (-.~) are removed as file is read.}} {{filter looks for a filterfile (-filter=filterfile) with instructions \ for rejection of sequences. Filterfile is of format: \ [character to find][max number of occurrences to allow]}} Example: if the filterfile looks like: * 0 X 2 Sequences with > 0 *'s or > 2 X's will be rejected. {{maxlen Reject translated sequences less than this length. Will apply \ to the length of the degapped sequence if -degap \ is selected. 100000}} {{minlen Set minimum length of translated sequence. 0}} {{f frame; specify reading frame 1, 2, or 3 1}} {{out Name of outfile.}} {{amb Ambiguity handling ambi codons containing IUPAC ambiguity characters will be translated if a unique amino acid is encoded. unambi causes any codon containing an ambiguity to be translated as an 'X'.}} {{h Print documentation.}} {{v verbosity; 0 for no output. 1}} {{version print version info and exit}} %(version)s""" debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: translate.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) p = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if p.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if p.status('h'): print formattedDocString sys.exit( 0 ) if p.status('out'): writefile = 1 outfilename = p.value('out') else: writefile = 0 v = 0 v = p.value('v') degap = p.status('degap') filter = p.status('filter') amb = p.value('amb') check_length = p.status('maxlen') or p.status('minlen') if check_length: maxlen = p.value('maxlen') minlen = p.value('minlen') startAt = p.value('f') - 1 #read from a list of fasta files fasta_list = p.value('infile_list', 'in','file') fasta = SeqIO.readFastaList(fasta_list, degap=degap, output='list') if filter: filterfilename = p.value('filter', type='in', ask = 1) filterfile = open(filterfilename, 'r') filterinstructions = [] while 1: line = filterfile.readline() if line == '': break splitline = string.split(line) filterinstructions.append([ splitline[0], int(splitline[1]) ]) #translate sequences and determine if they should be discarded filterRejects = 0 lengthRejects = 0 goodSeqCount = 0 if v > 0 and (filter or check_length): print 'Filtering sequences.' if v > 1: print 'Rejects are listed below...' if writefile: outfile = open(outfilename, 'w') if v > 0: print "Writing %s ..." % outfilename else: outfile = sys.stdout for seq in fasta: #toPep(self, startPosition=1, table=tableDefault): pep = Seq.translate( seq, table=amb, start=startAt) keep = 1 #count (s, sub[, start[, end]]) #continue if conditions in filterfile are met if filter: for condition in filterinstructions: if string.count(pep.getSeq(), condition[0]) > condition[1]: if v>1: print SeqIO.seqToFasta( pep ), filterRejects += 1 keep = 0 if check_length: if len(pep) < minlen or len(pep) > maxlen: keep = 0 lengthRejects += 1 if v>1: print SeqIO.seqToFasta( pep ), if keep: outfile.write( SeqIO.seqToFasta( pep ) ) goodSeqCount += 1 if writefile: outfile.close() if v > 0: print 'Wrote %i sequences.' % goodSeqCount if filter: print 'Rejected %i out of %i sequences due to composition filter' % (filterRejects, len(fasta)) if check_length: print 'Rejected %i out of %i sequences due to length (max=%i, min=%i)' % (lengthRejects, len(fasta), maxlen, minlen) main()