#! /usr/bin/env python import os, sys, string, cp import SeqIO, Seq def main(): """ %(filename)s Exclude a specified set of columns from a sequence alignment. {{r range: a string in the format a-b,c-d,... Coordinates are inclusive. A \ single position can be specified as p1-p1. For example, positions 1 to 10,\ 20 to 30 and position 40 is specified as 1-10,20-30,40-40 If the final \ value is greater than the length of the alignment, the alignment is \ returned to the end. 1-10000}} {{degap Strips gap characters from the sequences AFTER sequences have been trimmed}} {{out name of outfile}} {{h Prints documentation.}} {{v verbosity 1}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: trim.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') degap = dict.status('degap') #read from a list of fasta files fasta_list = dict.value('infile_list', 'in','file') fasta = SeqIO.readFastaList(fasta_list, output='list') #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') writeToFile = 1 else: outfile = sys.stdout v = 0 rangeString = dict.value('r') for seq in fasta: seq.setSeq( getSeqRange(seq, rangeString) ) outfile.write(SeqIO.seqToFasta( seq, degap=degap )) def getSeqRange(seq, rangeString): """rangeString in format 'a-b,c-d,...' returns a string""" length = len(seq) coordList = rangeString.split(',') newSeqStr = '' #object of same type with length 0 for coords in coordList: c1,c2 = coords.split('-') c1, c2 = eval(c1)-1, eval(c2) #convert to 0-index if c1 < 0: c1 = 0 if c2 > length: c2 = length newSeqStr = newSeqStr + seq[c1:c2] return newSeqStr if __name__ == '__main__': main()