#! /usr/bin/env python # from __future__ import division import os, sys, string, cp import SeqIO, Seq def main(): """ %(filename)s Removes columns from a sequence alignment in which a specified percent of sequences contain gap characters. Useful for preparing alignments for phylogenetic analysis. {{out name of outfile. Prints to stdout by default.}} {{h Prints documentation.}} {{v verbosity 1}} {{c Maximum percent of sequences permitted to have a gap character.\ for example cutoff=5 results in the removal of columns containing >5%% gaps 100}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: gapstrip.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') cutoff = dict.value('c') #read from a list of fasta files fasta_list = dict.value('infile_list', 'in','file') fasta = SeqIO.readFastaList(fasta_list, output='list') #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') else: outfile = sys.stdout v = 0 #tabulate alignment tab = SeqIO.tabulate(fasta) #determine which positions to keep total = len(fasta) keepList = [] for i in range(len(tab)): if 100 * tab[i].get('-',0)/total <= cutoff: keepList.append(i) ## compile this list for faster slicing of sequences sliceList = compile(keepList) ## output the sequences if v> 0: print 'Writing %s ...' % outfile.name for seq in fasta: seqString = seq[:] newString = '' #use the compiled sliceList to iterate through sequences with fewer loops for i in sliceList: if len( i ) == 1: newString += seqString[i[0]] elif len( i ) == 2: newString += seqString[i[0]:i[1]] seq.setSeq(newString) outfile.write(SeqIO.seqToFasta(seq)) if writeToFile: outfile.close() def compile( keepList, v=0 ): """take a list of integers that are indices into a list or string and denote elemenst to be extracted and transform it into a list containing one- and two-tuples to achieve extraction with fewer steps using slicing (with two-tuples) and direct indices (with one-tuples)""" sliceList = [] templist = [] while len( keepList ) > 0: if len( templist ) == 0: templist.append( keepList.pop(0) ) #make a list of consecutive numbers elif templist[-1] + 1 == keepList[0]: templist.append( keepList.pop(0) ) #if not consecutive, purge templist and make slice/single index else: if len( templist ) == 1: sliceList.append( (templist.pop(0),) ) elif len( templist ) > 1: sliceList.append( (templist[0], templist[-1]+1) ) templist=[] if len( templist ) == 1: sliceList.append( (templist[-1],)) elif len( templist ) > 1: sliceList.append( ( templist[0], templist[-1]+1 ) ) return sliceList if __name__ == '__main__': main()