#! /usr/bin/env python import sys, os, string, SeqIO, cp def main(): """ %(filename)s Create a new fasta file containing the subset of sequences defined in list. Neither file need be in any particular order. Sequences in list but not in fasta are ignored. {{l Sequences to include in the new file. Names must match \ those in fasta (not case-sensitive).}} {{inv Invert - exclude (rather than include) specified sequences.}} {{out Name of outfile; writes to stdout if omitted}} {{strip Remove gap characters from sequences.}} {{sort Sort the sequences in the output file alphabetically.}} {{h Print documentation}} {{d Turn debugging on}} {{version print version info and exit}} %(version)s""" debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: extract.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) od = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if od.status('version'): sys.exit( 'Version: %(version)s' % docstringdict ) if od.status('h'): print formattedDocString sys.exit(0) debug = od.status('d') fasta_list = od.value('infile_list', 'in','file') seqlistfile = od.value('l', 'in', ret='file') stripgaps = od.status('strip') sort = od.status('sort') writefile = od.status('out') outname = od.value('out') invert = od.status('inv') #read seqlistfile seqlist=seqlistfile.read().lower().split() # seqlist=[] # while 1: # line = seqlistfile.readline() # if line == '': break # if line.strip() == '': continue # seqlist.append( line.split()[0].strip().lower() ) # #print seqlist[-1] #sort seqlist if sort: seqlist.sort() if debug: print seqlist #def readFasta( filename='', type='nucl', gapstrip=0, v=debugLevel, output='dict' ) #read sequences from a list of fasta files seqdict = SeqIO.readFastaList(fasta_list, output='dict') if writefile: outfile = open(outname, 'w') print "Writing", outname else: outfile = sys.stdout if invert: newlist = [] for key in seqdict.keys(): if key not in seqlist: newlist.append(key) seqlist = newlist seqlist.sort() #def seqToFasta( seq, linelength = 60, degap = 0, acc=0, hea=0, begEnd=None, triplets=0, v=debugLevel, replace = {} ) for name in seqlist: try: seq = seqdict[name] except KeyError: if writefile: print name, "not found in", fasta_list continue outfile.write( SeqIO.seqToFasta( seq, linelength = 60, hea=1, degap=stripgaps) ) outfile.close() if __name__ == '__main__': main()