#! /usr/bin/env python # from __future__ import division # / operator performs float rather than int division import os, sys, string, cp import SeqIO, Seq def assemble_by_name(fastaDict1,fastaDict2): """Concatenate all pairs of sequences with the same name; return a list of seqs""" listOut = [] for key in fastaDict1.keys(): if fastaDict2.has_key(key): listOut.append( Seq.Seq(key, fastaDict1[key].getSeq() + fastaDict2[key].getSeq()) ) return listOut #def processNamelist(infile): # """Return a list of tuples from an infile. Ignores blank # lines or those beginning with #""" # # namelist = [] # for line in infile.lines(): # if line.strip() == '' or line.strip()[0] == '#': continue # namelist.append(tuple(line.split())) # # return namelist # # #def assemble_by_list(fastaDict, instructionList): # """instructionList is a list of tuples containing names; # values after a colon are used to rename""" # # listOut = [] # for tup in instructionList: # strList = [] # for name in tup: # strList.append(fastaDict.get(name,'')) # listOut.append def main(): """ %(filename)s Concatenates sequences contained in fasta files according to a list file. Sequences may be provided in one or more fasta files. {{lis list file containing sequence names to be concatenated. \ Names are not case sensitive. Lines beginning with a pound symbol are ignored.\ A string appearing after a colon is the name of the concatenated sequence.\ In the absence of a specified name, the new name is a concatenation of the \ names of the sequences.}} For example, s1 s2 : a s3 s4 ... concatenates s1 and s2 and names the result a, s3 and s4 --> s3s4, etc {{byname concatenates sequences in two fasta files by name in the \ order supplied on the command line.}} {{degap remove gap characters}} {{out name of outfile. Prints to stdout by default.}} {{h Prints documentation.}} {{v verbosity 1}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: assemble.py,v 1.1 2004/10/10 02:53:09 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') writeToFile = 1 else: outfile = sys.stdout v = 0 if v>0: print "Writing %s" % outfile.name #read from a list of fasta files fasta_list = dict.value('infile_list', 'in','file') if dict.status('byname'): try: file1, file2 = fasta_list except ValueError: sys.exit('Error: the "-byname" option requires exactly two input files.') fasta1, fasta2 = SeqIO.readFasta(file1, output='dict'), SeqIO.readFasta(file2, output='dict') assembledFastaList = assemble_by_name(fasta1,fasta2) for seq in assembledFastaList: outfile.write(SeqIO.seqToFasta(seq)) sys.exit() fasta = SeqIO.readFastaList(fasta_list, output='dict') # read in list file listFile = dict.value('lis', 'in','file') for line in listFile.readlines(): if line.strip() == '' or line.strip()[0] == '#': continue colonsplit = line.split(':') if len(colonsplit) > 1: name = colonsplit[-1].strip().lower() seqlist = colonsplit[0].split() else: seqlist = line.split() name = string.join(seqlist,'').lower() seqstr = '' if v > 1: sys.stderr.write('assembling %s newname = %s\n' % (string.join(seqlist,', '), name) ) for seqname in seqlist: try: seqstr += fasta[seqname.lower()].getSeq() except KeyError: sys.stderr.write('The sequence %s was not found\n' % seqname) #__init__( self, name, seq, type='', acc='',hea='' ) newSeq = Seq.Seq(name, seqstr, hea='Assembled from ' + string.join(seqlist)) outfile.write(SeqIO.seqToFasta(newSeq, hea=1)) if writeToFile: outfile.close() if __name__ == '__main__': main()