#! /usr/bin/env python import SeqIO, string, cp, sys, os from Dictionaries import * # global variable to determine verbosity v = 0 def main(): """ %(filename)s Numerically encode sequences in a fasta file. {{out name of outfile}} {{enc Encoding type aa default neunet aa encoding none just print unmodified tab-delimited sequence characters nucl nucleotides without ambiguties anucl nucleotides with ambiguties cvg coverage: gaps get 0's oz consensus characters are 0, other are 1 \ (CANNOT be used with -degap option) desc numbers most common character 1 next \ most common 2, etc (CANNOT be used with -degap option)}} {{degap remove gap characters from sequences}} {{v verbosity 0}} {{h help}} {{delim delimiter character for output table tab}} {{debug provides some debugging options interleave}} {{version print version info and exit}} %(version)s """ global v debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: encode.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') #asigning command line arguments infile = dict.value('infile') degap = dict.status('degap') encodingType = dict.value('enc',restrict = 1) #write to stdout by default writeToFile = 0 if dict.status('out'): outfile = dict.value('out','out','file') writeToFile = 1 else: outfile = sys.stdout v = 0 if dict.value('delim') == 'tab': delim = '\t' else: delim = dict.value('delim') interleave = '' if dict.value('debug') == 'interleave': interleave = 'interleave' if encodingType not in numDict.keys() and degap: sys.exit("""The -degap option is not compatible with an encoding type of '%s'""" % encodingType) unknownChar=-1 #load the fasta file into list of objects #readFasta( filename='', type='', gapstrip=0, v=debugLevel, output='list' ) seqList = SeqIO.readFasta( filename=infile, degap=degap, v = 0, output='list' ) encodedSeqs = [] # get a dictionary for encoding if encodingType == 'none': for seq in seqList: encodedSeqs.append( list(seq) ) elif encodingType in numDict.keys(): dict = numDict[encodingType] # make a list of lists representing the encoded sequences for seq in seqList: encodedSeqs.append( SeqIO.encode( seq, dict, unknownChar, v=v ) ) else: # get a list of dictionaries representing aa compos # def tabulate( seqList, v=debugLevel ) if v>0: print 'Tabulating aa composition at each position for all seqs' aaCompTable = SeqIO.tabulate( seqList, v=v ) for seq in seqList: if v>1:print '.', if encodingType == 'oz': encodedSeqs.append( ozEncode( seq, aaCompTable, 0, 1 ) ) elif encodingType == 'desc': encodedSeqs.append( descEncode( seq, aaCompTable, startat=0 ) ) if v>1:print ' ' # write the output file # writeTable( NestedList, filename='writeTableOut.txt', presicion=2, digits=10, delim ='\t', v=debugLevel): if v>0:print 'Writing', outfile.name SeqIO.writeTable( encodedSeqs, outfile, presicion=0, digits=0, delim =delim, pad=1, padchar=unknownChar ,v=v) if writeToFile: outfile.close() def ozEncode( seq, table, cons=0, noncons=1 ): """ Encodes consensus and noncons chars as specified seq sequence object table list of dictionaries from SeqIO.tabulate returns a list of ints""" encodedSeq = [] for i in range(len(seq)): #def consensus( dict, countGaps=0, plu=2, v=debugLevel ) consChar = SeqIO.consensus( table[i], countGaps=1, plu=2, v=v ) if seq[i].upper() == consChar.upper(): encodedSeq.append( cons ) else: encodedSeq.append( noncons ) return encodedSeq def makeDescDict( dictIn, start ): """dictIn is dictionary defining aa composition at a single position as defined by SeqIO.tabulate Encodes most abundant char as startat, next most abundant as startat + 1 , etc ties are broken by assigning char lower in alphabet the higher value returns a new dict""" itemlist = dictIn.items() #reverse the order revlist = [] for key, value in itemlist: revlist.append((value, key)) revlist.sort() revlist.reverse() dictout={} count = start if v > 3: print '-'*30 for value, key in revlist: dictout[key] = count if v > 3: print key, dictIn[key], count count = count + 1 if v > 3: print '-'*30 return dictout def testMakeDescDict(filename, position=0, start=0 ): seqList = SeqIO.readFasta(filename) tab = SeqIO.tabulate(seqList) print tab[position] print makeDescDict( tab[position], start ) def descEncode( seq, table, startat=0 ): """ seq sequence object table list of dictionaries from SeqIO.tabulate returns a list of ints""" encodedSeq = [] for i in range(len(seq)): thisDict = makeDescDict( table[i], startat ) encodedSeq.append( thisDict[seq[i]] ) return encodedSeq if __name__ == '__main__': main()