#! /usr/bin/env python import os, sys, string, cp, re, SeqIO def main(): """ %(filename)s Classifies sequences in an alignment according to one or two "rules" specifying amino acid composition at defined positions. Rules are written as nested, parenthesized pairs of expressions; an expression consists of a position followed by a character (e.g., 3R). Build the rules by joining pairs of expressions with parentheses. Input file should contain aligned sequences in fasta format; fasta format sequences can also be supplied via stdin, eg 'cat file.fasta | classify.py ...') If only one rule is given, sequences satisfying rule 1 are scored as false for rule 2. If two rules are given, sequences are scored for both rules independently; in this case a sequence may satisfy neither, one, or both rules. Use extract.py to recover lists of sequences from input aligments. For example: classify.py protease.fasta -r1="(63L)" -pick=a | extract.py protease.fasta -l=- -out=pro_63L.fasta 1A 2B 3C 4D 5E 6F 7G (1A 2B) (3C 4D) (5E 6F) 7G ((1A 2B) (3C 4D)) ((5E 6F) 7G) (((1A 2B) (3C 4D)) ((5E 6F) 7G)) Use '&' and '|' (logical AND, OR) to join pairs. (((1A & 2B) | (3C & 4D)) | ((5E | 6F) & 7G)) Negate expressions with a leading ^ (((1A & ^2B) | (3C & 4D)) | ((5E | ^6F) & 7G)) Examples (enclose all rules on the command line with single or double quotes): 1) classify.py protease.fasta -out=pro_63p.classified -r1="(63P)" 2) ... -r1="(63P)" -r2="(63L)" 3) ... -r1="(63P | 63S)" -r2="(63L | 63S)" In this case, both expressions may be true. 4) ... -r1="(((63L | 63P) | (93R & ^97K)) & 72L)" 5) ... -r1="((63L | 63P) | ((93R & ^97K) & 72L))" Note that rules 4 and 5 above give different results. {{r1 Rule 1}} {{r2 Rule 2 (optional)}} {{n Output format for name. s print the name of the sequence i print the index of the sequence 0 print nothing}} {{r Output format of result. num if r2 is not supplied, prints a 1 or 0 after the name according to the truth of r1; if r2 is supplied, prints a 1 or 0 for the result of each of the rules. ab prints 'A' in the first column after the name if r1 is true and 'B' in the next column if 1) r1 is false and no r2 is supplied, or 2) r2 is true. 0 print nothing}} {{pick Selects which lines (each line corresponding to a sequence) to print. ab Print all lines regardless of result. a Print line if rule 1 is satisfied. b Print line if rule 2 is satisfied.}} {{neg Character used to negate an expression. ^ ! Requires protection with a backslash when used on the command line.}} {{out Name of output file. Prints to stdout if unspecified.}} {{h Prints documentation.}} {{v Verbosity; use for debugging. 1}} {{version print version info and exit}} %(version)s """ debug = 0 docstringdict = {'filename':os.path.split(sys.argv[0])[-1], 'version':'$Id: classify.py,v 1.1 2004/10/10 02:53:09 nghoffma Exp $'} optlist, formattedDocString, formattedSummary = cp.optStringParser(main.__doc__ % docstringdict, optWidth=15, lineWidth=60, valOffset = 4, putVals=1) dict = cp.commandparser(options=optlist, usage='Options:\n' + formattedSummary, debug = debug, exitWithUsage=1) if dict.status('version'): sys.exit( 'Version: %(version)s\n' % docstringdict ) if dict.status('h'): print formattedDocString sys.exit( 0 ) v = dict.value('v') le1 = dict.value('r1') le2 = dict.value('r2') nameFormat = dict.value('n') resultFormat = dict.value('r') toFile = dict.status('out') if toFile: outfilename = dict.value('out') outfile = open(outfilename,'w') else: v = 0 printVal = dict.value('pick', restrict=1) printa, printb = 0,0 if printVal in ['ab','a']: printa = 1 if printVal in ['ab','b']: printb = 1 neg = dict.value('neg') ############## make classifier exit = 0 try: tester = Classifier(le1, le2, neg, v) except BadRuleFormatError, msg: exit = 1 except NoRuleSuppliedError, msg: exit = 1 if v > 0: print 'Rule 1: %s' % le1 print 'Rule 2: %s' % le2 if exit: print msg if toFile: outfile.close() sys.exit() #read from a list of fasta files fasta_list = dict.value('infile_list', 'in','file') fasta = SeqIO.readFastaList(fasta_list, output='list') # key is (result1, result2) letterDict = {(1,1):['A','B'], (1,0):['A',' '], (0,1):[' ','B'], (0,0):[' ',' ']} index = 1 truefor1 = 0 truefor2 = 0 for seq in fasta: result = tester.classify(seq.getSeq()) truefor1 = truefor1 + result[0] truefor2 = truefor2 + result[1] # decide which lines to print printLine = (printa and printb) or (printa and result[0]) or (printb and result[1]) output = [] if nameFormat == 's': output.append(seq.getName()) elif nameFormat == 'i': output.append(`index`) if resultFormat == 'num': if tester.ruleCount() == 1: output.append(`result[0]`) else: output.append(`result[0]`) output.append(`result[1]`) elif resultFormat == 'ab': output = output + letterDict[result] strout = string.join(output,'\t') + '\n' if printLine: if toFile: outfile.write(strout) else: sys.stdout.write( strout ) index = index + 1 if toFile: outfile.close() if v > 0: print "Number of sequences satisfying rule 1:", truefor1 print "Number of sequences satisfying rule 2:", truefor2 class BadRuleFormatError(Exception): pass class NoRuleSuppliedError(Exception): pass class Classifier: def __init__(self, le1, le2='', neg='^', v=1): """le1, le2 are strings containing a parenthesized logical expression""" if le1 == '': raise NoRuleSuppliedError, "Error: at least one rule must be supplied." self.__negChar = neg self.__v = v ws = re.compile(r"\s") # parse the logical expression self.__le1 = parseLogicalExpression(le1, self.__negChar).split() # make sure the expression was constructed properly recon1 = reconstructExpression(self.__le1) if ws.sub('',le1) != ws.sub('',recon1): raise BadRuleFormatError, 'Error: this expression was read as "%s"\nbut reconstructed as "%s"\nCheck placement of parentheses.' % (le1, recon1) if le2 != '': self.__le2 = parseLogicalExpression(le2, self.__negChar ).split() recon2 = reconstructExpression(self.__le2) if ws.sub('',le2) != ws.sub('',reconstructExpression(self.__le2)): raise BadRuleFormatError, 'Error: this expression was read as "%s"\nbut reconstructed as "%s"\nCheck placement of parentheses.' % (le2, recon2) else: self.__le2 = '' if v > 2: print '*'*30,'Classifier.__init__' print 'self.__le1', self.__le1 print 'self.__le2', self.__le2 print 'self.__negChar', self.__negChar print 'self.__v', self.__v print '*'*30 def classify(self, strIn, v=1): trueForLe1 = computeTruth(strIn, self.__le1, self.__negChar, v = self.__v) if self.__le2 == '': trueForLe2 = not trueForLe1 else: trueForLe2 = computeTruth(strIn, self.__le2, self.__negChar, v = self.__v) if v > 2 or self.__v > 2: print strIn print 'le1: "%s" is %s' % (self.__le1, trueForLe1) print 'le2: "%s" is %s' % (self.__le2, trueForLe2) return (trueForLe1,trueForLe2) def ruleCount(self): """Returns number of le's passed to init (ie 1 or 2) """ if self.__le2 == '': return 1 else: return 2 def classifySeqList(self, seqList, key='name'): """Returns a dictionary from a list of sequences. Test results are keyed by sequence name (key='name') or index in the list (key='index')""" dictOut = {} for i in range(len(seqList)): if key == 'name': dictOut[seqList[i].getName()] = self.classify(seqList[i].getSeq()) elif key == 'index': dictOut[i] = self.classify(seqList[i].getSeq()) return dictOut def computeTruth(seqString, le, neg='^', v=0): """Determine if seqString (a string) fulfills requirements of le (list)""" # "!;1;A 2;B | 3;C 4;D & & !;5;E 6;F & 7;G 8;H & | & " truthHolder = [] for val in le: # evaluate previous two expressions if val in ['&','|']: truth1 = truthHolder.pop() truth2 = truthHolder.pop() if val == '&': newTruth = truth1 and truth2 elif val == '|': newTruth = truth1 or truth2 truthHolder.append(newTruth) else: # determine truth of statement statement = val.split(';') aa = statement.pop() pos = int(statement.pop()) - 1 try: thisTruth = aa.upper() == seqString[pos].upper() except IndexError: print pos + 1, 'You cannot access that far into the sequence' # raise an exception here if statement == [neg]: truthHolder.append(not thisTruth) else: truthHolder.append(thisTruth) if v > 2: print 'val:', val print truthHolder return truthHolder.pop() def parseLogicalExpression(input, neg='^'): isDigit = re.compile(r"\d") isLetterOrGap = re.compile(r"[-.~a-z]", re.I) stack = list(input) newString = '' counter = 0 previous = ';' for current in input: if current == '&': stack[counter] = current counter = counter + 1 newString = newString + ' ' elif current == '|': stack[counter] = current counter = counter + 1 newString = newString + ' ' elif current == neg: newString = newString + current + ';' elif current in [' ','(']: continue elif current != ')': # if last was # and this one is a character # insert a semicolon if isDigit.findall(previous) and isLetterOrGap.findall(current): newString = newString + ';' newString = newString + current else: newString = newString + ' ' if counter != 0: counter = counter - 1 newString = newString + stack[counter] + ' ' previous = current return newString def reconstructExpression( tokenHolder ): """Reconstruct parenthesized-expression from string array; this can then be compared to original string--if it matches the expression was entered by user correctly.""" # special case for only one if len(tokenHolder) == 1: return '(%s)' % tokenHolder[0].replace(';','') # go through input and reassemble expression # note: java stack.push() method is same as appending an item to end of a list componentHolder = [] for tok in tokenHolder: if tok in ['|','&']: # never happens until at least two token are added to componentHolder atomicExpr1 = componentHolder.pop() atomicExpr2 = componentHolder.pop() returnString = '(%s %s %s)' % (atomicExpr2, tok, atomicExpr1) componentHolder.append(returnString) else: componentHolder.append(tok) return componentHolder.pop().replace(';','') if __name__ == "__main__": main()