#! /usr/bin/env python # from __future__ import division # / operator performs float rather than int division import os, sys, string, re, optparse def getAllRecords(instr): """Split records into a list of lists""" #records = instr.split('\n\n\n') records = instr.split('%0') records = ['%0 ' + x.strip() for x in records if x.strip() != ''] # for r in records: # print """ #********** #|%s| #********** #""" % r[:50] return records recordRexp = re.compile(r'(?P%\S+)\s(?P.+)') class EmptyRecordError(Exception): pass def parseRecord(recordString): """Parses a single record into a dictionary. Mulitple lines corresponding to a key are sored in tuples. Dict values are tuples.""" lines = recordString.strip().split('\n') if lines == ['']: raise EmptyRecordError keys = [] dictOut = {} last_key = '' for line in lines: try: key, val = recordRexp.findall(line)[0] if not dictOut.has_key(key): keys.append(key) dictOut[key] = dictOut.get(key, () ) + (val,) last_key = key except IndexError: # no key, val pair was found try: dictOut[last_key] = (dictOut[last_key][0].strip() + ' \n' +line.strip(),) except KeyError, msg: print 'Error in parseRecord' print recordString print "*"*30 print lines print last_key, line sys.exit() # for key in keys: # print key, dictOut[key] return keys, dictOut authorRexp = re.compile(r"""[a-z' ]+""", re.I) firstPageRexp = re.compile(r'\d+') goodLabelRexp = re.compile(r'[a-z]+\d+-\w+', re.I) nonAlphaRexp = re.compile(r'\W') class BadLabelError(Exception): pass def getTitleWord(instr, badwords='a an the in of hiv', minlen=4): wordlist=nonAlphaRexp.split(instr.lower()) while wordlist[0] in badwords.split() or len(wordlist[0]) < minlen: wordlist.pop(0) return wordlist[0] class Record: _malformed_count = 0 authKey='%A' yearKey='%D' pageKey='%P' labKey='%F' titleKey='%T' volumeKey='%V' numberKey='%N' def malformed_count(self): Record._malformed_count += 1 return Record._malformed_count def __init__(self, recordString): """record is a string""" self.keys, self.dict = parseRecord(recordString) self.label_ok = 0 self.note = '' def removeTrailingDotFromPage(self, pageKey='%P'): # get pages try: pages = self.dict[pageKey][0] except KeyError, IndexError: # nothing to be done return pages = pages.replace('.','') self.dict[pageKey] = (pages,) def fixLabel(self): """Label format is authoryearfirstpage""" # get author try: firstauthor = self.dict[Record.authKey][0] a = authorRexp.findall(firstauthor)[0].lower() a = nonAlphaRexp.sub('', a) except KeyError, IndexError: a = '???' # get year try: y = self.dict[Record.yearKey][0] except KeyError: y = '?' # get pages try: pages = self.dict[Record.pageKey][0] p = firstPageRexp.findall(pages)[0] except KeyError, IndexError: p = '?' # if pages aren't available, try the first acceptable word of the title if p == '?': try: titleString = self.dict[Record.titleKey][0] p = getTitleWord(titleString) except KeyError: p = '?' newLabel = '%s%s-%s' % (a,y,p) # check label format labelOk = goodLabelRexp.findall(newLabel) if labelOk: self.label_ok = 1 note = 'label_ok' else: newLabel = '%s-bad%i' % (newLabel, self.malformed_count()) note = 'label_not_ok' self.oldlabel = self.dict.get(Record.labKey, ('',)) self.dict[Record.labKey] = (newLabel,) if Record.labKey not in self.keys: self.keys.append(Record.labKey) self.addNote(note) self.note = note def getWolfLabel(self): # yearvolumenumberpages strout = '' for key in [Record.yearKey, Record.volumeKey, Record.numberKey, Record.pageKey]: strout += self.dict.get(key, ('',))[0] return strout def toString(self): strOut = '' for key in self.keys: for val in self.dict[key]: strOut += '%s %s\n' % (key, val) return strOut def addNote(self, instr, noteKey='%O'): try: newnote = self.dict[noteKey][0] + ' ' + instr.strip() # don't add note again if newnote.find(instr.strip()) != -1: return except KeyError: newnote = instr.strip() self.dict[noteKey] = (newnote,) class Option (optparse.Option): ATTRS = optparse.Option.ATTRS + ['required'] def _check_required (self): if self.required and not self.takes_value(): raise OptionError( "required flag set for option that doesn't take a value", self) # Make sure _check_required() is called from the constructor! CHECK_METHODS = optparse.Option.CHECK_METHODS + [_check_required] def process (self, opt, value, values, parser): optparse.Option.process(self, opt, value, values, parser) parser.option_seen[self] = 1 class OptionParser(optparse.OptionParser): def _init_parsing_state (self): optparse.OptionParser._init_parsing_state(self) self.option_seen = {} def check_values (self, values, args): for option in self.option_list: if (isinstance(option, Option) and option.required and not self.option_seen.has_key(option)): self.error("%s not supplied" % option) return (values, args) def main(): usage = """usage: %prog [options] Parse, modify, and write a reference database in endnote export format. Example: %prog -f infile.txt -o outfile.txt""" version = '$Id: endnlib.py,v 1.1 2004/10/10 02:53:10 nghoffma Exp $' make_option = optparse.make_option # -h,--help and --version options are built-in option_list=[ Option('-f','--infile', dest='infile', help='input file', required=1), Option('-o','--outfile', dest='outfile', help='name of file for output; prints to stdout by default', default=None), Option('-v', '--verbose', action='store_true', dest='verbose', help='be verbose') ] parser = OptionParser(usage=usage, version=version, option_list=option_list, ) (options, args) = parser.parse_args() v = options.verbose infilename = options.infile outfilename = options.outfile # if None, use stdout infile = open(infilename) rawstr = infile.read() infile.close() # convert any input format to unix anyendingRexp = re.compile(r'\r\n|\r|\n') rawstr = anyendingRexp.sub('\n', rawstr) #write to stdout by default writeToFile = False if outfilename: outfile = open(outfilename,'w') writeToFile = True else: outfile = sys.stdout v = False rawlist = getAllRecords(rawstr) counter = 1 for r in rawlist: if r.strip() == '':continue rObj = Record(r) rObj.fixLabel() rObj.removeTrailingDotFromPage() if v: print '%5i |%25s| |%25s| %s' % (counter, rObj.dict['%F'][0], rObj.dict['%A'][0], rObj.note) # hack to get conversion list for wolfgang's labels # outStr = '%s\t%s\n' % (rObj.getWolfLabel(), rObj.dict[Record.labKey][0]) # outfile.write(outStr) outStr = rObj.toString() outfile.write(outStr + os.linesep) #outfile.write(outStr.replace('\n',os.linesep) + os.linesep) counter += 1 if writeToFile: outfile.close() if sys.platform == 'darwin': import commands commands.getstatusoutput('/Developer/Tools/SetFile -t "TEXT" -c "ttxt" ' + outfilename) if __name__ == '__main__': main()