#!/usr/bin/env python # -*- coding: iso-8859-15 -*- # KMB 2008-12-28 import re import cPickle from tkinter_app_00 import App,win # ./sawyer.py 'tur\s+æt\s' 'tur\s+ae?t\s' # ./sawyer.py 'tur\s+on\s' # ./sawyer.py 'dicitur\s+[aeio]n?\s' 'nominatur\s+[aeio]n?\s' 'nuncupan?tur\s+[aeio]n?\s' 'appellatur\s+[aeio]n?\s' ' cognominatur\s+[aeio]n?\s' html_to_ascii_table={ 'á':'á', 'à':'à', 'ä' :'ä', 'â' :'â', 'å' :'å', 'Å' :'Å', 'Ä' :'Ä', 'Á':'Á', 'æ' :'æ', 'Æ' :'Æ', 'Æ' :'Æ', 'ç':'ç', 'ð' :'ð', 'Ð' :'Ð', 'è':'è', 'È':'È', 'é':'é', 'í':'í', 'ø':'ø', 'ó':'ó', 'Ó':'Ó', 'Ö' :'Ö', 'ö' :'ö', 'þ' :'þ', 'Þ' :'Þ', 'ü' :'ü', 'ý':'ý', } def html_to_ascii(t): for x in html_to_ascii_table: t=t.replace(x,html_to_ascii_table[x]) return t def to_ascii(t): t=t.replace('{ae}','æ') t=t.replace('{AE}','Æ') t=t.replace('{dh}','ð') t=t.replace('{DH}','Ð') t=t.replace('{th}','þ') t=t.replace('{TH}','Þ') t=t.replace('{&}','&') return t def load_pkl(): global sawyer try: pkl=open('sawyer.pkl','rb') sawyer=cPickle.load(pkl) pkl.close() except: if win: raw_input('Could not load sawyer.pkl - press any key to quit.') return 'sawyer.pkl load failure' def Scmp(a,b): # chop 'html/S' and '.html' return cmp(int(a[6:-5]),int(b[6:-5])) def re_find(p,z): r=[] for i,w in enumerate(z): m=p.match(w) if m: r.append(i) return tuple(r) def find(word,cb=False): if not word: return '' re_clean_html_sup=re.compile(r'.*?') re_date=re.compile(r'A\.D\.\s+(?P\d{3,4})') rgx=re.compile(('(?i)','')[cb]+word) k=0 r='' for s in sawyer: fn='html/S%d.html'%s txt=sawyer[s] txt=re_clean_html_sup.sub('',txt) txt=txt.replace('

','').replace('

','').replace('','').replace('','').replace('\n',' ') date='' m=re_date.search(txt) if m: date=m.group('AD') if fn=='html/S1539.html': date='x-xi' ms=rgx.finditer(txt) for m in ms: try: q='S%s'%(fn[6:-5],) if date: q+=' (%s)'%date except: q='S%s'%(fn[6:-5],) if date: q+=' (%s)'%date s,e=m.start(),m.end() a,b=s-30,e+15 while txt[a]!=' ': a-=1 while txt[b]!=' ': b+=1 q+=txt[a:b] q=q.strip(' .,7') r+=to_ascii(q)+'\n' k+=1 if r: return r,to_ascii(word) return 'No match.','' if __name__=='__main__': app=App(title='Sawyer - OE charter corpus search by Keith Briggs',init=load_pkl,action=find,checkbutton='case sensitive',help='help is available at http://keithbriggs.info/search_gui.html.\n') app.mainloop()