#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
# KMB 2008-12-28
import re
import cPickle
from tkinter_app_00 import App,win
# ./sawyer.py 'tur\s+æt\s' 'tur\s+ae?t\s'
# ./sawyer.py 'tur\s+on\s'
# ./sawyer.py 'dicitur\s+[aeio]n?\s' 'nominatur\s+[aeio]n?\s' 'nuncupan?tur\s+[aeio]n?\s' 'appellatur\s+[aeio]n?\s' ' cognominatur\s+[aeio]n?\s'
html_to_ascii_table={
'á':'á', 'à':'à', 'ä' :'ä', 'â' :'â',
'å' :'å', 'Å' :'Å', 'Ä' :'Ä', 'Á':'Á',
'æ' :'æ', 'Æ' :'Æ', 'Æ' :'Æ',
'ç':'ç',
'ð' :'ð', 'Ð' :'Ð',
'è':'è', 'È':'È', 'é':'é',
'í':'í',
'ø':'ø', 'ó':'ó', 'Ó':'Ó', 'Ö' :'Ö',
'ö' :'ö',
'þ' :'þ', 'Þ' :'Þ',
'ü' :'ü',
'ý':'ý',
}
def html_to_ascii(t):
for x in html_to_ascii_table:
t=t.replace(x,html_to_ascii_table[x])
return t
def to_ascii(t):
t=t.replace('{ae}','æ')
t=t.replace('{AE}','Æ')
t=t.replace('{dh}','ð')
t=t.replace('{DH}','Ð')
t=t.replace('{th}','þ')
t=t.replace('{TH}','Þ')
t=t.replace('{&}','&')
return t
def load_pkl():
global sawyer
try:
pkl=open('sawyer.pkl','rb')
sawyer=cPickle.load(pkl)
pkl.close()
except:
if win:
raw_input('Could not load sawyer.pkl - press any key to quit.')
return 'sawyer.pkl load failure'
def Scmp(a,b): # chop 'html/S' and '.html'
return cmp(int(a[6:-5]),int(b[6:-5]))
def re_find(p,z):
r=[]
for i,w in enumerate(z):
m=p.match(w)
if m: r.append(i)
return tuple(r)
def find(word,cb=False):
if not word: return ''
re_clean_html_sup=re.compile(r'.*?')
re_date=re.compile(r'A\.D\.\s+(?P\d{3,4})')
rgx=re.compile(('(?i)','')[cb]+word)
k=0
r=''
for s in sawyer:
fn='html/S%d.html'%s
txt=sawyer[s]
txt=re_clean_html_sup.sub('',txt)
txt=txt.replace('','').replace('
','').replace('','').replace('','').replace('\n',' ')
date=''
m=re_date.search(txt)
if m: date=m.group('AD')
if fn=='html/S1539.html': date='x-xi'
ms=rgx.finditer(txt)
for m in ms:
try:
q='S%s'%(fn[6:-5],)
if date: q+=' (%s)'%date
except:
q='S%s'%(fn[6:-5],)
if date: q+=' (%s)'%date
s,e=m.start(),m.end()
a,b=s-30,e+15
while txt[a]!=' ': a-=1
while txt[b]!=' ': b+=1
q+=txt[a:b]
q=q.strip(' .,7')
r+=to_ascii(q)+'\n'
k+=1
if r:
return r,to_ascii(word)
return 'No match.',''
if __name__=='__main__':
app=App(title='Sawyer - OE charter corpus search by Keith Briggs',init=load_pkl,action=find,checkbutton='case sensitive',help='help is available at http://keithbriggs.info/search_gui.html.\n')
app.mainloop()