https://github.com/annotation/text-fabric
Tip revision: 3bbada95e9216904cc8b12c2fd4ecf77cbf5fdf1 authored by Dirk Roorda on 09 October 2018, 15:57:26 UTC
optimal incantation
optimal incantation
Tip revision: 3bbada9
bhsa.py
import os
import re
import types
from IPython.display import display, HTML
from tf.fabric import Fabric
from tf.apphelpers import (
search,
table, plainTuple,
show, prettyPre, pretty, prettyTuple, prettySetup,
hasData, getData,
getBoundary, getFeatures,
htmlEsc, mdEsc,
dm, header, outLink,
URL_GH, URL_NB,
makeAvailableIn,
)
from tf.server.common import getConfig
from tf.notebook import location
ORG = 'etcbc'
CORPUS = 'bhsa'
RELEASE = '1.4'
RELEASE_FIRST = '1.3'
PHONO = 'phono'
PHONO_RL = '1.1'
PHONO_RL_FIRST = '1.0.1'
PARA = 'parallels'
PARA_RL = '1.1'
PARA_RL_FIRST = '1.0.1'
SHEBANQ_URL = 'https://shebanq.ancient-data.org/hebrew'
SHEBANQ = (
f'{SHEBANQ_URL}/text'
'?book={book}&chapter={chapter}&verse={verse}&version={version}'
'&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt'
)
SHEBANQ_LEX = (f'{SHEBANQ_URL}/word' '?version={version}&id={lid}')
CONDENSE_TYPE = 'verse'
CSS = '''
<style type="text/css">
.verse {
display: flex;
flex-flow: row wrap;
direction: rtl;
}
.vl {
display: flex;
flex-flow: column nowrap;
justify-content: flex-end;
align-items: flex-end;
direction: ltr;
width: 100%;
}
.outeritem {
display: flex;
flex-flow: row wrap;
direction: rtl;
}
.sentence,.clause,.phrase {
margin-top: -1.2em;
margin-left: 1em;
background: #ffffff none repeat scroll 0 0;
padding: 0 0.3em;
border-style: solid;
border-radius: 0.2em;
font-size: small;
display: block;
width: fit-content;
max-width: fit-content;
direction: ltr;
}
.atoms {
display: flex;
flex-flow: row wrap;
margin: 0.3em;
padding: 0.3em;
direction: rtl;
background-color: #ffffff;
}
.satom,.catom,.patom {
margin: 0.3em;
padding: 0.3em;
border-radius: 0.3em;
border-style: solid;
display: flex;
flex-flow: column nowrap;
direction: rtl;
background-color: #ffffff;
}
.sentence {
border-color: #aa3333;
border-width: 1px;
}
.clause {
border-color: #aaaa33;
border-width: 1px;
}
.phrase {
border-color: #33aaaa;
border-width: 1px;
}
.satom {
border-color: #aa3333;
border-width: 4px;
}
.catom {
border-color: #aaaa33;
border-width: 3px;
}
.patom {
border-color: #33aaaa;
border-width: 3px;
}
.word {
padding: 0.1em;
margin: 0.1em;
border-radius: 0.1em;
border: 1px solid #cccccc;
display: flex;
flex-flow: column nowrap;
direction: rtl;
background-color: #ffffff;
}
.lextp {
padding: 0.1em;
margin: 0.1em;
border-radius: 0.1em;
border: 2px solid #888888;
width: fit-content;
display: flex;
flex-flow: column nowrap;
direction: rtl;
background-color: #ffffff;
}
.occs {
font-size: x-small;
}
.satom.l,.catom.l,.patom.l {
border-left-style: dotted
}
.satom.r,.catom.r,.patom.r {
border-right-style: dotted
}
.satom.L,.catom.L,.patom.L {
border-left-style: none
}
.satom.R,.catom.R,.patom.R {
border-right-style: none
}
.h,.h a:visited,.h a:link {
font-family: "Ezra SIL", "SBL Hebrew", sans-serif;
font-size: large;
color: #000044;
direction: rtl;
text-decoration: none;
}
.hb,.hb a:visited,.hb a:link {
font-family: "Ezra SIL", "SBL Hebrew", sans-serif;
font-size: large;
direction: rtl;
text-decoration: none;
}
.rela,.function,.typ {
font-family: monospace;
font-size: small;
color: #0000bb;
}
.pdp,.pdp a:visited,.pdp a:link {
font-family: monospace;
font-size: medium;
color: #0000bb;
text-decoration: none;
}
.voc_lex {
font-family: monospace;
font-size: medium;
color: #0000bb;
}
.vs {
font-family: monospace;
font-size: medium;
font-weight: bold;
color: #0000bb;
}
.vt {
font-family: monospace;
font-size: medium;
font-weight: bold;
color: #0000bb;
}
.gloss {
font-family: sans-serif;
font-size: small;
font-weight: normal;
color: #444444;
}
.vrs {
font-family: sans-serif;
font-size: small;
font-weight: bold;
color: #444444;
}
.nd {
font-family: monospace;
font-size: x-small;
color: #999999;
}
.features {
font-family: monospace;
font-size: medium;
font-weight: bold;
color: #0a6611;
display: flex;
flex-flow: column nowrap;
padding: 0.1em;
margin: 0.1em;
direction: ltr;
}
.features .f {
font-family: sans-serif;
font-size: x-small;
font-weight: normal;
color: #5555bb;
}
.word .features div,.word .features span {
padding: 0;
margin: -0.1rem 0;
}
.hl {
background-color: #ffee66;
}
</style>
'''
CSS_FONT = '''
<link rel="stylesheet" href="/data/static/fonts.css"/>
'''
CSS_FONT_API = '''
<link rel="stylesheet" href="https://fontlibrary.org/face/ezra" type="text/css"/>
'''
CLASS_NAMES = dict(
verse='verse',
sentence='atoms',
sentence_atom='satom',
clause='atoms',
clause_atom='catom',
phrase='atoms',
phrase_atom='patom',
subphrase='subphrase',
word='word',
lex='lextp',
)
ATOMS = dict(
sentence_atom='sentence',
clause_atom='clause',
phrase_atom='phrase',
)
SUPER = dict((y, x) for (x, y) in ATOMS.items())
SECTION = {'book', 'chapter', 'verse', 'half_verse'}
VERSE = {'verse', 'half_verse'}
NONE_VALUES = {None, 'NA', 'none', 'unknown'}
STANDARD_FEATURES = '''
pdp vs vt
lex language gloss voc_lex voc_lex_utf8
function typ rela
number label book
'''
EXCLUDED_FEATURES = set('''
crossrefLCS
crossrefSET
g_cons
g_cons_utf8
g_lex
g_lex_utf8
g_nme
g_nme_utf8
g_pfm
g_pfm_utf8
g_prs
g_prs_utf8
g_uvf
g_uvf_utf8
g_vbe
g_vbe_utf8
g_vbs
g_vbs_utf8
kq_hybrid
kq_hybrid_utf8
languageISO
lex0
lexeme_count
mother_object_type
suffix_gender
suffix_number
suffix_person
'''.strip().split())
# for 4, 4b: voc_lex => g_lex, voc_lex_utf8 => g_lex_utf8
PASSAGE_RE = re.compile('^([A-Za-z0-9_ -]+)\s+([0-9]+)\s*:\s*([0-9]+)$')
DOC_URL = f'https://etcbc.github.io/bhsa'
DOC_INTRO = '0_home'
def FEATURE_URL(version, feature):
return f'{DOC_URL}/features/hebrew/{version}/{feature}.html'
def getTf(
lgc,
source='bhsa',
release=RELEASE, firstRelease=RELEASE_FIRST,
version='c',
relative='{}/tf',
):
dataUrl = f'https://github.com/{ORG}/{source}/releases/download/{release}/{version}.zip'
dataRel = f'{ORG}/' + relative.format(source)
getData(source, release, firstRelease, dataUrl, dataRel, version, lgc)
def hasTf(lgc, source='bhsa', version='c', relative='{}/tf'):
dataRel = f'{ORG}/' + relative.format(source)
return hasData(lgc, dataRel, version)
class Bhsa(object):
def __init__(
self,
api=None,
name=None,
version='c',
locations=None,
modules=None,
asApi=False,
lgc=False,
hoist=False,
):
self.asApi = asApi
self.version = version
self.condenseType = CONDENSE_TYPE
self.exampleSection = (
'<code>Genesis 1:1</code> (use'
' <a href="https://github.com/ETCBC/bhsa/blob/master/tf/c/book%40en.tf" target="_blank">'
'English book names</a>)'
)
self.exampleSectionText = 'Genesis 1:1'
standardFeatures = (
STANDARD_FEATURES.replace('voc_', 'g_') if version in {'4', '4b'} else STANDARD_FEATURES
)
self.standardFeatures = set(standardFeatures.strip().split())
if asApi or not api:
getTf(lgc, source=CORPUS, release=RELEASE, firstRelease=RELEASE_FIRST, version=version)
getTf(lgc, source=PHONO, release=PHONO_RL, firstRelease=PHONO_RL_FIRST, version=version)
getTf(lgc, source=PARA, release=PARA_RL, firstRelease=PARA_RL_FIRST, version=version)
if not asApi:
config = getConfig('bhsa')
cfg = config.configure(lgc, version=version)
locations = cfg['locations']
modules = cfg['modules']
TF = Fabric(locations=locations, modules=modules, silent=True)
api = TF.load('', silent=True)
self.api = api
if api is False:
return
allFeatures = TF.explore(silent=True, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
useFeatures = [f for f in loadableFeatures if f not in EXCLUDED_FEATURES]
result = TF.load(useFeatures, add=True, silent=True)
if result is False:
self.api = False
return
else:
api.TF.load(self.standardFeatures, add=True, silent=True)
self.prettyFeaturesLoaded = self.standardFeatures
self.prettyFeatures = ()
self.api = api
self.cwd = os.getcwd()
cwdPat = re.compile(f'^.*/github/([^/]+)/([^/]+)((?:/.+)?)$', re.I)
cwdRel = cwdPat.findall(self.cwd)
if not asApi:
if name is None:
(nbDir, nbName, nbExt) = location()
name = nbName
if cwdRel:
(thisOrg, thisRepo, thisPath) = cwdRel[0]
onlineTail = (f'{thisOrg}/{thisRepo}' f'/blob/master{thisPath}/{name}.ipynb')
else:
cwdRel = None
nbUrl = (None if name is None or cwdRel is None else f'{URL_NB}/{onlineTail}')
ghUrl = (None if name is None or cwdRel is None else f'{URL_GH}/{onlineTail}')
tutUrl = f'{URL_NB}/{ORG}/{CORPUS}/blob/master/tutorial/search.ipynb'
extraUrl = f'https://dans-labs.github.io/text-fabric/Api/Bhsa/'
dataLink = outLink(CORPUS.upper(), DOC_URL, '{provenance of this corpus}')
featureLink = outLink(
'Feature docs', FEATURE_URL(self.version, DOC_INTRO),
f'{CORPUS.upper()} feature documentation'
)
bhsaLink = outLink('BHSA API', extraUrl, 'BHSA API documentation')
tfLink = outLink(
f'Text-Fabric API {api.TF.version}', 'https://dans-labs.github.io/text-fabric/Api/General/',
'text-fabric-api'
)
tfsLink = outLink(
'Search Reference',
'https://dans-labs.github.io/text-fabric/Api/General/#search-templates',
'Search Templates Introduction and Reference'
)
tutLink = outLink(
'Search tutorial', tutUrl,
'Search tutorial in Jupyter Notebook'
)
if asApi:
self.dataLink = dataLink
self.featureLink = featureLink
self.tfsLink = tfsLink
self.tutLink = tutLink
else:
dm('**Documentation:**' f' {dataLink} {featureLink} {bhsaLink} {tfLink} {tfsLink}')
dm('**Loaded features** (click them for info):')
lf = ['book@ll'] + [f for f in api.Fall() if '@' not in f]
dm(' '.join(
outLink(feature, FEATURE_URL(self.version, feature), title='info')
for feature in lf
))
if nbUrl:
dm(
f'''
This notebook online:
{outLink('NBViewer', nbUrl)}
{outLink('GitHub', ghUrl)}
'''
)
self.classNames = CLASS_NAMES
self.noneValues = NONE_VALUES
if not asApi:
self.loadCSS()
if hoist:
makeAvailableIn(self, hoist)
self.table = types.MethodType(table, self)
self.plainTuple = types.MethodType(plainTuple, self)
self.show = types.MethodType(show, self)
self.prettyTuple = types.MethodType(prettyTuple, self)
self.pretty = types.MethodType(pretty, self)
self.prettySetup = types.MethodType(prettySetup, self)
self.search = types.MethodType(search, self)
self.header = types.MethodType(header, self)
def loadCSS(self):
asApi = self.asApi
if asApi:
return CSS_FONT + CSS
display(HTML(CSS_FONT_API + CSS))
def shbLink(self, n, text=None, className=None, asString=False, noUrl=False):
api = self.api
L = api.L
T = api.T
F = api.F
version = self.version
nType = F.otype.v(n)
if nType == 'lex':
lex = F.lex.v(n)
lan = F.language.v(n)
lexId = '{}{}'.format(
'1' if lan == 'Hebrew' else '2',
lex.replace('>', 'A').replace('<', 'O').replace('[', 'v').replace('/',
'n').replace('=', 'i'),
)
href = SHEBANQ_LEX.format(
version=version,
lid=lexId,
)
title = 'show this lexeme in SHEBANQ'
if text is None:
text = htmlEsc(F.voc_lex_utf8.v(n))
result = outLink(text, href, title=title, className=className)
if asString:
return result
display(HTML(result))
return
(bookE, chapter, verse) = T.sectionFromNode(n)
bookNode = n if nType == 'book' else L.u(n, otype='book')[0]
book = F.book.v(bookNode)
passageText = (
bookE if nType == 'book' else '{} {}'.format(bookE, chapter)
if nType == 'chapter' else '{} {}:{}{}'.format(bookE, chapter, verse, F.label.v(n))
if nType == 'half_verse' else '{} {}:{}'.format(bookE, chapter, verse)
)
href = '#' if noUrl else SHEBANQ.format(
version=version,
book=book,
chapter=chapter,
verse=verse,
)
if text is None:
text = passageText
title = 'show this passage in SHEBANQ'
else:
title = passageText
if noUrl:
title = None
target = '' if noUrl else None
result = outLink(text, href, title=title, className=className, target=target)
if asString:
return result
display(HTML(result))
def webLink(self, n):
return self.shbLink(n, className='rwh', asString=True, noUrl=True)
def nodeFromDefaultSection(self, sectionStr):
api = self.api
T = api.T
match = PASSAGE_RE.match(sectionStr)
if not match:
return (f'Wrong shape: "{sectionStr}". Must be "book chapter:verse"', None)
(book, chapter, verse) = match.groups()
verseNode = T.nodeFromSection((book, int(chapter), int(verse)))
if verseNode is None:
return (f'Not a valid verse: "{sectionStr}"', None)
return ('', verseNode)
def plain(
self,
n,
linked=True,
withNodes=False,
asString=False,
):
asApi = self.asApi
api = self.api
L = api.L
T = api.T
F = api.F
nType = F.otype.v(n)
result = ''
if asApi:
nodeRep = f' <a href="#" class="nd">{n}</a> ' if withNodes else ''
else:
nodeRep = f' *{n}* ' if withNodes else ''
hebrew = True
if nType == 'word':
rep = mdEsc(htmlEsc(T.text([n])))
elif nType in SECTION:
fmt = ('{}' if nType == 'book' else '{} {}' if nType == 'chapter' else '{} {}:{}')
rep = fmt.format(*T.sectionFromNode(n))
hebrew = False
if nType == 'half_verse':
rep += F.label.v(n)
rep = mdEsc(htmlEsc(rep))
if nType in VERSE:
if linked:
rep = self.shbLink(n, text=rep, asString=True)
rep += ' <span class="hb">' + T.text(L.d(n, otype="word")) + '</span>'
elif nType == 'lex':
rep = mdEsc(htmlEsc(F.voc_lex_utf8.v(n)))
else:
rep = mdEsc(htmlEsc(T.text(L.d(n, otype='word'))))
if linked and nType not in VERSE:
rep = self.shbLink(n, text=rep, asString=True)
if hebrew:
rep = f'<span class="hb">{rep}</span>'
result = f'{rep}{nodeRep}'
if asString or asApi:
return result
dm((result))
def _pretty(
self,
n,
outer,
html,
firstSlot,
lastSlot,
condenseType=None,
withNodes=True,
suppress=set(),
highlights={},
):
goOn = prettyPre(
self,
n,
firstSlot,
lastSlot,
withNodes,
highlights,
)
if not goOn:
return
(
slotType, nType,
className, boundaryClass, hlClass, hlStyle,
nodePart,
myStart, myEnd,
) = goOn
api = self.api
F = api.F
L = api.L
T = api.T
sortNodes = api.sortNodes
otypeRank = api.otypeRank
bigType = False
if condenseType is not None and otypeRank[nType] > otypeRank[condenseType]:
bigType = True
if nType == 'book':
html.append(self.shbLink(n, asString=True))
return
if nType == 'chapter':
html.append(self.shbLink(n, asString=True))
return
if bigType:
children = ()
elif nType in {'verse', 'half_verse'}:
(thisFirstSlot, thisLastSlot) = getBoundary(api, n)
children = sortNodes(
set(L.d(n, otype='sentence_atom')) | {
L.u(thisFirstSlot, otype='sentence_atom')[0],
L.u(thisLastSlot, otype='sentence_atom')[0],
}
)
elif nType == 'sentence':
children = L.d(n, otype='sentence_atom')
elif nType == 'sentence_atom' or nType == 'clause':
children = L.d(n, otype='clause_atom')
elif nType == 'clause_atom' or nType == 'phrase':
children = L.d(n, otype='phrase_atom')
elif nType == 'phrase_atom' or nType == 'subphrase':
children = L.d(n, otype=slotType)
elif nType == 'lex':
children = ()
elif nType == slotType:
children = ()
lx = L.u(n, otype='lex')[0]
superType = ATOMS.get(nType, None)
if superType:
(superNode, superStart, superEnd) = self._getSuper(n, superType)
if superStart < myStart:
boundaryClass += ' r'
if superEnd > myEnd:
boundaryClass += ' l'
nodePart = (f'<a href="#" class="nd">{superNode}</a>' if withNodes else '')
shl = highlights.get(superNode, None)
shlClass = ''
shlStyle = ''
if shl is not None:
if shl == '':
shlClass = ' hl'
else:
shlStyle = f' style="background-color: {shl};"'
if not hlClass:
hlClass = shlClass
hlStyle = shlStyle
doOuter = outer and nType in {slotType, 'lex'}
if doOuter:
html.append('<div class="outeritem">')
html.append(f'<div class="{className} {boundaryClass}{hlClass}"{hlStyle}>')
if nType in {'verse', 'half_verse'}:
passage = self.shbLink(n, asString=True)
html.append(
f'''
<div class="vl">
<div class="vrs">{passage}</div>
{nodePart}
</div>
'''
)
elif superType:
typePart = self.shbLink(superNode, text=superType, asString=True)
featurePart = ''
if superType == 'sentence':
featurePart = getFeatures(
self,
superNode,
suppress,
('number',),
plain=True,
)
elif superType == 'clause':
featurePart = getFeatures(
self,
superNode,
suppress,
('rela', 'typ'),
plain=True,
)
elif superType == 'phrase':
featurePart = getFeatures(
self,
superNode,
suppress,
('function', 'typ'),
plain=True,
)
html.append(
f'''
<div class="{superType}{shlClass}"{shlStyle}>
{typePart} {nodePart} {featurePart}
</div>
<div class="atoms">
'''
)
else:
if nodePart:
html.append(nodePart)
heading = ''
featurePart = ''
occs = ''
if nType == slotType:
lx = L.u(n, otype='lex')[0]
lexLink = (self.shbLink(lx, text=htmlEsc(T.text([n])), asString=True))
heading = f'<div class="h">{lexLink}</div>'
featurePart = getFeatures(
self,
n,
suppress,
('pdp', 'gloss', 'vs', 'vt'),
givenValue=dict(
pdp=self.shbLink(n, text=htmlEsc(F.pdp.v(n)), asString=True),
gloss=htmlEsc(F.gloss.v(lx)),
),
)
elif nType == 'lex':
occs = L.d(n, otype='word')
extremeOccs = sorted({occs[0], occs[-1]})
linkOccs = ' - '.join(self.shbLink(lo, asString=True) for lo in extremeOccs)
heading = f'<div class="h">{htmlEsc(F.voc_lex_utf8.v(n))}</div>'
occs = f'<div class="occs">{linkOccs}</div>'
featurePart = getFeatures(
self,
n,
suppress,
('voc_lex', 'gloss'),
givenValue=dict(
voc_lex=self.shbLink(n, text=htmlEsc(F.voc_lex.v(n)), asString=True)
),
)
html.append(heading)
html.append(featurePart)
html.append(occs)
for ch in children:
self._pretty(
ch,
False,
html,
firstSlot,
lastSlot,
condenseType=condenseType,
withNodes=withNodes,
suppress=suppress,
highlights=highlights,
)
if superType:
html.append('''
</div>
''')
html.append('''
</div>
''')
if doOuter:
html.append('</div>')
def _getSuper(self, n, tp):
api = self.api
L = api.L
superNode = L.u(n, otype=tp)[0]
return (superNode, *getBoundary(api, superNode))