Content - 2964622b3a60d9374a2e085e500fdc2cea0783fe

Permalink
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img align=\"right\" src=\"tf-small.png\"/>\n",
    "\n",
    "# From LAF-Fabric to Text-Fabric\n",
    "\n",
    "This notebook turns the ETCBC4C dataset from LAF-Fabric into Text-Fabric.\n",
    "It also includes additional data\n",
    "\n",
    "* from the ETCBC itself:\n",
    "  * ketiv qere data\n",
    "  * lexicon data\n",
    "  * paragraph data\n",
    "* from me\n",
    "  * book names in multiple languages\n",
    "  * phonetic transcription: will be done in a separate notebook, completely inside Text-Fabric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import collections,sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0.00s This is LAF-Fabric 4.8.3\n",
      "API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html\n",
      "Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from laf.fabric import LafFabric\n",
    "from etcbc.preprocess import prep\n",
    "from etcbc.lib import Transcription\n",
    "from etcbc.blang import booklangs, booknames\n",
    "fabric = LafFabric()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Core etcbc data\n",
    "\n",
    "This is what came out of the live Emdros database running on the jakob server at the ETCBC, on 2016-11-04."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nodeFeaturesStr = '''\n",
    "otype\n",
    "code\n",
    "det\n",
    "dist\n",
    "dist_unit\n",
    "domain\n",
    "function\n",
    "g_cons\n",
    "g_cons_utf8\n",
    "g_lex\n",
    "g_lex_utf8\n",
    "g_nme\n",
    "g_nme_utf8\n",
    "g_pfm\n",
    "g_pfm_utf8\n",
    "g_prs\n",
    "g_prs_utf8\n",
    "g_uvf\n",
    "g_uvf_utf8\n",
    "g_vbe\n",
    "g_vbe_utf8\n",
    "g_vbs\n",
    "g_vbs_utf8\n",
    "g_word\n",
    "g_word_utf8\n",
    "gn\n",
    "is_root\n",
    "kind\n",
    "lex\n",
    "ls\n",
    "mother_object_type\n",
    "nme\n",
    "nu\n",
    "number\n",
    "pdp\n",
    "pfm\n",
    "prs\n",
    "prs_gn\n",
    "prs_nu\n",
    "prs_ps\n",
    "ps\n",
    "rela\n",
    "sp\n",
    "st\n",
    "tab\n",
    "trailer\n",
    "trailer_utf8\n",
    "txt\n",
    "typ\n",
    "uvf\n",
    "vbe\n",
    "vbs\n",
    "vs\n",
    "vt\n",
    "book\n",
    "chapter\n",
    "label\n",
    "verse\n",
    "'''\n",
    "\n",
    "edgeFeaturesStr = '''\n",
    "oslots\n",
    "mother\n",
    "functional_parent\n",
    "distributional_parent\n",
    "'''\n",
    "\n",
    "intFeatures = set('''\n",
    "dist\n",
    "number\n",
    "tab\n",
    "chapter\n",
    "verse\n",
    "'''.strip().split())\n",
    "\n",
    "nodeFeatureList = nodeFeaturesStr.strip().split()\n",
    "edgeFeatureList = edgeFeaturesStr.strip().split()\n",
    "\n",
    "lfNodeFeatures = ' '.join(nodeFeatureList+('monads g_voc_lex g_voc_lex_utf8 language').strip().split())\n",
    "lfEdgeFeatures = ' '.join(set(edgeFeatureList) - {'oslots'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "source = 'etcbc'\n",
    "version = '4c'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0.00s LOADING API: please wait ... \n",
      "  0.00s DETAIL: COMPILING m: etcbc4c: UP TO DATE\n",
      "  0.00s USING main: etcbc4c DATA COMPILED AT: 2016-11-09T19-16-37\n",
      "  0.01s DETAIL: load main: G.node_anchor_min\n",
      "  0.07s DETAIL: load main: G.node_anchor_max\n",
      "  0.12s DETAIL: load main: G.node_sort\n",
      "  0.17s DETAIL: load main: G.node_sort_inv\n",
      "  0.59s DETAIL: load main: G.edges_from\n",
      "  0.64s DETAIL: load main: G.edges_to\n",
      "  0.70s DETAIL: load main: F.etcbc4_db_monads [node] \n",
      "  1.35s DETAIL: load main: F.etcbc4_db_otype [node] \n",
      "  1.97s DETAIL: load main: F.etcbc4_ft_code [node] \n",
      "  2.01s DETAIL: load main: F.etcbc4_ft_det [node] \n",
      "  2.20s DETAIL: load main: F.etcbc4_ft_dist [node] \n",
      "  2.38s DETAIL: load main: F.etcbc4_ft_dist_unit [node] \n",
      "  2.61s DETAIL: load main: F.etcbc4_ft_domain [node] \n",
      "  2.63s DETAIL: load main: F.etcbc4_ft_function [node] \n",
      "  2.74s DETAIL: load main: F.etcbc4_ft_g_cons [node] \n",
      "  2.90s DETAIL: load main: F.etcbc4_ft_g_cons_utf8 [node] \n",
      "  3.14s DETAIL: load main: F.etcbc4_ft_g_lex [node] \n",
      "  3.48s DETAIL: load main: F.etcbc4_ft_g_lex_utf8 [node] \n",
      "  3.73s DETAIL: load main: F.etcbc4_ft_g_nme [node] \n",
      "  3.83s DETAIL: load main: F.etcbc4_ft_g_nme_utf8 [node] \n",
      "  3.98s DETAIL: load main: F.etcbc4_ft_g_pfm [node] \n",
      "  4.08s DETAIL: load main: F.etcbc4_ft_g_pfm_utf8 [node] \n",
      "  4.17s DETAIL: load main: F.etcbc4_ft_g_prs [node] \n",
      "  4.27s DETAIL: load main: F.etcbc4_ft_g_prs_utf8 [node] \n",
      "  4.38s DETAIL: load main: F.etcbc4_ft_g_uvf [node] \n",
      "  4.46s DETAIL: load main: F.etcbc4_ft_g_uvf_utf8 [node] \n",
      "  4.55s DETAIL: load main: F.etcbc4_ft_g_vbe [node] \n",
      "  4.64s DETAIL: load main: F.etcbc4_ft_g_vbe_utf8 [node] \n",
      "  4.73s DETAIL: load main: F.etcbc4_ft_g_vbs [node] \n",
      "  4.82s DETAIL: load main: F.etcbc4_ft_g_vbs_utf8 [node] \n",
      "  4.90s DETAIL: load main: F.etcbc4_ft_g_voc_lex [node] \n",
      "  5.09s DETAIL: load main: F.etcbc4_ft_g_voc_lex_utf8 [node] \n",
      "  5.34s DETAIL: load main: F.etcbc4_ft_g_word [node] \n",
      "  5.54s DETAIL: load main: F.etcbc4_ft_g_word_utf8 [node] \n",
      "  5.81s DETAIL: load main: F.etcbc4_ft_gn [node] \n",
      "  5.96s DETAIL: load main: F.etcbc4_ft_is_root [node] \n",
      "  6.00s DETAIL: load main: F.etcbc4_ft_kind [node] \n",
      "  6.04s DETAIL: load main: F.etcbc4_ft_language [node] \n",
      "  6.21s DETAIL: load main: F.etcbc4_ft_lex [node] \n",
      "  6.38s DETAIL: load main: F.etcbc4_ft_ls [node] \n",
      "  6.55s DETAIL: load main: F.etcbc4_ft_mother_object_type [node] \n",
      "  6.64s DETAIL: load main: F.etcbc4_ft_nme [node] \n",
      "  6.79s DETAIL: load main: F.etcbc4_ft_nu [node] \n",
      "  6.98s DETAIL: load main: F.etcbc4_ft_number [node] \n",
      "  7.40s DETAIL: load main: F.etcbc4_ft_pdp [node] \n",
      "  7.57s DETAIL: load main: F.etcbc4_ft_pfm [node] \n",
      "  7.83s DETAIL: load main: F.etcbc4_ft_prs [node] \n",
      "  8.05s DETAIL: load main: F.etcbc4_ft_prs_gn [node] \n",
      "  8.27s DETAIL: load main: F.etcbc4_ft_prs_nu [node] \n",
      "  8.51s DETAIL: load main: F.etcbc4_ft_prs_ps [node] \n",
      "  8.74s DETAIL: load main: F.etcbc4_ft_ps [node] \n",
      "  8.91s DETAIL: load main: F.etcbc4_ft_rela [node] \n",
      "  9.33s DETAIL: load main: F.etcbc4_ft_sp [node] \n",
      "  9.65s DETAIL: load main: F.etcbc4_ft_st [node] \n",
      "  9.86s DETAIL: load main: F.etcbc4_ft_tab [node] \n",
      "  9.90s DETAIL: load main: F.etcbc4_ft_trailer [node] \n",
      "  9.99s DETAIL: load main: F.etcbc4_ft_trailer_utf8 [node] \n",
      "    10s DETAIL: load main: F.etcbc4_ft_txt [node] \n",
      "    10s DETAIL: load main: F.etcbc4_ft_typ [node] \n",
      "    10s DETAIL: load main: F.etcbc4_ft_uvf [node] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_vbe [node] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_vbs [node] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_vs [node] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_vt [node] \n",
      "    11s DETAIL: load main: F.etcbc4_sft_book [node] \n",
      "    11s DETAIL: load main: F.etcbc4_sft_chapter [node] \n",
      "    11s DETAIL: load main: F.etcbc4_sft_label [node] \n",
      "    11s DETAIL: load main: F.etcbc4_sft_verse [node] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_distributional_parent [e] \n",
      "    11s DETAIL: load main: F.etcbc4_ft_functional_parent [e] \n",
      "    12s DETAIL: load main: F.etcbc4_ft_mother [e] \n",
      "    12s DETAIL: load main: C.etcbc4_ft_distributional_parent -> \n",
      "    12s DETAIL: load main: C.etcbc4_ft_functional_parent -> \n",
      "    13s DETAIL: load main: C.etcbc4_ft_mother -> \n",
      "    13s DETAIL: load main: C.etcbc4_ft_distributional_parent <- \n",
      "    13s DETAIL: load main: C.etcbc4_ft_functional_parent <- \n",
      "    14s DETAIL: load main: C.etcbc4_ft_mother <- \n",
      "    14s LOGFILE=/Users/dirk/laf/laf-fabric-output/etcbc4c/TF/__log__TF.txt\n",
      "    14s INFO: LOADING PREPARED data: please wait ... \n",
      "    14s prep prep: G.node_sort\n",
      "    14s prep prep: G.node_sort_inv\n",
      "    14s prep prep: L.node_up\n",
      "    18s prep prep: L.node_down\n",
      "    25s prep prep: V.verses\n",
      "    25s prep prep: V.books_la\n",
      "    25s ETCBC reference: http://laf-fabric.readthedocs.org/en/latest/texts/ETCBC-reference.html\n",
      "    25s INFO: LOADED PREPARED data\n",
      "    25s INFO: DATA LOADED FROM SOURCE etcbc4c AND ANNOX  FOR TASK TF AT 2016-12-17T06-45-22\n"
     ]
    }
   ],
   "source": [
    "API = fabric.load(source+version, '--', 'TF', {\n",
    "    \"xmlids\": {\"node\": False, \"edge\": False},\n",
    "    \"features\": (lfNodeFeatures, lfEdgeFeatures),\n",
    "    \"primary\": False,\n",
    "    \"prepare\": prep(select='L')\n",
    "}, verbose='DETAIL')\n",
    "exec(fabric.localnames.format(var='fabric'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "source = 'etcbc'\n",
    "version = '4c'\n",
    "ETCBC = 'hebrew/{}{}'.format(source, version)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nodeFeatures = {}\n",
    "edgeFeatures = {}\n",
    "metaData = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Map slot numbers\n",
    "In TF we make sure that the slots go from 1-maxSlot consecutively.\n",
    "Maybe we have to map the original LAF-Fabric monad numbers to the node numbers of the words in TF.\n",
    "Maybe there are holes in the original monad sequence.\n",
    "\n",
    "In TF the slots start at 1, and there are no holes.\n",
    "\n",
    "We create the following mappings:\n",
    "\n",
    "* `lfFromMonad`: original monad number => LAF-Fabric node number\n",
    "* `tfFromMonad`: original monad number => Text-Fabric slot number\n",
    "* `tfFromLf   `: LAF-Fabric node number => Text-Fabric node number (only for monads/slots) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lfFromMonad = {}\n",
    "tfFromMonad = {}\n",
    "tfFromLf = {}\n",
    "\n",
    "for w in F.otype.s('word'):\n",
    "    m = int(F.monads.v(w))\n",
    "    lfFromMonad[m] = w\n",
    "\n",
    "for (i, (m, w)) in enumerate(sorted(lfFromMonad.items())):\n",
    "    tfFromLf[w] = i+1\n",
    "    tfFromMonad[m] = i+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "maxLfNode = max(lfn for lfn in NN())\n",
    "maxMonadNode = max(lfFromMonad.values())\n",
    "maxMonad = max(lfn for lfn in tfFromMonad.keys())\n",
    "maxSlot = max(tfn for tfn in tfFromMonad.values())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Map all nodes\n",
    "\n",
    "After the last slot, the other nodes start. We map the remaining LAF-Fabric nodes to Text-Fabric nodes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tfn = maxSlot\n",
    "for lfn in range(maxMonadNode+1, maxLfNode+1):\n",
    "    tfn += 1\n",
    "    tfFromLf[lfn] = tfn\n",
    "maxNode = tfn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I suspect that the LF monads correspond 1-1-identical to the TF slots,\n",
    "and the TF nodes are all 1 higher than the LF nodes.\n",
    "Let us check that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "good = True\n",
    "for ln in tfFromLf:\n",
    "    if ln+1 != tfFromLf[ln]:\n",
    "        good = False\n",
    "for m in tfFromMonad:\n",
    "    if m != tfFromMonad[m]:\n",
    "        good = False\n",
    "print(good)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final correspondence between LF and TF\n",
    "With this out of the way, we can make simple mappings between monads, LF nodes, slots and TF nodes.\n",
    "We free up some memory and define some simple mapping functions.\n",
    "These mapping functions must be used to translate LF nodes/monads into TF nodes/slots.\n",
    "\n",
    "Remember that the following are already defined:\n",
    "\n",
    "* `maxMonad` : maximum monad in LF\n",
    "* `maxSlot`  : maximum slot in TF\n",
    "* `maxLfNode`: maximum node in LF\n",
    "* `maxNode`  : maximum node in TF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "LAF-Fabric:\n",
      "  maxMonad     =  426581\n",
      "  maxMonadNode =  426580\n",
      "  maxLfNode    = 1436893\n",
      "Text-Fabric\n",
      "  maxSlot      =  426581\n",
      "  maxNode      = 1436894\n",
      "\n"
     ]
    }
   ],
   "source": [
    "lfFromMonad = None\n",
    "tfFromMonad = None\n",
    "tfFromLf = None\n",
    "\n",
    "tFm = lambda x: x\n",
    "tFn = lambda x: x+1\n",
    "\n",
    "def tFms(monadList): return set(monadList)\n",
    "def tFns(lfNodeList): return {n+1 for n in lfNodeList}\n",
    "\n",
    "print('''\n",
    "LAF-Fabric:\n",
    "  {:<12} = {:>7}\n",
    "  {:<12} = {:>7}\n",
    "  {:<12} = {:>7}\n",
    "Text-Fabric\n",
    "  {:<12} = {:>7}\n",
    "  {:<12} = {:>7}\n",
    "'''.format(\n",
    "    'maxMonad', maxMonad,\n",
    "    'maxMonadNode', maxMonadNode,\n",
    "    'maxLfNode', maxLfNode, \n",
    "    'maxSlot', maxSlot, \n",
    "    'maxNode', maxNode,\n",
    "))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oslots\n",
    "\n",
    "Here is code to write the oslots edge information in a compact text file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from tf.helpers import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oslotsData = {}\n",
    "\n",
    "for n in range(maxMonadNode+1, maxLfNode+1):\n",
    "    oslotsData[tFn(n)] = tFms(setFromSpec(F.monads.v(n)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Additional data\n",
    "\n",
    "## Book names international\n",
    "\n",
    "For each language in which book names have been translated, we add a feature `book_ll` where \n",
    "`ll` is the two letter language code.\n",
    "The feature gives for each book node the name of that book in that language."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26 book name languages\n"
     ]
    }
   ],
   "source": [
    "langs = booklangs['Hebrew']\n",
    "names = booknames['Hebrew']\n",
    "books = [b for b in F.otype.s('book')]\n",
    "bookIndex = dict(((b,i) for (i,b) in enumerate(books)))\n",
    "\n",
    "textConfig = {\n",
    "    'sectionTypes':              'book,chapter,verse',\n",
    "    'sectionFeatures':           'book,chapter,verse',\n",
    "    'fmt:text-orig-full':        '{qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}',\n",
    "    'fmt:text-orig-full-ketiv':  '{g_word_utf8}{trailer_utf8}',\n",
    "    'fmt:text-orig-plain':       '{g_cons_utf8}{trailer_utf8}',\n",
    "    'fmt:text-trans-full':       '{qere/g_word}{qere_trailer/trailer}',\n",
    "    'fmt:text-trans-full-ketiv': '{g_word}{trailer}',\n",
    "    'fmt:text-trans-plain':      '{g_cons}{trailer}',\n",
    "    'fmt:lex-orig-full':         '{g_lex_utf8} ',\n",
    "    'fmt:lex-orig-plain':        '{lex_utf8} ',\n",
    "    'fmt:lex-trans-full':        '{g_lex} ',\n",
    "    'fmt:lex-trans-plain':       '{lex0} ',\n",
    "}\n",
    "\n",
    "metaData.update(dict(otext=textConfig))\n",
    "\n",
    "for (code, (langEng, langOwn)) in sorted(booklangs['Hebrew'].items()):\n",
    "    fName = 'book@{}'.format(code)\n",
    "    metaData[fName] = dict(\n",
    "        valueType = 'str',\n",
    "        source='blang.py in LAF-Fabric',\n",
    "        languageCode=code,\n",
    "        languageEnglish=langEng,\n",
    "        language=langOwn\n",
    "    )\n",
    "    nodeFeatures[fName] = dict(((tFn(b), names[code][bookIndex[b]]) for b in books))\n",
    "print('{} book name languages'.format(len(nodeFeatures)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Paragraphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    10s Making mappings between clause atoms in PX and nodes in LAF\n",
      "    12s End making mappings: 90562=90562 clauses\n"
     ]
    }
   ],
   "source": [
    "inf(\"Making mappings between clause atoms in PX and nodes in LAF\")\n",
    "ca_labn2id = {}\n",
    "ca_id2labn = {}\n",
    "for n in NN():\n",
    "    otype = F.otype.v(n)\n",
    "    if otype == 'verse':\n",
    "        cur_label = F.label.v(n)\n",
    "    elif otype == 'chapter':\n",
    "        cur_subtract += cur_chapter_cas\n",
    "        cur_chapter_cas = 0\n",
    "    elif otype == 'book':\n",
    "        cur_subtract = 0\n",
    "        cur_chapter_cas = 0\n",
    "    elif otype == 'clause_atom':\n",
    "        cur_chapter_cas += 1\n",
    "        nm = int(F.number.v(n)) - cur_subtract\n",
    "        ca_labn2id[(cur_label, nm)] = n\n",
    "        ca_id2labn[n] = (cur_label, nm)\n",
    "inf(\"End making mappings: {}={} clauses\".format(len(ca_labn2id), len(ca_id2labn)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_px(px_file):\n",
    "    data = []\n",
    "    not_found = set()\n",
    "    px_handle = open(px_file)\n",
    "    ln = 0\n",
    "    can = 0\n",
    "    featurescan = re.compile(r'0 0 (..) [0-9]+ LineNr\\s*([0-9]+).*?Pargr:\\s*([0-9.]+)')\n",
    "    cur_label = None\n",
    "    data = []\n",
    "    for line in px_handle:\n",
    "        ln += 1\n",
    "        if line.strip()[0] != '*':\n",
    "            cur_label = line[0:10]\n",
    "            continue\n",
    "        can += 1\n",
    "        features = featurescan.findall(line)\n",
    "        if len(features) == 0:\n",
    "            msg(\"Warning: line {}: no instruction, LineNr, Pargr found\".format(ln))\n",
    "        elif len(features) > 1:\n",
    "            msg(\"Warning: line {}: multiple instruction, LineNr, Pargr found\".format(ln))\n",
    "        else:\n",
    "            feature = features[0]\n",
    "            the_ins = feature[0]\n",
    "            the_n = feature[1]\n",
    "            the_para = feature[2]\n",
    "            labn = (cur_label, int(the_n))\n",
    "            if labn not in ca_labn2id:\n",
    "                not_found.add(labn)\n",
    "                continue\n",
    "            data.append((ca_labn2id[labn], the_ins, the_n, the_para))\n",
    "    px_handle.close()\n",
    "    inf(\"Read {} paragraph annotations\".format(len(data)))\n",
    "    if not_found:\n",
    "        msg(\"Could not find {} label/line entries in index: {}\".format(len(not_found), sorted({lab for lab in not_found})))\n",
    "    else:\n",
    "        inf(\"All label/line entries found in index\")\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    12s Read 90562 paragraph annotations\n",
      "    12s All label/line entries found in index\n"
     ]
    }
   ],
   "source": [
    "base_dir = '/Users/dirk/laf/laf-fabric-data'\n",
    "px_base = '{}/{}/{}_data.{}{}'.format(base_dir, 'px', 'px', source, version)\n",
    "px = read_px(px_base)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nodeFeatures.update(dict(\n",
    "    instruction=dict(((x[0], x[1]) for x in px)),\n",
    "    pargr=dict(((x[0], x[3]) for x in px)),\n",
    "))\n",
    "for ft in 'instruction pargr'.strip().split():\n",
    "    metaData.setdefault(ft, {})['valueType'] = 'int' if ft == 'number_in_ch' else 'str'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "514581: instruction=        .N pargr=         1\n",
      "514582: instruction=        .. pargr=         1\n",
      "514583: instruction=        .. pargr=         1\n",
      "514584: instruction=        .. pargr=         1\n",
      "514585: instruction=        .# pargr=       1.1\n",
      "514586: instruction=        .q pargr=     1.1.1\n",
      "514587: instruction=        .# pargr=     1.1.2\n",
      "514588: instruction=        .# pargr=     1.1.3\n",
      "514589: instruction=        .. pargr=     1.1.3\n",
      "514590: instruction=        .# pargr=     1.1.4\n"
     ]
    }
   ],
   "source": [
    "caStart = 514581\n",
    "for w in range(caStart, caStart+10):\n",
    "    print('{}: instruction={:>10} pargr={:>10}'.format(\n",
    "        w,\n",
    "        nodeFeatures['instruction'][w],\n",
    "        nodeFeatures['pargr'][w],\n",
    "    ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Ketiv/qere"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19m 47s Making mappings between verse labels in KQ and verse nodes in LAF\n",
      "19m 48s 23213 verses\n"
     ]
    }
   ],
   "source": [
    "inf(\"Making mappings between verse labels in KQ and verse nodes in LAF\")\n",
    "vlab2vnode = {}\n",
    "for vs in F.otype.s('verse'):\n",
    "    lab = F.label.v(vs)\n",
    "    vlab2vnode[lab] = vs\n",
    "inf(\"{} verses\".format(len(vlab2vnode)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_kq(kq_file):\n",
    "    inf(\"Reading Ketiv-Qere data\")\n",
    "\n",
    "    info = collections.defaultdict(lambda: [])\n",
    "    not_found = set()\n",
    "    missing = collections.defaultdict(lambda: [])\n",
    "    missed = collections.defaultdict(lambda: [])\n",
    "\n",
    "    error_limit = 10\n",
    "\n",
    "    kq_handle = open(kq_file)\n",
    "\n",
    "    ln = 0\n",
    "    can = 0\n",
    "    cur_label = None\n",
    "    for line in kq_handle:\n",
    "        ln += 1\n",
    "        can += 1\n",
    "        vlab = line[0:10]\n",
    "        fields = line.rstrip('\\n')[10:].split()\n",
    "        (ketiv, qere) = fields[0:2]\n",
    "        (qtrim, qtrailer) = Transcription.suffix_and_finales(qere)\n",
    "        vnode = vlab2vnode.get(vlab, None)\n",
    "        if vnode == None:\n",
    "            not_found.add(vlab)\n",
    "            continue\n",
    "        info[vnode].append((ketiv, qtrim, qtrailer))        \n",
    "    kq_handle.close()\n",
    "    inf(\"Read {} ketiv-qere annotations\".format(ln))\n",
    "\n",
    "    data = []\n",
    "    for vnode in info:\n",
    "        wlookup = collections.defaultdict(lambda: [])\n",
    "        wvisited = collections.defaultdict(lambda: -1)\n",
    "        wnodes = L.d('word', vnode)\n",
    "        for w in wnodes:\n",
    "            gw = F.g_word.v(w)\n",
    "            if '*' in gw:\n",
    "                gw = F.g_cons.v(w)\n",
    "                if gw == '': gw = '.'\n",
    "                if F.trailer_utf8.v(w) == '': gw += '-'\n",
    "                wlookup[gw].append(w)\n",
    "        for (ketiv, qere, qtrailer) in info[vnode]:\n",
    "            wvisited[ketiv] += 1\n",
    "            windex = wvisited[ketiv]\n",
    "            ws = wlookup.get(ketiv, None)\n",
    "            if ws == None or windex > len(ws) - 1:\n",
    "                missing[vnode].append((windex, ketiv, qere))\n",
    "                continue\n",
    "            w = ws[windex]\n",
    "            qere_u = Transcription.to_hebrew(qere)\n",
    "            qtrailer_u = Transcription.to_hebrew(qtrailer)\n",
    "            data.append((\n",
    "                w,\n",
    "                ketiv,\n",
    "                qere,\n",
    "                qtrailer.replace('\\n', ''),\n",
    "                qere_u,\n",
    "                qtrailer_u.replace('\\n', ''),\n",
    "            ))\n",
    "        for ketiv in wlookup:\n",
    "            if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]:\n",
    "                missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), ketiv))\n",
    "    inf(\"Parsed {} ketiv-qere annotations\".format(len(data)))\n",
    "\n",
    "    if not_found:\n",
    "        msg(\"Could not find {} verses: {}\".format(len(not_found), sorted(not_found)))\n",
    "    else:\n",
    "        inf(\"All verses entries found in index\")\n",
    "    if missing:\n",
    "        msg(\"Could not locate ketivs in the text: {} verses\".format(len(missing)))\n",
    "        e = 0\n",
    "        for vnode in sorted(missing):\n",
    "            if e > error_limit: break\n",
    "            vlab = F.label.v(vnode)\n",
    "            for (windex, ketiv, qere) in missing[vnode]:\n",
    "                e += 1\n",
    "                if e > error_limit: break\n",
    "                print('NOT IN TEXT: {:<10} {:<20} #{} {}'.format(vlab, ketiv, windex, qere))\n",
    "    else:\n",
    "        inf(\"All ketivs found in the text\")\n",
    "    if missed:\n",
    "        msg(\"Could not lookup qeres in the data: {} verses\".format(len(missed)))\n",
    "        e = 0\n",
    "        for vnode in sorted(missed):\n",
    "            if e > error_limit: break\n",
    "            vlab = F.label.v(vnode)\n",
    "            for (windex, ketiv) in missed[vnode]:\n",
    "                e += 1\n",
    "                if e > error_limit: break\n",
    "                print('NOT IN DATA: {:<10} {:<20} #{}'.format(vlab, ketiv, windex))\n",
    "    else:\n",
    "        inf(\"All ketivs found in the data\")\n",
    "    return [(tFn(x[0]), x[2], x[3], x[4], x[5]) for x in data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19m 48s Reading Ketiv-Qere data\n",
      "19m 48s Read 1892 ketiv-qere annotations\n",
      "19m 48s Parsed 1892 ketiv-qere annotations\n",
      "19m 48s All verses entries found in index\n",
      "19m 48s All ketivs found in the text\n",
      "19m 48s All ketivs found in the data\n"
     ]
    }
   ],
   "source": [
    "base_dir = '/Users/dirk/laf/laf-fabric-data'\n",
    "kq_base = '{}/{}/{}.{}{}'.format(base_dir, 'kq', 'kq', source, version)\n",
    "kq = get_kq(kq_base)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nodeFeatures.update(dict(\n",
    "    qere=dict(((x[0], x[1]) for x in kq)),\n",
    "    qere_trailer=dict(((x[0], x[2]) for x in kq)),\n",
    "    qere_utf8=dict(((x[0], x[3]) for x in kq)),\n",
    "    qere_trailer_utf8=dict(((x[0], x[4]) for x in kq))\n",
    "))\n",
    "for ft in 'qere qere_trailer qere_utf8 qere_trailer_utf8'.strip().split():\n",
    "    metaData.setdefault(ft, {})['valueType'] = 'str'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"*N<R\" \"N<R\" \"HA45\"\n"
     ]
    }
   ],
   "source": [
    "w1 = 105846\n",
    "print('\"{}\" \"{}\" \"{}\"'.format(F.g_word.v(w1), F.g_cons.v(w1), nodeFeatures['qere'][w1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lexicon\n",
    "We add lexical data.\n",
    "The lexical data will not be added as features of words, but as features of lexemes.\n",
    "The lexemes will be added as fresh nodes, of a new type `lex`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "added 9236 lexemes\n",
      "maxNode is now 1446130\n"
     ]
    }
   ],
   "source": [
    "lang_map = {\n",
    "    'Hebrew': 'hbo',\n",
    "    'Aramaic': 'arc',\n",
    "}\n",
    "\n",
    "lexNode = maxNode\n",
    "lexOccs = {}\n",
    "nodeFromLex = {}\n",
    "lexFromNode = {}\n",
    "otypeData = {}\n",
    "for n in F.otype.s('word'):\n",
    "    lex = F.lex.v(n)\n",
    "    lan = lang_map[F.language.v(n)]\n",
    "    lex_id = (lan, lex)\n",
    "    lexOccs.setdefault(lex_id, []).append(tFn(n))\n",
    "    if lex_id not in nodeFromLex:\n",
    "        lexNode += 1\n",
    "        nodeFromLex[lex_id] = lexNode\n",
    "        lexFromNode[lexNode] = lex_id\n",
    "print('added {} lexemes\\nmaxNode is now {}'.format(len(nodeFromLex), lexNode)) \n",
    "for n in range(maxNode+1, lexNode+1):\n",
    "    oslotsData[n] = lexOccs[lexFromNode[n]]\n",
    "    otypeData[n] = 'lex'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lexical features\n",
    "We add extra features, based on the lexicon.\n",
    "We will add an extra otype: `lexeme`, the nodes of which will hold the lexemes.\n",
    "They will be linked to all the word nodes that contain occurrences of them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19m 51s Reading lexicon ...\n",
      "Lexicon arc: there were 0 errors\n",
      "Lexicon hbo: there were 0 errors\n",
      "Lexicon arc has   708 entries\n",
      "Lexicon hbo has  8528 entries\n",
      "19m 51s Done\n"
     ]
    }
   ],
   "source": [
    "langs = {'hbo', 'arc'}\n",
    "lex_base = dict((lan, '{}/{}/{}.{}{}'.format(base_dir, 'lexicon', lan, source, version)) for lan in langs)\n",
    "\n",
    "def read_lex(lan):\n",
    "    lex_infile = open(lex_base[lan], encoding='utf-8')\n",
    "    lex_outfile = outfile('{}.txt'.format(lan))\n",
    "    lex_errfile = outfile('{}.err.txt'.format(lan))\n",
    "\n",
    "    lex_items = {}\n",
    "    ln = 0\n",
    "    e = 0\n",
    "    for line in lex_infile:\n",
    "        ln += 1\n",
    "        line = line.rstrip()\n",
    "        line = line.split('#')[0]\n",
    "        if line == '': continue\n",
    "        (entry, featurestr) = line.split(sep=None, maxsplit=1)\n",
    "        entry = entry.strip('\"')\n",
    "        if entry in lex_items:\n",
    "            lex_errfile.write('duplicate lexical entry {} in line {}.\\n'.format(entry, ln))\n",
    "            e += 1\n",
    "            continue\n",
    "        featurestr = featurestr.strip(':')\n",
    "        featurestr = featurestr.replace('\\\\:', chr(254))\n",
    "        featurelst = featurestr.split(':')\n",
    "        features = {}\n",
    "        for feature in featurelst:\n",
    "            comps = feature.split('=', maxsplit=1)\n",
    "            if len(comps) == 1:\n",
    "                if feature.strip().isnumeric():\n",
    "                    comps = ('_n', feature.strip())\n",
    "                else:\n",
    "                    lex_errfile.write('feature without value for lexical entry {} in line {}: {}\\n'.format(\n",
    "                            entry, ln, feature,\n",
    "                    ))\n",
    "                    e += 1\n",
    "                    continue\n",
    "            (key, value) = comps\n",
    "            value = value.replace(chr(254), ':')\n",
    "            if key in features:\n",
    "                lex_errfile.write('duplicate feature for lexical entry {} in line {}: {}={}\\n'.format(\n",
    "                        entry, ln, key, value,\n",
    "                ))\n",
    "                e += 1\n",
    "                continue\n",
    "            features[key] = value.replace('\\\\', '/')\n",
    "        if 'sp' in features and features['sp'] == 'verb':\n",
    "            if 'gl' in features:\n",
    "                gloss = features['gl']\n",
    "                if gloss.startswith('to '):\n",
    "                    features['gl'] = gloss[3:]\n",
    "        lex_items[entry] = features\n",
    "        lex_outfile.write('{}\\t{}\\n'.format(entry, features))\n",
    "        \n",
    "    lex_infile.close()\n",
    "    lex_outfile.close()\n",
    "    lex_errfile.close()\n",
    "    msgstr = \"Lexicon {}: there w\".format(lan) + ('ere {} errors'.format(e) if e != 1 else 'as 1 error')\n",
    "    print(msgstr)\n",
    "    return lex_items\n",
    "\n",
    "inf(\"Reading lexicon ...\")\n",
    "lex_entries = dict((lan, read_lex(lan)) for lan in sorted(langs))\n",
    "for lan in sorted(lex_entries):\n",
    "    print('Lexicon {} has {:>5} entries'.format(lan, len(lex_entries[lan])))\n",
    "inf(\"Done\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We inspect all word occurrences of the etcbc4 database, inspect their language and lexeme values, and construct sets of lexemes that belong to each of the two languages, ``hbo`` and ``arc``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19m 51s Reading ETCBC database etcbc4c ...\n",
      "19m 58s Done\n",
      "Language arc has   708 lexemes in the etcbc4c text\n",
      "Language hbo has  8528 lexemes in the etcbc4c text\n"
     ]
    }
   ],
   "source": [
    "lex_text = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: set())))\n",
    "do_value_compare = {'sp', 'ls', 'gn', 'ps', 'nu', 'st'}\n",
    "text_value_set = collections.defaultdict(lambda: set())\n",
    "node_lex = {}\n",
    "\n",
    "inf(\"Reading ETCBC database {}{} ...\".format(source, version))\n",
    "text_langs = set()\n",
    "for n in F.otype.s('word'):\n",
    "    lan = lang_map[F.language.v(n)]\n",
    "    text_langs.add(lan)\n",
    "    lex = F.lex.v(n)\n",
    "    node_lex[n] = (lan,lex)\n",
    "    lex_text[lan][lex]['sp'].add(F.sp.v(n))\n",
    "    lex_text[lan][lex]['ls'].add(F.ls.v(n))\n",
    "    lex_text[lan][lex]['gn'].add(F.gn.v(n))\n",
    "    lex_text[lan][lex]['nu'].add(F.nu.v(n))\n",
    "    lex_text[lan][lex]['ps'].add(F.ps.v(n))\n",
    "    lex_text[lan][lex]['vc'].add(F.g_voc_lex.v(n))\n",
    "    for p in do_value_compare:\n",
    "        text_value_set[p].add(F.item[p].v(n))        \n",
    "\n",
    "tf = outfile('text_lexemes.txt')\n",
    "for lan in sorted(lex_text):\n",
    "    for lex in sorted(lex_text[lan]):\n",
    "        tf.write('{} \"{}\"\\n'.format(lan, lex))\n",
    "tf.close()\n",
    "inf(\"Done\")\n",
    "for lan in sorted(lex_text):\n",
    "    print('Language {} has {:>5} lexemes in the {}{} text'.format(lan, len(lex_text[lan]), source, version))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us now check whether all lexemes in the text occur in the lexicon and vice versa."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The intersection of hbo and arc in the etcbc4 text contains 460 lexemes\n",
      "The intersection of hbo and arc in the lexicon     contains 460 lexemes\n",
      "Lexemes in the lexical intersection of hbo and arc but not in the textual intersection: 0x: set()\n",
      "Lexemes in the textual intersection of hbo and arc but not in the lexical intersection: 0x: set()\n"
     ]
    }
   ],
   "source": [
    "mql_lan = dict(hbo='Hebrew', arc='Aramaic')\n",
    "\n",
    "arc_lex = set(lex_entries['arc'])\n",
    "hbo_lex = set(lex_entries['hbo'])\n",
    "\n",
    "arc_text = set(lex_text['arc'])\n",
    "hbo_text = set(lex_text['hbo'])\n",
    "\n",
    "hbo_and_arc_text = arc_text & hbo_text\n",
    "hbo_and_arc_lex = arc_lex & hbo_lex\n",
    "\n",
    "lex_min_text = hbo_and_arc_lex - hbo_and_arc_text\n",
    "text_min_lex = hbo_and_arc_text - hbo_and_arc_lex\n",
    "\n",
    "\n",
    "print('The intersection of hbo and arc in the etcbc4 text contains {} lexemes'.format(len(hbo_and_arc_text)))\n",
    "print('The intersection of hbo and arc in the lexicon     contains {} lexemes'.format(len(hbo_and_arc_lex)))\n",
    "print(\"Lexemes in the lexical intersection of hbo and arc but not in the textual intersection: {}x: {}\".format(\n",
    "    len(lex_min_text), lex_min_text)\n",
    ")\n",
    "print(\"Lexemes in the textual intersection of hbo and arc but not in the lexical intersection: {}x: {}\".format(\n",
    "    len(text_min_lex), text_min_lex)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "arc: lexemes in text but not in lexicon: 0x\n",
      "arc: lexemes in lexicon but not in text: 0x\n",
      "hbo: lexemes in text but not in lexicon: 0x\n",
      "hbo: lexemes in lexicon but not in text: 0x\n"
     ]
    }
   ],
   "source": [
    "arc_text_min_lex = arc_text - arc_lex\n",
    "arc_lex_min_text = arc_lex - arc_text\n",
    "\n",
    "hbo_text_min_lex = hbo_text - hbo_lex\n",
    "hbo_lex_min_text = hbo_lex - hbo_text\n",
    "\n",
    "for (myset, mymsg) in (\n",
    "    (arc_text_min_lex, 'arc: lexemes in text but not in lexicon'),\n",
    "    (arc_lex_min_text, 'arc: lexemes in lexicon but not in text'),\n",
    "    (hbo_text_min_lex, 'hbo: lexemes in text but not in lexicon'),\n",
    "    (hbo_lex_min_text, 'hbo: lexemes in lexicon but not in text'),\n",
    "):\n",
    "    print('{}: {}x{}'.format(mymsg, len(myset), '' if not myset else '\\n\\t{}'.format(', '.join(sorted(myset)))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vocalized lexeme\n",
    "\n",
    "The lexicon file provides an attribute `vc` for each lexeme, which is the vocalized lexeme.\n",
    "The ETCBC core data also has features `g_voc_lex` and `g_voc_lex_utf8` for each occurrence.\n",
    "\n",
    "We investigate whether the latter features are *consistent*, i.e. a property of the lexeme and lexeme only.\n",
    "If they are somehow dependent on the word occurrence, they are not consistent."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "arc\n",
      "    vc      :    0 inconsistent lexemes\n",
      "hbo\n",
      "    vc      :    0 inconsistent lexemes\n"
     ]
    }
   ],
   "source": [
    "consistent_props = {'vc'}\n",
    "\n",
    "exceptions = collections.defaultdict(lambda: collections.defaultdict(lambda: set()))\n",
    "\n",
    "incons = outfile('inconsistent.csv')\n",
    "for lan in sorted(lex_text):\n",
    "    lexemes = lex_text[lan]\n",
    "    for lexeme in sorted(lexemes):\n",
    "        properties = lexemes[lexeme]\n",
    "        for prop in consistent_props:\n",
    "            if prop in properties:\n",
    "                values = properties[prop]\n",
    "                if len(values) > 1:\n",
    "                    exceptions[lan][prop].add(lexeme)\n",
    "                    incons.write('\"{}\";\"{}\";\"{}\";{};\"{}\"\\n'.format(lan, prop, lexeme, len(values), '\";\"'.join(values)))\n",
    "incons.close()\n",
    "for lan in sorted(text_langs):\n",
    "    print(lan)\n",
    "    for prop in sorted(consistent_props):\n",
    "        extra = ''\n",
    "        print(\"{}{:<8}: {:>4} inconsistent lexemes{}\".format(\n",
    "            ' ' * 4, prop, len(exceptions.get(lan, {}).get(prop, set())), extra,\n",
    "        ))        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So we can omit the occurrence based features `g_voc_lex` and `g_voc_lex_utf8`\n",
    "and replace them by lexeme-based features.\n",
    "We will the lexeme-based features `voc_lex` and `voc_lex_utf8`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "g_voc_lex = {}\n",
    "g_voc_lex_utf8 = {}\n",
    "\n",
    "for w in F.otype.s('word'):\n",
    "    lan = lang_map[F.language.v(w)]\n",
    "    lex = F.lex.v(w)\n",
    "    voc = F.g_voc_lex.v(w)\n",
    "    voc_utf8 = F.g_voc_lex_utf8.v(w)\n",
    "    g_voc_lex[(lan, lex)] = voc\n",
    "    g_voc_lex_utf8[(lan, lex)] = voc_utf8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are going to compute the features\n",
    "``freq_lex``, ``rank_lex``, ``freq_occ``, ``rank_occ``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20m 01s Computing statistics\n",
      "20m 03s Done\n"
     ]
    }
   ],
   "source": [
    "inf('Computing statistics')\n",
    "wstats = {\n",
    "    'freqs': {\n",
    "        'lex': collections.defaultdict(lambda: collections.Counter()),\n",
    "        'occ': collections.defaultdict(lambda: collections.Counter()),\n",
    "    },\n",
    "    'ranks': {\n",
    "        'lex': collections.defaultdict(lambda: {}),\n",
    "        'occ': collections.defaultdict(lambda: {}),\n",
    "    },\n",
    "}\n",
    "langs = set()\n",
    "\n",
    "for w in F.otype.s('word'):\n",
    "    lan = lang_map[F.language.v(w)]\n",
    "    occ = F.g_cons.v(w)\n",
    "    lex = F.lex.v(w)\n",
    "    wstats['freqs']['lex'][lan][lex] += 1\n",
    "    wstats['freqs']['occ'][lan][occ] += 1\n",
    "    langs.add(lan)\n",
    "for lan in langs:\n",
    "    for tp in ['lex', 'occ']:\n",
    "        rank = -1\n",
    "        prev_n = -1\n",
    "        amount = 1\n",
    "        for (x, n) in sorted(wstats['freqs'][tp][lan].items(), key=lambda y: (-y[1], y[0])):\n",
    "            if n == prev_n:\n",
    "                amount += 1\n",
    "            else:\n",
    "                rank += amount\n",
    "                amount = 1\n",
    "            prev_n = n\n",
    "            wstats['ranks'][tp][lan][x] = rank\n",
    "inf('Done')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Composing lexical data features\n",
    "\n",
    "The specification in ``lex_fields`` below specifies the lexicon fields in the intended order.\n",
    "It contains instructions how to construct the field values from the lexical information obtained from the lexicon files.\n",
    "\n",
    "    (source, method, name, transformation table, data type, data size, data options)\n",
    "\n",
    "## source \n",
    "May contain one of the following:\n",
    "\n",
    "* the name of a lexical feature as shown in the lexicon files, such as ``sp``, ``vc``.\n",
    "* None. \n",
    "  In this case, **method** is a code that triggers special actions, such as getting an id or something that is available to the   program that fills the lexicon table\n",
    "* the name of an other field as shown in the **name** part of the specification. \n",
    "  In this case, **method** must be a function, defined else where, that takes the value of that other field as argument. \n",
    "  The function is typically a transliteration, or a stripping action.\n",
    "\n",
    "## method\n",
    "May contain one of the following:\n",
    "\n",
    "* a code (string), indicating:\n",
    "    * ``lex``: take the value of a feature (indicated in **source**) for this entry from the lexicon file\n",
    "    * ``entry``: take the value of the entry itself as found in the lexicon file\n",
    "    * ``id``: take the id for this entry as generated by the program\n",
    "    * ``lan``: take the language of this entry\n",
    "* a function taking one argument\n",
    "    * *get_voc*: get the earlier compiled `g_voc_lex` value\n",
    "    * *get_voc_utf8*: get the earlier compiled `g_voc_lex_utf8` value\n",
    "\n",
    "## name\n",
    "The name of the field in the to be constructed annotation file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_voc(lan, lex): return g_voc_lex[(lan, lex)]\n",
    "def get_voc_utf8(lan, lex): return g_voc_lex_utf8[(lan, lex)]\n",
    "\n",
    "lex_fields = (\n",
    "    (None, 'lan', 'language', None),\n",
    "    (None, 'entry', 'lex', None),\n",
    "    (None, get_voc, 'voc', None),\n",
    "    (None, get_voc_utf8, 'voc_utf8', None),\n",
    "    ('rt', 'lex', 'root', None),\n",
    "    ('sp', 'lex', 'sp', None),\n",
    "    ('sm', 'lex', 'nametype', None),\n",
    "    ('ls', 'lex', 'ls', None),\n",
    "    ('gl', 'lex', 'gloss', None),\n",
    ")\n",
    "\n",
    "cur_lex_values = {}\n",
    "\n",
    "def compute_fields(lan, entry, lexfeats):\n",
    "    cur_lex_values.clear()\n",
    "    return tuple(compute_field(lan, entry, lexfeats, f) for f in lex_fields)\n",
    "\n",
    "def compute_field(lan, entry, lexfeats, f):\n",
    "    (source, method, name, transform) = f\n",
    "    val = None\n",
    "    if method == 'lan': val = lan\n",
    "    elif method == 'entry': val = entry\n",
    "    elif method =='lex':\n",
    "        val = lexfeats.get(f[0], '')\n",
    "    else: val = method(lan, entry)\n",
    "    cur_lex_values[f[2]] = val\n",
    "    return val\n",
    "\n",
    "lex_index = {}\n",
    "for lan in sorted(lex_entries):\n",
    "    for entry in sorted(lex_entries[lan]):\n",
    "        entry_info = compute_fields(lan, entry, lex_entries[lan][entry])\n",
    "        lex_index[(lan, entry)] = entry_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lexFeatures = dict(((f[2], {}) for f in lex_fields))\n",
    "for ft in lexFeatures:\n",
    "    metaData.setdefault(ft, {})['valueType'] = 'str'\n",
    "for ft in ('freq_lex', 'rank_lex'):\n",
    "    lexFeatures[ft] = {}\n",
    "    metaData.setdefault(ft, {})['valueType'] = 'int'\n",
    "\n",
    "for (lan, lex) in lex_index:\n",
    "    tfln = nodeFromLex[(lan, lex)]\n",
    "    lexInfo = lex_index[(lan, lex)]\n",
    "    for (i, f) in enumerate(lex_fields):\n",
    "        lexFeatures[f[2]][tfln] = lexInfo[i]\n",
    "    lexFeatures['freq_lex'][tfln] = str(wstats['freqs']['lex'][lan][lex])\n",
    "    lexFeatures['rank_lex'][tfln] = str(wstats['ranks']['lex'][lan][lex])\n",
    "\n",
    "occFeatures = {}\n",
    "for ft in ('freq_occ', 'rank_occ'):\n",
    "    occFeatures[ft] = {}\n",
    "    metaData.setdefault(ft, {})['valueType'] = 'int'\n",
    "\n",
    "for w in F.otype.s('word'):\n",
    "    lan = lang_map[F.language.v(w)]\n",
    "    occ = F.g_cons.v(w)\n",
    "    occFeatures['freq_occ'][tFn(w)] = str(wstats['freqs']['occ'][lan][occ])\n",
    "    occFeatures['rank_occ'][tFn(w)] = str(wstats['ranks']['occ'][lan][occ])\n",
    "\n",
    "nodeFeatures.update(lexFeatures)\n",
    "nodeFeatures.update(occFeatures)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Core node and edge features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "maxNode=1446130\n"
     ]
    }
   ],
   "source": [
    "metaData.update({'': dict(source='ETCBC4c via LAF-Fabric')})\n",
    "for nf in nodeFeatureList:\n",
    "    metaData.setdefault(nf, {})['valueType'] = 'int' if nf in intFeatures else 'str'\n",
    "for ef in edgeFeatureList:\n",
    "    metaData.setdefault(ef, {})['valueType'] = 'str'\n",
    "    \n",
    "print('maxNode={}'.format(lexNode))\n",
    "\n",
    "for nf in nodeFeatureList:\n",
    "    if nf == 'otype':\n",
    "        data = dict(((\n",
    "            tFn(n),\n",
    "            F.otype.lookup[n],\n",
    "        ) for n in range(maxLfNode+1)))\n",
    "        data.update(dict(((\n",
    "            n,\n",
    "            otypeData[n],\n",
    "        ) for n in range(maxNode+1, lexNode+1))))\n",
    "    else:\n",
    "        data = dict(((tFn(n), v) for (n,v) in F.item[nf].lookup.items()))\n",
    "    nodeFeatures.setdefault(nf, {}).update(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Last minute changes\n",
    "Some features that come from the ETCBC core have obvious defects, or we need a feature next to it that is just a little bit different. This is what we do\n",
    "\n",
    "1. `lex` contains the lexeme (in transcription) with disambiguation marks (`[/=`) appended.\n",
    "   For text transformations we prefer the bare lexeme\n",
    "1. `lex_utf` has frills at the end of many values. Probably they have arisen by transforming the lexeme plus\n",
    "   disambiguation marks into unicode. We overwrite this feature with the transform of the bare lexeme.\n",
    "1. `language` has values `Hebrew` and `Aramaic`. We prefer ISO language codes: `hbo` and `arc` instead.\n",
    "   By adding `language` for lexeme nodes we already have switched to ISO codes. Here we do the rest."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nodeFeatures['lex0'] = {}\n",
    "nodeFeatures['lex_utf8'] = {}\n",
    "# node feature language alread exists: for lexemes\n",
    "\n",
    "nfl = nodeFeatures['lex0']\n",
    "nfu = nodeFeatures['lex_utf8']\n",
    "lnf = nodeFeatures['language']\n",
    "\n",
    "for (n, v) in nodeFeatures['lex'].items():\n",
    "    vv = v.rstrip('[/=')\n",
    "    nfl[n] = vv\n",
    "    nfu[n] = Transcription.to_hebrew(vv)\n",
    "\n",
    "for (n, v) in F.language.lookup.items():\n",
    "    lnf[tFn(n)] = lang_map[v]\n",
    "\n",
    "metaData.setdefault('lex0', {})['valueType'] = 'str'\n",
    "metaData.setdefault('lex_utf8', {})['valueType'] = 'str'\n",
    "metaData.setdefault('language', {})['valueType'] = 'str'\n",
    "\n",
    "for ef in edgeFeatureList:    \n",
    "    if ef == 'oslots':\n",
    "        data = oslotsData\n",
    "    else:\n",
    "        data = dict(((tFn(n), tFns(set(nDict.keys()))) for (n, nDict) in C.item[ef].lookup.items()))\n",
    "    edgeFeatures.setdefault(ef, {}).update(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save everything"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from tf.fabric import Fabric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is Text-Fabric 1.2.7\n",
      "Api reference : https://github.com/ETCBC/text-fabric/wiki/Api\n",
      "Tutorial      : https://github.com/ETCBC/text-fabric/blob/master/docs/tutorial.ipynb\n",
      "Data sources  : https://github.com/ETCBC/text-fabric-data\n",
      "Data docs     : https://etcbc.github.io/text-fabric-data/features/hebrew/etcbc4c/0_overview.html\n",
      "Shebanq docs  : https://shebanq.ancient-data.org/text\n",
      "Slack team    : https://shebanq.slack.com/signup\n",
      "Questions? Ask shebanq@ancient-data.org for an invite to Slack\n",
      "107 features found and 0 ignored\n"
     ]
    }
   ],
   "source": [
    "TF = Fabric(modules=ETCBC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0.00s Exporting 102 node and 4 edge and 1 config features to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c:\n",
      "   |     0.04s T book                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@am              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@ar              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@bn              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@da              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@de              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@el              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@en              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@es              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@fa              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@fr              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@he              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@hi              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@id              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@ja              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@ko              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@la              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@nl              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@pa              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@pt              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@ru              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@sw              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@syc             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@tr              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@ur              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@yo              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T book@zh              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.06s T chapter              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.17s T code                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.84s T det                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.11s T dist                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.10s T dist_unit            to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.14s T domain               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.02s T freq_lex             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.35s T freq_occ             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T function             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.85s T g_cons               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.87s T g_cons_utf8          to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.83s T g_lex                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.05s T g_lex_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.78s T g_nme                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.93s T g_nme_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.79s T g_pfm                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.75s T g_pfm_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.73s T g_prs                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.65s T g_prs_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T g_uvf                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.68s T g_uvf_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T g_vbe                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T g_vbe_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.73s T g_vbs                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.67s T g_vbs_utf8           to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.76s T g_word               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.15s T g_word_utf8          to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.03s T gloss                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.85s T gn                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.19s T instruction          to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.17s T is_root              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.19s T kind                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.10s T label                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.68s T language             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.74s T lex                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.75s T lex0                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.82s T lex_utf8             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.68s T ls                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.37s T mother_object_type   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.03s T nametype             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.80s T nme                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.64s T nu                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     2.03s T number               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.84s T otype                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.15s T pargr                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T pdp                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.70s T pfm                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T prs                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T prs_gn               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.70s T prs_nu               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.67s T prs_ps               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.66s T ps                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T qere                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s T qere_trailer         to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.01s T qere_trailer_utf8    to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.01s T qere_utf8            to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.02s T rank_lex             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.74s T rank_occ             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.20s T rela                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.02s T root                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.85s T sp                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.67s T st                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.18s T tab                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.76s T trailer              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.39s T trailer_utf8         to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.18s T txt                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.54s T typ                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.80s T uvf                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.81s T vbe                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     1.05s T vbs                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.08s T verse                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.04s T voc                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.04s T voc_utf8             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.92s T vs                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.67s T vt                   to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     3.01s T distributional_parent to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     4.51s T functional_parent    to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.70s T mother               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     5.34s T oslots               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      "   |     0.00s M otext                to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n",
      " 1m 00s Exported 102 node features and 4 edge features and 1 config features to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c\n"
     ]
    }
   ],
   "source": [
    "TF.save(\n",
    "    nodeFeatures=nodeFeatures,\n",
    "    edgeFeatures=edgeFeatures,\n",
    "    metaData=metaData,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}