readings.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import collections\n",
"import json\n",
"from functools import reduce"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"BASE = os.path.expanduser('~/github')\n",
"ORG = 'Nino-cunei'\n",
"REPO = 'oldbabylonian'\n",
"VERSION = '0.2'\n",
"\n",
"REPO_PATH = f'{BASE}/{ORG}/{REPO}'\n",
"\n",
"TRANS_DIR = f'{REPO_PATH}/sources/cdli/transcriptions/{VERSION}'\n",
"\n",
"SOURCES = (\n",
" 'AbB-primary',\n",
" 'AbB-secondary',\n",
")\n",
"\n",
"SRC_EXT = '.txt'\n",
"\n",
"REPORT_DIR = f'{REPO_PATH}/reports'\n",
"\n",
"if not os.path.exists(REPORT_DIR):\n",
" os.makedirs(REPORT_DIR, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"transRe = re.compile(r'''^[0-9a-zA-Z'.]+\\s+(.*)$''')\n",
"scoreRe = re.compile(r'''_([^_]*)_''')\n",
"numeralRe = re.compile(r'''[0-9]+\\(([^)]+)\\)''')\n",
"stickyNumeralRe = re.compile(r'''([0-9]+\\([^)]+\\)){2,}''')\n",
"splitRe = re.compile(r'''[ \\t{}<>\\[\\].]+''')\n",
"wrongBeforeRe = re.compile(r'''[/|]''')\n",
"wrongAfterRe = re.compile(r'''[0-9/|][^0-9]''')\n",
"\n",
"def numeralRepl(match):\n",
" return f' {match.group(1)} '\n",
"\n",
"maxReadingLength = 0\n",
"\n",
"def readSourceFile(src, readings, errors):\n",
" with open(f'{TRANS_DIR}/{src}{SRC_EXT}') as fh:\n",
" inTrans = False\n",
" l = 0\n",
" for line in fh:\n",
" l += 1\n",
" if line.startswith('Transcription:'):\n",
" inTrans = True\n",
" continue\n",
" elif line[0].isupper():\n",
" inTrans = False\n",
" continue\n",
" line = line.strip()\n",
" match = transRe.match(line)\n",
" if not match:\n",
" continue\n",
" trans = match.group(1)\n",
" nScores = trans.count('_')\n",
" if nScores % 2:\n",
" errors['unbalanced underscores'][src].add((l, line, None))\n",
" continue\n",
" scores = scoreRe.findall(trans)\n",
" for score in scores:\n",
" if score == '':\n",
" errors['empty score __'][src].add((l, line, None))\n",
" else:\n",
" for rd in getReadings(score, src, l, line, errors):\n",
" kind = 'lower' if rd.islower() else 'upper' if rd.isupper() else 'mixed'\n",
" if kind == 'mixed':\n",
" errors['mixed case in alternative'][src].add((l, line, rd))\n",
" continue\n",
" readings['alternative'][kind].add(rd)\n",
" primary = scoreRe.sub('', trans)\n",
" for rd in getReadings(primary, src, l, line, errors):\n",
" kind = 'lower' if rd.islower() else 'upper' if rd.isupper() else 'mixed'\n",
" if kind == 'mixed':\n",
" errors['mixed case in default'][src].add((l, line, rd))\n",
" continue\n",
" readings['default'][kind].add(rd)\n",
" \n",
" \n",
"def getReadings(material, src, l, line, errors):\n",
" global maxReadingLength\n",
" \n",
" words = splitRe.split(material)\n",
" readings = reduce(set.union, (set(word.split('-')) for word in words), set())\n",
" \n",
" thisMax = max(len(rd) if rd.isalnum() else 1 for rd in readings)\n",
" if thisMax > maxReadingLength:\n",
" maxReadingLength = thisMax\n",
" return filterReadings(readings, src, l, line, errors)\n",
"\n",
"def filterReadings(rds, src, l, line, errors):\n",
" newRds = set()\n",
" for rd in rds:\n",
" if wrongBeforeRe.search(rd):\n",
" errors['A malformed reading before weeding'][src].add((l, line, rd))\n",
" continue\n",
" rd = (\n",
" rd.\\\n",
" replace('#', '').\\\n",
" replace('.', '').\\\n",
" replace('?', '').\\\n",
" replace('!', '').\\\n",
" replace('$', '').\\\n",
" replace('*', '').\\\n",
" replace('+', '')\n",
" )\n",
" if stickyNumeralRe.search(rd):\n",
" errors['adjacent numerals'][src].add((l, line, rd))\n",
" rd = numeralRe.sub(numeralRepl, rd)\n",
" rd = rd.replace('(', ' ').replace(')', ' ')\n",
" subrds = rd.strip().split()\n",
" \n",
" for srd in subrds:\n",
" if wrongAfterRe.search(srd):\n",
" rdRep = f'\"{srd}\" in \"{rd}\"' if len(rd) > 1 else \"{srd}\"\n",
" errors['Z malformed reading after weeding'][src].add((l, line, rdRep))\n",
" continue\n",
" if len(srd) > 7:\n",
" rdRep = f'\"{srd}\" in \"{rd}\"' if len(rd) > 1 else \"{srd}\"\n",
" errors['long reading'][src].add((l, line, rdRep))\n",
" continue\n",
" if srd != '':\n",
" newRds.add(srd)\n",
" return newRds\n",
"\n",
"def showErrors(errors, batch=10):\n",
" if not errors:\n",
" print('No errors')\n",
" else:\n",
" for (error, srcs) in sorted(errors.items()):\n",
" print(f'ERROR {error}')\n",
" for (src, data) in sorted(srcs.items()):\n",
" print(f'\\t{src} ({len(data)}x)')\n",
" for (l, line, sore) in sorted(data)[0:batch]:\n",
" soreRep = '' if sore is None else f'\"{sore}\" in '\n",
" print(f'\\t\\t{l}: {soreRep}{line}')\n",
" if len(data) > batch:\n",
" print(f'\\t\\t + more')\n",
"\n",
"def printErrors(errors):\n",
" outFile = f'{REPORT_DIR}/errors.tsv'\n",
" if os.path.exists(outFile):\n",
" os.unlink(outFile)\n",
" with open(outFile, 'w') as fh:\n",
" fh.write('\\t'.join(f'''\n",
" error\n",
" sourcefile\n",
" lineno\n",
" wrong\n",
" line\n",
"'''.strip().split()))\n",
" fh.write('\\n')\n",
" for (error, srcs) in sorted(errors.items()):\n",
" for (src, data) in sorted(srcs.items()):\n",
" for (l, line, sore) in sorted(data):\n",
" soreRep = '' if sore is None else sore\n",
" fh.write('\\t'.join((error, src, str(l), soreRep, line)))\n",
" fh.write('\\n')\n",
"\n",
"def showReadings(msg, readings, batch=20):\n",
" print(f'''\n",
"================================================\n",
"= {msg} max reading length is {maxReadingLength}\n",
"================================================\n",
"''')\n",
" totals = collections.Counter()\n",
" for (cls, clsItems) in readings.items():\n",
" for (kind, kindItems) in clsItems.items():\n",
" totals[cls] += len(kindItems)\n",
" for (cls, clsItems) in readings.items():\n",
" print(f'{cls:<15}: {totals[cls]:>4} readings')\n",
" for (kind, kindItems) in clsItems.items():\n",
" print(f'\\t{kind:<15}: {len(kindItems):>4} readings')\n",
" if batch:\n",
" for it in sorted(kindItems)[0:batch]:\n",
" print(f'\\t\\t{it}')\n",
" if len(kindItems) > batch:\n",
" print(f'\\t\\t + more')\n",
" \n",
"def printReadings(readings):\n",
" xReadings = set()\n",
" for (cls, clsItems) in readings.items():\n",
" for (kind, kindItems) in clsItems.items():\n",
" for it in kindItems:\n",
" xReadings.add((kind, it))\n",
" \n",
" outFile = f'{REPORT_DIR}/readings.tsv'\n",
" if os.path.exists(outFile):\n",
" os.unlink(outFile)\n",
" with open(outFile, 'w') as fh:\n",
" fh.write('\\t'.join(f'''\n",
" kind\n",
" reading\n",
"'''.strip().split()))\n",
" fh.write('\\n')\n",
" for (kind, rd) in sorted(xReadings):\n",
" fh.write('\\t'.join((kind, rd)))\n",
" fh.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"readings = collections.defaultdict(lambda: collections.defaultdict(set))\n",
"errors = collections.defaultdict(lambda: collections.defaultdict(set))\n",
"maxReadingLength = 0\n",
"\n",
"for src in SOURCES:\n",
" readSourceFile(src, readings, errors)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"printErrors(errors)\n",
"printReadings(readings)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ERROR A malformed reading before weeding\n",
"\tAbB-primary (49x)\n",
"\t\t7549: \"ta/sza\" in 12. la ki a ta/sza li# ki x\n",
"\t\t7618: \"5/6(disz)\" in 11. _1(disz) 5/6(disz) gin2 ku3-babbar_ sza _igi-6(disz)-gal2 ku3-babbar zu2-lum dilmun-na_\n",
"\t\t8531: \"KU/MA\" in 5. x x ni-su2-uq KU/MA x [...]\n",
"\t\t8936: \"1/3(disz)\" in 8. _1/3(disz) ma-na ku3-babbar_\n",
"\t\t9198: \"1/3(disz)\" in 9. _1/3(disz) ma-na 1(disz) gin2 ku3-babbar na4_ {d}utu\n",
"\t\t9499: \"2/3(disz)\" in 4. _3(u) 2/3(disz) ma-na 8(disz) gin2 an-na-kam_\n",
"\t\t9555: \"1/3(disz)\" in 1. _1/3(disz) ma-na 5(disz) gin2 ku3-babbar_\n",
"\t\t9882: \"ir/ni\" in 25. u3 _a-sza3_ DISZ sza ta mi ir/ni ia a-di [u2-ul]\n",
"\t\t10085: \"1/2(disz)\" in 6. _1/2(disz) gin2 1(u) sze ku3-babbar na4_ {d}utu usz-ta-bi-la#-[kum]\n",
"\t\t10579: \"1/2(disz)\" in 3. _2(disz) 1/2(disz) ma-na 1(disz) 1/2(disz) gin2 ku3-babbar_\n",
"\t\t + more\n",
"\tAbB-secondary (72x)\n",
"\t\t2204: \"1/2(disz)\" in 7. _3(disz) 1/2(disz) ma-na ku3-babbar_\n",
"\t\t2206: \"1/2(disz)\" in 9. _3(disz) 1/2(disz) ma-na ku3-babbar_\n",
"\t\t2905: \"2/3(disz)\" in 5. _2/3(disz) ma-na 2(disz) gin2 ku3-babbar_ a-na _sa10 1(disz) sag-ARAD_\n",
"\t\t2911: \"1/3(disz)\" in 11. i-na li-ib-bu _1/3(disz) ma-na 2(disz) x ku3#-babbar_\n",
"\t\t3990: \"1/2(disz)\" in 9. _u4 1(disz)-e 2(disz) 1/2(disz) sila3 ninda-ta_ qa2-du ni-is-hi-szu\n",
"\t\t4125: \"1/2(disz)\" in 5. szum-ma _1/2(disz) gin2 ku3-babbar_ szum-ma _2(disz) ma-na siki_\n",
"\t\t4272: \"1/3(disz)\" in 11. sza _3(disz) 1/3(disz) gin2 ku3-babbar_ sza ma-ah-ri#-[ka]\n",
"\t\t4456: \"1/3(disz)\" in 14. da-qa-at _1/3(disz) ma-na ku3-babbar_\n",
"\t\t4470: \"1/2(disz)\" in 12. da-qa-at _1/2(disz) ma-na ku3-babbar_ szu-bi-lam-ma\n",
"\t\t5029: \"1/2(disz)\" in 10. sza-ad-da-aq-da _4(disz) 1/2(disz) gin2 ku3-[babbar_]\n",
"\t\t + more\n",
"ERROR Z malformed reading after weeding\n",
"\tAbB-secondary (15x)\n",
"\t\t14900: \"\"0,1\" in \"0,1\"\" in 7. 0,2.4 _zu2-lum 0,1 ziz2-an-na_\n",
"\t\t14900: \"\"0,2\" in \"0,2\"\" in 7. 0,2.4 _zu2-lum 0,1 ziz2-an-na_\n",
"\t\t16644: \"\"0,2\" in \"0,2\"\" in 15. u3 0,2.4 sze ma-ah-ri-ia\n",
"\t\t68155: \"\"1,1\" in \"1,1\"\" in 7. szum-ma 1(asz) _sze-gur_ szum-ma 1,1 _sze-gur_\n",
"\t\t68723: \"\"3,3\" in \"3,3\"\" in 9. 3,3 [(x)] _gur#_ im-du-ud\n",
"\t\t68724: \"\"3,3\" in \"3,3\"\" in 10. 3,3 _gur ba-zi_ na-szi\n",
"\t\t68733: \"\"46,3\" in \"46,3\"\" in 5. i-na 46,3.2.0 _gur zu2-lum_\n",
"\t\t68734: \"\"31,2\" in \"31,2\"\" in 6. 31,2.3.0 _gur_ im-du-ud\n",
"\t\t68735: \"\"5,2\" in \"5,2\"\" in 7. 5,2.0.0 _gur ba-zi_ na-szi\n",
"\t\t68736: \"\"9,4\" in \"9,4\"\" in 8. a-na si-it-ti 9,4.0.0 _gur_\n",
"\t\t + more\n",
"ERROR adjacent numerals\n",
"\tAbB-secondary (11x)\n",
"\t\t14203: \"2(u)5(disz)\" in 9. i-na _iti ziz2-a u4 2(u)5(disz) kam_\n",
"\t\t14315: \"1(u)8(disz)\" in 6. asz-szum 1(u)8(disz) _gin2 ku3-babbar_\n",
"\t\t14526: \"1(u)4(disz)\" in 7. u3 _dingir_-szu-i-bi a-na 1(u)4(disz) _gin2 ku3-babbar_\n",
"\t\t14590: \"2(gesz2)1(u)2(asz)\" in 6. 2(gesz2)1(u)2(asz) _gur a-sza3 igi uru{ki}_\n",
"\t\t14591: \"2(gesz2)1(u)\" in 7. 2(gesz2)1(u) _gur a-sza3 hi-isz sar_\n",
"\t\t14592: \"1(gesz2)4(asz)\" in 8. 1(gesz2)4(asz) _gur a-sza3 a-gar3 gu-la_\n",
"\t\t14593: \"1(geszu)6(gesz2)2(u)6(asz)\" in 9. 1(geszu)6(gesz2)2(u)6(asz) _gur_\n",
"\t\t14901: \"2(u)8(disz)\" in 8. 2(u)8(disz) _ku6 hi-a_\n",
"\t\t68722: \"1(u)2(asz)\" in 8. asz-ku-nu i-na 1(u)2(asz) _gur zu2-lum_-szu\n",
"\t\t70585: \"1(u)2(disz)\" in 12. u3 ta-asz-pi2-it 1(u)2(disz) _ab2-ga_\n",
"\t\t + more\n",
"ERROR long reading\n",
"\tAbB-primary (17x)\n",
"\t\t486: \"\"geszimmar\" in \"geszimmar\"\" in 7'. _5(disz) ma-na {gisz}zu2 geszimmar_\n",
"\t\t9148: \"\"szunigin\" in \"szunigin\"\" in 8. _szunigin 6(gesz2) 2(u) 1(asz) 5(ban2) sze gur {gesz}ban2#_ [{gesz}me]-sze#-qum\n",
"\t\t16751: \"\"szakkan2\" in \"szakkan2\"\" in 4. {d}szakkan2 u3 {d}dumu-zi\n",
"\t\t18009: \"\"isztaran\" in \"isztaran\"\" in 6. {disz}{d}isztaran-ki-nam-i-de\n",
"\t\t20132: \"\"szakkan2\" in \"szakkan2\"\" in 5'. a-na mu-ha-du-um u3 a-pil-{d}szakkan2\n",
"\t\t20146: \"\"szakkan2\" in \"szakkan2\"\" in 4. {disz}mu-ha-du-um u3 a-pil#-[{d}]szakkan2\n",
"\t\t22639: \"\"isztaran\" in \"isztaran\"\" in 3. u3 ge6-{d}isztaran\n",
"\t\t23026: \"\"taskarin\" in \"taskarin\"\" in 6. asz-szum _{gesz}taskarin# hi-a_ sza# ta#-asz-pu#-ra-am\n",
"\t\t23033: \"\"taskarin\" in \"taskarin\"\" in 13. 3(disz) szu-szi _{gesz}taskarin hi-a_\n",
"\t\t23037: \"\"taskarin\" in \"taskarin\"\" in 3. i-nu-ma 1(disz) szu-szi _{gesz}taskarin hi-a_\n",
"\t\t + more\n",
"\tAbB-secondary (29x)\n",
"\t\t3804: \"\"kuruszda\" in \"kuruszda\"\" in 5. _1(u) udu-nita2 hi-a_ sza _e2 {lu2}kuruszda_\n",
"\t\t3808: \"\"kuruszda\" in \"kuruszda\"\" in 9. _2(disz) {lu2}kuruszda_ t,u2-ur-dam\n",
"\t\t5141: \"\"geszimmar\" in \"geszimmar\"\" in 6. u3 i-na-an-na a-na _gesz-ur3 geszimmar_ na-sze-e-em\n",
"\t\t5145: \"\"geszimmar\" in \"geszimmar\"\" in 10. u3 i-na-an-na _gesz-ur3 geszimmar hi-a_\n",
"\t\t6649: \"\"muhaldim\" in \"muhaldim\"\" in 5. {disz}{d}suen-i-di2-nam _muhaldim_\n",
"\t\t7631: \"\"szandana\" in \"szandana\"\" in 4. {disz}ki-ib-ra-ab-ba _szandana_-ka\n",
"\t\t9033: \"\"isztaran\" in \"isztaran\"\" in 3. um-ma {d}isztaran-szi-it-ma-ar-ma a-hu-ka\n",
"\t\t9034: \"\"isztaran\" in \"isztaran\"\" in 4. {d}utu u3 {d}isztaran\n",
"\t\t14650: \"\"kaskaltur\" in \"kaskaltur\"\" in 6. {disz}a-li2-_kaskal+tur_-ti\n",
"\t\t22290: \"\"muhaldim\" in \"muhaldim\"\" in 4. asz-szum 1(disz) ib#-ni-{d}mar-tu _ugula muhaldim-mesz_\n",
"\t\t + more\n",
"ERROR mixed case in alternative\n",
"\tAbB-secondary (1x)\n",
"\t\t68255: \"2\" in 9. {disz}i-din-_e2-mah 2 1/2(disz) gin2 ku3-babbar_ le-qu2\n",
"ERROR mixed case in default\n",
"\tAbB-primary (1x)\n",
"\t\t103: \"0\" in 4. _{d}utu_ u3 _{d}amar-utu_ da-ri-[isz] _u4_-[mi 0]\n",
"\tAbB-secondary (49x)\n",
"\t\t11498: \"1\" in 3. s,u2-ha-ra-am 1 x ih-ta-ia-at,\n",
"\t\t11513: \"0\" in 18. a-na szu-ti-im le-qe2-em [0 0]\n",
"\t\t11810: \"0\" in 12. [x x x 0]-ta-tim\n",
"\t\t11912: \"0\" in 4. [_{d}utu_] u3 _{d}amar-utu_ asz-szu-mi-ia da-ri-isz _u4_-mi [0]\n",
"\t\t12036: \"0\" in 4. _{d}utu_ u3 _{d}amar-utu_ da-ri-[isz _u4_-mi 0]\n",
"\t\t12142: \"0\" in 10. _szuku_ u3 ma-asz-ti-[tum] x ma it ti sza bu x [0]\n",
"\t\t12783: \"0\" in 4. [x x 0] tu-sza-ba-lim\n",
"\t\t12936: \"0\" in 6. [x 0] x la i-he-su-u2\n",
"\t\t13190: \"0\" in 8. a-na _u4 5(disz) kam_ a-na _sze_-szu ba-ba-li [0]\n",
"\t\t13653: \"1\" in 6. 1 {gi}qu2-up-pa-am x [...]\n",
"\t\t + more\n",
"ERROR unbalanced underscores\n",
"\tAbB-primary (21x)\n",
"\t\t54: 6'. x _[a-sza3 s,i]-bi-it ku-un-zu-lum\n",
"\t\t203: 4. _u4 4(disz)-kam a-di i-na-an-na\n",
"\t\t7787: 17. usz#-ta-bi-lam asz-szum sze-e _szuku#-ka sza _mu 4(disz)-kam_\n",
"\t\t8591: 10. _szuku# e2_ sza a-di _iti# szu-numun#-a LI#-x [(x)]\n",
"\t\t9024: 6. _1(u) 5(asz) sze gur_ sza _lu2-kurun2-na\n",
"\t\t11813: 25. _a2 {gesz}ma2_ a-na _u4 2(u)-kam li-li-[kam]\n",
"\t\t12934: 2. u3 4(asz) 2(barig) 3(ban2) sze gur_ la-bi-ir-tim#\n",
"\t\t13119: 9. _1(ban2) ze2!(SZE)-ra-am sza# sza#-ma-asz-ki-il#-[li ...]\n",
"\t\t13281: 3. i-na _iti kin {d}inanna u4 3(u)-kam\n",
"\t\t15088: 9. _u4 <<kam iti>> 1(u) 6(disz)-kam_ sza _iti udru{duru5}\n",
"\t\t + more\n",
"\tAbB-secondary (26x)\n",
"\t\t9313: 7. [2(disz) _gin2] ku3#-babbar u3 e-em x\n",
"\t\t12409: 1. [a-na ra]-bi-a-an _an-[za-gar3]-ku-na-nu-um{ki}\n",
"\t\t12711: 2. _dumu-mesz _AZ/UG-ni-i{ki}_\n",
"\t\t14533: 14. a-na _mu 2(disz) kam# [0-0]\n",
"\t\t16503: 4. [_{d}utu_ u3 {d}]amar-utu_ li-ba#-al#-li-t,u2-ka\n",
"\t\t23809: 19. _nig2-ka9#-szu-nu li-pu-szu\n",
"\t\t23992: 7. u3 _7(disz) ma-na ku3-babbar la-bi-ir#-ti#-szu\n",
"\t\t25602: 12. _e2 dumu-munus szul-gi ki te-ep-ti-[a]\n",
"\t\t25644: 9. _gal-ukken#-[na] x x [(x) (x)]\n",
"\t\t25654: 19. _sze_-am sza# _a-sza3 gu2-un#-[szu]\n",
"\t\t + more\n",
"\n",
"================================================\n",
"= Readings max reading length is 9\n",
"================================================\n",
"\n",
"alternative : 465 readings\n",
"\tlower : 430 readings\n",
"\t\ta\n",
"\t\ta2\n",
"\t\tab\n",
"\t\tab2\n",
"\t\tabul\n",
"\t\tabzu\n",
"\t\tad\n",
"\t\taga\n",
"\t\tagrig\n",
"\t\tah\n",
"\t\tal\n",
"\t\talan\n",
"\t\tam\n",
"\t\tam3\n",
"\t\tama\n",
"\t\tamar\n",
"\t\tan\n",
"\t\tansze\n",
"\t\tapin\n",
"\t\tar\n",
"\t\tar3\n",
"\t\tasz\n",
"\t\tasz2\n",
"\t\tasza5\n",
"\t\taszgab\n",
"\t\tazlag2\n",
"\t\tba\n",
"\t\tba4\n",
"\t\tba6\n",
"\t\tbabbar\n",
"\t\tbad3\n",
"\t\tbal\n",
"\t\tbala\n",
"\t\tban\n",
"\t\tban2\n",
"\t\tbanda3\n",
"\t\tbanesz\n",
"\t\tbanszur\n",
"\t\tbappir\n",
"\t\tbar\n",
"\t\tbara2\n",
"\t\tbarig\n",
"\t\tbi\n",
"\t\tbu\n",
"\t\tbun2\n",
"\t\tbur\n",
"\t\tbur'u\n",
"\t\tbur3\n",
"\t\tburanun\n",
"\t\tburu14\n",
"\t\td\n",
"\t\tda\n",
"\t\tdab\n",
"\t\tdab5\n",
"\t\tdabin\n",
"\t\tdagal\n",
"\t\tdah\n",
"\t\tdam\n",
"\t\tdar\n",
"\t\tde3\n",
"\t\tdi\n",
"\t\tdib\n",
"\t\tdida\n",
"\t\tdidli\n",
"\t\tdilmun\n",
"\t\tdim2\n",
"\t\tdim4\n",
"\t\tdin\n",
"\t\tdingir\n",
"\t\tdiri\n",
"\t\tdirig\n",
"\t\tdisz\n",
"\t\tdu\n",
"\t\tdu10\n",
"\t\tdu11\n",
"\t\tdu3\n",
"\t\tdu5\n",
"\t\tdu6\n",
"\t\tdu7\n",
"\t\tdu8\n",
"\t\tdub\n",
"\t\tdug\n",
"\t\tdug3\n",
"\t\tduh\n",
"\t\tdumu\n",
"\t\tduru5\n",
"\t\tdusu\n",
"\t\tdusu2\n",
"\t\te\n",
"\t\te2\n",
"\t\te3\n",
"\t\tedin\n",
"\t\tegir\n",
"\t\tel\n",
"\t\telam\n",
"\t\teme\n",
"\t\ten\n",
"\t\tengar\n",
"\t\tenku\n",
"\t\tensi2\n",
"\t\t + more\n",
"\tupper : 35 readings\n",
"\t\tARAD\n",
"\t\tARAD2\n",
"\t\tBA\n",
"\t\tBAD\n",
"\t\tBUR\n",
"\t\tDU\n",
"\t\tGA\n",
"\t\tGAB\n",
"\t\tGAG\n",
"\t\tGAN2\n",
"\t\tGAZ\n",
"\t\tGESZ\n",
"\t\tIB\n",
"\t\tID\n",
"\t\tIL2\n",
"\t\tKA\n",
"\t\tKI\n",
"\t\tKIB\n",
"\t\tKU\n",
"\t\tKUM\n",
"\t\tLA\n",
"\t\tLU\n",
"\t\tNE\n",
"\t\tNIG2\n",
"\t\tNUN\n",
"\t\tPA\n",
"\t\tSAR\n",
"\t\tSZESZ\n",
"\t\tSZU\n",
"\t\tTAR\n",
"\t\tTE\n",
"\t\tTU\n",
"\t\tTUL2\n",
"\t\tTUR\n",
"\t\tUD\n",
"default : 662 readings\n",
"\tlower : 532 readings\n",
"\t\ta\n",
"\t\ta2\n",
"\t\tab\n",
"\t\tad\n",
"\t\tag\n",
"\t\tag2\n",
"\t\tah\n",
"\t\tak\n",
"\t\takszak\n",
"\t\tal\n",
"\t\talamusz\n",
"\t\tam\n",
"\t\tam3\n",
"\t\tan\n",
"\t\tap\n",
"\t\tapin\n",
"\t\taq\n",
"\t\tar\n",
"\t\tas\n",
"\t\tas,\n",
"\t\tas2\n",
"\t\tasal\n",
"\t\tasar\n",
"\t\tasz\n",
"\t\tasz2\n",
"\t\tasznan\n",
"\t\tat\n",
"\t\tat,\n",
"\t\taz\n",
"\t\taz2\n",
"\t\taz3\n",
"\t\tba\n",
"\t\tba4\n",
"\t\tba6\n",
"\t\tbabbar\n",
"\t\tbabila2\n",
"\t\tbad3\n",
"\t\tbal\n",
"\t\tbala\n",
"\t\tban2\n",
"\t\tban3\n",
"\t\tbar\n",
"\t\tbarig\n",
"\t\tbat\n",
"\t\tbe\n",
"\t\tbe2\n",
"\t\tbi\n",
"\t\tbi2\n",
"\t\tbil\n",
"\t\tbil2\n",
"\t\tbir2\n",
"\t\tbir4\n",
"\t\tbisz\n",
"\t\tblank\n",
"\t\tbu\n",
"\t\tbur\n",
"\t\tbur'u\n",
"\t\tbur3\n",
"\t\tburanun\n",
"\t\td\n",
"\t\tda\n",
"\t\tdag\n",
"\t\tdah\n",
"\t\tdam\n",
"\t\tdan\n",
"\t\tdar\n",
"\t\tde\n",
"\t\tde4\n",
"\t\tdi\n",
"\t\tdi2\n",
"\t\tdi3\n",
"\t\tdib\n",
"\t\tdidli\n",
"\t\tdil\n",
"\t\tdim\n",
"\t\tdin\n",
"\t\tdingir\n",
"\t\tdisz\n",
"\t\tdu\n",
"\t\tdu10\n",
"\t\tdu3\n",
"\t\tdu6\n",
"\t\tdu8\n",
"\t\tdub\n",
"\t\tdug\n",
"\t\tdul3\n",
"\t\tdumu\n",
"\t\tduru5\n",
"\t\te\n",
"\t\te2\n",
"\t\te3\n",
"\t\tea\n",
"\t\teb\n",
"\t\ted\n",
"\t\teg\n",
"\t\teh\n",
"\t\tek\n",
"\t\tel\n",
"\t\tel2\n",
"\t\tel3\n",
"\t\t + more\n",
"\tupper : 130 readings\n",
"\t\tA\n",
"\t\tAB\n",
"\t\tAD\n",
"\t\tAG\n",
"\t\tAH\n",
"\t\tAK\n",
"\t\tAL\n",
"\t\tAM\n",
"\t\tAN\n",
"\t\tAR\n",
"\t\tARAD\n",
"\t\tARAD2\n",
"\t\tAS,\n",
"\t\tAS2\n",
"\t\tASZ\n",
"\t\tAZ\n",
"\t\tBA\n",
"\t\tBAR\n",
"\t\tBE\n",
"\t\tBI\n",
"\t\tBU\n",
"\t\tBUR\n",
"\t\tDA\n",
"\t\tDAM\n",
"\t\tDI\n",
"\t\tDIM\n",
"\t\tDIN\n",
"\t\tDISZ\n",
"\t\tDU\n",
"\t\tE\n",
"\t\tEDIN\n",
"\t\tEK\n",
"\t\tEL\n",
"\t\tER\n",
"\t\tGA\n",
"\t\tGAL\n",
"\t\tGAN2\n",
"\t\tGESZ\n",
"\t\tGI\n",
"\t\tGIR\n",
"\t\tGIR2\n",
"\t\tGISZ\n",
"\t\tGU\n",
"\t\tHA\n",
"\t\tHI\n",
"\t\tHU\n",
"\t\tI\n",
"\t\tIA\n",
"\t\tIB\n",
"\t\tID\n",
"\t\tIG\n",
"\t\tIH\n",
"\t\tIK\n",
"\t\tIL\n",
"\t\tIM\n",
"\t\tIR\n",
"\t\tIRI\n",
"\t\tISZ\n",
"\t\tIZ\n",
"\t\tKA\n",
"\t\tKAB\n",
"\t\tKAM\n",
"\t\tKI\n",
"\t\tKIB\n",
"\t\tKU\n",
"\t\tKUR\n",
"\t\tLA\n",
"\t\tLAM\n",
"\t\tLI\n",
"\t\tLU\n",
"\t\tLU2\n",
"\t\tLUM\n",
"\t\tMA\n",
"\t\tME\n",
"\t\tMI\n",
"\t\tNA\n",
"\t\tNAM\n",
"\t\tNE\n",
"\t\tNI\n",
"\t\tNIM\n",
"\t\tNIN\n",
"\t\tNU\n",
"\t\tPA\n",
"\t\tPI\n",
"\t\tRA\n",
"\t\tRI\n",
"\t\tRU\n",
"\t\tS,I\n",
"\t\tSA\n",
"\t\tSAG\n",
"\t\tSAR\n",
"\t\tSIG\n",
"\t\tSU\n",
"\t\tSZA\n",
"\t\tSZE\n",
"\t\tSZE3\n",
"\t\tSZI\n",
"\t\tSZIM\n",
"\t\tSZU\n",
"\t\tTA\n",
"\t\t + more\n"
]
}
],
"source": [
"showErrors(errors)\n",
"showReadings('Readings', readings, batch=100)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1(a)2(b)3(c)\n",
"['1(a)', '2(b)', '3(c)']\n",
"['1(a)', '2(b)', '3(c)']\n"
]
},
{
"data": {
"text/plain": [
"'gur-1(a) 2(b) 3(c)-ki'"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeralRe = re.compile(r'''([0-9]+\\([^)]+\\))''')\n",
"stickyNumeralRe = re.compile(r'''((?:[0-9]+\\([^)]+\\)){2,})''')\n",
"\n",
"def stickyNumeralRepl(match):\n",
" print(match.groups()[0])\n",
" print(numeralRe.findall(match.groups()[0]))\n",
" print(numeralRe.findall(match.groups()[0]))\n",
" return ' '.join(numeralRe.findall(match.groups()[0]))\n",
"\n",
"x = 'gur-1(a)2(b)3(c)-ki'\n",
"stickyNumeralRe.sub(stickyNumeralRepl, x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}