glyphs.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import collections\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"BASE = os.path.expanduser('~/github')\n",
"ORG = 'Nino-cunei'\n",
"REPO = 'oldbabylonian'\n",
"VERSION = '0.2'\n",
"\n",
"REPO_PATH = f'{BASE}/{ORG}/{REPO}'\n",
"MAP_FILE_T = f'{REPO_PATH}/sources/writing/signs.txt'\n",
"MAP_FILE_P = f'{REPO_PATH}/sources/writing/signs.p'"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"columns 0 and 1 are equal\n",
"columns 0 and 2 are equal\n",
"Written data to /Users/dirk/github/Nino-cunei/oldbabylonian/sources/writing/signs.txt\n",
"headers = ['value', 'form', 'character', 'language']\n",
"Data has 9907 rows\n",
"Data has languages {'sux'}\n",
"found 446 duplicate values:\n",
"\t...Ac ...ingara /cumun/ 1(car)u) 1/3(dic@c) 1/4 2/3(dic@c) 4(dic@v) 4(dic@v@c) KWU127~a and more\n",
"found 1311 duplicate characters:\n",
"\tββ π ππ ππ ππΊ ππ πππ² ππΌ ππ ππ© and more\n",
"found 75 duplicate value,forms:\n",
"\t4(dic@v),LIMMU 4(dic@v@c),LIMMU KWU127~a,|ZI&ZI| LAK469~a,|ZI&ZI| MZL101~a,|ZI&ZI| arata,|LAMΓ(KUR.RU)| at,AD at2,GIRβ@g bat,BAD bit,Eβ and more\n",
"found 75 duplicate value,form,languages:\n",
"\t4(dic@v),LIMMU,sux 4(dic@v@c),LIMMU,sux KWU127~a,|ZI&ZI|,sux LAK469~a,|ZI&ZI|,sux MZL101~a,|ZI&ZI|,sux arata,|LAMΓ(KUR.RU)|,sux at,AD,sux at2,GIRβ@g,sux bat,BAD,sux bit,Eβ,sux and more\n",
"rows with language = \"sux\"\n",
"\tfound 446 duplicate values:\n",
"\t\t...Ac ...ingara /cumun/ 1(car)u) 1/3(dic@c) 1/4 2/3(dic@c) 4(dic@v) 4(dic@v@c) KWU127~a and more\n",
"['LAK797', 'A', 'π', 'sux']\n",
"['MZL839', 'A', 'π', 'sux']\n",
"[')u4', 'A', 'π', 'sux']\n",
"['a', 'A', 'π', 'sux']\n",
"['aia2', 'A', 'π', 'sux']\n",
"['aya2', 'A', 'π', 'sux']\n",
"['barx', 'A', 'π', 'sux']\n",
"['bunijx', 'A', 'π', 'sux']\n",
"['burx', 'A', 'π', 'sux']\n",
"['dur5', 'A', 'π', 'sux']\n",
"\n",
"... 9887 rows ...\n",
"\n",
"['1/8', 'Fβ', 'π', 'sux']\n",
"['1/4', 'Fβ', 'π ', 'sux']\n",
"['1/4(iku)', 'Fβ', 'π ', 'sux']\n",
"['1/6', 'Fβ
', 'π‘', 'sux']\n",
"['1/4', 'Fβ', 'π’', 'sux']\n",
"['/', 'Pβ', 'π°', 'sux']\n",
"[':', 'Pβ', 'π±', 'sux']\n",
"[':\"', 'Pβ', 'π²', 'sux']\n",
"[':.', 'Pβ', 'π³', 'sux']\n",
"['::', 'Pβ
', '\\ue100', 'sux']\n"
]
}
],
"source": [
"def checkSignMapData(path):\n",
" with open(MAP_FILE_P, 'rb') as fh: \n",
" pData = pickle.load(fh)\n",
" \n",
" def compare(col1, col2):\n",
" p1Data = pData[col1]\n",
" p2Data = pData[col2]\n",
" diff = False\n",
" for i in range(len(p1Data)):\n",
" if p1Data[i] != p2Data[i]:\n",
" print(f'row {i} is different')\n",
" print(p1Data[i])\n",
" print(p2Data[i])\n",
" diff = True\n",
" break\n",
" if not diff:\n",
" print(f'columns {col1} and {col2} are equal')\n",
" return not diff\n",
" \n",
" good = True\n",
" for (c1, c2) in ((0, c) for c in range(1, len(pData))):\n",
" if not compare(c1, c2):\n",
" good = False\n",
"\n",
" if not good or not len(pData):\n",
" print('No data delivered')\n",
" return None\n",
" \n",
" data = pData[0]\n",
" lData = len(data)\n",
" \n",
" if not lData:\n",
" print('Data is empty')\n",
" return None\n",
" \n",
" with open(MAP_FILE_T, 'w') as tfh:\n",
" for row in data:\n",
" rowStr = '\\t'.join(row)\n",
" tfh.write(f'{rowStr}\\n')\n",
" print(f'Written data to {MAP_FILE_T}')\n",
" \n",
" headers = data.pop(0)\n",
" lData -= 1\n",
" print(f'headers = {headers}')\n",
" \n",
" if not lData:\n",
" print('No rows')\n",
" return None\n",
" \n",
" batch = 10\n",
" \n",
" langs = {row[3] for row in data}\n",
" print(f'Data has {lData} rows')\n",
" print(f'Data has languages {langs}')\n",
" \n",
" # check whether forms are unique\n",
" \n",
" def checkUnique(cols, per=None):\n",
" if type(cols) is int:\n",
" cols = (cols,)\n",
" colNames = ','.join(headers[col] for col in cols)\n",
" values = set()\n",
" duplicates = set()\n",
" \n",
" if per is None:\n",
" indent = ''\n",
" chunks = {None: data}\n",
" else:\n",
" indent = '\\t'\n",
" chunks = collections.defaultdict(list)\n",
" for row in data:\n",
" chunks[row[per]].append(row)\n",
" \n",
" for (perVal, rows) in sorted(chunks.items()):\n",
" if perVal is not None:\n",
" print(f'rows with {headers[per]} = \"{perVal}\"')\n",
" for row in rows:\n",
" value = ','.join(row[col] for col in cols)\n",
" dest = duplicates if value in values else values\n",
" dest.add(value)\n",
"\n",
" if duplicates:\n",
" lDups = len(duplicates)\n",
" print(f'{indent}found {lDups} duplicate {colNames}s:')\n",
" rest = '' if lDups <= batch else ' and more'\n",
" dupStr = ' '.join(sorted(duplicates)[0:batch])\n",
" print(f'{indent}\\t{dupStr}{rest}')\n",
" else:\n",
" print(f'{indent}no duplicate {colNames}s')\n",
" \n",
" checkUnique(0)\n",
" checkUnique(2)\n",
" checkUnique((0, 1))\n",
" checkUnique((0, 1, 3))\n",
" checkUnique(0, per=3)\n",
" \n",
" for row in data[0:batch]:\n",
" print(row)\n",
" print(f'\\n... {lData - 2 * batch} rows ...\\n')\n",
" for row in data[-batch:]:\n",
" print(row)\n",
" \n",
" return data\n",
" \n",
"data = checkSignMapData(MAP_FILE)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"def makeMap(data):\n",
" pass\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"from unicodedata import name as uname"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"cuneiBlocks = {\n",
" 'Cuneiform': ('12000', '123FF'),\n",
" 'Cuneiform Numbers and Punctuation': ('12400', '1247F'),\n",
" 'Early Dynastic Cuneiform': ('12480', '1254F'),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cNumber = dict(\n",
" one=1,\n",
" two=2,\n",
" three=3,\n",
" four=4,\n",
" five=5,\n",
" six=6,\n",
" seven=7,\n",
" eight=8,\n",
" nine=9,\n",
")\n",
"\n",
"numericGlyphs = set('''\n",
" ash\n",
" ash9\n",
" ban2\n",
" buru\n",
" dish\n",
" eshe3\n",
" esh16\n",
" esh21\n",
" gesh2\n",
" geshu\n",
" ilimmu\n",
" ilimmu3\n",
" ilimmu4\n",
" imin\n",
" imin3\n",
" limmu\n",
" limmu4\n",
" shar2\n",
" sharu\n",
" u\n",
" ussu\n",
" ussu3\n",
"'''.strip().split())\n",
"\n",
"fractions = dict(\n",
" half=2,\n",
" third=3,\n",
" thirds=3,\n",
" quarter=4,\n",
" sixths=6,\n",
" eighth=8,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1360 positions; 1234 cuneiform characters\n",
"126 skipped positions\n",
"number: 65\n",
"12400 = π = ash <= CUNEIFORM NUMERIC SIGN TWO ASH\n",
"12401 = π = ash <= CUNEIFORM NUMERIC SIGN THREE ASH\n",
"12402 = π = ash <= CUNEIFORM NUMERIC SIGN FOUR ASH\n",
"12403 = π = ash <= CUNEIFORM NUMERIC SIGN FIVE ASH\n",
"12404 = π = ash <= CUNEIFORM NUMERIC SIGN SIX ASH\n",
"12405 = π
= ash <= CUNEIFORM NUMERIC SIGN SEVEN ASH\n",
"12406 = π = ash <= CUNEIFORM NUMERIC SIGN EIGHT ASH\n",
"12407 = π = ash <= CUNEIFORM NUMERIC SIGN NINE ASH\n",
"12408 = π = dish <= CUNEIFORM NUMERIC SIGN THREE DISH\n",
"12409 = π = dish <= CUNEIFORM NUMERIC SIGN FOUR DISH\n",
"1240a = π = dish <= CUNEIFORM NUMERIC SIGN FIVE DISH\n",
"1240b = π = dish <= CUNEIFORM NUMERIC SIGN SIX DISH\n",
"1240c = π = dish <= CUNEIFORM NUMERIC SIGN SEVEN DISH\n",
"1240d = π = dish <= CUNEIFORM NUMERIC SIGN EIGHT DISH\n",
"1240e = π = dish <= CUNEIFORM NUMERIC SIGN NINE DISH\n",
"1240f = π = u <= CUNEIFORM NUMERIC SIGN FOUR U\n",
"12410 = π = u <= CUNEIFORM NUMERIC SIGN FIVE U\n",
"12411 = π = u <= CUNEIFORM NUMERIC SIGN SIX U\n",
"12412 = π = u <= CUNEIFORM NUMERIC SIGN SEVEN U\n",
"12413 = π = u <= CUNEIFORM NUMERIC SIGN EIGHT U\n",
"numberSpecial: 10\n",
"12432 = π² = shar2 times gal plus dish <= CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH\n",
"12433 = π³ = shar2 times gal plus min <= CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN\n",
"12456 = π = nigidamin <= CUNEIFORM NUMERIC SIGN NIGIDAMIN\n",
"12457 = π = nigidaesh <= CUNEIFORM NUMERIC SIGN NIGIDAESH\n",
"12461 = π‘ = old assyrian one sixth <= CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE SIXTH\n",
"12462 = π’ = old assyrian one quarter <= CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER\n",
"12465 = π₯ = elamite one third <= CUNEIFORM NUMERIC SIGN ELAMITE ONE THIRD\n",
"12466 = π¦ = elamite two thirds <= CUNEIFORM NUMERIC SIGN ELAMITE TWO THIRDS\n",
"12467 = π§ = elamite forty <= CUNEIFORM NUMERIC SIGN ELAMITE FORTY\n",
"12468 = π¨ = elamite fifty <= CUNEIFORM NUMERIC SIGN ELAMITE FIFTY\n",
"numberVar: 29\n",
"12425 = π₯ = shar2 ~ <= CUNEIFORM NUMERIC SIGN THREE SHAR2 VARIANT FORM\n",
"1242f = π― = sharu ~ <= CUNEIFORM NUMERIC SIGN THREE SHARU VARIANT FORM\n",
"12437 = π· = buru ~ <= CUNEIFORM NUMERIC SIGN THREE BURU VARIANT FORM\n",
"1243a = πΊ = esh16 ~ <= CUNEIFORM NUMERIC SIGN THREE VARIANT FORM ESH16\n",
"1243b = π» = esh21 ~ <= CUNEIFORM NUMERIC SIGN THREE VARIANT FORM ESH21\n",
"1243c = πΌ = limmu ~ <= CUNEIFORM NUMERIC SIGN FOUR VARIANT FORM LIMMU\n",
"1243d = π½ = limmu4 ~ <= CUNEIFORM NUMERIC SIGN FOUR VARIANT FORM LIMMU4\n",
"1243e = πΎ = limmu ~a <= CUNEIFORM NUMERIC SIGN FOUR VARIANT FORM LIMMU A\n",
"1243f = πΏ = limmu ~b <= CUNEIFORM NUMERIC SIGN FOUR VARIANT FORM LIMMU B\n",
"12440 = π = ash9 ~ <= CUNEIFORM NUMERIC SIGN SIX VARIANT FORM ASH9\n",
"12441 = π = imin3 ~ <= CUNEIFORM NUMERIC SIGN SEVEN VARIANT FORM IMIN3\n",
"12442 = π = imin ~a <= CUNEIFORM NUMERIC SIGN SEVEN VARIANT FORM IMIN A\n",
"12443 = π = imin ~b <= CUNEIFORM NUMERIC SIGN SEVEN VARIANT FORM IMIN B\n",
"12444 = π = ussu ~ <= CUNEIFORM NUMERIC SIGN EIGHT VARIANT FORM USSU\n",
"12445 = π
= ussu3 ~ <= CUNEIFORM NUMERIC SIGN EIGHT VARIANT FORM USSU3\n",
"12446 = π = ilimmu ~ <= CUNEIFORM NUMERIC SIGN NINE VARIANT FORM ILIMMU\n",
"12447 = π = ilimmu3 ~ <= CUNEIFORM NUMERIC SIGN NINE VARIANT FORM ILIMMU3\n",
"12448 = π = ilimmu4 ~ <= CUNEIFORM NUMERIC SIGN NINE VARIANT FORM ILIMMU4\n",
"12449 = π = ilimmu ~a <= CUNEIFORM NUMERIC SIGN NINE VARIANT FORM ILIMMU A\n",
"12453 = π = ban2 ~ <= CUNEIFORM NUMERIC SIGN FOUR BAN2 VARIANT FORM\n",
"odd: 7\n",
"1245a = π = third dish <= CUNEIFORM NUMERIC SIGN ONE THIRD DISH\n",
"1245b = π = thirds dish <= CUNEIFORM NUMERIC SIGN TWO THIRDS DISH\n",
"1245c = π = sixths dish <= CUNEIFORM NUMERIC SIGN FIVE SIXTHS DISH\n",
"1245f = π = eighth ash <= CUNEIFORM NUMERIC SIGN ONE EIGHTH ASH\n",
"12460 = π = quarter ash <= CUNEIFORM NUMERIC SIGN ONE QUARTER ASH\n",
"12463 = π£ = quarter gur <= CUNEIFORM NUMERIC SIGN ONE QUARTER GUR\n",
"12464 = π€ = half gur <= CUNEIFORM NUMERIC SIGN ONE HALF GUR\n",
"punct: 5\n",
"12470 = π° = old assyrian word divider <= CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER\n",
"12471 = π± = vertical colon <= CUNEIFORM PUNCTUATION SIGN VERTICAL COLON\n",
"12472 = π² = diagonal colon <= CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON\n",
"12473 = π³ = diagonal tricolon <= CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON\n",
"12474 = π΄ = diagonal quadcolon <= CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON\n",
"sign: 1118\n",
"12000 = π = a <= CUNEIFORM SIGN A\n",
"12001 = π = a times a <= CUNEIFORM SIGN A TIMES A\n",
"12002 = π = a times bad <= CUNEIFORM SIGN A TIMES BAD\n",
"12003 = π = a times gan2 tenu <= CUNEIFORM SIGN A TIMES GAN2 TENU\n",
"12004 = π = a times ha <= CUNEIFORM SIGN A TIMES HA\n",
"12005 = π
= a times igi <= CUNEIFORM SIGN A TIMES IGI\n",
"12006 = π = a times lagar gunu <= CUNEIFORM SIGN A TIMES LAGAR GUNU\n",
"12007 = π = a times mush <= CUNEIFORM SIGN A TIMES MUSH\n",
"12008 = π = a times sag <= CUNEIFORM SIGN A TIMES SAG\n",
"12009 = π = a2 <= CUNEIFORM SIGN A2\n",
"1200a = π = ab <= CUNEIFORM SIGN AB\n",
"1200b = π = ab times ash2 <= CUNEIFORM SIGN AB TIMES ASH2\n",
"1200c = π = ab times dun3 gunu <= CUNEIFORM SIGN AB TIMES DUN3 GUNU\n",
"1200d = π = ab times gal <= CUNEIFORM SIGN AB TIMES GAL\n",
"1200e = π = ab times gan2 tenu <= CUNEIFORM SIGN AB TIMES GAN2 TENU\n",
"1200f = π = ab times ha <= CUNEIFORM SIGN AB TIMES HA\n",
"12010 = π = ab times igi gunu <= CUNEIFORM SIGN AB TIMES IGI GUNU\n",
"12011 = π = ab times imin <= CUNEIFORM SIGN AB TIMES IMIN\n",
"12012 = π = ab times lagab <= CUNEIFORM SIGN AB TIMES LAGAB\n",
"12013 = π = ab times shesh <= CUNEIFORM SIGN AB TIMES SHESH\n"
]
}
],
"source": [
"pos = 0\n",
"nChars = 0\n",
"\n",
"noUni = []\n",
"\n",
"glyphs = collections.defaultdict(list)\n",
"\n",
"for (cuneiBlock, (start, end)) in cuneiBlocks.items():\n",
" for u in range(int(start, 16), int(end, 16) + 1):\n",
" pos += 1\n",
" c = chr(u)\n",
" name = uname(c, None)\n",
" if name is None:\n",
" noUni.append(u)\n",
" continue\n",
" nChars += 1\n",
" if not name.startswith('CUNEIFORM '):\n",
" glyphs['no'].append(u)\n",
" parts = [p.lower() for p in name.split()][1:]\n",
" kind = 'other'\n",
" if parts[0] == 'sign':\n",
" kind = 'sign'\n",
" parts = parts[1:]\n",
" elif parts[0] == 'numeric':\n",
" kind = 'number'\n",
" parts = parts[1:]\n",
" if parts[0] != 'sign':\n",
" kind = 'odd'\n",
" else:\n",
" parts = parts[1:]\n",
" num = cNumber.get(parts[0], None)\n",
" variant = ''\n",
" if num is None:\n",
" kind = 'numberSpecial'\n",
" else:\n",
" parts[0] = str(num)\n",
" parts = parts[1:]\n",
" if 'variant' in parts and 'form' in parts:\n",
" variant = '~'\n",
" kind = 'numberVar'\n",
" parts.remove('variant')\n",
" parts.remove('form')\n",
" if len(parts) == 0 or parts[0] not in numericGlyphs or len(parts) > 1:\n",
" if len(parts) == 2:\n",
" if parts[1] in {'a', 'b'}:\n",
" variant = f'~{parts[1]}'\n",
" kind = 'numberVar'\n",
" parts = parts[0:-1]\n",
" elif parts[1] == 'tenu':\n",
" pass\n",
" else:\n",
" kind = 'odd'\n",
" else:\n",
" kind = 'odd'\n",
" if variant:\n",
" parts.append(variant)\n",
" elif parts[0] == 'punctuation':\n",
" kind = 'punct'\n",
" parts = parts[1:]\n",
" if parts[0] != 'sign':\n",
" kind = 'odd'\n",
" else:\n",
" parts = parts[1:]\n",
" \n",
" glyphs[kind].append((u, ' '.join(parts)))\n",
" \n",
"print(f'{pos} positions; {nChars} cuneiform characters')\n",
"if pos - nChars:\n",
" print(f'{pos - nChars} skipped positions')\n",
" \n",
"for (kind, unis) in sorted(glyphs.items()):\n",
" print(f'{kind}: {len(unis)}')\n",
" for (u, shortName) in unis[0:20]:\n",
" c = chr(u)\n",
" print(f'{u:>03x} = {c} = {shortName} <= {uname(c)}')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'LATIN SMALL LETTER A'"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uname('a')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}