Content - ae425f2dad36ac018af609b672cf60ff16c6fb4c - 448d699/tf/transcription.py

visit type:
Tip revision: eaebe1d98ed790dd0ace153faf759eb162d3037b authored by Dirk Roorda on 27 April 2018, 08:50:44 UTC
New minor release 3.4.4
Tip revision: eaebe1d
transcription.py
import re


class Transcription(object):
    decomp = {
        '\u05E9\u05C1': "\uFB2A",
        '\u05E9\u05C2': "\uFB2B",
    }
    hebrew_mapping = {
        '_': ' ',  # space inside word
        '92': "\u0591",  # etnahta = atnach
        '01': "\u0592",  # segolta
        '65': "\u0593",  # shalshelet
        '80': "\u0594",  # zaqef_qatan
        '85': "\u0595",  # zaqef_gadol
        '73': "\u0596",  # tipeha = tifcha
        '81': "\u0597",  # revia = rebia
        '82': "\u0598",  # zarqa = tsinorit = zinorit = sinnorit
        '03': "\u0599",  # pashta
        '10': "\u059A",  # yetiv = yetib
        '91': "\u059B",  # tevir = tebir
        '61': "\u059C",  # geresh
        '11': "\u059D",  # geresh muqdam = mugrash
        '62': "\u059E",  # gershayim = garshayim
        '84': "\u059F",  # qarney para = pazer_gadol
        '14': "\u05A0",  # telisha_gedola
        '44': "\u05A0",  # telisha_gedola = telisha_gedola_med
        '83': "\u05A1",  # pazer
        '74': "\u05A3",  # munah = munach
        '70': "\u05A4",  # mahapakh = mehuppach
        '71': "\u05A5",  # merkha = merecha
        '72': "\u05A6",  # merkha kefula = merecha_kepula
        '94': "\u05A7",  # darga
        '63': "\u05A8",  # qadma = azla
        '33': "\u05A8",  # pashta_med < qadma
        '04': "\u05A9",  # telisha_qetana
        '24': "\u05A9",  # telisha_qetana = telisha_qetana_med
        '93': "\u05AA",  # yera ben yomo = yerach
        '60': "\u05AB",  # ole = ole_weyored
        '64': "\u05AC",  # iluy = illuy
        '13': "\u05AD",  # dehi = dechi
        '02': "\u05AE",  # zinor = sinnor
        '*': "\u05AF",  # masora circle
        ':': "\u05B0",  # sheva = shewa
        ':E': "\u05B1",  # hataf segol = chataph_segol
        ':A': "\u05B2",  # hataf patah = chataph_patach
        ':@': "\u05B3",  # hataf qamats = chataph_qamats
        'I': "\u05B4",  # hiriq = chiriq
        ';': "\u05B5",  # tsere
        'E': "\u05B6",  # segol
        'A': "\u05B7",  # patach
        '@': "\u05B8",  # qamats
        'O': "\u05B9",  # holam = cholam
        'U': "\u05BB",  # qubuts = qubbuts
        '.': "\u05BC",  # dagesh
        '25': "\u05BD",  # silluq yamin
        '45': "\u05BD",  # meteg
        '35': "\u05BD",  # meteg (tikon)
        '75': "\u05BD",  # siluq = silluq
        '95': "\u05BD",  # meteg = meteg_yamin
        '&': "\u05BE",  # maqaf
        ',': "\u05BF",  # rafe = raphe
        '05': "\u05C0",  # paseq
        '.c': "\u05C1",  # shin dot
        '.f': "\u05C2",  # sin dot
        '00': "\u05C3",  # sof_pasuq
        '52': "\u05C4",  # upper dot = puncta_above
        '53': "\u05C5",  # lower dot = puncta_below
        'ñ': "\u05C6\u0307",  # nun hafukha
        'Ñ': "\u05C6\u0307",  # nun hafukha
        '>': "\u05D0",  # alef
        'B': "\u05D1",  # bet
        'G': "\u05D2",  # gimel
        'D': "\u05D3",  # dalet
        'H': "\u05D4",  # he
        'W': "\u05D5",  # vav
        'Z': "\u05D6",  # zayin
        'X': "\u05D7",  # het
        'V': "\u05D8",  # tet
        'J': "\u05D9",  # yod
        'k': "\u05DA",  # kaf final
        'K': "\u05DB",  # kaf
        'L': "\u05DC",  # lamed
        'm': "\u05DD",  # mem final
        'M': "\u05DE",  # mem
        'n': "\u05DF",  # nun final
        'N': "\u05E0",  # nun
        'S': "\u05E1",  # samekh
        '<': "\u05E2",  # ayin
        'p': "\u05E3",  # pe final
        'P': "\u05E4",  # pe
        'y': "\u05E5",  # tsadi final
        'Y': "\u05E6",  # tsadi
        'Q': "\u05E7",  # qof
        'R': "\u05E8",  # resh
        '#': "\u05E9",  # sin unpointed
        'T': "\u05EA",  # tav
        'C': "\uFB2A",  # shin pointed
        'F': "\uFB2B",  # sin pointed
        '55': "<UNMAPPED 55=large letter>",  # large_letter
        '56': "<UNMAPPED 56=small letter>",  # small_letter
        '57': "<UNMAPPED 57=suspended letter>",  # suspended_letter
        '-': "",  # suppress space afterward
    }
    hebrew_cons = '>BGDHWZXVJKLMNS<PYQRFCT'
    trans_final_pat = re.compile(
        r'([' + hebrew_cons + r'][^_&]*)([KMNPY])([^' + hebrew_cons +
        r'_&]*(:?[_&]|\Z))'
    )
    trans_hebrew_pat = re.compile(r'(:[AE@]|.[cf]|:|[0-9][0-9]|.)')
    swap_accent_pat = re.compile(
        r'(\A|[_&])([0-9][0-9])([' + hebrew_cons + r'])([:;,.EAIOU@]*)'
    )
    remove_accent_pat = re.compile(r'((?:[0-9][0-9])|[,*])')
    remove_point_pat = re.compile(
        r'((?:[0-9][0-9])|(?:\.[cf])|(?::[@AE])|[,.:;@AEIOU*])'
    )
    remove_psn_pat = re.compile(r'00[ _SPNÑñ]*')
    remove_psq_pat = re.compile(r'(?:[ _]+05[ _]*)|(?:05[ _]+)')
    shin_pat = re.compile(r'[CF]')
    ph_simple_pat = re.compile(r'([ˈˌᵊᵃᵒᵉāo*])')
    noorigspace = re.compile(
        '''
          (?: [&-]\Z)           # space, maqef or nospace
        | (?:
               0[05]            # sof pasuq or paseq
               (?:_[SNP])*      # nun hafukha, setumah, petuhah at end of verse
               \Z
          )
        | (?:_[SPN])+           #  nun hafukha, setumah, petuhah between words
    ''', re.X
    )

    syriac_mapping = {
        '>': "\u0710",  # alaph
        'B': "\u0712",  # beth
        'G': "\u0713",  # gamal
        'D': "\u0715",  # dalat
        'H': "\u0717",  # he
        'W': "\u0718",  # waw
        'Z': "\u0719",  # zain
        'X': "\u071A",  # heth
        'V': "\u071B",  # teth
        'J': "\u071D",  # yudh
        'K': "\u071F",  # kaph
        'L': "\u0720",  # lamadh
        'M': "\u0721",  # mim
        'N': "\u0722",  # nun
        'S': "\u0723",  # semkath
        '<': "\u0725",  # e
        'P': "\u0726",  # pe
        'Y': "\u0728",  # sadhe
        'Q': "\u0729",  # qaph
        'R': "\u072A",  # rish
        'C': "\u072B",  # shin
        'T': "\u072C",  # taw
        's': "\u0724",  # semkath final
        'p': "\u0727",  # pe reversed
    }

    def __init__(self):
        self.hebrew_consonants = {
            Transcription.hebrew_mapping[x]
            for x in Transcription.hebrew_cons
        }
        self.hebrew_consonants.add('\u05E9')
        self.hebrew_mappingi = dict(
            (v, k) for (k, v) in Transcription.hebrew_mapping.items()
            if k != ''
        )
        # special treatment needed for nun hafukha,
        # since it is consists of two characters
        self.hebrew_mappingi['\u05C6'] = 'ñ'
        self.hebrew_mappingi['\u0307'] = ''

    def _comp(s):
        for (d, c) in Transcription.decomp.items():
            s = s.replace(d, c)
        return s

    def _decomp(s):
        for (d, c) in Transcription.decomp.items():
            s = s.replace(c, d)
        return s

    def suffix_and_finales(word):
        # first split the word proper from the suffix,
        # and add a space if there is no other suffix
        add_space = ''
        suffix = ''
        new_word = word
        if not word:
            return (new_word, suffix + add_space)
        lastch = new_word[-1]
        if lastch == '-' or lastch == '&':
            new_word = new_word[0:-1]
            suffix = lastch
        else:
            if len(new_word) >= 2:
                lastch = new_word[-1]
                llastch = new_word[-2]
                if llastch == '_' and (lastch == 'P' or lastch == 'S'):
                    new_word = new_word[0:-2]
                    suffix = ' ' + lastch + suffix + ' '
            if len(new_word) >= 2:
                lastch = new_word[-1]
                llastch = new_word[-2]
                if llastch == '_' and (lastch == 'N'):
                    new_word = new_word[0:-2]
                    suffix = ' ñ' + suffix + ' '
            if len(new_word) >= 2:
                lastch = new_word[-1]
                llastch = new_word[-2]
                if llastch == '0' and (lastch == '0' or lastch == '5'):
                    new_word = new_word[0:-2]
                    suffix = (
                        ' ' if lastch == '5' else ''
                    ) + llastch + lastch + suffix
                    add_space = '\n' if lastch == '0' else ' '
        if suffix == '':
            add_space = ' '
        elif suffix == '-':
            add_space = ''
            suffix = ''
        # second: replace consonants by their final forms when needed
        new_word = Transcription.trans_final_pat.sub(
            Transcription._map_final, new_word
        )
        return (new_word, suffix + add_space)

    def _map_final(m):
        return m.group(1) + m.group(2).lower() + m.group(3)

    def _map_hebrew(m):
        return Transcription.hebrew_mapping.get(m.group(1), m.group(1))

    def _swap_accent(m):
        return m.group(1) + m.group(3) + m.group(4) + m.group(2)

    def _remove_accent(m):
        return '00' if m.group(1) == '00' else '05' if m.group(
            1
        ) == '05' else ''

    def _remove_point(m):
        return '00' if m.group(1) == '00' else '05' if m.group(
            1
        ) == '05' else ''

    def _ph_simple(m):
        return 'å' if m.group(1) in 'āo' else ''

    # unicode normalization is harmful
    # if there is a combination of dagesh, vowel and accent.

    def suppress_space(word):
        return Transcription.noorigspace.search(word)

    def to_etcbc_v(word):
        return Transcription.remove_accent_pat.sub(
            Transcription._remove_accent, word
        )

    def to_etcbc_c(word):
        word = Transcription.remove_point_pat.sub(
            Transcription._remove_point, word
        )
        word = Transcription.remove_psn_pat.sub(
            '00', word
        )  # remove nun hafukha, setumah, petuhah at the end of a verse
        word = Transcription.remove_psq_pat.sub(
            ' ', word
        )  # replace paseq with attached spaces by single space
        word = word.upper()  # no final forms of consonants
        return Transcription.shin_pat.sub('#', word)

    def to_hebrew(word):
        word = Transcription.swap_accent_pat.sub(
            Transcription._swap_accent, word
        )
        return Transcription.trans_hebrew_pat.sub(
            Transcription._map_hebrew, word
        )

    def to_hebrew_v(word):
        return Transcription.trans_hebrew_pat.sub(
            Transcription._map_hebrew, Transcription.to_etcbc_v(word)
        )

    def to_hebrew_c(word):
        return Transcription.trans_hebrew_pat.sub(
            Transcription._map_hebrew, Transcription.to_etcbc_c(word)
        )

    def to_hebrew_x(word):
        return Transcription.trans_hebrew_pat.sub(
            Transcription._map_hebrew, word
        )

    def ph_simplify(pword):
        return Transcription.ph_simple_pat.sub(Transcription._ph_simple, pword)

    def from_hebrew(self, word):
        return ''.join(
            self.hebrew_mappingi.get(x, x) for x in Transcription._comp(word)
        )

    def to_syriac(self, word):
        return Transcription._decomp(
            ''.join(
                self.syriac_mapping.get(x, x)
                for x in Transcription._comp(word)
            )
        )

    def from_syriac(self, word):
        return ''.join(
            self.syriac_mappingi.get(x, x) for x in Transcription._comp(word)
        )
Browse the archive

https://github.com/annotation/text-fabric