https://github.com/mozilla/gecko-dev
Raw File
Tip revision: 137e4f612bdcc225bf086f1697ca1e9e75450820 authored by seabld on 24 September 2014, 05:12:24 UTC
Added tag SEAMONKEY_2_29_1_RELEASE for changeset FIREFOX_32_0_3_BUILD1. CLOSED TREE a=release
Tip revision: 137e4f6
make_intl_data.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

""" Usage: make_intl_data.py [language-subtag-registry.txt]

    This script extracts information about mappings between deprecated and
    current BCP 47 language tags from the IANA Language Subtag Registry and
    converts it to JavaScript object definitions in IntlData.js. The definitions
    are used in Intl.js.

    The IANA Language Subtag Registry is imported from
    http://www.iana.org/assignments/language-subtag-registry
    and uses the syntax specified in
    http://tools.ietf.org/html/rfc5646#section-3
"""

def readRegistryRecord(registry):
    """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
    record = {}
    for line in registry:
        line = line.strip()
        if line == "":
            continue
        if line == "%%":
            yield record
            record = {}
        else:
            if ":" in line:
                key, value = line.split(":", 1)
                key, value = key.strip(), value.strip()
                record[key] = value
            else:
                # continuation line
                record[key] += " " + line
    if record:
        yield record
    return


def readRegistry(registry):
    """ Reads IANA Language Subtag Registry and extracts information for Intl.js.

        Information extracted:
        - langTagMappings: mappings from complete language tags to preferred
          complete language tags
        - langSubtagMappings: mappings from subtags to preferred subtags
        - extlangMappings: mappings from extlang subtags to preferred subtags,
          with prefix to be removed
        Returns these three mappings as dictionaries, along with the registry's
        file date.

        We also check that mappings for language subtags don't affect extlang
        subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
        to separate them for processing. Region codes are separated by case,
        and script codes by length, so they're unproblematic.
    """
    langTagMappings = {}
    langSubtagMappings = {}
    extlangMappings = {}
    languageSubtags = set()
    extlangSubtags = set()

    for record in readRegistryRecord(registry):
        if "File-Date" in record:
            fileDate = record["File-Date"]
            continue

        if record["Type"] == "grandfathered":
            # Grandfathered tags don't use standard syntax, so
            # CanonicalizeLanguageTag expects the mapping table to provide
            # the final form for all.
            # For langTagMappings, keys must be in lower case; values in
            # the case used in the registry.
            tag = record["Tag"]
            if "Preferred-Value" in record:
                langTagMappings[tag.lower()] = record["Preferred-Value"]
            else:
                langTagMappings[tag.lower()] = tag
        elif record["Type"] == "redundant":
            # For langTagMappings, keys must be in lower case; values in
            # the case used in the registry.
            if "Preferred-Value" in record:
                langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
        elif record["Type"] in ("language", "script", "region", "variant"):
            # For langSubtagMappings, keys and values must be in the case used
            # in the registry.
            subtag = record["Subtag"]
            if record["Type"] == "language":
                languageSubtags.add(subtag)
            if "Preferred-Value" in record:
                if subtag == "heploc":
                    # The entry for heploc is unique in its complexity; handle
                    # it as special case below.
                    continue
                if "Prefix" in record:
                    # This might indicate another heploc-like complex case.
                    raise Exception("Please evaluate: subtag mapping with prefix value.")
                langSubtagMappings[subtag] = record["Preferred-Value"]
        elif record["Type"] == "extlang":
            # For extlangMappings, keys must be in the case used in the
            # registry; values are records with the preferred value and the
            # prefix to be removed.
            subtag = record["Subtag"]
            extlangSubtags.add(subtag)
            if "Preferred-Value" in record:
                preferred = record["Preferred-Value"]
                prefix = record["Prefix"]
                extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
        else:
            # No other types are allowed by
            # http://tools.ietf.org/html/rfc5646#section-3.1.3
            assert False, "Unrecognized Type: {0}".format(record["Type"])

    # Check that mappings for language subtags and extlang subtags don't affect
    # each other.
    for lang in languageSubtags:
        if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
            raise Exception("Conflict: lang with extlang mapping: " + lang)
    for extlang in extlangSubtags:
        if extlang in langSubtagMappings:
            raise Exception("Conflict: extlang with lang mapping: " + extlang)

    # Special case for heploc.
    langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"

    return {"fileDate": fileDate,
            "langTagMappings": langTagMappings,
            "langSubtagMappings": langSubtagMappings,
            "extlangMappings": extlangMappings}


def writeMappingsVar(intlData, dict, name, description, fileDate, url):
    """ Writes a variable definition with a mapping table to file intlData.

        Writes the contents of dictionary dict to file intlData with the given
        variable name and a comment with description, fileDate, and URL.
    """
    intlData.write("\n")
    intlData.write("// {0}.\n".format(description))
    intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
    intlData.write("// {0}\n".format(url))
    intlData.write("var {0} = {{\n".format(name))
    keys = sorted(dict)
    for key in keys:
        if isinstance(dict[key], basestring):
            value = '"{0}"'.format(dict[key])
        else:
            preferred = dict[key]["preferred"]
            prefix = dict[key]["prefix"]
            value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
        intlData.write('    "{0}": {1},\n'.format(key, value))
    intlData.write("};\n")


def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
    """ Writes the language tag data to the Intl data file. """
    writeMappingsVar(intlData, langTagMappings, "langTagMappings",
                     "Mappings from complete tags to preferred values", fileDate, url)
    writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
                     "Mappings from non-extlang subtags to preferred values", fileDate, url)
    writeMappingsVar(intlData, extlangMappings, "extlangMappings",
                     "Mappings from extlang subtags to preferred values", fileDate, url)


if __name__ == '__main__':
    import codecs
    import sys
    import urllib2

    url = "http://www.iana.org/assignments/language-subtag-registry"
    if len(sys.argv) > 1:
        print("Always make sure you have the newest language-subtag-registry.txt!")
        registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
    else:
        print("Downloading IANA Language Subtag Registry...")
        reader = urllib2.urlopen(url)
        text = reader.read().decode("utf-8")
        reader.close()
        registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
        registry.write(text)
        registry.seek(0)

    print("Processing IANA Language Subtag Registry...")
    data = readRegistry(registry)
    fileDate = data["fileDate"]
    langTagMappings = data["langTagMappings"]
    langSubtagMappings = data["langSubtagMappings"]
    extlangMappings = data["extlangMappings"]
    registry.close()

    print("Writing Intl data...")
    intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
    intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
    writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
    intlData.close()
back to top