https://github.com/Dans-labs/text-fabric
Raw File
Tip revision: 459f7d35aa64aab2b86cf4d63701892c9214adbb authored by Dirk Roorda on 11 May 2018, 09:42:48 UTC
New minor release 4.0.1
Tip revision: 459f7d3
mql.py
import os
import re
from functools import reduce
from .data import WARP
from .helpers import (
    cleanName,
    isClean,
    specFromRanges,
    rangesFromList,
    setFromSpec,
    nbytes,
)

# If a feature, with type string, has less than ENUM_LIMIT values,
# an enumeration type for it will be created
# provided all values of that feature are a valid name for MQL.

ENUM_LIMIT = 1000

ONE_ENUM_TYPE = True


class MQL(object):
    def __init__(self, mqlDir, mqlName, tfFeatures, tm):
        self.mqlDir = mqlDir
        cleanDb = cleanName(mqlName)
        if cleanDb != mqlName:
            self.tm.error('db name "{}" => "{}"'.format(mqlName, cleanDb))
        self.mqlName = cleanDb
        self.tfFeatures = tfFeatures
        self.tm = tm
        self.enums = {}
        self._check()

    def write(self):
        if not self.good:
            return
        if not os.path.exists(self.mqlDir):
            try:
                os.makedirs(self.mqlDir, exist_ok=True)
            except Exception:
                self.tm.error(
                    'Cannot create directory "{}"'.format(self.mqlDir)
                )
                self.good = False
                return
        mqlPath = '{}/{}.mql'.format(self.mqlDir, self.mqlName)
        try:
            fm = open(mqlPath, 'w', encoding='utf8')
        except Exception:
            self.tm.error('Could not write to {}'.format(mqlPath))
            self.good = False
            return

        self.tm.info('Loading {} features'.format(len(self.featureList)))
        for ft in self.featureList:
            fObj = self.features[ft]
            fObj.load()

        self.fm = fm
        self._writeStartDb()
        self._writeEnums()
        self._writeTypes()
        self._writeDataAll()
        self._writeEndDb()
        self.tm.indent(level=0)
        self.tm.info('Done')

    def _check(self):
        self.tm.info('Checking features of dataset {}'.format(self.mqlName))
        self.features = {}
        self.featureList = []
        self.tm.indent(level=1)
        for (f, fo) in sorted(self.tfFeatures.items()):
            if fo.method is not None or f in WARP:
                continue
            fo.load(metaOnly=True)
            if fo.isConfig:
                continue
            cleanF = cleanName(f)
            if cleanF != f:
                self.tm.error('feature "{}" => "{}"'.format(f, cleanF))
            self.featureList.append(cleanF)
            self.features[cleanF] = fo
        good = True
        for feat in (WARP[0], WARP[1], '__levels__'):
            if feat not in self.tfFeatures:
                self.tm.error(
                    '{} feature {} is missing from data set'.format(
                        'Warp' if feat in WARP else 'Computed'
                        if feat.startswith('__') else 'Data',
                        feat,
                    )
                )
                good = False
            else:
                fObj = self.tfFeatures[feat]
                if not fObj.load():
                    good = False
        self.tm.indent(level=0)
        if (not good):
            self.tm.error('Export to MQL aborted')
        else:
            self.tm.info(
                '{} features to export to MQL ...'.format(
                    len(self.featureList)
                )
            )
        self.good = good

    def _writeStartDb(self):
        self.fm.write(
            '''
CREATE DATABASE '{name}'
GO
USE DATABASE '{name}'
GO
'''.format(name=self.mqlName)
        )

    def _writeEndDb(self):
        self.fm.write('''
VACUUM DATABASE ANALYZE
GO
''')
        self.fm.close()

    def _writeEnums(self):
        self.tm.indent(level=0)
        self.tm.info('Writing enumerations')
        self.tm.indent(level=1)
        for ft in self.featureList:
            fObj = self.features[ft]
            if fObj.isEdge or fObj.dataType == 'int':
                continue
            fMap = fObj.data
            fValues = sorted(set(fMap.values()))
            if len(fValues) > ENUM_LIMIT:
                continue
            eligible = all(isClean(fVal) for fVal in fValues)
            if not eligible:
                unclean = [fVal for fVal in fValues if not isClean(fVal)]
                print(
                    '\t{:<15}: {:>4} values, {} not a name, e.g. «{}»'.format(
                        ft,
                        len(fValues),
                        len(unclean),
                        unclean[0],
                    )
                )
                continue
            self.enums[ft] = fValues

        if ONE_ENUM_TYPE:
            self._writeEnumsAsOne()
        else:
            for ft in sorted(self.enums):
                self._writeEnum(ft)
            self.tm.indent(level=0)
            self.tm.info('Written {} enumerations'.format(len(self.enums)))

    def _writeEnumsAsOne(self):
        fValues = reduce(
            set.union,
            (set(fV) for fV in self.enums.values()),
            set(),
        )
        self.tm.info(
            'Writing an all-in-one enum with {:>4} values'.format(
                len(fValues)
            )
        )
        fValuesEnumerated = ',\n\t'.join(
            '{} = {}'.format(fVal, i) for (i, fVal) in enumerate(fValues)
        )
        self.fm.write(
            '''
CREATE ENUMERATION all_enum = {{
    {}
}}
GO
'''.format(fValuesEnumerated)
        )

    def _writeEnum(self, ft):
        fValues = self.enums[ft]
        self.tm.info('enum {:<15} with {:>4} values'.format(ft, len(fValues)))
        fValuesEnumerated = ',\n\t'.join(
            '{} = {}'.format(fVal, i) for (i, fVal) in enumerate(fValues)
        )
        self.fm.write(
            '''
CREATE ENUMERATION {}_enum = {{
    {}
}}
GO
'''.format(ft, fValuesEnumerated)
        )

    def _writeTypes(self):
        def valInt(n):
            return str(n)

        def valStr(s):
            if "'" in s:
                return '"{}"'.format(s.replace('"', '\\"'))
            else:
                return "'{}'".format(s)

        def valIds(ids):
            return '({})'.format(','.join(str(i) for i in ids))

        self.levels = self.tfFeatures['__levels__'].data[::-1]
        self.tm.indent(level=0)
        self.tm.info(
            'Mapping {} features onto {} object types'.format(
                len(self.featureList),
                len(self.levels),
            )
        )
        otypeSupport = {}
        for (otype, av, start, end) in self.levels:
            cleanOtype = cleanName(otype)
            if cleanOtype != otype:
                self.tm.error('otype "{}" => "{}"'.format(otype, cleanOtype))
            otypeSupport[cleanOtype] = set(range(start, end + 1))

        self.otypes = {}
        self.featureTypes = {}
        self.featureMethods = {}
        for ft in self.featureList:
            fObj = self.features[ft]
            if fObj.isEdge:
                dataType = 'LIST OF id_d'
                method = valIds
            else:
                if fObj.dataType == 'str':
                    dataType = 'string DEFAULT ""'
                    method = valInt if ft in self.enums else valStr
                elif fObj.dataType == 'int':
                    dataType = 'integer DEFAULT 0'
                    method = valInt
                else:
                    dataType = 'string DEFAULT ""'
                    method = valStr
            self.featureTypes[ft] = dataType
            self.featureMethods[ft] = method

            support = set(fObj.data.keys())
            for otype in otypeSupport:
                if len(support & otypeSupport[otype]):
                    self.otypes.setdefault(otype, []).append(ft)

        for otype in (cleanName(x[0]) for x in self.levels):
            self._writeType(otype)

    def _writeType(self, otype):
        self.fm.write('''
CREATE OBJECT TYPE
[{}
'''.format(otype))
        for ft in self.otypes[otype]:
            fType = '{}_enum'.format(
                'all' if ONE_ENUM_TYPE else ft
            ) if ft in self.enums else self.featureTypes[ft]
            self.fm.write('  {}:{};\n'.format(ft, fType))
        self.fm.write('''
]
GO
''')

    def _writeDataAll(self):
        self.tm.info(
            'Writing {} features as data in {} object types'.format(
                len(self.featureList),
                len(self.levels),
            )
        )
        self.oslots = self.tfFeatures[WARP[1]].data
        for (otype, av, start, end) in self.levels:
            self._writeData(otype, start, end)

    def _writeData(self, otype, start, end):
        tm = self.tm
        fm = self.fm
        tm.indent(level=1, reset=True)
        tm.info('{} data ...'.format(otype))
        oslots = self.oslots
        maxSlot = oslots[-1]
        oFeats = self.otypes[otype]
        features = self.features
        featureMethods = self.featureMethods
        fm.write(
            '''
DROP INDEXES ON OBJECT TYPE[{o}]
GO
CREATE OBJECTS
WITH OBJECT TYPE[{o}]
'''.format(o=otype)
        )
        curSize = 0
        LIMIT = 50000
        t = 0
        j = 0
        tm.indent(level=2, reset=True)
        for n in range(start, end + 1):
            oMql = '''
CREATE OBJECT
FROM MONADS= {{ {m} }}
WITH ID_D={i} [
'''.format(
                m=n if n <= maxSlot else
                specFromRanges(rangesFromList(oslots[n - maxSlot - 1])),
                i=n,
            )
            for ft in oFeats:
                method = featureMethods[ft]
                fMap = features[ft].data
                if n in fMap:
                    oMql += '{}:={};\n'.format(ft, method(fMap[n]))
            oMql += '''
]
'''
            fm.write(oMql)
            curSize += len(bytes(oMql, encoding='utf8'))
            t += 1
            j += 1
            if j == LIMIT:
                fm.write(
                    '''
GO
CREATE OBJECTS
WITH OBJECT TYPE[{o}]
'''.format(o=otype)
                )
                tm.info(
                    'batch of size {:>20} with {:>7} of {:>7} {}s'.format(
                        nbytes(curSize), j, t, otype
                    )
                )
                j = 0
                curSize = 0

        tm.info(
            'batch of size {:>20} with {:>7} of {:>7} {}s'.format(
                nbytes(curSize), j, t, otype
            )
        )
        fm.write(
            '''
GO
CREATE INDEXES ON OBJECT TYPE[{o}]
GO
'''.format(o=otype)
        )

        tm.indent(level=1)
        tm.info('{} data: {} objects'.format(otype, t))


# MQL IMPORT

uniscan = re.compile(r'(?:\\x..)+')


def makeuni(match):
    ''' Make proper unicode of a text that contains byte escape codes
        such as backslash xb6
    '''
    byts = eval('"' + match.group(0) + '"')
    return byts.encode('latin1').decode('utf-8')


def uni(line):
    return uniscan.sub(makeuni, line)


def tfFromMql(mqlFile, tm, slotType=None, otext=None, meta=None):
    if slotType is None:
        tm.error('ERROR: no slotType specified')
        return (False, {}, {}, {})
    (good, objectTypes, tables, edgeF, nodeF) = parseMql(mqlFile, tm)
    if not good:
        return (False, {}, {}, {})
    return tfFromData(
        tm, objectTypes, tables, edgeF, nodeF, slotType, otext, meta
    )


def parseMql(mqlFile, tm):
    tm.info('Parsing mql source ...')
    fh = open(mqlFile)

    objectTypes = dict()
    tables = dict()

    edgeF = dict()
    nodeF = dict()

    curId = None
    curEnum = None
    curObjectType = None
    curTable = None
    curObject = None
    curValue = None
    curFeature = None
    seeObjects = False

    inObjectTypeFeatures = False

    STRING_TYPES = {'ascii', 'string'}

    enums = dict()

    chunkSize = 1000000
    inThisChunk = 0

    good = True

    for (ln, line) in enumerate(fh):
        inThisChunk += 1
        if inThisChunk == chunkSize:
            tm.info('\tline {:>9}'.format(ln + 1))
            inThisChunk = 0
        if line.startswith('CREATE OBJECTS WITH OBJECT TYPE'
                           ) or line.startswith('WITH OBJECT TYPE'):
            comps = line.rstrip().rstrip(']').split('[', 1)
            curTable = comps[1]
            tm.info('\t\tobjects in {}'.format(curTable))
            curObject = None
            if curTable not in tables:
                tables[curTable] = dict()
            seeObjects = True
        elif line == 'CREATE OBJECT\n':
            curObject = None
            curObject = dict(feats=dict(), monads=None)
            curId = None
            seeObjects = True
        elif curEnum is not None:
            if line.startswith('}'):
                curEnum = None
                continue
            comps = line.strip().rstrip(',').split('=', 1)
            comp = comps[0].strip()
            words = comp.split()
            if words[0] == 'DEFAULT':
                enums[curEnum]['default'] = uni(words[1])
                value = words[1]
            else:
                value = words[0]
            enums[curEnum]['values'].append(value)
        elif curObjectType is not None:
            if line.startswith(']'):
                curObjectType = None
                inObjectTypeFeatures = False
                continue
            if curObjectType is True:
                if line.startswith('['):
                    curObjectType = line.rstrip()[1:]
                    objectTypes[curObjectType] = dict()
                    tm.info('\t\totype {}'.format(curObjectType))
                    inObjectTypeFeatures = True
                    continue
            if inObjectTypeFeatures:
                comps = line.strip().rstrip(';').split(':', 1)
                feature = comps[0].strip()
                fInfo = comps[1].strip()
                fCleanInfo = fInfo.replace('FROM SET', '')
                fInfoComps = fCleanInfo.split(' ', 1)
                fMQLType = fInfoComps[0]
                if len(fInfoComps) == 2:
                    fDefaultComps = fInfoComps[1].strip().split(' ', 1)
                    fDefault = fDefaultComps[
                        1
                    ] if len(fDefaultComps) > 1 else None
                else:
                    fDefault = None
                if fDefault is not None and fMQLType in STRING_TYPES:
                    fDefault = uni(fDefault[1:-1])
                default = enums.get(fMQLType, {}).get('default', fDefault)
                ftype = 'str' if fMQLType in enums else\
                        'int' if fMQLType == 'integer' else\
                        'str' if fMQLType in STRING_TYPES else\
                        'int' if fInfo == 'id_d' else\
                        'str'
                isEdge = fMQLType == 'id_d'
                if isEdge:
                    edgeF.setdefault(curObjectType, set()).add(feature)
                else:
                    nodeF.setdefault(curObjectType, set()).add(feature)

                objectTypes[curObjectType][feature] = (ftype, default)
                tm.info(
                    '\t\t\tfeature {} ({}) =def= {} : {}'.format(
                        feature, ftype, default, 'edge' if isEdge else 'node'
                    )
                )
        elif seeObjects:
            if curObject is not None:
                if line.startswith(']'):
                    objectType = objectTypes[curTable]
                    for (feature, (ftype, default)) in objectType.items():
                        if feature not in curObject['feats'
                                                    ] and default is not None:
                            curObject['feats'][feature] = default
                    tables[curTable][curId] = curObject
                    curObject = None
                    continue
                elif line.startswith('['):
                    name = line.rstrip()[1:]
                    if len(name):
                        curTable = name
                        if curTable not in tables:
                            tables[curTable] = dict()
                elif line.startswith('FROM MONADS'):
                    monads = line.split('=', 1)[1].replace('{', '').replace(
                        '}', ''
                    ).replace(' ', '').strip()
                    curObject['monads'] = setFromSpec(monads)
                elif line.startswith('WITH ID_D'):
                    comps = line.replace('[', '').rstrip().split('=', 1)
                    curId = int(comps[1])
                elif line.startswith('GO'):
                    pass
                elif line.strip() == '':
                    pass
                else:
                    if curValue is not None:
                        toBeContinued = not line.rstrip().endswith('";')
                        if toBeContinued:
                            curValue += line
                        else:
                            curValue += line.rstrip().rstrip(';').rstrip('"')
                            curObject['feats'][curFeature] = uni(curValue)
                            curValue = None
                            curFeature = None
                        continue
                    if ':=' in line:
                        (featurePart, valuePart) = line.split('=', 1)
                        feature = featurePart[0:-1].strip()
                        valuePart = valuePart.lstrip()
                        isText = ':="' in line
                        toBeContinued = isText and not line.rstrip(
                        ).endswith('";')
                        if toBeContinued:
                            # this happens if a feature value
                            # contains a new line
                            # we must continue scanning lines
                            # until we meet the end of the value
                            curFeature = feature
                            curValue = valuePart.lstrip('"')
                        else:
                            value = valuePart.rstrip().rstrip(';').strip('"')
                            curObject['feats'][feature] = uni(
                                value
                            ) if isText else value
                    else:
                        tm.error(
                            'ERROR: line {}: unrecognized line -->{}<--'.
                            format(ln, line)
                        )
                        good = False
                        break
            else:
                if line.startswith('CREATE OBJECT'):
                    curObject = dict(feats=dict(), monads=None)
                    curId = None
        else:
            if line.startswith('CREATE ENUMERATION'):
                words = line.split()
                curEnum = words[2]
                enums[curEnum] = dict(default=None, values=[])
                tm.info('\t\tenum {}'.format(curEnum))
            elif line.startswith('CREATE OBJECT TYPE'):
                curObjectType = True
                inObjectTypeFeatures = False
    tm.info('{} lines parsed'.format(ln + 1))
    fh.close()
    for table in tables:
        tm.info('{} objects of type {}'.format(len(tables[table]), table))

    if len(tables) == 0:
        tm.info('No objects found')
    return (good, objectTypes, tables, nodeF, edgeF)


def tfFromData(tm, objectTypes, tables, nodeF, edgeF, slotType, otext, meta):
    tm.info('Making TF data ...')

    NIL = {'nil', 'NIL', 'Nil'}

    tableOrder = [slotType] + [t for t in sorted(tables) if t != slotType]

    iddFromMonad = dict()
    slotFromMonad = dict()

    nodeFromIdd = dict()
    iddFromNode = dict()

    nodeFeatures = dict()
    edgeFeatures = dict()
    metaData = dict()

    # metadata that ends up in every feature
    metaData[''] = {} if meta is None else meta

    # the config feature otext
    metaData['otext'] = otext

    good = True

    tm.info('Monad - idd mapping ...')
    for idd in tables.get(slotType, {}):
        monad = list(tables[slotType][idd]['monads'])[0]
        iddFromMonad[monad] = idd

    tm.info('Removing holes in the monad sequence')
    # we set up a monad - slot mapping
    curSlot = 0
    otype = dict()
    for monad in sorted(iddFromMonad):
        curSlot += 1
        slotFromMonad[monad] = curSlot
        idd = iddFromMonad[monad]
        nodeFromIdd[idd] = curSlot
        iddFromNode[curSlot] = idd
        otype[curSlot] = slotType

    maxSlot = curSlot
    tm.info('maxSlot={}'.format(maxSlot))

    tm.info('Node mapping and otype ...')
    node = maxSlot
    for t in tableOrder[1:]:
        for idd in sorted(tables[t]):
            node += 1
            nodeFromIdd[idd] = node
            iddFromNode[node] = idd
            otype[node] = t

    nodeFeatures['otype'] = otype
    metaData['otype'] = dict(valueType='str', )

    tm.info('oslots ...')
    oslots = dict()
    for t in tableOrder[1:]:
        for idd in tables.get(t, {}):
            node = nodeFromIdd[idd]
            monads = tables[t][idd]['monads']
            oslots[node] = {slotFromMonad[m] for m in monads}
    edgeFeatures['oslots'] = oslots
    metaData['oslots'] = dict(valueType='str', )

    tm.info('metadata ...')
    for t in nodeF:
        for f in nodeF[t]:
            ftype = objectTypes[t][f][0]
            metaData.setdefault(f, {})['valueType'] = ftype
    for t in edgeF:
        for f in edgeF[t]:
            metaData.setdefault(f, {})['valueType'] = 'str'

    tm.info('features ...')
    chunkSize = 100000
    for t in tableOrder:
        tm.info('\tfeatures from {}s'.format(t))
        inThisChunk = 0
        thisTable = tables.get(t, {})
        for (i, idd) in enumerate(thisTable):
            inThisChunk += 1
            if inThisChunk == chunkSize:
                tm.info('\t{:>9} {}s'.format(i + 1, t))
                inThisChunk = 0
            node = nodeFromIdd[idd]
            features = tables[t][idd]['feats']
            for (f, v) in features.items():
                isEdge = f in edgeF.get(t, set())
                if isEdge:
                    if v not in NIL:
                        edgeFeatures.setdefault(f, {}).setdefault(
                            node, set()
                        ).add(nodeFromIdd[int(v)])
                else:
                    nodeFeatures.setdefault(f, {})[node] = v
        tm.info('\t{:>9} {}s'.format(len(thisTable), t))

    return (good, nodeFeatures, edgeFeatures, metaData)
back to top