https://github.com/annotation/text-fabric
Raw File
Tip revision: 4322d183112906d1a996173143b0bd764713a9b8 authored by Dirk Roorda on 30 April 2018, 17:56:37 UTC
New minor release 3.4.9
Tip revision: 4322d18
data.py
import os
import pickle
import gzip
import collections
import time
from datetime import datetime
from .helpers import (
    setFromSpec, valueFromTf, tfFromValue, specFromRanges, rangesFromSet
)

ERROR_CUTOFF = 20
GZIP_LEVEL = 2
PICKLE_PROTOCOL = 4

WARP = (
    'otype',
    'oslots',
    'otext',
)

DATA_TYPES = ('str', 'int')


class Data(object):
    def __init__(
        self,
        path,
        tm,
        edgeValues=False,
        data=None,
        isEdge=None,
        isConfig=None,
        metaData={},
        method=None,
        dependencies=None
    ):
        (dirName, baseName) = os.path.split(path)
        (fileName, extension) = os.path.splitext(baseName)
        self.path = path
        self.tm = tm
        self.dirName = dirName
        self.fileName = fileName
        self.extension = extension
        self.binDir = '{}/.tf'.format(dirName)
        self.binPath = '{}/{}.tfx'.format(self.binDir, self.fileName)
        self.edgeValues = edgeValues
        self.isEdge = isEdge
        self.isConfig = isConfig
        self.metaData = metaData
        self.method = method
        self.dependencies = dependencies
        self.data = data
        self.dataLoaded = False
        self.dataError = False
        self.dataType = 'str'

    def load(self, metaOnly=False, silent=False):
        self.tm.indent(level=1, reset=True)
        origTime = self._getModified()
        binTime = self._getModified(bin=True)
        sourceRep = ', '.join(
            dep.fileName for dep in self.dependencies
        ) if self.method else self.dirName
        msgFormat = '{:<1} {:<20} from {}'
        actionRep = ''
        good = True

        if self.dataError:
            # there has been an error in an earlier
            # computation/compiling/loading of this feature
            actionRep = 'E'
            good = False
        elif self.dataLoaded and (
            self.isConfig or (not origTime or self.dataLoaded >= origTime) and
            (not binTime or self.dataLoaded >= binTime)
        ):
            actionRep = '='  # loaded and up to date
        elif not origTime and not binTime:
            actionRep = 'X'  # no source and no binary present
            good = False
        else:
            if not origTime:
                actionRep = 'b'
                good = self._readDataBin()
            elif not binTime or origTime > binTime:
                actionRep = 'C' if self.method else 'T'
                good = self._compute() if self.method else self._readTf(
                    metaOnly=metaOnly
                )
                if good:
                    if self.isConfig or metaOnly:
                        actionRep = 'M'
                    else:
                        self._writeDataBin()
            else:
                actionRep = 'B'
                good = True if self.method else self._readTf(metaOnly=True)
                if good:
                    if self.isConfig or metaOnly:
                        actionRep = 'M'
                    else:
                        good = self._readDataBin()
        if self.isConfig:
            self.cleanDataBin()
        if good:
            if actionRep != '=':
                self.tm.info(
                    msgFormat.format(actionRep, self.fileName, sourceRep),
                    cache=1 if (not silent) or (actionRep in 'CT') else -1,
                )
        else:
            self.dataError = True
            self.tm.error(
                msgFormat.format(actionRep, self.fileName, sourceRep)
            )
        return good

    def unload(self):
        self.data = None
        self.dataLoaded = False

    def save(self, overwrite=False, nodeRanges=False):
        return self._writeTf(overwrite=overwrite, nodeRanges=nodeRanges)

    def _setDataType(self):
        if self.isConfig:
            return
        if 'valueType' in self.metaData:
            dataType = self.metaData['valueType']
            if dataType not in DATA_TYPES:
                self.tm.error(
                    'Unknown @valueType: "{}". Should be one of {}'.format(
                        dataType,
                        ','.join(DATA_TYPES),
                    )
                )
                self.dataType = DATA_TYPES[0]
            else:
                self.dataType = dataType
        else:
            self.tm.error(
                'Missing @valueType. Should be one of {}'.format(
                    ','.join(DATA_TYPES),
                )
            )
            self.dataType = DATA_TYPES[0]

    def _readTf(self, metaOnly=False):
        path = self.path
        if not os.path.exists(path):
            self.tm.error(
                'TF reading: feature file "{}" does not exist'.format(path)
            )
            return False
        fh = open(path, encoding='utf8')
        i = 0
        self.metaData = {}
        self.isConfig = False
        for line in fh:
            i += 1
            if i == 1:
                text = line.rstrip()
                if text == '@edge':
                    self.isEdge = True
                elif text == '@node':
                    self.isEdge = False
                elif text == '@config':
                    self.isConfig = True
                else:
                    self.tm.error(
                        'Line {}: missing @node/@edge/@config'.format(i)
                    )
                    fh.close()
                    return False
                continue
            text = line.rstrip('\n')
            if len(text) and text[0] == '@':
                if text == '@edgeValues':
                    self.edgeValues = True
                    continue
                fields = text[1:].split('=', 1)
                self.metaData[fields[0]
                              ] = fields[1] if len(fields) == 2 else None
                continue
            else:
                if text != '':
                    self.tm.error(
                        'Line {}: missing blank line after metadata'.format(i)
                    )
                    fh.close()
                    return False
                else:
                    break
        self._setDataType()
        good = True
        if not metaOnly and not self.isConfig:
            good = self._readDataTf(fh, i)
        fh.close()
        return good

    def _readDataTf(self, fh, firstI):
        errors = collections.defaultdict(list)
        i = firstI
        implicit_node = 1
        data = {}
        isEdge = self.isEdge
        edgeValues = self.edgeValues
        normFields = 3 if isEdge and edgeValues else 2
        isNum = self.dataType == 'int'
        for line in fh:
            i += 1
            fields = line.rstrip('\n').split('\t')
            lfields = len(fields)
            if lfields > normFields:
                errors['wrongFields'].append(i)
                continue
            if lfields == normFields:
                nodes = setFromSpec(fields[0])
                if isEdge:
                    if fields[1] == '':
                        errors['emptyNode2Spec'].append(i)
                        continue
                    nodes2 = setFromSpec(fields[1])
                if not isEdge or edgeValues:
                    valTf = fields[-1]
            else:
                if isEdge:
                    if edgeValues:
                        if lfields == normFields - 1:
                            nodes = {implicit_node}
                            nodes2 = setFromSpec(fields[0])
                            valTf = fields[-1]
                        elif lfields == normFields - 2:
                            nodes = {implicit_node}
                            if fields[0] == '':
                                errors['emptyNode2Spec'].append(i)
                                continue
                            nodes2 = setFromSpec(fields[0])
                            valTf = ''
                        else:
                            nodes = {implicit_node}
                            valTf = ''
                            errors['emptyNode2Spec'].append(i)
                            continue
                    else:
                        if lfields == normFields - 1:
                            nodes = {implicit_node}
                            if fields[0] == '':
                                errors['emptyNode2Spec'].append(i)
                                continue
                            nodes2 = setFromSpec(fields[0])
                        else:
                            nodes = {implicit_node}
                            errors['emptyNode2Spec'].append(i)
                            continue
                else:
                    nodes = {implicit_node}
                    if lfields == 1:
                        valTf = fields[0]
                    else:
                        valTf = ''
            implicit_node = max(nodes) + 1
            if not isEdge or edgeValues:
                value = (
                    int(valTf) if isNum and valTf != '' else None
                    if isNum else '' if valTf == '' else valueFromTf(valTf)
                )
            if isEdge:
                for n in nodes:
                    for m in nodes2:
                        if not edgeValues:
                            data.setdefault(n, set()).add(m)
                        else:
                            data.setdefault(
                                n, {}
                            )[m] = value  # even if the value is None
            else:
                for n in nodes:
                    if value is not None:
                        data[n] = value
        for kind in errors:
            lnk = len(errors[kind])
            self.tm.error(
                '{} in lines {}'.format(
                    kind,
                    ','.join(str(ln) for ln in errors[kind][0:ERROR_CUTOFF])
                )
            )
            if lnk > ERROR_CUTOFF:
                self.tm.error(
                    '\t and {} more cases'.format(lnk - ERROR_CUTOFF),
                    tm=False
                )
        self.data = data
        if not errors:
            if self.fileName == WARP[0]:
                slotType = data[1]
                otype = []
                maxSlot = 1
                for n in sorted(data):
                    if data[n] == slotType:
                        maxSlot = n
                        continue
                    otype.append(data[n])
                otype.append(slotType)
                otype.append(maxSlot)
                self.data = tuple(otype)
            elif self.fileName == WARP[1]:
                slotsList = sorted(data)
                maxSlot = min(data.keys()) - 1
                oslots = []
                for n in slotsList:
                    oslots.append(tuple(sorted(data[n])))
                oslots.append(maxSlot)
                self.data = tuple(oslots)
        return not errors

    def _compute(self):
        good = True
        for feature in self.dependencies:
            if not feature.load():
                good = False
        if not good:
            return False

        def info(msg, tm=True):
            self.tm.info(cmpFormat.format(msg), tm=tm, cache=-1)

        cmpFormat = 'c {:<20} {{}}'.format(self.fileName)
        self.tm.indent(level=2, reset=True)

        def error(msg, tm=True):
            self.tm.error(cmpFormat.format(msg), tm=tm)

        self.data = self.method(
            info, error, *[
                dep.metaData if dep.fileName == WARP[2] else dep.data
                for dep in self.dependencies
            ]
        )
        good = self.data is not None
        if good:
            self.dataLoaded = time.time()
        return good

    def _writeTf(
        self,
        dirName=None,
        fileName=None,
        overwrite=True,
        extension=None,
        metaOnly=False,
        nodeRanges=False
    ):
        self.tm.indent(level=1, reset=True)
        metaOnly = metaOnly or self.isConfig

        dirName = dirName or self.dirName
        fileName = fileName or self.fileName
        extension = extension or self.extension
        if not os.path.exists(dirName):
            try:
                os.makedirs(dirName, exist_ok=True)
            except Exception:
                self.tm.error('Cannot create directory "{}"'.format(dirName))
                return False
        fpath = '{}/{}{}'.format(dirName, fileName, extension)
        if fpath == self.path:
            if os.path.exists(fpath):
                if not overwrite:
                    self.tm.error((
                        'Feature file "{}" already exists,'
                        ' feature will not be written'
                    ).format(fpath))
                    return False
        try:
            fh = open(fpath, 'w', encoding='utf8')
        except Exception:
            self.tm.error('Cannot write to feature file "{}"'.format(fpath))
            return False
        fh.write(
            '@{}\n'.format(
                'config' if self.isConfig else 'edge'
                if self.isEdge else 'node'
            )
        )
        if self.edgeValues:
            fh.write('@edgeValues\n')
        for meta in sorted(self.metaData):
            fh.write('@{}={}\n'.format(meta, self.metaData[meta]))
        fh.write('@writtenBy=Text-Fabric\n')
        fh.write(
            '@dateWritten={}\n'.
            format(datetime.utcnow().replace(microsecond=0).isoformat() + 'Z')
        )
        fh.write('\n')
        self._setDataType()
        good = True
        if not metaOnly:
            good = self._writeDataTf(fh, nodeRanges=nodeRanges)
        fh.close()
        msgFormat = '{:<1} {:<20} to {}'
        if good:
            self.tm.info(
                msgFormat.format('M' if metaOnly else 'T', fileName, dirName)
            )
        else:
            self.tm.error(
                msgFormat.format('M' if metaOnly else 'T', fileName, dirName)
            )
        return good

    def _writeDataTf(self, fh, nodeRanges=False):
        data = self.data
        if type(data) is tuple:
            maxSlot = data[-1]
            if self.fileName == WARP[0]:
                data = dict(((k + 1 + maxSlot, data[k])
                             for k in range(0,
                                            len(data) - 2)))
            elif self.fileName == WARP[1]:
                data = dict(((k + 1 + maxSlot, data[k])
                             for k in range(0,
                                            len(data) - 1)))
        edgeValues = self.edgeValues
        if self.isEdge:
            implicitNode = 1
            for n in sorted(data):
                thisData = data[n]
                sets = {}
                if edgeValues:
                    for m in thisData:
                        sets.setdefault(thisData[m], set()).add(m)
                    for (value, mset) in sorted(sets.items()):
                        nodeSpec2 = specFromRanges(rangesFromSet(mset))
                        nodeSpec = '' if n == implicitNode else n
                        implicitNode = n + 1
                        if value is None:
                            fh.write(
                                '{}{}{}\n'.format(
                                    nodeSpec,
                                    '\t' if nodeSpec else '',
                                    nodeSpec2,
                                )
                            )
                        else:
                            fh.write(
                                '{}{}{}\t{}\n'.format(
                                    nodeSpec,
                                    '\t' if nodeSpec else '',
                                    nodeSpec2,
                                    tfFromValue(value),
                                )
                            )
                else:
                    nodeSpec2 = specFromRanges(rangesFromSet(thisData))
                    nodeSpec = '' if n == implicitNode else n
                    implicitNode = n + 1
                    fh.write(
                        '{}{}{}\n'.format(
                            nodeSpec, '\t' if nodeSpec else '', nodeSpec2
                        )
                    )
        else:
            sets = {}
            if nodeRanges:
                for n in sorted(data):
                    sets.setdefault(data[n], []).append(n)
                implicitNode = 1
                for (value, nset) in sorted(
                    sets.items(), key=lambda x: (x[1][0], x[1][-1])
                ):
                    if len(nset) == 1 and nset[0] == implicitNode:
                        nodeSpec = ''
                    else:
                        nodeSpec = specFromRanges(rangesFromSet(nset))
                    implicitNode = nset[-1]
                    fh.write(
                        '{}{}{}\n'.format(
                            nodeSpec,
                            '\t' if nodeSpec else '',
                            tfFromValue(value),
                        )
                    )
            else:
                implicitNode = 1
                for n in sorted(data):
                    nodeSpec = '' if n == implicitNode else n
                    implicitNode = n + 1
                    fh.write(
                        '{}{}{}\n'.format(
                            nodeSpec,
                            '\t' if nodeSpec else '',
                            tfFromValue(data[n]),
                        )
                    )
        return True

    def _readDataBin(self):
        if not os.path.exists(self.binPath):
            self.tm.error(
                'TF reading: feature file "{}" does not exist'.format(
                    self.binPath
                )
            )
            return False
        with gzip.open(self.binPath, "rb") as f:
            self.data = pickle.load(f)
        self.dataLoaded = time.time()
        return True

    def cleanDataBin(self):
        if os.path.exists(self.binPath):
            os.unlink(self.binPath)

    def _writeDataBin(self):
        good = True
        if not os.path.exists(self.binDir):
            try:
                os.makedirs(self.binDir, exist_ok=True)
            except Exception:
                self.tm.error(
                    'Cannot create directory "{}"'.format(self.binDir)
                )
                good = False
        if not good:
            return False
        try:
            with gzip.open(self.binPath, "wb", compresslevel=GZIP_LEVEL) as f:
                pickle.dump(self.data, f, protocol=PICKLE_PROTOCOL)
        except Exception:
            self.tm.error('Cannot write to file "{}"'.format(self.binPath))
            good = False
        self.dataLoaded = time.time()
        return True

    def _getModified(self, bin=False):
        if bin:
            return os.path.getmtime(self.binPath) if os.path.exists(
                self.binPath
            ) else None
        else:
            if self.method:
                depsInfo = [dep._getModified() for dep in self.dependencies]
                depsModifieds = [d for d in depsInfo if d is not None]
                depsModified = None if len(depsModifieds
                                           ) == 0 else max(depsModifieds)
                if depsModified is not None:
                    return depsModified
                elif os.path.exists(self.binPath):
                    return os.path.getmtime(self.binPath)
                else:
                    return None
            else:
                if os.path.exists(self.path):
                    return os.path.getmtime(self.path)
                elif os.path.exists(self.binPath):
                    return os.path.getmtime(self.binPath)
                else:
                    return None
back to top