swh:1:snp:f50ab94432af916b5fb8b4ad831e8dddded77084
Raw File
Tip revision: 617964b4eb19df1e978ec1e474f941147a39efcd authored by Alexey Kamenev on 16 May 2016, 23:55:43 UTC
Added ReduceElementsNode to NDL.
Tip revision: 617964b
CorpusDescriptor.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include "StringToIdMap.h"
#include <set>

namespace Microsoft { namespace MSR { namespace CNTK {

// Represents a full corpus.
// Defines which sequences should participate in the reading.
// TODO: Extract an interface.
class CorpusDescriptor
{
    bool m_includeAll;
    std::set<size_t> m_sequenceIds;

public:
    CorpusDescriptor(const std::wstring& file) : m_includeAll(false)
    {
        // Add all sequence ids.
        for (msra::files::textreader r(file); r;)
        {
            m_sequenceIds.insert(m_stringRegistry[r.wgetline()]);
        }
    }

    // By default include all sequences.
    CorpusDescriptor() : m_includeAll(true)
    {}

    // Checks if the specified sequence should be used for reading.
    bool IsIncluded(const std::wstring& sequenceKey)
    {
        if (m_includeAll)
        {
            return true;
        }

        size_t id;
        if(!m_stringRegistry.TryGet(sequenceKey, id))
        {
            return false;
        }

        return m_sequenceIds.find(id) != m_sequenceIds.end();
    }

    // Gets the string registry
    WStringToIdMap& GetStringRegistry()
    {
        return m_stringRegistry;
    }

    // Gets the string registry
    const WStringToIdMap& GetStringRegistry() const
    {
        return m_stringRegistry;
    }

private:
    DISABLE_COPY_AND_MOVE(CorpusDescriptor);

    WStringToIdMap m_stringRegistry;
};

typedef std::shared_ptr<CorpusDescriptor> CorpusDescriptorPtr;

}}}
back to top