// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // // BinaryReader.cpp : Defines the exported functions for the DLL application. // #include "stdafx.h" #define DATAREADER_EXPORTS // creating the exports here #include "DataReader.h" #include "BinaryReader.h" #ifdef LEAKDETECT #include // leak detection #endif namespace Microsoft { namespace MSR { namespace CNTK { // Destroy - cleanup and remove this class // NOTE: this destroys the object, and it can't be used past this point template void BinaryReader::Destroy() { delete this; } // LoadSections - Load in all the sections in the file // sectionRoot - section whose children we will enumrate // NOTE: called recursively to obtain full section heirarchy template void BinaryReader::LoadSections(Section* sectionRoot, MappingType mapping, size_t windowSize) { int sectionCount = sectionRoot->GetSectionCount(); for (int i = 0; i < sectionCount; ++i) { Section* newSection = sectionRoot->ReadSection(i, mapping, windowSize); auto found = m_sections.find(newSection->GetName()); if (found != m_sections.end()) { RuntimeError("LoadSections, duplicate section name %ls already defined previously", newSection->GetName().c_str()); } m_sections[newSection->GetName()] = newSection; // load in any subsections LoadSections(newSection, mapping, windowSize); } } // Init - Reader Initialize for multiple data sets // config - [in] configuration parameters for the datareader // Sample format below: //# Parameter values for the reader //reader=[ // # reader to use // readerType=BinaryReader // miniBatchMode=Partial // file={, // c:\speech\mnist\mnist_features.bin // c:\speech\mnist\mnist_labels.bin // } //] static vector GetCommaSeparatedItems(const ConfigParameters& config, wstring key) { ConfigArray files(config(key), ','); return (stringargvector) files; } static vector GetCommaSeparatedItems(const ScriptableObjects::IConfigRecord& config, wstring key) { return config(key, vector()); } template template void BinaryReader::InitFromConfig(const ConfigRecordType& readerConfig) { // load in all the file config info for the root level std::vector fileConfigs; vector files = GetCommaSeparatedItems(readerConfig, L"file"); mOneLinePerFile = false; mOneLinePerFile = readerConfig(L"onelineperfile", false); m_dim = readerConfig(L"dim", (size_t) 20); m_totalSamples = 0; size_t windowSize = readerConfig(L"windowSize", (size_t) 0); MappingType mapping = windowSize ? mappingElementWindow : mappingSection; if (mOneLinePerFile) { for (int i = 0; i < files.size(); ++i) { FILE* secFile = _wfopen(files[i].c_str(), L"rt"); if (secFile != nullptr) m_fStream.push_back(secFile); else LogicError("BinaryReader::init cannot find %ls to read", files[i].c_str()); } } else { for (int i = 0; i < files.size(); ++i) { SectionFile* secFile = new SectionFile(files[i], fileOptionsRead, 0); size_t records = secFile->FileSection()->GetRecordCount(); // if we haven't set the total records yet, set it SectionFlags flags = secFile->FileSection()->GetFlags(); if (m_totalSamples == 0 && !(flags & flagAuxilarySection)) { m_totalSamples = records; } else // otherwise we want to check to make sure it's the same { if (records != m_totalSamples && !(flags & flagAuxilarySection)) RuntimeError("multiple files have different record counts, cannot be used together!"); } m_secFiles.push_back(secFile); // now get all the sections out of the file Section* section = secFile->FileSection(); LoadSections(section, mapping, windowSize); } } // initialize all the variables m_mbStartSample = m_epoch = m_epochStartSample = 0; m_partialMinibatch = false; m_traceLevel = readerConfig(L"traceLevel", 0); // determine if partial minibatches are desired std::string minibatchMode(readerConfig(L"minibatchMode", "Partial")); m_partialMinibatch = !_stricmp(minibatchMode.c_str(), "Partial"); // Initial load is complete DisplayProperties(); } template void BinaryReader::DisplayProperties() { if (m_traceLevel == 0) return; fprintf(stderr, "BinaryReader in use...\n"); // enumerate the files for (SectionFile* file : m_secFiles) { Section* section = file->FileSection(); fprintf(stderr, "File: %ls, Records: %lld\n", file->GetName().c_str(), section->GetElementCount()); } for (auto pair : m_sections) { Section* section = pair.second; fprintf(stderr, "Section: %ls, Elements: %zd, ElementsPerRecord: %zd, ElementSize: %zd\n", pair.first.c_str(), section->GetElementCount(), section->GetElementsPerRecord(), section->GetElementSize()); if (section->GetSectionType() == sectionTypeStats) { vector stats; stats.resize(section->GetElementCount()); size_t size = sizeof(NumericStatistics) * stats.size(); GetData(pair.first, stats.size(), &stats[0], size); fprintf(stderr, " *Stats*: "); for (NumericStatistics stat : stats) { fprintf(stderr, "%s:%lf, ", stat.statistic, stat.value); } fprintf(stderr, "\n"); } } } // destructor - virtual so it gets called properly template BinaryReader::~BinaryReader() { // clear the section references, they will be deleted by the sectionFile destructors m_sections.clear(); for (SectionFile* secFile : m_secFiles) { delete secFile; } m_secFiles.clear(); for (size_t i = 0; i < m_fStream.size(); i++) { fclose(m_fStream[i]); } } //StartMinibatchLoop - Startup a minibatch loop // mbSize - [in] size of the minibatch (number of Samples, etc.) // epoch - [in] epoch number for this loop, if > 0 the requestedEpochSamples must be specified (unless epoch zero was completed this run) // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset // this value must be a multiple of mbSize, if it is not, it will be rounded up to one. template void BinaryReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) { m_mbSize = mbSize; if (requestedEpochSamples == requestDataSize) { // if they want the dataset size, get the first file, and return the dataset size if (m_totalSamples == 0) RuntimeError("no section files contain total samples, can't determine dataset size"); requestedEpochSamples = m_totalSamples; } m_epochSize = requestedEpochSamples; m_epoch = epoch; SetupEpoch(); } // CheckEndDataset - Check to see if we have arrived at the end of the dataset // actualmbsize - [in] the actual size of the dataset we are requesting, // returns - true if there we hit dataset end, false otherwise template bool BinaryReader::CheckEndDataset(size_t actualmbsize) { size_t epochEnd = m_epochSize; size_t epochSample = m_mbStartSample % m_epochSize; // check for an odd sized last minibatch if (epochSample + actualmbsize > epochEnd) { actualmbsize = epochEnd - epochSample; } // make sure we take into account hitting the end of the dataset (not wrapping around) actualmbsize = min(m_totalSamples - (m_mbStartSample % m_totalSamples), actualmbsize); if (actualmbsize == 0) { return true; } // if they don't want partial minibatches, skip and return if (actualmbsize < m_mbSize && !m_partialMinibatch) { m_mbStartSample += actualmbsize; return true; } return false; } // GetMinibatch - Get the next minibatch (features and labels) // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponding matrix, // [out] each matrix resized if necessary containing data. // returns - true if there are more minibatches, false if no more minibatchs remain template bool BinaryReader::GetMinibatch(std::map*>& matrices) { // get out if they didn't call StartMinibatchLoop() first if (m_mbSize == 0) return false; // check to see if we have changed epochs, if so we are done with this one. if (m_mbStartSample / m_epochSize != m_epoch) { return false; } // actual size is either what requested, or total number of samples read so far size_t actualmbsize = min(m_totalSamples, m_mbSize); // it may still return less if at end of sweep size_t epochStartSample = m_mbStartSample % m_totalSamples; bool endOfDataset = CheckEndDataset(actualmbsize); if (endOfDataset) return false; for (auto value : matrices) { wstring matrixName = value.first; Section* section = m_sections[matrixName]; Matrix* gpuData = value.second; size_t rows = section->GetElementsPerRecord(); SectionData dataType; size_t dataSize; section->GetDataTypeSize(dataType, dataSize); // if the data types are not as expected if (dataType != sectionDataFloat || dataSize != sizeof(ElemType)) { // if it's a label type, may need to get a subsection if (section->GetSectionType() == sectionTypeLabel) { bool categoryLabelFound = false; SectionLabel* sectionLabel = (SectionLabel*) section; // category type, check to see if we saved it if (sectionLabel->GetLabelKind() == labelCategory) { Section* sectionCategory = NULL; for (int i = 0; i < section->GetSectionCount(); ++i) { sectionCategory = section->ReadSection(i); // found the category labels, so pass them on if (sectionCategory->GetSectionType() == sectionTypeCategoryLabel) { section = sectionCategory; rows = section->GetElementsPerRecord(); section->GetDataTypeSize(dataType, dataSize); categoryLabelFound = true; break; } } if (!categoryLabelFound) { RuntimeError("Category Labels not saved in file, either save, or support creation in BinaryReader"); } } } // make sure we are good now if (dataType != sectionDataFloat || dataSize != sizeof(ElemType)) { RuntimeError("Category Labels not saved in file, either save, or support creation in BinaryReader"); } } size_t size = rows * dataSize * actualmbsize; size_t index = epochStartSample * section->GetElementsPerRecord(); ElemType* data = (ElemType*) section->EnsureElements(index, size); // ElemType* data = section->GetElement(epochStartSample*section->GetElementsPerRecord()); // data = (ElemType*)section->EnsureMapped(data, size); // make sure that the data is as expected if (!!(section->GetFlags() & flagAuxilarySection) || section->GetElementSize() != sizeof(ElemType)) { RuntimeError("GetMinibatch: Section %ls Auxilary section specified, and/or element size %lld mismatch", section->GetName().c_str(), section->GetElementSize()); } gpuData->SetValue(rows, actualmbsize, gpuData->GetDeviceId(), data); } // advance to the next minibatch m_mbStartSample += actualmbsize; // we read some records, so process them return true; } //SetupEpoch - Setup the proper position in the file, and other variable settings to start a particular epoch template void BinaryReader::SetupEpoch() { // if we are starting fresh (epoch zero and no data read), init everything assert(m_totalSamples != 0); // make sure we are in the correct location for mid-dataset epochs size_t mbStartSample = m_epoch * m_epochSize; size_t fileRecord = mbStartSample % m_totalSamples; fprintf(stderr, "starting epoch %lld at record count %lld, and file position %lld\n", m_epoch, mbStartSample, fileRecord); // reset the original read sample m_epochStartSample = m_mbStartSample = mbStartSample; } // GetLabelMapping - Gets the label mapping from integer index to label type // returns - a map from numeric datatype to native label type template const std::map::LabelIdType, typename BinaryReader::LabelType>& BinaryReader::GetLabelMapping(const std::wstring& sectionName) { auto iter = m_sections.find(sectionName); if (iter == m_sections.end()) { RuntimeError("GetLabelMapping: requested section name not found\n"); } Section* section = iter->second; if (section->GetSectionType() != sectionTypeLabelMapping) { RuntimeError("section specified is not label mapping section.\n"); } // get it from the correct section SectionString* sectionLabelMapping = (SectionString*) section; return sectionLabelMapping->GetLabelMapping(); } // SetLabelMapping - Sets the label mapping from integer index to label // labelMapping - mapping table from label values to IDs (must be 0-n) // note: for tasks with labels, the mapping table must be the same between a training run and a testing run template void BinaryReader::SetLabelMapping(const std::wstring& /*sectionName*/, const std::map::LabelIdType, typename BinaryReader::LabelType>& /*labelMapping*/) { RuntimeError("Binary reader does not support setting the mapping table.\n"); } // GetData - Gets metadata from the specified section (into CPU memory) // sectionName - section name to retrieve data from // numRecords - number of records to read // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request // dataBufferSize - [in] size of the databuffer in bytes // [out] size of buffer filled with data // recordStart - record to start reading from, defaults to zero (start of data) // returns: true if data remains to be read, false if the end of data was reached, or buffer is insufficent template bool BinaryReader::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart) { auto iter = m_sections.find(sectionName); if (iter == m_sections.end()) { RuntimeError("GetData: requested section name not found\n"); } Section* section = iter->second; size_t sizeData = section->GetElementsPerRecord() * section->GetElementSize(); if ((numRecords + recordStart) * section->GetElementsPerRecord() > section->GetElementCount()) { RuntimeError("GetData: requested invalid record number\n"); } sizeData *= numRecords; // of buffer isn't large enough, or they didn't pass one if (sizeData > dataBufferSize || data == NULL) { dataBufferSize = sizeData; return false; } char* copyFrom = section->EnsureElements(section->GetElementsPerRecord() * recordStart, sizeData); memcpy_s((char*) data, dataBufferSize, copyFrom, sizeData); return true; } template bool BinaryReader::DataEnd(EndDataType endDataType) { bool ret = false; switch (endDataType) { case endDataNull: assert(false); break; case endDataEpoch: ret = (m_mbStartSample / m_epochSize != m_epoch); break; case endDataSet: { // actual size is either what requested, or total number of samples read so far size_t actualmbsize = min(m_totalSamples, m_mbSize); // it may still return less if at end of sweep ret = CheckEndDataset(actualmbsize); break; } case endDataSentence: // for fast reader each minibatch is considered a "sentence", so always true ret = true; break; } return ret; } // instantiate all the combinations we expect to be used template class BinaryReader; template class BinaryReader; } } }