#pragma once #include #include "simplesenonehmm.h" #include "latticearchive.h" #include "latticesource.h" #include "ssematrix.h" #include "Matrix.h" #include "CUDAPageLockedMemAllocator.h" #include #include #pragma warning(disable : 4127) // conditional expression is constant namespace msra { namespace lattices { struct SeqGammarCalParam { double amf; double lmf; double wp; double bMMIfactor; bool sMBRmode; SeqGammarCalParam() { amf = 14.0; lmf = 14.0; wp = 0.0; bMMIfactor = 0.0; sMBRmode = false; } }; template class GammaCalculation { bool cpumode; public: GammaCalculation() : cpumode(false) { initialmark = false; lmf = 7.0f; // Note that 9 was best for Fisher --these should best be configurable wp = 0.0f; amf = 7.0f; boostmmifactor = 0.0f; seqsMBRmode = false; } ~GammaCalculation() { } // ======================================== // Sec. 1 init functions // ======================================== void init(msra::asr::simplesenonehmm hset, int DeviceId) { m_deviceid = DeviceId; if (!initialmark) { m_hset = hset; m_maxframenum = 0; // prep for parallel implementation (CUDA) parallellattice.setdevice(DeviceId); if (parallellattice.enabled()) // send hmm set to GPU if GPU computation enabled parallellattice.entercomputation(m_hset, mbrclassdef); // cache senone2classmap if mpemode initialmark = true; } } // ======================================== // Sec. 2 set functions // ======================================== void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam) { lmf = (float) gammarParam.lmf; amf = (float) gammarParam.amf; wp = (float) gammarParam.wp; seqsMBRmode = gammarParam.sMBRmode; boostmmifactor = (float) gammarParam.bMMIfactor; } // ======================================== // Sec. 3 calculation functions // ======================================== void calgammaformb(Microsoft::MSR::CNTK::Matrix& functionValues, std::vector>& lattices, const Microsoft::MSR::CNTK::Matrix& loglikelihood, Microsoft::MSR::CNTK::Matrix& labels, Microsoft::MSR::CNTK::Matrix& gammafromlattice, std::vector& uids, std::vector& boundaries, size_t samplesInRecurrentStep, /* numParallelUtterance ? */ std::shared_ptr pMBLayout, std::vector& extrauttmap, bool doreferencealign) { // check total frame number to be added ? // int deviceid = loglikelihood.GetDeviceId(); size_t boundaryframenum; std::vector validframes; // [s] cursor pointing to next utterance begin within a single parallel sequence [s] validframes.assign(samplesInRecurrentStep, 0); ElemType objectValue = 0.0; // convert from Microsoft::MSR::CNTK::Matrix to msra::math::ssematrixbase size_t numrows = loglikelihood.GetNumRows(); size_t numcols = loglikelihood.GetNumCols(); Microsoft::MSR::CNTK::Matrix tempmatrix(m_deviceid); // copy loglikelihood to pred if (numcols > pred.cols()) { pred.resize(numrows, numcols); dengammas.resize(numrows, numcols); } if (doreferencealign) labels.SetValue((ElemType)(0.0f)); size_t T = numcols / samplesInRecurrentStep; // number of time steps in minibatch if (samplesInRecurrentStep > 1) { assert(extrauttmap.size() == lattices.size()); assert(T == pMBLayout->GetNumTimeSteps()); } size_t mapi = 0; // parallel-sequence index for utterance [i] // cal gamma for each utterance size_t ts = 0; for (size_t i = 0; i < lattices.size(); i++) { const size_t numframes = lattices[i]->getnumframes(); msra::dbn::matrixstripe predstripe(pred, ts, numframes); // logLLs for this utterance msra::dbn::matrixstripe dengammasstripe(dengammas, ts, numframes); // denominator gammas if (samplesInRecurrentStep == 1) // no sequence parallelism { tempmatrix = loglikelihood.ColumnSlice(ts, numframes); // if (m_deviceid == CPUDEVICE) { CopyFromCNTKMatrixToSSEMatrix(tempmatrix, numframes, predstripe); } if (m_deviceid != CPUDEVICE) parallellattice.setloglls(tempmatrix); } else // multiple parallel sequences { // get number of frames for the utterance mapi = extrauttmap[i]; // parallel-sequence index; in case of >1 utterance within this parallel sequence, this is in order of concatenation // scan MBLayout for end of utterance size_t mapframenum = SIZE_MAX; // duration of utterance [i] as determined from MBLayout for (size_t t = validframes[mapi]; t < T; t++) { // TODO: Adapt this to new MBLayout, m_sequences would be easier to work off. if (pMBLayout->IsEnd(mapi, t)) { mapframenum = t - validframes[mapi] + 1; break; } } // must match the explicit information we get from the reader if (numframes != mapframenum) LogicError("gammacalculation: IsEnd() not working, numframes (%d) vs. mapframenum (%d)", (int) numframes, (int) mapframenum); assert(numframes == mapframenum); if (numframes > tempmatrix.GetNumCols()) tempmatrix.Resize(numrows, numframes); Microsoft::MSR::CNTK::Matrix loglikelihoodForCurrentParallelUtterance = loglikelihood.ColumnSlice(mapi + (validframes[mapi] * samplesInRecurrentStep), ((numframes - 1) * samplesInRecurrentStep) + 1); tempmatrix.CopyColumnsStrided(loglikelihoodForCurrentParallelUtterance, numframes, samplesInRecurrentStep, 1); // if (doreferencealign || m_deviceid == CPUDEVICE) { CopyFromCNTKMatrixToSSEMatrix(tempmatrix, numframes, predstripe); } if (m_deviceid != CPUDEVICE) { parallellattice.setloglls(tempmatrix); } } array_ref uidsstripe(&uids[ts], numframes); if (doreferencealign) { boundaryframenum = numframes; } else boundaryframenum = 0; array_ref boundariesstripe(&boundaries[ts], boundaryframenum); double numavlogp = 0; foreach_column (t, dengammasstripe) // we do not allocate memory for numgamma now, should be the same as numgammasstripe { const size_t s = uidsstripe[t]; numavlogp += predstripe(s, t) / amf; } numavlogp /= numframes; // auto_timer dengammatimer; double denavlogp = lattices[i]->second.forwardbackward(parallellattice, (const msra::math::ssematrixbase&) predstripe, (const msra::asr::simplesenonehmm&) m_hset, (msra::math::ssematrixbase&) dengammasstripe, (msra::math::ssematrixbase&) gammasbuffer /*empty, not used*/, lmf, wp, amf, boostmmifactor, seqsMBRmode, uidsstripe, boundariesstripe); objectValue += (ElemType)((numavlogp - denavlogp) * numframes); if (samplesInRecurrentStep == 1) { tempmatrix = gammafromlattice.ColumnSlice(ts, numframes); } // copy gamma to tempmatrix if (m_deviceid == CPUDEVICE) { CopyFromSSEMatrixToCNTKMatrix(dengammas, numrows, numframes, tempmatrix, gammafromlattice.GetDeviceId()); } else parallellattice.getgamma(tempmatrix); // set gamma for multi channel if (samplesInRecurrentStep > 1) { Microsoft::MSR::CNTK::Matrix gammaFromLatticeForCurrentParallelUtterance = gammafromlattice.ColumnSlice(mapi + (validframes[mapi] * samplesInRecurrentStep), ((numframes - 1) * samplesInRecurrentStep) + 1); gammaFromLatticeForCurrentParallelUtterance.CopyColumnsStrided(tempmatrix, numframes, 1, samplesInRecurrentStep); } if (doreferencealign) { for (size_t nframe = 0; nframe < numframes; nframe++) { size_t uid = uidsstripe[nframe]; if (samplesInRecurrentStep > 1) labels(uid, (nframe + validframes[mapi]) * samplesInRecurrentStep + mapi) = 1.0; else labels(uid, ts + nframe) = 1.0; } } if (samplesInRecurrentStep > 1) validframes[mapi] += numframes; // advance the cursor within the parallel sequence fprintf(stderr, "dengamma value %f\n", denavlogp); ts += numframes; } functionValues.SetValue(objectValue); } // Calculate CTC score // totalScore (output): total CTC score at element (0,0) // prob (input): the posterior output from the network (log softmax of right) // maxIndexes (input): indexes of max elements in label input vectors // maxValues (input): values of max elements in label input vectors // labels (input): 1-hot vector with frame-level phone labels // CTCPosterior (output): CTC posterior // blankTokenId (input): id of the blank token. If specified as SIZE_MAX, will be replaced with (numberOfLabels - 1) // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin. // Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy. // delayConstraint=-1 means no constraint void doCTC(Microsoft::MSR::CNTK::Matrix& totalScore, const Microsoft::MSR::CNTK::Matrix& prob, const Microsoft::MSR::CNTK::Matrix& maxIndexes, const Microsoft::MSR::CNTK::Matrix& maxValues, Microsoft::MSR::CNTK::Matrix& CTCPosterior, const std::shared_ptr pMBLayout, size_t blankTokenId, int delayConstraint = -1) { const auto numParallelSequences = pMBLayout->GetNumParallelSequences(); const auto numSequences = pMBLayout->GetNumSequences(); const size_t numRows = prob.GetNumRows(); const size_t numCols = prob.GetNumCols(); m_deviceid = prob.GetDeviceId(); Microsoft::MSR::CNTK::Matrix matrixPhoneSeqs(CPUDEVICE); Microsoft::MSR::CNTK::Matrix matrixPhoneBounds(CPUDEVICE); std::vector> allUttPhoneSeqs; std::vector> allUttPhoneBounds; int maxPhoneNum = 0; std::vector phoneSeq; std::vector phoneBound; if (blankTokenId == SIZE_MAX) blankTokenId = numRows - 1; size_t mbsize = numCols / numParallelSequences; // Prepare data structures from the reader // the position of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance. std::vector uttBeginFrame; // the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch std::vector uttFrameNum; // the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch std::vector uttPhoneNum; // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. std::vector uttToChanInd; uttBeginFrame.reserve(numSequences); uttFrameNum.reserve(numSequences); uttPhoneNum.reserve(numSequences); uttToChanInd.reserve(numSequences); size_t seqId = 0; for (const auto& seq : pMBLayout->GetAllSequences()) { if (seq.seqId == GAP_SEQUENCE_ID) continue; assert(seq.seqId == seqId); seqId++; uttToChanInd.push_back(seq.s); size_t numFrames = seq.GetNumTimeSteps(); uttBeginFrame.push_back(seq.tBegin); uttFrameNum.push_back(numFrames); // Get the phone list and boundaries phoneSeq.clear(); phoneSeq.push_back(SIZE_MAX); phoneBound.clear(); phoneBound.push_back(0); int prevPhoneId = -1; size_t startFrameInd = seq.tBegin * numParallelSequences + seq.s; size_t endFrameInd = seq.tEnd * numParallelSequences + seq.s; size_t frameCounter = 0; for (auto frameInd = startFrameInd; frameInd < endFrameInd; frameInd += numParallelSequences, frameCounter++) { // Labels are represented as 1-hot vectors for each frame // If the 1-hot vectors may have either value 1 or 2 at the position of the phone corresponding to the frame: // 1 means the frame is within phone boundary // 2 means the frame is the phone boundary if (maxValues(0, frameInd) == 2) { prevPhoneId = (size_t) maxIndexes(0, frameInd); phoneSeq.push_back(blankTokenId); phoneBound.push_back(frameCounter); phoneSeq.push_back(prevPhoneId); phoneBound.push_back(frameCounter); } } phoneSeq.push_back(blankTokenId); phoneBound.push_back(numFrames); phoneSeq.push_back(SIZE_MAX); phoneBound.push_back(numFrames); allUttPhoneSeqs.push_back(phoneSeq); allUttPhoneBounds.push_back(phoneBound); uttPhoneNum.push_back(phoneSeq.size()); if (phoneSeq.size() > maxPhoneNum) maxPhoneNum = phoneSeq.size(); } matrixPhoneSeqs.Resize(maxPhoneNum, numSequences); matrixPhoneBounds.Resize(maxPhoneNum, numSequences); for (size_t i = 0; i < numSequences; i++) { for (size_t j = 0; j < allUttPhoneSeqs[i].size(); j++) { matrixPhoneSeqs(j, i) = (ElemType) allUttPhoneSeqs[i][j]; matrixPhoneBounds(j, i) = (ElemType) allUttPhoneBounds[i][j]; } } // Once these matrices populated, move them to the active device matrixPhoneSeqs.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid); matrixPhoneBounds.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid); // compute alpha, beta and CTC scores Microsoft::MSR::CNTK::Matrix alpha(m_deviceid); Microsoft::MSR::CNTK::Matrix beta(m_deviceid); CTCPosterior.AssignCTCScore(prob, alpha, beta, matrixPhoneSeqs, matrixPhoneBounds, totalScore, uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, blankTokenId, delayConstraint, /*isColWise=*/true); Microsoft::MSR::CNTK::Matrix rowSum(m_deviceid); rowSum.Resize(1, numCols); // Normalize the CTC scores CTCPosterior.VectorSum(CTCPosterior, rowSum, /*isColWise=*/true); CTCPosterior.RowElementDivideBy(rowSum); } // Calculate CTC score // totalScore (output): total CTC score at element (0,0) // prob (input): the posterior output from the network (log softmax of right) // maxIndexes (input): indexes of max elements in label input vectors // maxValues (input): values of max elements in label input vectors // labels (input): 1-hot vector with frame-level phone labels // CTCPosterior (output): CTC posterior // blankTokenId (input): id of the blank token. If specified as SIZE_MAX, will be replaced with (numberOfLabels - 1) // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin. // Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy. // delayConstraint=-1 means no constraint void twodimForwardBackward(Microsoft::MSR::CNTK::Matrix& totalScore, Microsoft::MSR::CNTK::Matrix& F, Microsoft::MSR::CNTK::Matrix& G, Microsoft::MSR::CNTK::Matrix& mergedinput, const Microsoft::MSR::CNTK::Matrix& maxIndexes, Microsoft::MSR::CNTK::Matrix& m_derivative, const std::shared_ptr pMBLayout, const std::shared_ptr phoneMBLayout, size_t blankTokenId) { const auto numParallelSequences = pMBLayout->GetNumParallelSequences(); const auto numPhoneParallelSequences = phoneMBLayout->GetNumParallelSequences(); const auto numSequences = pMBLayout->GetNumSequences(); //assert(numParallelSequences==phoneMBLayout->GetNumParallelSequences()); assert(numSequences == phoneMBLayout->GetNumSequences()); const size_t numRows = F.GetNumRows(); const size_t numCols = F.GetNumCols(); const size_t numPhoneCols = G.GetNumCols(); size_t maxFrameNum = numCols / numParallelSequences; size_t maxPhoneNum = numPhoneCols / numPhoneParallelSequences; // Prepare data structures from the reader // the position of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance. std::vector uttFrameBeginIdx, uttPhoneBeginIdx; // the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch std::vector uttFrameNum; // the phone number of each utterance. The size of this vector = the number of all utterances in this minibatch std::vector uttPhoneNum; // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. std::vector uttFrameToChanInd, uttPhoneToChanInd; std::vector phoneSeq; std::vector> allUttPhoneSeqs; uttFrameNum.reserve(numSequences); uttPhoneNum.reserve(numSequences); uttFrameToChanInd.reserve(numSequences); uttPhoneToChanInd.reserve(numSequences); uttFrameBeginIdx.reserve(numSequences); uttPhoneBeginIdx.reserve(numSequences); //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively.... size_t seqId = 0; //frame size_t totalframenum = 0, totalphonenum = 0; for (const auto& seq : pMBLayout->GetAllSequences()) { if (seq.seqId == GAP_SEQUENCE_ID) { continue; } assert(seq.seqId == seqId); seqId++; uttFrameToChanInd.push_back(seq.s); size_t numFrames = seq.GetNumTimeSteps(); uttFrameBeginIdx.push_back(seq.tBegin); uttFrameNum.push_back(numFrames); totalframenum += numFrames; } seqId = 0; //phone for (const auto& seq : phoneMBLayout->GetAllSequences()) { if (seq.seqId == GAP_SEQUENCE_ID) { continue; } assert(seq.seqId == seqId); seqId++; uttPhoneToChanInd.push_back(seq.s); size_t numFrames = seq.GetNumTimeSteps(); uttPhoneBeginIdx.push_back(seq.tBegin); uttPhoneNum.push_back(numFrames); totalphonenum += numFrames; size_t startFrameInd = seq.tBegin * numPhoneParallelSequences + seq.s; size_t endFrameInd = seq.tEnd * numPhoneParallelSequences + seq.s; size_t frameCounter = 0; phoneSeq.clear(); for (auto frameInd = startFrameInd; frameInd < endFrameInd; frameInd += numPhoneParallelSequences, frameCounter++) { phoneSeq.push_back((size_t) maxIndexes(0, frameInd)); } allUttPhoneSeqs.push_back(phoneSeq); } // for cpu m_deviceid_gpu = maxIndexes.GetDeviceId(); m_deviceid = m_deviceid_gpu; Microsoft::MSR::CNTK::Matrix matrixPhoneSeqs(CPUDEVICE); //Microsoft::MSR::CNTK::Matrix matrixPhoneBounds(CPUDEVICE); // copy phone seq to matrix matrixPhoneSeqs.Resize(maxPhoneNum, numSequences); //matrixPhoneBounds.Resize(maxPhoneNum, numSequences); for (size_t i = 0; i < numSequences; i++) { for (size_t j = 0; j < allUttPhoneSeqs[i].size(); j++) { matrixPhoneSeqs(j, i) = (ElemType) allUttPhoneSeqs[i][j]; // matrixPhoneBounds(j, i) = (ElemType)allUttPhoneBounds[i][j]; } } // Once these matrices populated, move them to the active device matrixPhoneSeqs.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid); //matrixPhoneBounds.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid); //calculate the memory need for f*g std::vector uttBeginForOutputditribution; uttBeginForOutputditribution.reserve(numSequences); size_t totalcol = 0; for (size_t s = 0; s < numSequences; s++) { uttBeginForOutputditribution.push_back(totalcol); totalcol += uttFrameNum[s] * uttPhoneNum[s]; /*if (uttFrameNum[s] > 10) uttFrameNum[s] -= 3;*/ } //compute f+g Microsoft::MSR::CNTK::Matrix matrixOutputDistribution(m_deviceid_gpu); /*G.TransferFromDeviceToDevice(m_deviceid_gpu, CPUDEVICE); for (size_t uttId = 0; uttId < numSequences; uttId++) { for (size_t u = 0; u < uttPhoneNum[uttId]; u++) { size_t phonePosInMB = (u + uttPhoneBeginIdx[uttId])*numPhoneParallelSequences + uttPhoneToChanInd[uttId]; size_t phoneId = allUttPhoneSeqs[uttId][u]; for (size_t k = 0; k < G.GetNumRows(); k++) { if (k == phoneId) G.SetValue(k, phonePosInMB, 0.0); else G.SetValue(k, phonePosInMB, (float)LZERO); } } } G.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu);*/ //G.SetValue(0.0); //F.Print("H"); //G.Print("G"); //matrixOutputDistribution.Resize(numRows, totalcol); //matrixOutputDistribution.AssignUserOp1(F, G, uttFrameToChanInd, uttPhoneToChanInd, uttFrameBeginIdx, uttPhoneBeginIdx, uttBeginForOutputditribution, uttFrameNum, uttPhoneNum, // totalcol, numParallelSequences, numPhoneParallelSequences); //matrixOutputDistribution.Print("h"); //log softmax of f+g //mergedinput.InplaceLogSoftmax(true); /*Microsoft::MSR::CNTK::Matrix logsoftmax(m_deviceid_gpu); logsoftmax.SetValue(mergedinput); logsoftmax.InplaceLogSoftmax(true);*/ //matrixOutputDistribution.Print("prob"); // forward backward to compute alpha, beta derivaitves Microsoft::MSR::CNTK::Matrix alpha(m_deviceid_gpu); Microsoft::MSR::CNTK::Matrix beta(m_deviceid_gpu); m_derivative.TransferToDeviceIfNotThere(m_deviceid_gpu); mergedinput.AssignRNNTScore(mergedinput, alpha, beta, matrixPhoneSeqs, matrixPhoneSeqs, uttFrameToChanInd, uttFrameBeginIdx, uttBeginForOutputditribution, uttPhoneToChanInd, uttPhoneBeginIdx, uttFrameNum, uttPhoneNum, numParallelSequences, numPhoneParallelSequences, maxPhoneNum, maxFrameNum, totalScore, blankTokenId, -1, true); //mergedinput.InplaceExp(); //m_derivative.AssignElementProductOf(m_derivative, mergedinput); //mergedinput.ReleaseMemory(); ElemType finalscore = 0; //m_derivative.Print("RNNT"); finalscore = totalScore.Get00Element(); //fprintf(stderr, "finalscore:%f\n", finalscore); if (finalscore > 50 || finalscore < 0) { for (size_t i = 0; i < uttFrameNum.size(); i++) { fprintf(stderr, "framenum:%d\n", (int) (uttFrameNum[i])); } matrixPhoneSeqs.Print("phone seq"); //matrixPhoneBounds.Print("phone bound"); } /*alpha.Print("alpha"); beta.Print("beta"); prob.Print("prob");*/ //m_derivative.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu); //matrixOutputDistribution.ReleaseMemory(); //compute derivatives for F and G /*if (m_derivativeForF.GetDeviceId() != CPUDEVICE) printf("m_derivativeForF before is in GPU"); if (m_derivativeForG.GetDeviceId() != CPUDEVICE) printf("m_derivativeForG before is in GPU");*/ /*m_derivativeForF.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu); m_derivativeForG.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu); m_derivativeForF.SetValue(0.0); m_derivativeForG.SetValue(0.0); m_derivativeForF.AssignUserOp2(RNNTPosterior, uttFrameToChanInd, uttPhoneToChanInd, uttFrameBeginIdx, uttPhoneBeginIdx, uttBeginForOutputditribution, uttFrameNum, uttPhoneNum, numParallelSequences, numPhoneParallelSequences, maxFrameNum, maxPhoneNum, 0); m_derivativeForG.AssignUserOp2(RNNTPosterior, uttFrameToChanInd, uttPhoneToChanInd, uttFrameBeginIdx, uttPhoneBeginIdx, uttBeginForOutputditribution, uttFrameNum, uttPhoneNum, numParallelSequences, numPhoneParallelSequences, maxFrameNum, maxPhoneNum, 1);*/ //do derivative norm /*Microsoft::MSR::CNTK::Matrix tempMatrix1(m_deviceid_gpu), tempMatrix2(m_deviceid_gpu); Microsoft::MSR::CNTK::Matrix::VectorSum(m_derivativeForF, tempMatrix1, false); tempMatrix2.AssignVectorNorm2Of(tempMatrix1,true); //tempMatrix1.Print("sum of F"); //tempMatrix2.Print("norm of F"); //fprintf(stderr, "framenum %d phonenum %d\n", (int) totalframenum, (int) totalphonenum); ElemType norm_coef = (ElemType) 10.0/ tempMatrix2.Get00Element(); Microsoft::MSR::CNTK::Matrix::Scale(norm_coef, m_derivativeForF); Microsoft::MSR::CNTK::Matrix::Scale(norm_coef * (ElemType) totalphonenum / (ElemType)totalframenum, m_derivativeForG);*/ /*//tempMatrix2.Print("norm of F"); Microsoft::MSR::CNTK::Matrix::VectorSum(m_derivativeForG, tempMatrix1, false); //tempMatrix1.Print("sum of G"); tempMatrix2.AssignVectorNorm2Of(tempMatrix1, true); tempMatrix2.Print("norm of G"); if (m_derivativeForF.GetDeviceId() != CPUDEVICE) printf("m_derivativeForF after is in GPU"); if (m_derivativeForG.GetDeviceId() != CPUDEVICE) printf("m_derivativeForG after is in GPU"); m_derivativeForF.Print("derivative for F"); m_derivativeForG.Print("derivative for G"); m_derivativeForF.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu); m_derivativeForG.TransferFromDeviceToDevice(CPUDEVICE, m_deviceid_gpu); printf("finish gamma");*/ } private: // Helper methods for copying between ssematrix objects and CNTK matrices void CopyFromCNTKMatrixToSSEMatrix(const Microsoft::MSR::CNTK::Matrix& src, size_t numCols, msra::math::ssematrixbase& dest) { if (!std::is_same::value) { LogicError("Cannot copy between a SSE matrix and a non-float type CNTK Matrix object!"); } size_t numRows = src.GetNumRows(); const Microsoft::MSR::CNTK::Matrix srcSlice = src.ColumnSlice(0, numCols); if ((m_intermediateCUDACopyBuffer == nullptr) || (m_intermediateCUDACopyBufferSize < srcSlice.GetNumElements())) { m_intermediateCUDACopyBuffer = AllocateIntermediateBuffer(srcSlice.GetDeviceId(), srcSlice.GetNumElements()); m_intermediateCUDACopyBufferSize = srcSlice.GetNumElements(); } ElemType* pBuf = m_intermediateCUDACopyBuffer.get(); srcSlice.CopyToArray(pBuf, m_intermediateCUDACopyBufferSize); if (pBuf != m_intermediateCUDACopyBuffer.get()) { LogicError("Unexpected re-allocation of destination CPU buffer in Matrix::CopyToArray!"); } if ((dest.getcolstride() == dest.rows()) && (numRows == dest.rows())) { memcpy(&dest(0, 0), (float*) pBuf, sizeof(ElemType) * numRows * numCols); } else { // We need to copy columnwise for (size_t i = 0; i < numCols; ++i) { memcpy(&dest(0, i), (float*) (pBuf + (i * numRows)), sizeof(ElemType) * numRows); } } } void CopyFromSSEMatrixToCNTKMatrix(const msra::math::ssematrixbase& src, size_t numRows, size_t numCols, Microsoft::MSR::CNTK::Matrix& dest, int deviceId) { if (!std::is_same::value) { LogicError("Cannot copy between a SSE matrix and a non-float type CNTK Matrix object!"); } size_t numElements = numRows * numCols; if ((m_intermediateCUDACopyBuffer == nullptr) || (m_intermediateCUDACopyBufferSize < numElements)) { m_intermediateCUDACopyBuffer = AllocateIntermediateBuffer(deviceId, numElements); m_intermediateCUDACopyBufferSize = numElements; } if ((src.getcolstride() == src.rows()) && (numRows == src.rows())) { memcpy((float*) m_intermediateCUDACopyBuffer.get(), &src(0, 0), sizeof(float) * numRows * numCols); } else { // We need to copy columnwise for (size_t i = 0; i < numCols; ++i) { memcpy((float*) (m_intermediateCUDACopyBuffer.get() + (i * numRows)), &src(0, i), sizeof(float) * numRows); } } dest.SetValue(numRows, numCols, deviceId, m_intermediateCUDACopyBuffer.get(), 0); } // TODO: This function is duplicate of the one in HTLMLFReader. // This should be moved to a common utils library and removed from here as well as HTLMLFReader std::unique_ptr& GetCUDAAllocator(int deviceID) { if (m_cudaAllocator != nullptr) { if (m_cudaAllocator->GetDeviceId() != deviceID) { m_cudaAllocator.reset(nullptr); } } if (m_cudaAllocator == nullptr) { m_cudaAllocator.reset(new Microsoft::MSR::CNTK::CUDAPageLockedMemAllocator(deviceID)); } return m_cudaAllocator; } // TODO: This function is duplicate of the one in HTLMLFReader. // This should be moved to a common utils library and removed from here as well as HTLMLFReader std::shared_ptr AllocateIntermediateBuffer(int deviceID, size_t numElements) { if (deviceID >= 0) { // Use pinned memory for GPU devices for better copy performance size_t totalSize = sizeof(ElemType) * numElements; return std::shared_ptr((ElemType*) GetCUDAAllocator(deviceID)->Malloc(totalSize), [this, deviceID](ElemType* p) { this->GetCUDAAllocator(deviceID)->Free((char*) p); }); } else { return std::shared_ptr(new ElemType[numElements], [](ElemType* p) { delete[] p; }); } } protected: msra::asr::simplesenonehmm m_hset; msra::lattices::lattice::parallelstate parallellattice; msra::lattices::mbrclassdefinition mbrclassdef = msra::lattices::senone; // defines the unit for minimum bayesian risk bool initialmark; msra::dbn::matrix dengammas; msra::dbn::matrix pred; int m_deviceid; // -1: cpu int m_deviceid_gpu; size_t m_maxframenum; float lmf; // Note that 9 was best for Fisher --these should best be configurable float wp; float amf; msra::dbn::matrix gammasbuffer; std::vector boundary; float boostmmifactor; bool seqsMBRmode; private: std::unique_ptr m_cudaAllocator; std::shared_ptr m_intermediateCUDACopyBuffer; size_t m_intermediateCUDACopyBufferSize; }; } // namespace lattices } // namespace msra