https://github.com/wilkeraziz/mosesdecoder
Raw File
Tip revision: 3f5dfbf9f27ab2f26e58be8e9d496e1e6214d5c3 authored by Lane Schwartz on 15 October 2013, 20:50:25 UTC
Plug in actual calls to neuralLM
Tip revision: 3f5dfbf
Main.cpp
// $Id$
/***********************************************************************
 Moses - factored phrase-based, hierarchical and syntactic language decoder
 Copyright (C) 2009 Hieu Hoang

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include <iterator>
#include <cassert>
#include "moses/InputFileStream.h"
#include "moses/Util.h"
#include "moses/UserMessage.h"
#include "OnDiskWrapper.h"
#include "SourcePhrase.h"
#include "TargetPhrase.h"
#include "TargetPhraseCollection.h"
#include "Word.h"
#include "Vocab.h"
#include "Main.h"

using namespace std;
using namespace OnDiskPt;

int main (int argc, char * const argv[])
{
  // insert code here...
  Moses::ResetUserTime();
  Moses::PrintUserTime("Starting");

  if (argc != 8) {
    std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl;
    return 1;
  }

  int numSourceFactors	= Moses::Scan<int>(argv[1])
     , numTargetFactors	= Moses::Scan<int>(argv[2])
     , numScores				= Moses::Scan<int>(argv[3])
     , tableLimit				= Moses::Scan<int>(argv[4]);
  TargetPhraseCollection::s_sortScoreInd			= Moses::Scan<int>(argv[5]);
  assert(TargetPhraseCollection::s_sortScoreInd < numScores);
  
  const string filePath 	= argv[6]
               ,destPath	= argv[7];

  Moses::InputFileStream inStream(filePath);

  OnDiskWrapper onDiskWrapper;
  bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
  assert(retDb);

  PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
  size_t lineNum = 0;
  char line[100000];

  //while(getline(inStream, line))
  while(inStream.getline(line, 100000)) {
    lineNum++;
    if (lineNum%1000 == 0) cerr << "." << flush;
    if (lineNum%10000 == 0) cerr << ":" << flush;
    if (lineNum%100000 == 0) cerr << lineNum << flush;
    //cerr << lineNum << " " << line << endl;

    std::vector<float> misc(1);
    SourcePhrase sourcePhrase;
    TargetPhrase *targetPhrase = new TargetPhrase(numScores);
    OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
    assert(misc.size() == onDiskWrapper.GetNumCounts());

    rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
  }

  rootNode.Save(onDiskWrapper, 0, tableLimit);
  onDiskWrapper.EndSave();

  Moses::PrintUserTime("Finished");

  //pause();
  return 0;

} // main()

bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase)
{
  if (prevSourcePhrase == NULL)
    return false;

  assert(currSourcePhrase);
  bool ret = (*currSourcePhrase > *prevSourcePhrase);
  //cerr << *prevSourcePhrase << endl << *currSourcePhrase << " " << ret << endl << endl;

  return ret;
}

OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
{
  size_t scoreInd = 0;

  // MAIN LOOP
  size_t stage = 0;
  /*	0 = source phrase
   1 = target phrase
   2 = scores
   3 = align
   4 = count
   */
  char *tok = strtok (line," ");
  OnDiskPt::PhrasePtr out(new Phrase());
  while (tok != NULL) {
    if (0 == strcmp(tok, "|||")) {
      ++stage;
    } else {
      switch (stage) {
      case 0: {
    	WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
    	if (w != NULL)
    	  out->AddWord(w);
    	
        break;
      }
      case 1: {
        Tokenize(targetPhrase, tok, false, true, onDiskWrapper);
        break;
      }
      case 2: {
        float score = Moses::Scan<float>(tok);
        targetPhrase.SetScore(score, scoreInd);
        ++scoreInd;
        break;
      }
      case 3: {
        //targetPhrase.Create1AlignFromString(tok);
    	targetPhrase.CreateAlignFromString(tok);	
        break;
      }
      case 4:
        ++stage;
        break;
	/*      case 5: {
        // count info. Only store the 2nd one
        float val = Moses::Scan<float>(tok);
        misc[0] = val;
        ++stage;
        break;
	}*/
      case 5: {
        // count info. Only store the 2nd one
        //float val = Moses::Scan<float>(tok);
        //misc[0] = val;
        ++stage;
        break;
      }
      case 6: {
	// store only the 3rd one (rule count)
        float val = Moses::Scan<float>(tok);
        misc[0] = val;
        ++stage;
        break;
	}
      default:
        assert(false);
        break;
      }
    }

    tok = strtok (NULL, " ");
  } // while (tok != NULL)

  assert(scoreInd == numScores);
  targetPhrase.SortAlign();
  return out;
} // Tokenize()

OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
              , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
              , OnDiskPt::OnDiskWrapper &onDiskWrapper)
{

  bool nonTerm = false;
  size_t tokSize = token.size();
  int comStr =token.compare(0, 1, "[");

  if (comStr == 0) {
    comStr = token.compare(tokSize - 1, 1, "]");
    nonTerm = comStr == 0;
  }

  OnDiskPt::WordPtr out;
  if (nonTerm) {
    // non-term
    size_t splitPos		= token.find_first_of("[", 2);
    string wordStr	= token.substr(0, splitPos);

    if (splitPos == string::npos) {
      // lhs - only 1 word
      WordPtr word(new Word());
      word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
      phrase.AddWord(word);
    } else {
      // source & target non-terms
      if (addSourceNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);        
      }

      wordStr = token.substr(splitPos, tokSize - splitPos);
      if (addTargetNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);
        out = word;
      }

    }
  } else {
    // term
    WordPtr word(new Word());
    word->CreateFromString(token, onDiskWrapper.GetVocab());
    phrase.AddWord(word);
    out = word;
  }
  
  return out;
}

void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments)
{
  for (int ind = alignments.size() - 1; ind >= 0; --ind) {
    const ::AlignPair &alignPair = alignments[ind];
    size_t sourcePos = alignPair.first
                       ,targetPos = alignPair.second;

    const string &target = targetToks[targetPos];
    sourceToks.insert(sourceToks.begin() + sourcePos + 1, target);

  }
}

class AlignOrderer
{
public:
  bool operator()(const ::AlignPair &a, const ::AlignPair &b) const {
    return a.first < b.first;
  }
};

void SortAlign(::AlignType &alignments)
{
  std::sort(alignments.begin(), alignments.end(), AlignOrderer());
}
back to top