Revision 2c7a1608709e510c64b47c9c7c2c53451fa0d445 authored by abarun on 11 April 2010, 21:42:19 UTC, committed by abarun on 11 April 2010, 21:42:19 UTC
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/josiah@3114 1f5c12ca-751b-0410-a591-d2e778427230
1 parent 8fa4e58
Raw File
Model1.cpp
#include <algorithm>
#include <functional>
#include <fstream>
#include <numeric>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/bind.hpp>
#include <boost/foreach.hpp>
#include <boost/serialization/map.hpp>
#include <boost/serialization/vector.hpp>
#include <boost/iterator/transform_iterator.hpp>
#include "File.h" 
#include "Model1.h"
#include "Gibbler.h"

#define foreach BOOST_FOREACH 

namespace Josiah {

moses_factor_to_vocab_id::moses_factor_to_vocab_id(const vocabulary& v, 
  const Moses::FactorDirection d, const Moses::FactorType t, 
  Moses::FactorCollection& c){

  for (vocabulary::const_iterator i = v.begin(); i!=v.end(); ++i){
    size_t factor_id = c.AddFactor(d, t, i->right)->GetId(); 
    if (_vocab_id.size() <= factor_id)
      _vocab_id.resize(factor_id+1, -1);
    _vocab_id[factor_id] = i->left;
  }
}  

external_m1_node::external_m1_node(){}

external_m1_node::external_m1_node(FILE* f){ read(f); } 

external_m1_node::external_m1_node(const internal_m1_node& node){
  for (internal_m1_node::const_iterator i=node.begin(); i!=node.end(); ++i){
    keys.push_back(i->first);
    data.push_back(i->second);
  }
}

void external_m1_node::read(FILE* f){
  Moses::fReadVector(f,keys);
  Moses::fReadVector(f,data);
}

size_t external_m1_node::write(FILE* f) const {
  Moses::fWriteVector(f,keys);
  return Moses::fWriteVector(f,data);
}

float external_m1_node::score(int key, int col) const {
  std::vector<int>::const_iterator i = 
    std::lower_bound(keys.begin(),keys.end(),key);
  if (i != keys.end() && *i==key)
    return data[std::distance(keys.begin(),i)][col];
  else
    return 0.0;
}

external_model1_table::external_model1_table(const internal_model1_table& origin, const vocabulary& f_vocab, 
  const vocabulary& e_vocab, const std::string& filename): 
  _f_vocab(f_vocab), _e_vocab(e_vocab) {

  _f=Moses::fOpen(filename.c_str(),"wb");
  
  // write external data (persistent content of _table) using R.Zens utils
  OFF_T default_offset = Moses::fTell(_f);
  external_m1_node().write(_f);
  std::vector<OFF_T> offsets(f_vocab.size(),default_offset); 
 
  for (internal_model1_table::const_iterator i=origin.begin(); i!=origin.end(); ++i){
    offsets[i->first] = Moses::fTell(_f);
    external_m1_node(i->second).write(_f);
  }
  _init_table(offsets.begin(), offsets.end());

  // write internal data using boost::serialize
  std::string ifilename(filename+".i");
  std::ofstream ofs(ifilename.c_str(), std::ios::binary);
  boost::archive::binary_oarchive oa(ofs);
  oa << _f_vocab;
  oa << _e_vocab;
  oa << offsets;
}

external_model1_table::external_model1_table(std::string filename){
  std::string ifilename(filename+".i");
  std::ifstream ifs(ifilename.c_str(), std::ios::binary);
  boost::archive::binary_iarchive ia(ifs);
  ia >> _f_vocab;
  ia >> _e_vocab;
  std::vector<OFF_T> offsets;
  ia >> offsets;
  _f = Moses::fOpen(filename.c_str(), "rb");
  _init_table(offsets.begin(), offsets.end());
}

void external_model1_table::gc(){
  for(cache::iterator i=_cache.begin(); i!=_cache.end(); ++i){
    _table[*i].free();
  }
  _cache.clear();
} 

float external_model1_table::score(const int f, const int e, const int col) {
  _cache.insert(f);
  return _table[f]->score(e,col);
}

const vocabulary& external_model1_table::f_vocab() const { return _f_vocab; }
const vocabulary& external_model1_table::e_vocab() const { return _e_vocab; }

model1::model1(model1_table_handle table, vocab_mapper_handle fmap, vocab_mapper_handle emap):
    SingleValuedFeatureFunction("Model1"),_ptable(table), _pfmap(fmap), _pemap(emap), _sample(NULL) {}

template <typename ForwardRange, typename BackInsertIterator>
void _moses_words_to_ids(const moses_factor_to_vocab_id& func, 
  const ForwardRange& origin, const BackInsertIterator& dest){
  
  std::vector<int> unfiltered_ids;
  std::transform(origin.begin(), origin.end(), 
    std::back_inserter(unfiltered_ids), func);
  std::remove_copy_if(unfiltered_ids.begin(), unfiltered_ids.end(),
    dest, is_unknown());  
}

template <typename ForwardRange, typename BackInsertIterator>
void _moses_words_to_ids_unfiltered(const moses_factor_to_vocab_id& func, 
  const ForwardRange& origin, const BackInsertIterator& dest){
  
  std::vector<int> unfiltered_ids;
  std::transform(origin.begin(), origin.end(), 
    std::back_inserter(unfiltered_ids), func);
  std::copy (unfiltered_ids.begin (), unfiltered_ids.end (), dest);
}


void model1::init(const Sample& sample) {
  _sample = &sample;
  clear_cache(sample);
}

void model1::clear_cache(const Sample& s){
  _source_words = s.GetSourceWords();

  _source_word_ids.clear();
  _moses_words_to_ids(*_pfmap, s.GetSourceWords(), 
    std::back_inserter(_source_word_ids));

  // _sums_cache operates only over known source words
  _sums.clear();
  _sums.insert(_sums.end(), _source_word_ids.size(), 0.0);

  _ptable->gc();
}

// functor that takes the log of its argument;
// used in logspace product computations of model1
struct to_log{ 
  float operator()(float x) const { return log(std::max(x,MODEL1_SUM_FLOOR)); } 
  typedef float result_type;
  typedef float argument_type;
};
typedef boost::transform_iterator<to_log,std::vector<float>::iterator> log_iter;

float model1::computeScore(){
  // 2. feature computation
  // convert target words to ids
  std::vector<int> target_word_ids;
  _moses_words_to_ids(*_pemap, _sample->GetTargetWords(),  
    std::back_inserter(target_word_ids));

  // compute sums in each column
  _compute_inner_sums(_source_word_ids.begin(), _source_word_ids.end(),
    target_word_ids.begin(), target_word_ids.end(),
    _sums.begin());

  // compute product of sums in logspace
  return std::accumulate(log_iter(_sums.begin()),  
    log_iter(_sums.end()), -(_source_word_ids.size()*log(target_word_ids.size()+1)));
}

float model1::getSingleUpdateScore(const TranslationOption* option, const TargetGap& gap){
  assert(!"Do not call model1::getSingleUpdateScore");
  return 0.0;
}


float model1::getContiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
                                             const TargetGap& gap){
  assert(!"Do not call model1::getDiscontiguousPairedUpdateScore");
  return 0.0;
}

float model1::getDiscontiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
                                             const TargetGap& leftGap, const TargetGap& rightGap){
  assert(!"Do not call model1::getContiguousPairedUpdateScore");
  return 0.0;
}
  



model1_inverse::model1_inverse(model1_table_handle table, vocab_mapper_handle fmap, vocab_mapper_handle emap):
    SingleValuedFeatureFunction("Model1Inverse"),
 _ptable(table), _pfmap(fmap), _pemap(emap), _sample(NULL) {}

 void model1_inverse::init(const Sample& sample) {
   _sample = &sample;
   clear_cache(sample);
 }
 
void model1_inverse::clear_cache(const Sample& s){
  _sourceWords = s.GetSourceWords();

  _word_cache.clear();
  _option_cache.clear();
  _sentence_cache.clear();
  _moses_words_to_ids(*_pfmap, s.GetSourceWords(), 
    std::back_inserter(_sentence_cache));
  _ptable->gc();
}

float model1_inverse::computeScore(){
  // 2. perform the actual computation
  std::vector<int> target_words;
  _moses_words_to_ids(*_pemap, _sample->GetTargetWords(),
    std::back_inserter(target_words));
  return score(_sentence_cache.begin(), _sentence_cache.end(), 
    target_words.begin(), target_words.end());
}

float model1_inverse::getSingleUpdateScore(const TranslationOption* option, const TargetGap& gap) {
  if (_option_cache.find(option) == _option_cache.end()) {
    std::vector<int> target_words;
    _moses_words_to_ids(*_pemap, option->GetTargetPhrase(),
      std::back_inserter(target_words));
    _option_cache[option] = score(_sentence_cache.begin(), _sentence_cache.end(),
      target_words.begin(), target_words.end());
  }    
  return _option_cache[option];    
}


float model1_inverse::getContiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
    const TargetGap& gap) {
    
  return getSingleUpdateScore(leftOption, gap) +
    getSingleUpdateScore(rightOption, gap);
}

float model1_inverse::getDiscontiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
    const TargetGap& leftGap, const TargetGap& rightGap) {
    
      return getSingleUpdateScore(leftOption, leftGap) +
          getSingleUpdateScore(rightOption, rightGap);
}  
  
ApproximateModel1::ApproximateModel1(model1_table_handle table, vocab_mapper_handle fmap, vocab_mapper_handle emap):
    model1(table,fmap,emap){}

void ApproximateModel1::init(const Sample& sample) {
  model1::init(sample);
  clear_cache(sample);
}

   
void ApproximateModel1::clear_cache(const Sample& s) {
  _source_words = s.GetSourceWords();
  
  _source_word_ids.clear();
  _moses_words_to_ids_unfiltered(*_pfmap, s.GetSourceWords(),
    std::back_inserter(_source_word_ids));

  _option_cache.clear();
  // _sums_cache operates only over known source words
  _sums.clear();
  _sums.insert(_sums.end(), _source_word_ids.size(), 0.0);
 
  _ptable->gc();

}
  
float ApproximateModel1::getImportanceWeight() {
  //since the "approximation" is to return 0, this is just the true score
  return model1::computeScore() - computeScore();
}
  
  
float ApproximateModel1::computeScore() {
  float score = 0.0;
  //cerr << "AM1, In compute score" << endl;
  for (Hypothesis* h =  const_cast<Hypothesis*>(const_cast<Sample*>(_sample)->GetTargetTail()->GetNextHypo()); h; h = const_cast<Hypothesis*>(h->GetNextHypo())) {
    score += getSingleUpdateScore(&(h->GetTranslationOption()), h->GetCurrTargetWordsRange());
  }
  return score;
}

float ApproximateModel1::getSingleUpdateScore(const TranslationOption* option, const TargetGap& gap){
  return getSingleUpdateScore(option,gap.segment);
}  
  
float ApproximateModel1::getSingleUpdateScore(const TranslationOption* option, const WordsRange& segment){
  /*cerr << "Score for option " << *(option->GetSourcePhrase()) << " to " << option->GetTargetPhrase() << " = " ;
  cerr <<  "Option start pos " <<  option->GetStartPos() << ", End pos " << option->GetEndPos() << endl;
  cerr << "Source words ids size = " << _source_word_ids.size() << endl;*/
  if (_option_cache.find(option) == _option_cache.end()) {
    std::vector<int> target_word_ids;
    _moses_words_to_ids_unfiltered(*_pemap, option->GetTargetPhrase(), std::back_inserter(target_word_ids));

    if (_source_word_ids[option->GetStartPos()] == -1 ) {
      _option_cache[option] = MODEL1_LOG_FLOOR;
    }
    else { 
     vector<float> sums(option->GetEndPos() - option->GetStartPos() + 1);
      _compute_inner_sums(_source_word_ids.begin() + option->GetStartPos(), _source_word_ids.begin() + option->GetEndPos() + 1,
                        target_word_ids.begin(), target_word_ids.end(),
                        sums.begin());
      /*cerr << "Sums " ;
      for (size_t i = 0; i < sums.size(); ++i)
       cerr << sums[i] << " ";
      cerr << endl; */
    
      // compute product of sums in logspace
      _option_cache[option] = std::accumulate(log_iter(sums.begin()),  
                         log_iter(sums.end()), 0.0);
    }
  }
  //cerr << "Score " << _option_cache[option] << endl;
  return _option_cache[option];
}
  
  
float ApproximateModel1::getContiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
                                               const TargetGap& gap){
  //cerr << "In get cont score " << endl;
  return getSingleUpdateScore(leftOption, gap) +
  getSingleUpdateScore(rightOption, gap);  
}
  
float ApproximateModel1::getDiscontiguousPairedUpdateScore(const TranslationOption* leftOption, const TranslationOption* rightOption, 
                                                  const TargetGap& leftGap, const TargetGap& rightGap){
  //cerr << "In get discont score " << endl;
  return getSingleUpdateScore(leftOption, leftGap) +
  getSingleUpdateScore(rightOption, rightGap);
}  
  

ApproximateModel1Inverse::ApproximateModel1Inverse(model1_table_handle table, vocab_mapper_handle fmap, vocab_mapper_handle emap):
    model1_inverse(table,fmap,emap){}
    
float ApproximateModel1Inverse::getImportanceWeight() {
  //since the "approximation" is to return 0, this is just the true score
  return model1_inverse::computeScore();
}

  
  
  
  
} // namespace Josiah
back to top