https://github.com/wilkeraziz/mosesdecoder
Raw File
Tip revision: 5d9d691ad6c322c95d6eb0fb2c762f5616b70eee authored by Hieu Hoang on 15 October 2015, 15:58:51 UTC
merge
Tip revision: 5d9d691
SuffixArray.h
#pragma once

#include "Vocabulary.h"

class SuffixArray
{
public:
  typedef unsigned int INDEX;

private:
  WORD_ID *m_array;
  INDEX *m_index;
  INDEX *m_buffer;
  char *m_wordInSentence;
  INDEX *m_sentence;
  char *m_sentenceLength;
  WORD_ID m_endOfSentence;
  INDEX *m_document;
  INDEX *m_documentName;
  char *m_documentNameBuffer;
  size_t m_documentNameLength;
  size_t m_documentCount;
  bool m_useDocument;
  Vocabulary m_vcb;
  INDEX m_size;
  INDEX m_sentenceCount;

  // No copying allowed.
  SuffixArray(const SuffixArray&);
  void operator=(const SuffixArray&);

public:
  SuffixArray();
  ~SuffixArray();

  void Create(const std::string& fileName );
  bool ProcessDocumentLine( const char* const, const size_t );
  void Sort(INDEX start, INDEX end);
  int CompareIndex( INDEX a, INDEX b ) const;
  inline int CompareWord( WORD_ID a, WORD_ID b ) const;
  int Count( const std::vector< WORD > &phrase );
  bool MinCount( const std::vector< WORD > &phrase, INDEX min );
  bool Exists( const std::vector< WORD > &phrase );
  int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
  int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
  INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
  INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
  int Match( const std::vector< WORD > &phrase, INDEX index );
  void List( INDEX start, INDEX end );
  void PrintSentenceMatches( const std::vector< WORD > &phrase );
  inline INDEX GetPosition( INDEX index ) const {
    return m_index[ index ];
  }
  inline INDEX GetSentence( INDEX position ) const {
    return m_sentence[position];
  }
  inline char GetWordInSentence( INDEX position ) const {
    return m_wordInSentence[position];
  }
  inline char GetSentenceLength( INDEX sentenceId ) const {
    return m_sentenceLength[sentenceId];
  }
  inline INDEX GetSize() const {
    return m_size;
  }
  inline WORD GetWord( INDEX position ) const {
    return m_vcb.GetWord( m_array[position] );
  }
  void UseDocument() {
    m_useDocument = true;
  }
  INDEX GetDocument( INDEX sentence ) const;
  void PrintDocumentName( INDEX document ) {
    for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
      std::cout << m_documentNameBuffer[ i ];
    }
  }
  void Save(const std::string& fileName ) const;
  void Load(const std::string& fileName );
  void CheckAllocation(bool, const char *dataStructure) const;
  bool Error( const char* message, const std::string& fileName) const;
};
back to top