Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/virtual_interface.hh
00001 #ifndef LM_VIRTUAL_INTERFACE_H
00002 #define LM_VIRTUAL_INTERFACE_H
00003 
00004 #include "lm/return.hh"
00005 #include "lm/word_index.hh"
00006 #include "util/string_piece.hh"
00007 
00008 #include <string>
00009 #include <cstring>
00010 
00011 namespace lm {
00012 namespace base {
00013 
00014 template <class T, class U, class V> class ModelFacade;
00015 
00016 /* Vocabulary interface.  Call Index(string) and get a word index for use in
00017  * calling Model.  It provides faster convenience functions for <s>, </s>, and
00018  * <unk> although you can also find these using Index.
00019  *
00020  * Some models do not load the mapping from index to string.  If you need this,
00021  * check if the model Vocabulary class implements such a function and access it
00022  * directly.
00023  *
00024  * The Vocabulary object is always owned by the Model and can be retrieved from
00025  * the Model using BaseVocabulary() for this abstract interface or
00026  * GetVocabulary() for the actual implementation (in which case you'll need the
00027  * actual implementation of the Model too).
00028  */
00029 class Vocabulary {
00030   public:
00031     virtual ~Vocabulary();
00032 
00033     WordIndex BeginSentence() const { return begin_sentence_; }
00034     WordIndex EndSentence() const { return end_sentence_; }
00035     WordIndex NotFound() const { return not_found_; }
00036 
00037     /* Most implementations allow StringPiece lookups and need only override
00038      * Index(StringPiece).  SRI requires null termination and overrides all
00039      * three methods.
00040      */
00041     virtual WordIndex Index(const StringPiece &str) const = 0;
00042     virtual WordIndex Index(const std::string &str) const {
00043       return Index(StringPiece(str));
00044     }
00045     virtual WordIndex Index(const char *str) const {
00046       return Index(StringPiece(str));
00047     }
00048 
00049   protected:
00050     // Call SetSpecial afterward.
00051     Vocabulary() {}
00052 
00053     Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) {
00054       SetSpecial(begin_sentence, end_sentence, not_found);
00055     }
00056 
00057     void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found);
00058 
00059     WordIndex begin_sentence_, end_sentence_, not_found_;
00060 
00061   private:
00062     // Disable copy constructors.  They're private and undefined.
00063     // Ersatz boost::noncopyable.
00064     Vocabulary(const Vocabulary &);
00065     Vocabulary &operator=(const Vocabulary &);
00066 };
00067 
00068 /* There are two ways to access a Model.
00069  *
00070  *
00071  * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
00072  *
00073  * Every Model implements the scoring function:
00074  * float Score(
00075  *   const Model::State &in_state,
00076  *   const WordIndex new_word,
00077  *   Model::State &out_state) const;
00078  *
00079  * It can also return the length of n-gram matched by the model:
00080  * FullScoreReturn FullScore(
00081  *   const Model::State &in_state,
00082  *   const WordIndex new_word,
00083  *   Model::State &out_state) const;
00084  *
00085  *
00086  * There are also accessor functions:
00087  * const State &BeginSentenceState() const;
00088  * const State &NullContextState() const;
00089  * const Vocabulary &GetVocabulary() const;
00090  * unsigned int Order() const;
00091  *
00092  * NB: In case you're wondering why the model implementation looks like it's
00093  * missing these methods, see facade.hh.
00094  *
00095  * This is the fastest way to use a model and presents a normal State class to
00096  * be included in a hypothesis state structure.
00097  *
00098  *
00099  * OPTION 2: Use the virtual interface below.
00100  *
00101  * The virtual interface allow you to decide which Model to use at runtime
00102  * without templatizing everything on the Model type.  However, each Model has
00103  * its own State class, so a single State cannot be efficiently provided (it
00104  * would require using the maximum memory of any Model's State or memory
00105  * allocation with each lookup).  This means you become responsible for
00106  * allocating memory with size StateSize() and passing it to the Score or
00107  * FullScore functions provided here.
00108  *
00109  * For example, cdec has a std::string containing the entire state of a
00110  * hypothesis.  It can reserve StateSize bytes in this string for the model
00111  * state.
00112  *
00113  * All the State objects are POD, so it's ok to use raw memory for storing
00114  * State.
00115  * in_state and out_state must not have the same address.
00116  */
00117 class Model {
00118   public:
00119     virtual ~Model();
00120 
00121     size_t StateSize() const { return state_size_; }
00122     const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
00123     void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
00124     const void *NullContextMemory() const { return null_context_memory_; }
00125     void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
00126 
00127     // Requires in_state != out_state
00128     virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
00129 
00130     // Requires in_state != out_state
00131     virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
00132 
00133     // Prefer to use FullScore.  The context words should be provided in reverse order.
00134     virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
00135 
00136     unsigned char Order() const { return order_; }
00137 
00138     const Vocabulary &BaseVocabulary() const { return *base_vocab_; }
00139 
00140   private:
00141     template <class T, class U, class V> friend class ModelFacade;
00142     explicit Model(size_t state_size) : state_size_(state_size) {}
00143 
00144     const size_t state_size_;
00145     const void *begin_sentence_memory_, *null_context_memory_;
00146 
00147     const Vocabulary *base_vocab_;
00148 
00149     unsigned char order_;
00150 
00151     // Disable copy constructors.  They're private and undefined.
00152     // Ersatz boost::noncopyable.
00153     Model(const Model &);
00154     Model &operator=(const Model &);
00155 };
00156 
00157 } // mamespace base
00158 } // namespace lm
00159 
00160 #endif // LM_VIRTUAL_INTERFACE_H