Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_VIRTUAL_INTERFACE_H 00002 #define LM_VIRTUAL_INTERFACE_H 00003 00004 #include "lm/return.hh" 00005 #include "lm/word_index.hh" 00006 #include "util/string_piece.hh" 00007 00008 #include <string> 00009 #include <cstring> 00010 00011 namespace lm { 00012 namespace base { 00013 00014 template <class T, class U, class V> class ModelFacade; 00015 00016 /* Vocabulary interface. Call Index(string) and get a word index for use in 00017 * calling Model. It provides faster convenience functions for <s>, </s>, and 00018 * <unk> although you can also find these using Index. 00019 * 00020 * Some models do not load the mapping from index to string. If you need this, 00021 * check if the model Vocabulary class implements such a function and access it 00022 * directly. 00023 * 00024 * The Vocabulary object is always owned by the Model and can be retrieved from 00025 * the Model using BaseVocabulary() for this abstract interface or 00026 * GetVocabulary() for the actual implementation (in which case you'll need the 00027 * actual implementation of the Model too). 00028 */ 00029 class Vocabulary { 00030 public: 00031 virtual ~Vocabulary(); 00032 00033 WordIndex BeginSentence() const { return begin_sentence_; } 00034 WordIndex EndSentence() const { return end_sentence_; } 00035 WordIndex NotFound() const { return not_found_; } 00036 00037 /* Most implementations allow StringPiece lookups and need only override 00038 * Index(StringPiece). SRI requires null termination and overrides all 00039 * three methods. 00040 */ 00041 virtual WordIndex Index(const StringPiece &str) const = 0; 00042 virtual WordIndex Index(const std::string &str) const { 00043 return Index(StringPiece(str)); 00044 } 00045 virtual WordIndex Index(const char *str) const { 00046 return Index(StringPiece(str)); 00047 } 00048 00049 protected: 00050 // Call SetSpecial afterward. 00051 Vocabulary() {} 00052 00053 Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { 00054 SetSpecial(begin_sentence, end_sentence, not_found); 00055 } 00056 00057 void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); 00058 00059 WordIndex begin_sentence_, end_sentence_, not_found_; 00060 00061 private: 00062 // Disable copy constructors. They're private and undefined. 00063 // Ersatz boost::noncopyable. 00064 Vocabulary(const Vocabulary &); 00065 Vocabulary &operator=(const Vocabulary &); 00066 }; 00067 00068 /* There are two ways to access a Model. 00069 * 00070 * 00071 * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). 00072 * 00073 * Every Model implements the scoring function: 00074 * float Score( 00075 * const Model::State &in_state, 00076 * const WordIndex new_word, 00077 * Model::State &out_state) const; 00078 * 00079 * It can also return the length of n-gram matched by the model: 00080 * FullScoreReturn FullScore( 00081 * const Model::State &in_state, 00082 * const WordIndex new_word, 00083 * Model::State &out_state) const; 00084 * 00085 * 00086 * There are also accessor functions: 00087 * const State &BeginSentenceState() const; 00088 * const State &NullContextState() const; 00089 * const Vocabulary &GetVocabulary() const; 00090 * unsigned int Order() const; 00091 * 00092 * NB: In case you're wondering why the model implementation looks like it's 00093 * missing these methods, see facade.hh. 00094 * 00095 * This is the fastest way to use a model and presents a normal State class to 00096 * be included in a hypothesis state structure. 00097 * 00098 * 00099 * OPTION 2: Use the virtual interface below. 00100 * 00101 * The virtual interface allow you to decide which Model to use at runtime 00102 * without templatizing everything on the Model type. However, each Model has 00103 * its own State class, so a single State cannot be efficiently provided (it 00104 * would require using the maximum memory of any Model's State or memory 00105 * allocation with each lookup). This means you become responsible for 00106 * allocating memory with size StateSize() and passing it to the Score or 00107 * FullScore functions provided here. 00108 * 00109 * For example, cdec has a std::string containing the entire state of a 00110 * hypothesis. It can reserve StateSize bytes in this string for the model 00111 * state. 00112 * 00113 * All the State objects are POD, so it's ok to use raw memory for storing 00114 * State. 00115 * in_state and out_state must not have the same address. 00116 */ 00117 class Model { 00118 public: 00119 virtual ~Model(); 00120 00121 size_t StateSize() const { return state_size_; } 00122 const void *BeginSentenceMemory() const { return begin_sentence_memory_; } 00123 void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); } 00124 const void *NullContextMemory() const { return null_context_memory_; } 00125 void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); } 00126 00127 // Requires in_state != out_state 00128 virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; 00129 00130 // Requires in_state != out_state 00131 virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; 00132 00133 // Prefer to use FullScore. The context words should be provided in reverse order. 00134 virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; 00135 00136 unsigned char Order() const { return order_; } 00137 00138 const Vocabulary &BaseVocabulary() const { return *base_vocab_; } 00139 00140 private: 00141 template <class T, class U, class V> friend class ModelFacade; 00142 explicit Model(size_t state_size) : state_size_(state_size) {} 00143 00144 const size_t state_size_; 00145 const void *begin_sentence_memory_, *null_context_memory_; 00146 00147 const Vocabulary *base_vocab_; 00148 00149 unsigned char order_; 00150 00151 // Disable copy constructors. They're private and undefined. 00152 // Ersatz boost::noncopyable. 00153 Model(const Model &); 00154 Model &operator=(const Model &); 00155 }; 00156 00157 } // mamespace base 00158 } // namespace lm 00159 00160 #endif // LM_VIRTUAL_INTERFACE_H