Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_BINARY_FORMAT_H 00002 #define LM_BINARY_FORMAT_H 00003 00004 #include "lm/config.hh" 00005 #include "lm/model_type.hh" 00006 #include "lm/read_arpa.hh" 00007 00008 #include "util/file_piece.hh" 00009 #include "util/mmap.hh" 00010 #include "util/scoped.hh" 00011 00012 #include <cstddef> 00013 #include <vector> 00014 00015 #include <stdint.h> 00016 00017 namespace lm { 00018 namespace ngram { 00019 00020 extern const char *kModelNames[6]; 00021 00022 /*Inspect a file to determine if it is a binary lm. If not, return false. 00023 * If so, return true and set recognized to the type. This is the only API in 00024 * this header designed for use by decoder authors. 00025 */ 00026 bool RecognizeBinary(const char *file, ModelType &recognized); 00027 00028 struct FixedWidthParameters { 00029 unsigned char order; 00030 float probing_multiplier; 00031 // What type of model is this? 00032 ModelType model_type; 00033 // Does the end of the file have the actual strings in the vocabulary? 00034 bool has_vocabulary; 00035 unsigned int search_version; 00036 }; 00037 00038 // This is a macro instead of an inline function so constants can be assigned using it. 00039 #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) 00040 00041 // Parameters stored in the header of a binary file. 00042 struct Parameters { 00043 FixedWidthParameters fixed; 00044 std::vector<uint64_t> counts; 00045 }; 00046 00047 class BinaryFormat { 00048 public: 00049 explicit BinaryFormat(const Config &config); 00050 00051 // Reading a binary file: 00052 // Takes ownership of fd 00053 void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); 00054 // Used to read parts of the file to update the config object before figuring out full size. 00055 void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; 00056 // Actually load the binary file and return a pointer to the beginning of the search area. 00057 void *LoadBinary(std::size_t size); 00058 00059 uint64_t VocabStringReadingOffset() const { 00060 assert(vocab_string_offset_ != kInvalidOffset); 00061 return vocab_string_offset_; 00062 } 00063 00064 // Writing a binary file or initializing in RAM from ARPA: 00065 // Size for vocabulary. 00066 void *SetupJustVocab(std::size_t memory_size, uint8_t order); 00067 // Warning: can change the vocaulary base pointer. 00068 void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); 00069 // Warning: can change vocabulary and search base addresses. 00070 void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); 00071 // Write the header at the beginning of the file. 00072 void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts); 00073 00074 private: 00075 void MapFile(void *&vocab_base, void *&search_base); 00076 00077 // Copied from configuration. 00078 const Config::WriteMethod write_method_; 00079 const char *write_mmap_; 00080 util::LoadMethod load_method_; 00081 00082 // File behind memory, if any. 00083 util::scoped_fd file_; 00084 00085 // If there is a file involved, a single mapping. 00086 util::scoped_memory mapping_; 00087 00088 // If the data is only in memory, separately allocate each because the trie 00089 // knows vocab's size before it knows search's size (because SRILM might 00090 // have pruned). 00091 util::scoped_memory memory_vocab_, memory_search_; 00092 00093 // Memory ranges. Note that these may not be contiguous and may not all 00094 // exist. 00095 std::size_t header_size_, vocab_size_, vocab_pad_; 00096 // aka end of search. 00097 uint64_t vocab_string_offset_; 00098 00099 static const uint64_t kInvalidOffset = (uint64_t)-1; 00100 }; 00101 00102 bool IsBinaryFormat(int fd); 00103 00104 } // namespace ngram 00105 } // namespace lm 00106 #endif // LM_BINARY_FORMAT_H