Joshua
open source statistical hierarchical phrase-based machine translation system
All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/binary_format.hh
00001 #ifndef LM_BINARY_FORMAT_H
00002 #define LM_BINARY_FORMAT_H
00003 
00004 #include "lm/config.hh"
00005 #include "lm/model_type.hh"
00006 #include "lm/read_arpa.hh"
00007 
00008 #include "util/file_piece.hh"
00009 #include "util/mmap.hh"
00010 #include "util/scoped.hh"
00011 
00012 #include <cstddef>
00013 #include <vector>
00014 
00015 #include <stdint.h>
00016 
00017 namespace lm {
00018 namespace ngram {
00019 
00020 extern const char *kModelNames[6];
00021 
00022 /*Inspect a file to determine if it is a binary lm.  If not, return false.
00023  * If so, return true and set recognized to the type.  This is the only API in
00024  * this header designed for use by decoder authors.
00025  */
00026 bool RecognizeBinary(const char *file, ModelType &recognized);
00027 
00028 struct FixedWidthParameters {
00029   unsigned char order;
00030   float probing_multiplier;
00031   // What type of model is this?
00032   ModelType model_type;
00033   // Does the end of the file have the actual strings in the vocabulary?
00034   bool has_vocabulary;
00035   unsigned int search_version;
00036 };
00037 
00038 // This is a macro instead of an inline function so constants can be assigned using it.
00039 #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
00040 
00041 // Parameters stored in the header of a binary file.
00042 struct Parameters {
00043   FixedWidthParameters fixed;
00044   std::vector<uint64_t> counts;
00045 };
00046 
00047 class BinaryFormat {
00048   public:
00049     explicit BinaryFormat(const Config &config);
00050 
00051     // Reading a binary file:
00052     // Takes ownership of fd
00053     void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
00054     // Used to read parts of the file to update the config object before figuring out full size.
00055     void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
00056     // Actually load the binary file and return a pointer to the beginning of the search area.
00057     void *LoadBinary(std::size_t size);
00058 
00059     uint64_t VocabStringReadingOffset() const {
00060       assert(vocab_string_offset_ != kInvalidOffset);
00061       return vocab_string_offset_;
00062     }
00063 
00064     // Writing a binary file or initializing in RAM from ARPA:
00065     // Size for vocabulary.
00066     void *SetupJustVocab(std::size_t memory_size, uint8_t order);
00067     // Warning: can change the vocaulary base pointer.
00068     void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
00069     // Warning: can change vocabulary and search base addresses.
00070     void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
00071     // Write the header at the beginning of the file.
00072     void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
00073 
00074   private:
00075     void MapFile(void *&vocab_base, void *&search_base);
00076 
00077     // Copied from configuration.
00078     const Config::WriteMethod write_method_;
00079     const char *write_mmap_;
00080     util::LoadMethod load_method_;
00081 
00082     // File behind memory, if any.
00083     util::scoped_fd file_;
00084 
00085     // If there is a file involved, a single mapping.
00086     util::scoped_memory mapping_;
00087 
00088     // If the data is only in memory, separately allocate each because the trie
00089     // knows vocab's size before it knows search's size (because SRILM might
00090     // have pruned).
00091     util::scoped_memory memory_vocab_, memory_search_;
00092 
00093     // Memory ranges.  Note that these may not be contiguous and may not all
00094     // exist.
00095     std::size_t header_size_, vocab_size_, vocab_pad_;
00096     // aka end of search.
00097     uint64_t vocab_string_offset_;
00098 
00099     static const uint64_t kInvalidOffset = (uint64_t)-1;
00100 };
00101 
00102 bool IsBinaryFormat(int fd);
00103 
00104 } // namespace ngram
00105 } // namespace lm
00106 #endif // LM_BINARY_FORMAT_H