Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/bhiksha.hh
00001 /* Simple implementation of
00002  * @inproceedings{bhikshacompression,
00003  *  author={Bhiksha Raj and Ed Whittaker},
00004  *  year={2003},
00005  *  title={Lossless Compression of Language Model Structure and Word Identifiers},
00006  *  booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing},
00007  *  pages={388--391},
00008  *  }
00009  *
00010  *  Currently only used for next pointers.
00011  */
00012 
00013 #ifndef LM_BHIKSHA_H
00014 #define LM_BHIKSHA_H
00015 
00016 #include "lm/model_type.hh"
00017 #include "lm/trie.hh"
00018 #include "util/bit_packing.hh"
00019 #include "util/sorted_uniform.hh"
00020 
00021 #include <algorithm>
00022 #include <stdint.h>
00023 #include <cassert>
00024 
00025 namespace lm {
00026 namespace ngram {
00027 struct Config;
00028 class BinaryFormat;
00029 
00030 namespace trie {
00031 
00032 class DontBhiksha {
00033   public:
00034     static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
00035 
00036     static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
00037 
00038     static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
00039 
00040     static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) {
00041       return util::RequiredBits(max_next);
00042     }
00043 
00044     DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config);
00045 
00046     void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const {
00047       out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask);
00048       out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask);
00049       //assert(out.end >= out.begin);
00050     }
00051 
00052     void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) {
00053       util::WriteInt57(base, bit_offset, next_.bits, value);
00054     }
00055 
00056     void FinishedLoading(const Config &/*config*/) {}
00057 
00058     uint8_t InlineBits() const { return next_.bits; }
00059 
00060   private:
00061     util::BitsMask next_;
00062 };
00063 
00064 class ArrayBhiksha {
00065   public:
00066     static const ModelType kModelTypeAdd = kArrayAdd;
00067 
00068     static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
00069 
00070     static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
00071 
00072     static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config);
00073 
00074     ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config);
00075 
00076     void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const {
00077       // Some assertions are commented out because they are expensive.
00078       // assert(*offset_begin_ == 0);
00079       // std::upper_bound returns the first element that is greater.  Want the
00080       // last element that is <= to the index.
00081       const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1;
00082       // Since *offset_begin_ == 0, the position should be in range.
00083       // assert(begin_it >= offset_begin_);
00084       const uint64_t *end_it;
00085       for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {}
00086       // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
00087       --end_it;
00088       // assert(end_it >= begin_it);
00089       out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
00090         util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
00091       out.end = ((end_it - offset_begin_) << next_inline_.bits) |
00092         util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
00093       // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
00094       assert(out.end >= out.begin);
00095     }
00096 
00097     void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {
00098       uint64_t encode = value >> next_inline_.bits;
00099       for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index;
00100       util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask);
00101     }
00102 
00103     void FinishedLoading(const Config &config);
00104 
00105     uint8_t InlineBits() const { return next_inline_.bits; }
00106 
00107   private:
00108     const util::BitsMask next_inline_;
00109 
00110     const uint64_t *const offset_begin_;
00111     const uint64_t *const offset_end_;
00112 
00113     uint64_t *write_to_;
00114 
00115     void *original_base_;
00116 };
00117 
00118 } // namespace trie
00119 } // namespace ngram
00120 } // namespace lm
00121 
00122 #endif // LM_BHIKSHA_H