Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 /* Simple implementation of 00002 * @inproceedings{bhikshacompression, 00003 * author={Bhiksha Raj and Ed Whittaker}, 00004 * year={2003}, 00005 * title={Lossless Compression of Language Model Structure and Word Identifiers}, 00006 * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing}, 00007 * pages={388--391}, 00008 * } 00009 * 00010 * Currently only used for next pointers. 00011 */ 00012 00013 #ifndef LM_BHIKSHA_H 00014 #define LM_BHIKSHA_H 00015 00016 #include "lm/model_type.hh" 00017 #include "lm/trie.hh" 00018 #include "util/bit_packing.hh" 00019 #include "util/sorted_uniform.hh" 00020 00021 #include <algorithm> 00022 #include <stdint.h> 00023 #include <cassert> 00024 00025 namespace lm { 00026 namespace ngram { 00027 struct Config; 00028 class BinaryFormat; 00029 00030 namespace trie { 00031 00032 class DontBhiksha { 00033 public: 00034 static const ModelType kModelTypeAdd = static_cast<ModelType>(0); 00035 00036 static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {} 00037 00038 static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } 00039 00040 static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { 00041 return util::RequiredBits(max_next); 00042 } 00043 00044 DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config); 00045 00046 void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const { 00047 out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask); 00048 out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask); 00049 //assert(out.end >= out.begin); 00050 } 00051 00052 void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) { 00053 util::WriteInt57(base, bit_offset, next_.bits, value); 00054 } 00055 00056 void FinishedLoading(const Config &/*config*/) {} 00057 00058 uint8_t InlineBits() const { return next_.bits; } 00059 00060 private: 00061 util::BitsMask next_; 00062 }; 00063 00064 class ArrayBhiksha { 00065 public: 00066 static const ModelType kModelTypeAdd = kArrayAdd; 00067 00068 static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); 00069 00070 static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); 00071 00072 static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); 00073 00074 ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config); 00075 00076 void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const { 00077 // Some assertions are commented out because they are expensive. 00078 // assert(*offset_begin_ == 0); 00079 // std::upper_bound returns the first element that is greater. Want the 00080 // last element that is <= to the index. 00081 const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1; 00082 // Since *offset_begin_ == 0, the position should be in range. 00083 // assert(begin_it >= offset_begin_); 00084 const uint64_t *end_it; 00085 for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {} 00086 // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1)); 00087 --end_it; 00088 // assert(end_it >= begin_it); 00089 out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | 00090 util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); 00091 out.end = ((end_it - offset_begin_) << next_inline_.bits) | 00092 util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); 00093 // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052 00094 assert(out.end >= out.begin); 00095 } 00096 00097 void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) { 00098 uint64_t encode = value >> next_inline_.bits; 00099 for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index; 00100 util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask); 00101 } 00102 00103 void FinishedLoading(const Config &config); 00104 00105 uint8_t InlineBits() const { return next_inline_.bits; } 00106 00107 private: 00108 const util::BitsMask next_inline_; 00109 00110 const uint64_t *const offset_begin_; 00111 const uint64_t *const offset_end_; 00112 00113 uint64_t *write_to_; 00114 00115 void *original_base_; 00116 }; 00117 00118 } // namespace trie 00119 } // namespace ngram 00120 } // namespace lm 00121 00122 #endif // LM_BHIKSHA_H