Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_COMMON_PRINT_H 00002 #define LM_COMMON_PRINT_H 00003 00004 #include "lm/word_index.hh" 00005 #include "util/mmap.hh" 00006 #include "util/string_piece.hh" 00007 00008 #include <cassert> 00009 #include <vector> 00010 00011 namespace util { namespace stream { class ChainPositions; }} 00012 00013 // Warning: PrintARPA routines read all unigrams before all bigrams before all 00014 // trigrams etc. So if other parts of the chain move jointly, you'll have to 00015 // buffer. 00016 00017 namespace lm { 00018 00019 class VocabReconstitute { 00020 public: 00021 // fd must be alive for life of this object; does not take ownership. 00022 explicit VocabReconstitute(int fd); 00023 00024 const char *Lookup(WordIndex index) const { 00025 assert(index < map_.size() - 1); 00026 return map_[index]; 00027 } 00028 00029 StringPiece LookupPiece(WordIndex index) const { 00030 return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); 00031 } 00032 00033 std::size_t Size() const { 00034 // There's an extra entry to support StringPiece lengths. 00035 return map_.size() - 1; 00036 } 00037 00038 private: 00039 util::scoped_memory memory_; 00040 std::vector<const char*> map_; 00041 }; 00042 00043 class PrintARPA { 00044 public: 00045 // Does not take ownership of vocab_fd or out_fd. 00046 explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts) 00047 : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {} 00048 00049 void Run(const util::stream::ChainPositions &positions); 00050 00051 private: 00052 int vocab_fd_; 00053 int out_fd_; 00054 std::vector<uint64_t> counts_; 00055 }; 00056 00057 } // namespace lm 00058 #endif // LM_COMMON_PRINT_H