Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/common/print.hh
00001 #ifndef LM_COMMON_PRINT_H
00002 #define LM_COMMON_PRINT_H
00003 
00004 #include "lm/word_index.hh"
00005 #include "util/mmap.hh"
00006 #include "util/string_piece.hh"
00007 
00008 #include <cassert>
00009 #include <vector>
00010 
00011 namespace util { namespace stream { class ChainPositions; }}
00012 
00013 // Warning: PrintARPA routines read all unigrams before all bigrams before all
00014 // trigrams etc.  So if other parts of the chain move jointly, you'll have to
00015 // buffer.
00016 
00017 namespace lm {
00018 
00019 class VocabReconstitute {
00020   public:
00021     // fd must be alive for life of this object; does not take ownership.
00022     explicit VocabReconstitute(int fd);
00023 
00024     const char *Lookup(WordIndex index) const {
00025       assert(index < map_.size() - 1);
00026       return map_[index];
00027     }
00028 
00029     StringPiece LookupPiece(WordIndex index) const {
00030       return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
00031     }
00032 
00033     std::size_t Size() const {
00034       // There's an extra entry to support StringPiece lengths.
00035       return map_.size() - 1;
00036     }
00037 
00038   private:
00039     util::scoped_memory memory_;
00040     std::vector<const char*> map_;
00041 };
00042 
00043 class PrintARPA {
00044   public:
00045     // Does not take ownership of vocab_fd or out_fd.
00046     explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts)
00047       : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {}
00048 
00049     void Run(const util::stream::ChainPositions &positions);
00050 
00051   private:
00052     int vocab_fd_;
00053     int out_fd_;
00054     std::vector<uint64_t> counts_;
00055 };
00056 
00057 } // namespace lm
00058 #endif // LM_COMMON_PRINT_H