~post/joshua-docs/corpus__count_8hh_source.html

00001 #ifndef LM_BUILDER_CORPUS_COUNT_H
00002 #define LM_BUILDER_CORPUS_COUNT_H
00003
00004 #include "lm/lm_exception.hh"
00005 #include "lm/word_index.hh"
00006 #include "util/scoped.hh"
00007
00008 #include <cstddef>
00009 #include <string>
00010 #include <stdint.h>
00011 #include <vector>
00012
00013 namespace util {
00014 class FilePiece;
00015 namespace stream {
00016 class ChainPosition;
00017 } // namespace stream
00018 } // namespace util
00019
00020 namespace lm {
00021 namespace builder {
00022
00023 class CorpusCount {
00024   public:
00025     // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
00026     static float DedupeMultiplier(std::size_t order);
00027
00028     // How much memory vocabulary will use based on estimated size of the vocab.
00029     static std::size_t VocabUsage(std::size_t vocab_estimate);
00030
00031     // token_count: out.
00032     // type_count aka vocabulary size.  Initialize to an estimate.  It is set to the exact value.
00033     CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
00034
00035     void Run(const util::stream::ChainPosition &position);
00036
00037   private:
00038     util::FilePiece &from_;
00039     int vocab_write_;
00040     uint64_t &token_count_;
00041     WordIndex &type_count_;
00042     std::vector<bool>& prune_words_;
00043     const std::string& prune_vocab_filename_;
00044
00045     std::size_t dedupe_mem_size_;
00046     util::scoped_malloc dedupe_mem_;
00047
00048     WarningAction disallowed_symbol_action_;
00049 };
00050
00051 } // namespace builder
00052 } // namespace lm
00053 #endif // LM_BUILDER_CORPUS_COUNT_H