Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_BUILDER_CORPUS_COUNT_H 00002 #define LM_BUILDER_CORPUS_COUNT_H 00003 00004 #include "lm/lm_exception.hh" 00005 #include "lm/word_index.hh" 00006 #include "util/scoped.hh" 00007 00008 #include <cstddef> 00009 #include <string> 00010 #include <stdint.h> 00011 #include <vector> 00012 00013 namespace util { 00014 class FilePiece; 00015 namespace stream { 00016 class ChainPosition; 00017 } // namespace stream 00018 } // namespace util 00019 00020 namespace lm { 00021 namespace builder { 00022 00023 class CorpusCount { 00024 public: 00025 // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size 00026 static float DedupeMultiplier(std::size_t order); 00027 00028 // How much memory vocabulary will use based on estimated size of the vocab. 00029 static std::size_t VocabUsage(std::size_t vocab_estimate); 00030 00031 // token_count: out. 00032 // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. 00033 CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); 00034 00035 void Run(const util::stream::ChainPosition &position); 00036 00037 private: 00038 util::FilePiece &from_; 00039 int vocab_write_; 00040 uint64_t &token_count_; 00041 WordIndex &type_count_; 00042 std::vector<bool>& prune_words_; 00043 const std::string& prune_vocab_filename_; 00044 00045 std::size_t dedupe_mem_size_; 00046 util::scoped_malloc dedupe_mem_; 00047 00048 WarningAction disallowed_symbol_action_; 00049 }; 00050 00051 } // namespace builder 00052 } // namespace lm 00053 #endif // LM_BUILDER_CORPUS_COUNT_H