Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_CONFIG_H 00002 #define LM_CONFIG_H 00003 00004 #include "lm/lm_exception.hh" 00005 #include "util/mmap.hh" 00006 00007 #include <iosfwd> 00008 #include <string> 00009 #include <vector> 00010 00011 /* Configuration for ngram model. Separate header to reduce pollution. */ 00012 00013 namespace lm { 00014 00015 class EnumerateVocab; 00016 00017 namespace ngram { 00018 00019 struct Config { 00020 // EFFECTIVE FOR BOTH ARPA AND BINARY READS 00021 00022 // (default true) print progress bar to messages 00023 bool show_progress; 00024 00025 // Where to log messages including the progress bar. Set to NULL for 00026 // silence. 00027 std::ostream *messages; 00028 00029 std::ostream *ProgressMessages() const { 00030 return show_progress ? messages : 0; 00031 } 00032 00033 // This will be called with every string in the vocabulary by the 00034 // constructor; it need only exist for the lifetime of the constructor. 00035 // See enumerate_vocab.hh for more detail. Config does not take ownership; 00036 // just delete/let it go out of scope after the constructor exits. 00037 EnumerateVocab *enumerate_vocab; 00038 00039 00040 // ONLY EFFECTIVE WHEN READING ARPA 00041 00042 // What to do when <unk> isn't in the provided model. 00043 WarningAction unknown_missing; 00044 // What to do when <s> or </s> is missing from the model. 00045 // If THROW_UP, the exception will be of type util::SpecialWordMissingException. 00046 WarningAction sentence_marker_missing; 00047 00048 // What to do with a positive log probability. For COMPLAIN and SILENT, map 00049 // to 0. 00050 WarningAction positive_log_probability; 00051 00052 // The probability to substitute for <unk> if it's missing from the model. 00053 // No effect if the model has <unk> or unknown_missing == THROW_UP. 00054 float unknown_missing_logprob; 00055 00056 // Size multiplier for probing hash table. Must be > 1. Space is linear in 00057 // this. Time is probing_multiplier / (probing_multiplier - 1). No effect 00058 // for sorted variant. 00059 // If you find yourself setting this to a low number, consider using the 00060 // TrieModel which has lower memory consumption. 00061 float probing_multiplier; 00062 00063 // Amount of memory to use for building. The actual memory usage will be 00064 // higher since this just sets sort buffer size. Only applies to trie 00065 // models. 00066 std::size_t building_memory; 00067 00068 // Template for temporary directory appropriate for passing to mkdtemp. 00069 // The characters XXXXXX are appended before passing to mkdtemp. Only 00070 // applies to trie. If empty, defaults to write_mmap. If that's NULL, 00071 // defaults to input file name. 00072 std::string temporary_directory_prefix; 00073 00074 // Level of complaining to do when loading from ARPA instead of binary format. 00075 enum ARPALoadComplain {ALL, EXPENSIVE, NONE}; 00076 ARPALoadComplain arpa_complain; 00077 00078 // While loading an ARPA file, also write out this binary format file. Set 00079 // to NULL to disable. 00080 const char *write_mmap; 00081 00082 enum WriteMethod { 00083 WRITE_MMAP, // Map the file directly. 00084 WRITE_AFTER // Write after we're done. 00085 }; 00086 WriteMethod write_method; 00087 00088 // Include the vocab in the binary file? Only effective if write_mmap != NULL. 00089 bool include_vocab; 00090 00091 00092 // Left rest options. Only used when the model includes rest costs. 00093 enum RestFunction { 00094 REST_MAX, // Maximum of any score to the left 00095 REST_LOWER, // Use lower-order files given below. 00096 }; 00097 RestFunction rest_function; 00098 // Only used for REST_LOWER. 00099 std::vector<std::string> rest_lower_files; 00100 00101 00102 // Quantization options. Only effective for QuantTrieModel. One value is 00103 // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used 00104 // to quantize (and one of the remaining backoffs will be 0). 00105 uint8_t prob_bits, backoff_bits; 00106 00107 // Bhiksha compression (simple form). Only works with trie. 00108 uint8_t pointer_bhiksha_bits; 00109 00110 00111 // ONLY EFFECTIVE WHEN READING BINARY 00112 00113 // How to get the giant array into memory: lazy mmap, populate, read etc. 00114 // See util/mmap.hh for details of MapMethod. 00115 util::LoadMethod load_method; 00116 00117 00118 // Set defaults. 00119 Config(); 00120 }; 00121 00122 } /* namespace ngram */ } /* namespace lm */ 00123 00124 #endif // LM_CONFIG_H