Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/config.hh
00001 #ifndef LM_CONFIG_H
00002 #define LM_CONFIG_H
00003 
00004 #include "lm/lm_exception.hh"
00005 #include "util/mmap.hh"
00006 
00007 #include <iosfwd>
00008 #include <string>
00009 #include <vector>
00010 
00011 /* Configuration for ngram model.  Separate header to reduce pollution. */
00012 
00013 namespace lm {
00014 
00015 class EnumerateVocab;
00016 
00017 namespace ngram {
00018 
00019 struct Config {
00020   // EFFECTIVE FOR BOTH ARPA AND BINARY READS
00021 
00022   // (default true) print progress bar to messages
00023   bool show_progress;
00024 
00025   // Where to log messages including the progress bar.  Set to NULL for
00026   // silence.
00027   std::ostream *messages;
00028 
00029   std::ostream *ProgressMessages() const {
00030     return show_progress ? messages : 0;
00031   }
00032 
00033   // This will be called with every string in the vocabulary by the
00034   // constructor; it need only exist for the lifetime of the constructor.
00035   // See enumerate_vocab.hh for more detail.  Config does not take ownership;
00036   // just delete/let it go out of scope after the constructor exits.
00037   EnumerateVocab *enumerate_vocab;
00038 
00039 
00040   // ONLY EFFECTIVE WHEN READING ARPA
00041 
00042   // What to do when <unk> isn't in the provided model.
00043   WarningAction unknown_missing;
00044   // What to do when <s> or </s> is missing from the model.
00045   // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
00046   WarningAction sentence_marker_missing;
00047 
00048   // What to do with a positive log probability.  For COMPLAIN and SILENT, map
00049   // to 0.
00050   WarningAction positive_log_probability;
00051 
00052   // The probability to substitute for <unk> if it's missing from the model.
00053   // No effect if the model has <unk> or unknown_missing == THROW_UP.
00054   float unknown_missing_logprob;
00055 
00056   // Size multiplier for probing hash table.  Must be > 1.  Space is linear in
00057   // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
00058   // for sorted variant.
00059   // If you find yourself setting this to a low number, consider using the
00060   // TrieModel which has lower memory consumption.
00061   float probing_multiplier;
00062 
00063   // Amount of memory to use for building.  The actual memory usage will be
00064   // higher since this just sets sort buffer size.  Only applies to trie
00065   // models.
00066   std::size_t building_memory;
00067 
00068   // Template for temporary directory appropriate for passing to mkdtemp.
00069   // The characters XXXXXX are appended before passing to mkdtemp.  Only
00070   // applies to trie.  If empty, defaults to write_mmap.  If that's NULL,
00071   // defaults to input file name.
00072   std::string temporary_directory_prefix;
00073 
00074   // Level of complaining to do when loading from ARPA instead of binary format.
00075   enum ARPALoadComplain {ALL, EXPENSIVE, NONE};
00076   ARPALoadComplain arpa_complain;
00077 
00078   // While loading an ARPA file, also write out this binary format file.  Set
00079   // to NULL to disable.
00080   const char *write_mmap;
00081 
00082   enum WriteMethod {
00083     WRITE_MMAP, // Map the file directly.
00084     WRITE_AFTER // Write after we're done.
00085   };
00086   WriteMethod write_method;
00087 
00088   // Include the vocab in the binary file?  Only effective if write_mmap != NULL.
00089   bool include_vocab;
00090 
00091 
00092   // Left rest options.  Only used when the model includes rest costs.
00093   enum RestFunction {
00094     REST_MAX,   // Maximum of any score to the left
00095     REST_LOWER, // Use lower-order files given below.
00096   };
00097   RestFunction rest_function;
00098   // Only used for REST_LOWER.
00099   std::vector<std::string> rest_lower_files;
00100 
00101 
00102   // Quantization options.  Only effective for QuantTrieModel.  One value is
00103   // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
00104   // to quantize (and one of the remaining backoffs will be 0).
00105   uint8_t prob_bits, backoff_bits;
00106 
00107   // Bhiksha compression (simple form).  Only works with trie.
00108   uint8_t pointer_bhiksha_bits;
00109 
00110 
00111   // ONLY EFFECTIVE WHEN READING BINARY
00112 
00113   // How to get the giant array into memory: lazy mmap, populate, read etc.
00114   // See util/mmap.hh for details of MapMethod.
00115   util::LoadMethod load_method;
00116 
00117 
00118   // Set defaults.
00119   Config();
00120 };
00121 
00122 } /* namespace ngram */ } /* namespace lm */
00123 
00124 #endif // LM_CONFIG_H