Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/lm/read_arpa.hh
00001 #ifndef LM_READ_ARPA_H
00002 #define LM_READ_ARPA_H
00003 
00004 #include "lm/lm_exception.hh"
00005 #include "lm/word_index.hh"
00006 #include "lm/weights.hh"
00007 #include "util/file_piece.hh"
00008 
00009 #include <cstddef>
00010 #include <iosfwd>
00011 #include <vector>
00012 
00013 namespace lm {
00014 
00015 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
00016 void ReadNGramHeader(util::FilePiece &in, unsigned int length);
00017 
00018 void ReadBackoff(util::FilePiece &in, Prob &weights);
00019 void ReadBackoff(util::FilePiece &in, float &backoff);
00020 inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
00021   ReadBackoff(in, weights.backoff);
00022 }
00023 inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) {
00024   ReadBackoff(in, weights.backoff);
00025 }
00026 
00027 void ReadEnd(util::FilePiece &in);
00028 
00029 extern const bool kARPASpaces[256];
00030 
00031 // Positive log probability warning.
00032 class PositiveProbWarn {
00033   public:
00034     PositiveProbWarn() : action_(THROW_UP) {}
00035 
00036     explicit PositiveProbWarn(WarningAction action) : action_(action) {}
00037 
00038     void Warn(float prob);
00039 
00040   private:
00041     WarningAction action_;
00042 };
00043 
00044 template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
00045   try {
00046     float prob = f.ReadFloat();
00047     if (prob > 0.0) {
00048       warn.Warn(prob);
00049       prob = 0.0;
00050     }
00051     UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
00052     WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
00053     Weights &w = unigrams[word];
00054     w.prob = prob;
00055     ReadBackoff(f, w);
00056   } catch(util::Exception &e) {
00057     e << " in the 1-gram at byte " << f.Offset();
00058     throw;
00059   }
00060 }
00061 
00062 template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
00063   ReadNGramHeader(f, 1);
00064   for (std::size_t i = 0; i < count; ++i) {
00065     Read1Gram(f, vocab, unigrams, warn);
00066   }
00067   vocab.FinishedLoading(unigrams);
00068 }
00069 
00070 // Read ngram, write vocab ids to indices_out.
00071 template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) {
00072   try {
00073     weights.prob = f.ReadFloat();
00074     if (weights.prob > 0.0) {
00075       warn.Warn(weights.prob);
00076       weights.prob = 0.0;
00077     }
00078     for (unsigned char i = 0; i < n; ++i, ++indices_out) {
00079       StringPiece word(f.ReadDelimited(kARPASpaces));
00080       WordIndex index = vocab.Index(word);
00081       *indices_out = index;
00082       // Check for words mapped to <unk> that are not the string <unk>.
00083       UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
00084           FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
00085     }
00086     ReadBackoff(f, weights);
00087   } catch(util::Exception &e) {
00088     e << " in the " << static_cast<unsigned int>(n) << "-gram at byte " << f.Offset();
00089     throw;
00090   }
00091 }
00092 
00093 } // namespace lm
00094 
00095 #endif // LM_READ_ARPA_H