Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/util/tokenize_piece.hh
00001 #ifndef UTIL_TOKENIZE_PIECE_H
00002 #define UTIL_TOKENIZE_PIECE_H
00003 
00004 #include "util/exception.hh"
00005 #include "util/string_piece.hh"
00006 
00007 #include <boost/iterator/iterator_facade.hpp>
00008 
00009 #include <algorithm>
00010 #include <cstring>
00011 
00012 namespace util {
00013 
00014 // Thrown on dereference when out of tokens to parse
00015 class OutOfTokens : public Exception {
00016   public:
00017     OutOfTokens() throw() {}
00018     ~OutOfTokens() throw() {}
00019 };
00020 
00021 class SingleCharacter {
00022   public:
00023     SingleCharacter() {}
00024     explicit SingleCharacter(char delim) : delim_(delim) {}
00025 
00026     StringPiece Find(const StringPiece &in) const {
00027       return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1);
00028     }
00029 
00030   private:
00031     char delim_;
00032 };
00033 
00034 class MultiCharacter {
00035   public:
00036     MultiCharacter() {}
00037 
00038     explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
00039 
00040     StringPiece Find(const StringPiece &in) const {
00041       return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size());
00042     }
00043 
00044   private:
00045     StringPiece delimiter_;
00046 };
00047 
00048 class AnyCharacter {
00049   public:
00050     AnyCharacter() {}
00051     explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
00052 
00053     StringPiece Find(const StringPiece &in) const {
00054       return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
00055     }
00056 
00057   private:
00058     StringPiece chars_;
00059 };
00060 
00061 class BoolCharacter {
00062   public:
00063     BoolCharacter() {}
00064 
00065     explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; }
00066 
00067     StringPiece Find(const StringPiece &in) const {
00068       for (const char *i = in.data(); i != in.data() + in.size(); ++i) {
00069         if (delimiter_[static_cast<unsigned char>(*i)]) return StringPiece(i, 1);
00070       }
00071       return StringPiece(in.data() + in.size(), 0);
00072     }
00073 
00074     template <unsigned Length> static void Build(const char (&characters)[Length], bool (&out)[256]) {
00075       memset(out, 0, sizeof(out));
00076       for (const char *i = characters; i != characters + Length; ++i) {
00077         out[static_cast<unsigned char>(*i)] = true;
00078       }
00079     }
00080 
00081   private:
00082     const bool *delimiter_;
00083 };
00084 
00085 class AnyCharacterLast {
00086   public:
00087     AnyCharacterLast() {}
00088 
00089     explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}
00090 
00091     StringPiece Find(const StringPiece &in) const {
00092       return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
00093     }
00094 
00095   private:
00096     StringPiece chars_;
00097 };
00098 
00099 template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
00100   public:
00101     TokenIter() {}
00102 
00103     template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) {
00104       increment();
00105     }
00106 
00107     bool operator!() const {
00108       return current_.data() == 0;
00109     }
00110     operator bool() const {
00111       return current_.data() != 0;
00112     }
00113 
00114     static TokenIter<Find, SkipEmpty> end() {
00115       return TokenIter<Find, SkipEmpty>();
00116     }
00117 
00118   private:
00119     friend class boost::iterator_core_access;
00120 
00121     void increment() {
00122       do {
00123         StringPiece found(finder_.Find(after_));
00124         current_ = StringPiece(after_.data(), found.data() - after_.data());
00125         if (found.data() == after_.data() + after_.size()) {
00126           after_ = StringPiece(NULL, 0);
00127         } else {
00128           after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
00129         }
00130       } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
00131     }
00132 
00133     bool equal(const TokenIter<Find, SkipEmpty> &other) const {
00134       return current_.data() == other.current_.data();
00135     }
00136 
00137     const StringPiece &dereference() const {
00138       UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens");
00139       return current_;
00140     }
00141 
00142     StringPiece current_;
00143     StringPiece after_;
00144 
00145     Find finder_;
00146 };
00147 
00148 } // namespace util
00149 
00150 #endif // UTIL_TOKENIZE_PIECE_H