Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef UTIL_TOKENIZE_PIECE_H 00002 #define UTIL_TOKENIZE_PIECE_H 00003 00004 #include "util/exception.hh" 00005 #include "util/string_piece.hh" 00006 00007 #include <boost/iterator/iterator_facade.hpp> 00008 00009 #include <algorithm> 00010 #include <cstring> 00011 00012 namespace util { 00013 00014 // Thrown on dereference when out of tokens to parse 00015 class OutOfTokens : public Exception { 00016 public: 00017 OutOfTokens() throw() {} 00018 ~OutOfTokens() throw() {} 00019 }; 00020 00021 class SingleCharacter { 00022 public: 00023 SingleCharacter() {} 00024 explicit SingleCharacter(char delim) : delim_(delim) {} 00025 00026 StringPiece Find(const StringPiece &in) const { 00027 return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); 00028 } 00029 00030 private: 00031 char delim_; 00032 }; 00033 00034 class MultiCharacter { 00035 public: 00036 MultiCharacter() {} 00037 00038 explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} 00039 00040 StringPiece Find(const StringPiece &in) const { 00041 return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); 00042 } 00043 00044 private: 00045 StringPiece delimiter_; 00046 }; 00047 00048 class AnyCharacter { 00049 public: 00050 AnyCharacter() {} 00051 explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} 00052 00053 StringPiece Find(const StringPiece &in) const { 00054 return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); 00055 } 00056 00057 private: 00058 StringPiece chars_; 00059 }; 00060 00061 class BoolCharacter { 00062 public: 00063 BoolCharacter() {} 00064 00065 explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; } 00066 00067 StringPiece Find(const StringPiece &in) const { 00068 for (const char *i = in.data(); i != in.data() + in.size(); ++i) { 00069 if (delimiter_[static_cast<unsigned char>(*i)]) return StringPiece(i, 1); 00070 } 00071 return StringPiece(in.data() + in.size(), 0); 00072 } 00073 00074 template <unsigned Length> static void Build(const char (&characters)[Length], bool (&out)[256]) { 00075 memset(out, 0, sizeof(out)); 00076 for (const char *i = characters; i != characters + Length; ++i) { 00077 out[static_cast<unsigned char>(*i)] = true; 00078 } 00079 } 00080 00081 private: 00082 const bool *delimiter_; 00083 }; 00084 00085 class AnyCharacterLast { 00086 public: 00087 AnyCharacterLast() {} 00088 00089 explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {} 00090 00091 StringPiece Find(const StringPiece &in) const { 00092 return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); 00093 } 00094 00095 private: 00096 StringPiece chars_; 00097 }; 00098 00099 template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> { 00100 public: 00101 TokenIter() {} 00102 00103 template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { 00104 increment(); 00105 } 00106 00107 bool operator!() const { 00108 return current_.data() == 0; 00109 } 00110 operator bool() const { 00111 return current_.data() != 0; 00112 } 00113 00114 static TokenIter<Find, SkipEmpty> end() { 00115 return TokenIter<Find, SkipEmpty>(); 00116 } 00117 00118 private: 00119 friend class boost::iterator_core_access; 00120 00121 void increment() { 00122 do { 00123 StringPiece found(finder_.Find(after_)); 00124 current_ = StringPiece(after_.data(), found.data() - after_.data()); 00125 if (found.data() == after_.data() + after_.size()) { 00126 after_ = StringPiece(NULL, 0); 00127 } else { 00128 after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); 00129 } 00130 } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. 00131 } 00132 00133 bool equal(const TokenIter<Find, SkipEmpty> &other) const { 00134 return current_.data() == other.current_.data(); 00135 } 00136 00137 const StringPiece &dereference() const { 00138 UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); 00139 return current_; 00140 } 00141 00142 StringPiece current_; 00143 StringPiece after_; 00144 00145 Find finder_; 00146 }; 00147 00148 } // namespace util 00149 00150 #endif // UTIL_TOKENIZE_PIECE_H