Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If 00002 * you don't use ICU, then this will use the Google implementation from Chrome. 00003 * This has been modified from the original version to let you choose. 00004 */ 00005 00006 // Copyright 2008, Google Inc. 00007 // All rights reserved. 00008 // 00009 // Redistribution and use in source and binary forms, with or without 00010 // modification, are permitted provided that the following conditions are 00011 // met: 00012 // 00013 // * Redistributions of source code must retain the above copyright 00014 // notice, this list of conditions and the following disclaimer. 00015 // * Redistributions in binary form must reproduce the above 00016 // copyright notice, this list of conditions and the following disclaimer 00017 // in the documentation and/or other materials provided with the 00018 // distribution. 00019 // * Neither the name of Google Inc. nor the names of its 00020 // contributors may be used to endorse or promote products derived from 00021 // this software without specific prior written permission. 00022 // 00023 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00024 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00025 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00026 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 // Copied from strings/stringpiece.h with modifications 00035 // 00036 // A string-like object that points to a sized piece of memory. 00037 // 00038 // Functions or methods may use const StringPiece& parameters to accept either 00039 // a "const char*" or a "string" value that will be implicitly converted to 00040 // a StringPiece. The implicit conversion means that it is often appropriate 00041 // to include this .h file in other files rather than forward-declaring 00042 // StringPiece as would be appropriate for most other Google classes. 00043 // 00044 // Systematic usage of StringPiece is encouraged as it will reduce unnecessary 00045 // conversions from "const char*" to "string" and back again. 00046 // 00047 00048 #ifndef UTIL_STRING_PIECE_H 00049 #define UTIL_STRING_PIECE_H 00050 00051 #include "util/have.hh" 00052 00053 #include <cstring> 00054 #include <iosfwd> 00055 #include <ostream> 00056 00057 #ifdef HAVE_ICU 00058 #include <unicode/stringpiece.h> 00059 #include <unicode/uversion.h> 00060 00061 // Old versions of ICU don't define operator== and operator!=. 00062 #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) 00063 #warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. 00064 inline bool operator==(const StringPiece& x, const StringPiece& y) { 00065 if (x.size() != y.size()) 00066 return false; 00067 00068 return std::memcmp(x.data(), y.data(), x.size()) == 0; 00069 } 00070 00071 inline bool operator!=(const StringPiece& x, const StringPiece& y) { 00072 return !(x == y); 00073 } 00074 #endif // old version of ICU 00075 00076 U_NAMESPACE_BEGIN 00077 00078 inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { 00079 int longersize = longer.size(), prefixsize = prefix.size(); 00080 return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; 00081 } 00082 00083 #else 00084 00085 #include <algorithm> 00086 #include <cstddef> 00087 #include <string> 00088 #include <cstring> 00089 00090 #ifdef WIN32 00091 #undef max 00092 #undef min 00093 #endif 00094 00095 class StringPiece { 00096 public: 00097 typedef size_t size_type; 00098 00099 private: 00100 const char* ptr_; 00101 size_type length_; 00102 00103 public: 00104 // We provide non-explicit singleton constructors so users can pass 00105 // in a "const char*" or a "string" wherever a "StringPiece" is 00106 // expected. 00107 StringPiece() : ptr_(NULL), length_(0) { } 00108 StringPiece(const char* str) 00109 : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } 00110 StringPiece(const std::string& str) 00111 : ptr_(str.data()), length_(str.size()) { } 00112 StringPiece(const char* offset, size_type len) 00113 : ptr_(offset), length_(len) { } 00114 00115 // data() may return a pointer to a buffer with embedded NULs, and the 00116 // returned buffer may or may not be null terminated. Therefore it is 00117 // typically a mistake to pass data() to a routine that expects a NUL 00118 // terminated string. 00119 const char* data() const { return ptr_; } 00120 size_type size() const { return length_; } 00121 size_type length() const { return length_; } 00122 bool empty() const { return length_ == 0; } 00123 00124 void clear() { ptr_ = NULL; length_ = 0; } 00125 void set(const char* data, size_type len) { ptr_ = data; length_ = len; } 00126 void set(const char* str) { 00127 ptr_ = str; 00128 length_ = str ? strlen(str) : 0; 00129 } 00130 void set(const void* data, size_type len) { 00131 ptr_ = reinterpret_cast<const char*>(data); 00132 length_ = len; 00133 } 00134 00135 char operator[](size_type i) const { return ptr_[i]; } 00136 00137 void remove_prefix(size_type n) { 00138 ptr_ += n; 00139 length_ -= n; 00140 } 00141 00142 void remove_suffix(size_type n) { 00143 length_ -= n; 00144 } 00145 00146 int compare(const StringPiece& x) const { 00147 int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); 00148 if (r == 0) { 00149 if (length_ < x.length_) r = -1; 00150 else if (length_ > x.length_) r = +1; 00151 } 00152 return r; 00153 } 00154 00155 std::string as_string() const { 00156 // std::string doesn't like to take a NULL pointer even with a 0 size. 00157 return std::string(!empty() ? data() : "", size()); 00158 } 00159 00160 void CopyToString(std::string* target) const; 00161 void AppendToString(std::string* target) const; 00162 00163 // Does "this" start with "x" 00164 bool starts_with(const StringPiece& x) const { 00165 return ((length_ >= x.length_) && 00166 (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); 00167 } 00168 00169 // Does "this" end with "x" 00170 bool ends_with(const StringPiece& x) const { 00171 return ((length_ >= x.length_) && 00172 (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); 00173 } 00174 00175 // standard STL container boilerplate 00176 typedef char value_type; 00177 typedef const char* pointer; 00178 typedef const char& reference; 00179 typedef const char& const_reference; 00180 typedef ptrdiff_t difference_type; 00181 static const size_type npos; 00182 typedef const char* const_iterator; 00183 typedef const char* iterator; 00184 typedef std::reverse_iterator<const_iterator> const_reverse_iterator; 00185 typedef std::reverse_iterator<iterator> reverse_iterator; 00186 iterator begin() const { return ptr_; } 00187 iterator end() const { return ptr_ + length_; } 00188 const_reverse_iterator rbegin() const { 00189 return const_reverse_iterator(ptr_ + length_); 00190 } 00191 const_reverse_iterator rend() const { 00192 return const_reverse_iterator(ptr_); 00193 } 00194 00195 size_type max_size() const { return length_; } 00196 size_type capacity() const { return length_; } 00197 00198 size_type copy(char* buf, size_type n, size_type pos = 0) const; 00199 00200 size_type find(const StringPiece& s, size_type pos = 0) const; 00201 size_type find(char c, size_type pos = 0) const; 00202 size_type rfind(const StringPiece& s, size_type pos = npos) const; 00203 size_type rfind(char c, size_type pos = npos) const; 00204 00205 size_type find_first_of(const StringPiece& s, size_type pos = 0) const; 00206 size_type find_first_of(char c, size_type pos = 0) const { 00207 return find(c, pos); 00208 } 00209 size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; 00210 size_type find_first_not_of(char c, size_type pos = 0) const; 00211 size_type find_last_of(const StringPiece& s, size_type pos = npos) const; 00212 size_type find_last_of(char c, size_type pos = npos) const { 00213 return rfind(c, pos); 00214 } 00215 size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; 00216 size_type find_last_not_of(char c, size_type pos = npos) const; 00217 00218 StringPiece substr(size_type pos, size_type n = npos) const; 00219 00220 static int wordmemcmp(const char* p, const char* p2, size_type N) { 00221 return std::memcmp(p, p2, N); 00222 } 00223 }; 00224 00225 inline bool operator==(const StringPiece& x, const StringPiece& y) { 00226 if (x.size() != y.size()) 00227 return false; 00228 00229 return std::memcmp(x.data(), y.data(), x.size()) == 0; 00230 } 00231 00232 inline bool operator!=(const StringPiece& x, const StringPiece& y) { 00233 return !(x == y); 00234 } 00235 00236 inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { 00237 return longer.starts_with(prefix); 00238 } 00239 00240 #endif // HAVE_ICU undefined 00241 00242 inline bool operator<(const StringPiece& x, const StringPiece& y) { 00243 const int r = std::memcmp(x.data(), y.data(), 00244 std::min(x.size(), y.size())); 00245 return ((r < 0) || ((r == 0) && (x.size() < y.size()))); 00246 } 00247 00248 inline bool operator>(const StringPiece& x, const StringPiece& y) { 00249 return y < x; 00250 } 00251 00252 inline bool operator<=(const StringPiece& x, const StringPiece& y) { 00253 return !(x > y); 00254 } 00255 00256 inline bool operator>=(const StringPiece& x, const StringPiece& y) { 00257 return !(x < y); 00258 } 00259 00260 // allow StringPiece to be logged (needed for unit testing). 00261 inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { 00262 return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); 00263 } 00264 00265 #ifdef HAVE_ICU 00266 U_NAMESPACE_END 00267 using U_NAMESPACE_QUALIFIER StringPiece; 00268 #endif 00269 00270 #endif // UTIL_STRING_PIECE_H