Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/util/string_piece.hh
00001 /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
00002  * you don't use ICU, then this will use the Google implementation from Chrome.
00003  * This has been modified from the original version to let you choose.
00004  */
00005 
00006 // Copyright 2008, Google Inc.
00007 // All rights reserved.
00008 //
00009 // Redistribution and use in source and binary forms, with or without
00010 // modification, are permitted provided that the following conditions are
00011 // met:
00012 //
00013 //    * Redistributions of source code must retain the above copyright
00014 // notice, this list of conditions and the following disclaimer.
00015 //    * Redistributions in binary form must reproduce the above
00016 // copyright notice, this list of conditions and the following disclaimer
00017 // in the documentation and/or other materials provided with the
00018 // distribution.
00019 //    * Neither the name of Google Inc. nor the names of its
00020 // contributors may be used to endorse or promote products derived from
00021 // this software without specific prior written permission.
00022 //
00023 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00024 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00025 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00026 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00027 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00028 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00029 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00030 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00031 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00032 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00033 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00034 // Copied from strings/stringpiece.h with modifications
00035 //
00036 // A string-like object that points to a sized piece of memory.
00037 //
00038 // Functions or methods may use const StringPiece& parameters to accept either
00039 // a "const char*" or a "string" value that will be implicitly converted to
00040 // a StringPiece.  The implicit conversion means that it is often appropriate
00041 // to include this .h file in other files rather than forward-declaring
00042 // StringPiece as would be appropriate for most other Google classes.
00043 //
00044 // Systematic usage of StringPiece is encouraged as it will reduce unnecessary
00045 // conversions from "const char*" to "string" and back again.
00046 //
00047 
00048 #ifndef UTIL_STRING_PIECE_H
00049 #define UTIL_STRING_PIECE_H
00050 
00051 #include "util/have.hh"
00052 
00053 #include <cstring>
00054 #include <iosfwd>
00055 #include <ostream>
00056 
00057 #ifdef HAVE_ICU
00058 #include <unicode/stringpiece.h>
00059 #include <unicode/uversion.h>
00060 
00061 // Old versions of ICU don't define operator== and operator!=.
00062 #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
00063 #warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.
00064 inline bool operator==(const StringPiece& x, const StringPiece& y) {
00065   if (x.size() != y.size())
00066     return false;
00067 
00068   return std::memcmp(x.data(), y.data(), x.size()) == 0;
00069 }
00070 
00071 inline bool operator!=(const StringPiece& x, const StringPiece& y) {
00072   return !(x == y);
00073 }
00074 #endif // old version of ICU
00075 
00076 U_NAMESPACE_BEGIN
00077 
00078 inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
00079   int longersize = longer.size(), prefixsize = prefix.size();
00080   return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
00081 }
00082 
00083 #else
00084 
00085 #include <algorithm>
00086 #include <cstddef>
00087 #include <string>
00088 #include <cstring>
00089 
00090 #ifdef WIN32
00091 #undef max
00092 #undef min
00093 #endif
00094 
00095 class StringPiece {
00096  public:
00097   typedef size_t size_type;
00098 
00099  private:
00100   const char*   ptr_;
00101   size_type     length_;
00102 
00103  public:
00104   // We provide non-explicit singleton constructors so users can pass
00105   // in a "const char*" or a "string" wherever a "StringPiece" is
00106   // expected.
00107   StringPiece() : ptr_(NULL), length_(0) { }
00108   StringPiece(const char* str)
00109     : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
00110   StringPiece(const std::string& str)
00111     : ptr_(str.data()), length_(str.size()) { }
00112   StringPiece(const char* offset, size_type len)
00113     : ptr_(offset), length_(len) { }
00114 
00115   // data() may return a pointer to a buffer with embedded NULs, and the
00116   // returned buffer may or may not be null terminated.  Therefore it is
00117   // typically a mistake to pass data() to a routine that expects a NUL
00118   // terminated string.
00119   const char* data() const { return ptr_; }
00120   size_type size() const { return length_; }
00121   size_type length() const { return length_; }
00122   bool empty() const { return length_ == 0; }
00123 
00124   void clear() { ptr_ = NULL; length_ = 0; }
00125   void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
00126   void set(const char* str) {
00127     ptr_ = str;
00128     length_ = str ? strlen(str) : 0;
00129   }
00130   void set(const void* data, size_type len) {
00131     ptr_ = reinterpret_cast<const char*>(data);
00132     length_ = len;
00133   }
00134 
00135   char operator[](size_type i) const { return ptr_[i]; }
00136 
00137   void remove_prefix(size_type n) {
00138     ptr_ += n;
00139     length_ -= n;
00140   }
00141 
00142   void remove_suffix(size_type n) {
00143     length_ -= n;
00144   }
00145 
00146   int compare(const StringPiece& x) const {
00147     int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
00148     if (r == 0) {
00149       if (length_ < x.length_) r = -1;
00150       else if (length_ > x.length_) r = +1;
00151     }
00152     return r;
00153   }
00154 
00155   std::string as_string() const {
00156     // std::string doesn't like to take a NULL pointer even with a 0 size.
00157     return std::string(!empty() ? data() : "", size());
00158   }
00159 
00160   void CopyToString(std::string* target) const;
00161   void AppendToString(std::string* target) const;
00162 
00163   // Does "this" start with "x"
00164   bool starts_with(const StringPiece& x) const {
00165     return ((length_ >= x.length_) &&
00166             (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
00167   }
00168 
00169   // Does "this" end with "x"
00170   bool ends_with(const StringPiece& x) const {
00171     return ((length_ >= x.length_) &&
00172             (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
00173   }
00174 
00175   // standard STL container boilerplate
00176   typedef char value_type;
00177   typedef const char* pointer;
00178   typedef const char& reference;
00179   typedef const char& const_reference;
00180   typedef ptrdiff_t difference_type;
00181   static const size_type npos;
00182   typedef const char* const_iterator;
00183   typedef const char* iterator;
00184   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
00185   typedef std::reverse_iterator<iterator> reverse_iterator;
00186   iterator begin() const { return ptr_; }
00187   iterator end() const { return ptr_ + length_; }
00188   const_reverse_iterator rbegin() const {
00189     return const_reverse_iterator(ptr_ + length_);
00190   }
00191   const_reverse_iterator rend() const {
00192     return const_reverse_iterator(ptr_);
00193   }
00194 
00195   size_type max_size() const { return length_; }
00196   size_type capacity() const { return length_; }
00197 
00198   size_type copy(char* buf, size_type n, size_type pos = 0) const;
00199 
00200   size_type find(const StringPiece& s, size_type pos = 0) const;
00201   size_type find(char c, size_type pos = 0) const;
00202   size_type rfind(const StringPiece& s, size_type pos = npos) const;
00203   size_type rfind(char c, size_type pos = npos) const;
00204 
00205   size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
00206   size_type find_first_of(char c, size_type pos = 0) const {
00207     return find(c, pos);
00208   }
00209   size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
00210   size_type find_first_not_of(char c, size_type pos = 0) const;
00211   size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
00212   size_type find_last_of(char c, size_type pos = npos) const {
00213     return rfind(c, pos);
00214   }
00215   size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
00216   size_type find_last_not_of(char c, size_type pos = npos) const;
00217 
00218   StringPiece substr(size_type pos, size_type n = npos) const;
00219 
00220   static int wordmemcmp(const char* p, const char* p2, size_type N) {
00221     return std::memcmp(p, p2, N);
00222   }
00223 };
00224 
00225 inline bool operator==(const StringPiece& x, const StringPiece& y) {
00226   if (x.size() != y.size())
00227     return false;
00228 
00229   return std::memcmp(x.data(), y.data(), x.size()) == 0;
00230 }
00231 
00232 inline bool operator!=(const StringPiece& x, const StringPiece& y) {
00233   return !(x == y);
00234 }
00235 
00236 inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
00237   return longer.starts_with(prefix);
00238 }
00239 
00240 #endif // HAVE_ICU undefined
00241 
00242 inline bool operator<(const StringPiece& x, const StringPiece& y) {
00243   const int r = std::memcmp(x.data(), y.data(),
00244                                        std::min(x.size(), y.size()));
00245   return ((r < 0) || ((r == 0) && (x.size() < y.size())));
00246 }
00247 
00248 inline bool operator>(const StringPiece& x, const StringPiece& y) {
00249   return y < x;
00250 }
00251 
00252 inline bool operator<=(const StringPiece& x, const StringPiece& y) {
00253   return !(x > y);
00254 }
00255 
00256 inline bool operator>=(const StringPiece& x, const StringPiece& y) {
00257   return !(x < y);
00258 }
00259 
00260 // allow StringPiece to be logged (needed for unit testing).
00261 inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
00262   return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
00263 }
00264 
00265 #ifdef HAVE_ICU
00266 U_NAMESPACE_END
00267 using U_NAMESPACE_QUALIFIER StringPiece;
00268 #endif
00269 
00270 #endif  // UTIL_STRING_PIECE_H