Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef UTIL_FILE_PIECE_H 00002 #define UTIL_FILE_PIECE_H 00003 00004 #include "util/ersatz_progress.hh" 00005 #include "util/exception.hh" 00006 #include "util/file.hh" 00007 #include "util/mmap.hh" 00008 #include "util/read_compressed.hh" 00009 #include "util/string_piece.hh" 00010 00011 #include <cstddef> 00012 #include <iosfwd> 00013 #include <string> 00014 #include <cassert> 00015 #include <stdint.h> 00016 00017 namespace util { 00018 00019 class ParseNumberException : public Exception { 00020 public: 00021 explicit ParseNumberException(StringPiece value) throw(); 00022 ~ParseNumberException() throw() {} 00023 }; 00024 00025 extern const bool kSpaces[256]; 00026 00027 // Memory backing the returned StringPiece may vanish on the next call. 00028 class FilePiece { 00029 public: 00030 // 1 MB default. 00031 explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); 00032 // Takes ownership of fd. name is used for messages. 00033 explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); 00034 00035 /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is 00036 * much faster. But sometimes you just have an istream like Boost's HTTP 00037 * server and want to parse it the same way. 00038 * name is just used for messages and FileName(). 00039 */ 00040 explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); 00041 00042 ~FilePiece(); 00043 00044 char get() { 00045 if (position_ == position_end_) { 00046 Shift(); 00047 if (at_end_) throw EndOfFileException(); 00048 } 00049 return *(position_++); 00050 } 00051 00052 // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). 00053 StringPiece ReadDelimited(const bool *delim = kSpaces) { 00054 SkipSpaces(delim); 00055 return Consume(FindDelimiterOrEOF(delim)); 00056 } 00057 00059 bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) { 00060 assert(delim[static_cast<unsigned char>('\n')]); 00061 // Skip non-enter spaces. 00062 for (; ; ++position_) { 00063 if (position_ == position_end_) { 00064 try { 00065 Shift(); 00066 } catch (const util::EndOfFileException &e) { return false; } 00067 // And break out at end of file. 00068 if (position_ == position_end_) return false; 00069 } 00070 if (!delim[static_cast<unsigned char>(*position_)]) break; 00071 if (*position_ == '\n') return false; 00072 } 00073 // We can't be at the end of file because there's at least one character open. 00074 to = Consume(FindDelimiterOrEOF(delim)); 00075 return true; 00076 } 00077 00090 StringPiece ReadLine(char delim = '\n', bool strip_cr = true); 00091 00101 bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true); 00102 00103 float ReadFloat(); 00104 double ReadDouble(); 00105 long int ReadLong(); 00106 unsigned long int ReadULong(); 00107 00108 // Skip spaces defined by isspace. 00109 void SkipSpaces(const bool *delim = kSpaces) { 00110 assert(position_ <= position_end_); 00111 for (; ; ++position_) { 00112 if (position_ == position_end_) { 00113 Shift(); 00114 // And break out at end of file. 00115 if (position_ == position_end_) return; 00116 } 00117 assert(position_ < position_end_); 00118 if (!delim[static_cast<unsigned char>(*position_)]) return; 00119 } 00120 } 00121 00122 uint64_t Offset() const { 00123 return position_ - data_.begin() + mapped_offset_; 00124 } 00125 00126 const std::string &FileName() const { return file_name_; } 00127 00128 private: 00129 void InitializeNoRead(const char *name, std::size_t min_buffer); 00130 // Calls InitializeNoRead, so don't call both. 00131 void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); 00132 00133 template <class T> T ReadNumber(); 00134 00135 StringPiece Consume(const char *to) { 00136 assert(to >= position_); 00137 StringPiece ret(position_, to - position_); 00138 position_ = to; 00139 return ret; 00140 } 00141 00142 const char *FindDelimiterOrEOF(const bool *delim = kSpaces); 00143 00144 void Shift(); 00145 // Backends to Shift(). 00146 void MMapShift(uint64_t desired_begin); 00147 00148 void TransitionToRead(); 00149 void ReadShift(); 00150 00151 const char *position_, *last_space_, *position_end_; 00152 00153 scoped_fd file_; 00154 const uint64_t total_size_; 00155 const uint64_t page_; 00156 00157 std::size_t default_map_size_; 00158 uint64_t mapped_offset_; 00159 00160 // Order matters: file_ should always be destroyed after this. 00161 scoped_memory data_; 00162 00163 bool at_end_; 00164 bool fallback_to_read_; 00165 00166 ErsatzProgress progress_; 00167 00168 std::string file_name_; 00169 00170 ReadCompressed fell_back_; 00171 }; 00172 00173 } // namespace util 00174 00175 #endif // UTIL_FILE_PIECE_H