Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/kenlm/util/file_piece.hh
00001 #ifndef UTIL_FILE_PIECE_H
00002 #define UTIL_FILE_PIECE_H
00003 
00004 #include "util/ersatz_progress.hh"
00005 #include "util/exception.hh"
00006 #include "util/file.hh"
00007 #include "util/mmap.hh"
00008 #include "util/read_compressed.hh"
00009 #include "util/string_piece.hh"
00010 
00011 #include <cstddef>
00012 #include <iosfwd>
00013 #include <string>
00014 #include <cassert>
00015 #include <stdint.h>
00016 
00017 namespace util {
00018 
00019 class ParseNumberException : public Exception {
00020   public:
00021     explicit ParseNumberException(StringPiece value) throw();
00022     ~ParseNumberException() throw() {}
00023 };
00024 
00025 extern const bool kSpaces[256];
00026 
00027 // Memory backing the returned StringPiece may vanish on the next call.
00028 class FilePiece {
00029   public:
00030     // 1 MB default.
00031     explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
00032     // Takes ownership of fd.  name is used for messages.
00033     explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
00034 
00035     /* Read from an istream.  Don't use this if you can avoid it.  Raw fd IO is
00036      * much faster.  But sometimes you just have an istream like Boost's HTTP
00037      * server and want to parse it the same way.
00038      * name is just used for messages and FileName().
00039      */
00040     explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
00041 
00042     ~FilePiece();
00043 
00044     char get() {
00045       if (position_ == position_end_) {
00046         Shift();
00047         if (at_end_) throw EndOfFileException();
00048       }
00049       return *(position_++);
00050     }
00051 
00052     // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().
00053     StringPiece ReadDelimited(const bool *delim = kSpaces) {
00054       SkipSpaces(delim);
00055       return Consume(FindDelimiterOrEOF(delim));
00056     }
00057 
00059     bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) {
00060       assert(delim[static_cast<unsigned char>('\n')]);
00061       // Skip non-enter spaces.
00062       for (; ; ++position_) {
00063         if (position_ == position_end_) {
00064           try {
00065             Shift();
00066           } catch (const util::EndOfFileException &e) { return false; }
00067           // And break out at end of file.
00068           if (position_ == position_end_) return false;
00069         }
00070         if (!delim[static_cast<unsigned char>(*position_)]) break;
00071         if (*position_ == '\n') return false;
00072       }
00073       // We can't be at the end of file because there's at least one character open.
00074       to = Consume(FindDelimiterOrEOF(delim));
00075       return true;
00076     }
00077 
00090     StringPiece ReadLine(char delim = '\n', bool strip_cr = true);
00091 
00101     bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true);
00102 
00103     float ReadFloat();
00104     double ReadDouble();
00105     long int ReadLong();
00106     unsigned long int ReadULong();
00107 
00108     // Skip spaces defined by isspace.
00109     void SkipSpaces(const bool *delim = kSpaces) {
00110       assert(position_ <= position_end_);
00111       for (; ; ++position_) {
00112         if (position_ == position_end_) {
00113           Shift();
00114           // And break out at end of file.
00115           if (position_ == position_end_) return;
00116         }
00117         assert(position_ < position_end_);
00118         if (!delim[static_cast<unsigned char>(*position_)]) return;
00119       }
00120     }
00121 
00122     uint64_t Offset() const {
00123       return position_ - data_.begin() + mapped_offset_;
00124     }
00125 
00126     const std::string &FileName() const { return file_name_; }
00127 
00128   private:
00129     void InitializeNoRead(const char *name, std::size_t min_buffer);
00130     // Calls InitializeNoRead, so don't call both.
00131     void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
00132 
00133     template <class T> T ReadNumber();
00134 
00135     StringPiece Consume(const char *to) {
00136       assert(to >= position_);
00137       StringPiece ret(position_, to - position_);
00138       position_ = to;
00139       return ret;
00140     }
00141 
00142     const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
00143 
00144     void Shift();
00145     // Backends to Shift().
00146     void MMapShift(uint64_t desired_begin);
00147 
00148     void TransitionToRead();
00149     void ReadShift();
00150 
00151     const char *position_, *last_space_, *position_end_;
00152 
00153     scoped_fd file_;
00154     const uint64_t total_size_;
00155     const uint64_t page_;
00156 
00157     std::size_t default_map_size_;
00158     uint64_t mapped_offset_;
00159 
00160     // Order matters: file_ should always be destroyed after this.
00161     scoped_memory data_;
00162 
00163     bool at_end_;
00164     bool fallback_to_read_;
00165 
00166     ErsatzProgress progress_;
00167 
00168     std::string file_name_;
00169 
00170     ReadCompressed fell_back_;
00171 };
00172 
00173 } // namespace util
00174 
00175 #endif // UTIL_FILE_PIECE_H