414 lines
16 KiB
C++
414 lines
16 KiB
C++
// Protocol Buffers - Google's data interchange format
|
|
// Copyright 2008 Google Inc. All rights reserved.
|
|
// https://developers.google.com/protocol-buffers/
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following disclaimer
|
|
// in the documentation and/or other materials provided with the
|
|
// distribution.
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
// Author: kenton@google.com (Kenton Varda)
|
|
// Based on original Protocol Buffers design by
|
|
// Sanjay Ghemawat, Jeff Dean, and others.
|
|
//
|
|
// Class for parsing tokenized text from a ZeroCopyInputStream.
|
|
|
|
#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
|
|
#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
|
|
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <google/protobuf/stubs/common.h>
|
|
#include <google/protobuf/stubs/logging.h>
|
|
#include <google/protobuf/port_def.inc>
|
|
|
|
namespace google {
|
|
namespace protobuf {
|
|
namespace io {
|
|
|
|
class ZeroCopyInputStream; // zero_copy_stream.h
|
|
|
|
// Defined in this file.
|
|
class ErrorCollector;
|
|
class Tokenizer;
|
|
|
|
// By "column number", the proto compiler refers to a count of the number
|
|
// of bytes before a given byte, except that a tab character advances to
|
|
// the next multiple of 8 bytes. Note in particular that column numbers
|
|
// are zero-based, while many user interfaces use one-based column numbers.
|
|
typedef int ColumnNumber;
|
|
|
|
// Abstract interface for an object which collects the errors that occur
|
|
// during parsing. A typical implementation might simply print the errors
|
|
// to stdout.
|
|
class PROTOBUF_EXPORT ErrorCollector {
|
|
public:
|
|
inline ErrorCollector() {}
|
|
virtual ~ErrorCollector();
|
|
|
|
// Indicates that there was an error in the input at the given line and
|
|
// column numbers. The numbers are zero-based, so you may want to add
|
|
// 1 to each before printing them.
|
|
virtual void AddError(int line, ColumnNumber column,
|
|
const std::string& message) = 0;
|
|
|
|
// Indicates that there was a warning in the input at the given line and
|
|
// column numbers. The numbers are zero-based, so you may want to add
|
|
// 1 to each before printing them.
|
|
virtual void AddWarning(int line, ColumnNumber column,
|
|
const std::string& message) {}
|
|
|
|
private:
|
|
GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
|
|
};
|
|
|
|
// This class converts a stream of raw text into a stream of tokens for
|
|
// the protocol definition parser to parse. The tokens recognized are
|
|
// similar to those that make up the C language; see the TokenType enum for
|
|
// precise descriptions. Whitespace and comments are skipped. By default,
|
|
// C- and C++-style comments are recognized, but other styles can be used by
|
|
// calling set_comment_style().
|
|
class PROTOBUF_EXPORT Tokenizer {
|
|
public:
|
|
// Construct a Tokenizer that reads and tokenizes text from the given
|
|
// input stream and writes errors to the given error_collector.
|
|
// The caller keeps ownership of input and error_collector.
|
|
Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
|
|
~Tokenizer();
|
|
|
|
enum TokenType {
|
|
TYPE_START, // Next() has not yet been called.
|
|
TYPE_END, // End of input reached. "text" is empty.
|
|
|
|
TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not
|
|
// starting with a digit. It is an error for a number
|
|
// to be followed by an identifier with no space in
|
|
// between.
|
|
TYPE_INTEGER, // A sequence of digits representing an integer. Normally
|
|
// the digits are decimal, but a prefix of "0x" indicates
|
|
// a hex number and a leading zero indicates octal, just
|
|
// like with C numeric literals. A leading negative sign
|
|
// is NOT included in the token; it's up to the parser to
|
|
// interpret the unary minus operator on its own.
|
|
TYPE_FLOAT, // A floating point literal, with a fractional part and/or
|
|
// an exponent. Always in decimal. Again, never
|
|
// negative.
|
|
TYPE_STRING, // A quoted sequence of escaped characters. Either single
|
|
// or double quotes can be used, but they must match.
|
|
// A string literal cannot cross a line break.
|
|
TYPE_SYMBOL, // Any other printable character, like '!' or '+'.
|
|
// Symbols are always a single character, so "!+$%" is
|
|
// four tokens.
|
|
};
|
|
|
|
// Structure representing a token read from the token stream.
|
|
struct Token {
|
|
TokenType type;
|
|
std::string text; // The exact text of the token as it appeared in
|
|
// the input. e.g. tokens of TYPE_STRING will still
|
|
// be escaped and in quotes.
|
|
|
|
// "line" and "column" specify the position of the first character of
|
|
// the token within the input stream. They are zero-based.
|
|
int line;
|
|
ColumnNumber column;
|
|
ColumnNumber end_column;
|
|
};
|
|
|
|
// Get the current token. This is updated when Next() is called. Before
|
|
// the first call to Next(), current() has type TYPE_START and no contents.
|
|
const Token& current();
|
|
|
|
// Return the previous token -- i.e. what current() returned before the
|
|
// previous call to Next().
|
|
const Token& previous();
|
|
|
|
// Advance to the next token. Returns false if the end of the input is
|
|
// reached.
|
|
bool Next();
|
|
|
|
// Like Next(), but also collects comments which appear between the previous
|
|
// and next tokens.
|
|
//
|
|
// Comments which appear to be attached to the previous token are stored
|
|
// in *prev_tailing_comments. Comments which appear to be attached to the
|
|
// next token are stored in *next_leading_comments. Comments appearing in
|
|
// between which do not appear to be attached to either will be added to
|
|
// detached_comments. Any of these parameters can be NULL to simply discard
|
|
// the comments.
|
|
//
|
|
// A series of line comments appearing on consecutive lines, with no other
|
|
// tokens appearing on those lines, will be treated as a single comment.
|
|
//
|
|
// Only the comment content is returned; comment markers (e.g. //) are
|
|
// stripped out. For block comments, leading whitespace and an asterisk will
|
|
// be stripped from the beginning of each line other than the first. Newlines
|
|
// are included in the output.
|
|
//
|
|
// Examples:
|
|
//
|
|
// optional int32 foo = 1; // Comment attached to foo.
|
|
// // Comment attached to bar.
|
|
// optional int32 bar = 2;
|
|
//
|
|
// optional string baz = 3;
|
|
// // Comment attached to baz.
|
|
// // Another line attached to baz.
|
|
//
|
|
// // Comment attached to qux.
|
|
// //
|
|
// // Another line attached to qux.
|
|
// optional double qux = 4;
|
|
//
|
|
// // Detached comment. This is not attached to qux or corge
|
|
// // because there are blank lines separating it from both.
|
|
//
|
|
// optional string corge = 5;
|
|
// /* Block comment attached
|
|
// * to corge. Leading asterisks
|
|
// * will be removed. */
|
|
// /* Block comment attached to
|
|
// * grault. */
|
|
// optional int32 grault = 6;
|
|
bool NextWithComments(std::string* prev_trailing_comments,
|
|
std::vector<std::string>* detached_comments,
|
|
std::string* next_leading_comments);
|
|
|
|
// Parse helpers ---------------------------------------------------
|
|
|
|
// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
|
|
// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
|
|
// result is undefined (possibly an assert failure).
|
|
static double ParseFloat(const std::string& text);
|
|
|
|
// Parses a TYPE_STRING token. This never fails, so long as the text actually
|
|
// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
|
|
// result is undefined (possibly an assert failure).
|
|
static void ParseString(const std::string& text, std::string* output);
|
|
|
|
// Identical to ParseString, but appends to output.
|
|
static void ParseStringAppend(const std::string& text, std::string* output);
|
|
|
|
// Parses a TYPE_INTEGER token. Returns false if the result would be
|
|
// greater than max_value. Otherwise, returns true and sets *output to the
|
|
// result. If the text is not from a Token of type TYPE_INTEGER originally
|
|
// parsed by a Tokenizer, the result is undefined (possibly an assert
|
|
// failure).
|
|
static bool ParseInteger(const std::string& text, uint64 max_value,
|
|
uint64* output);
|
|
|
|
// Options ---------------------------------------------------------
|
|
|
|
// Set true to allow floats to be suffixed with the letter 'f'. Tokens
|
|
// which would otherwise be integers but which have the 'f' suffix will be
|
|
// forced to be interpreted as floats. For all other purposes, the 'f' is
|
|
// ignored.
|
|
void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
|
|
|
|
// Valid values for set_comment_style().
|
|
enum CommentStyle {
|
|
// Line comments begin with "//", block comments are delimited by "/*" and
|
|
// "*/".
|
|
CPP_COMMENT_STYLE,
|
|
// Line comments begin with "#". No way to write block comments.
|
|
SH_COMMENT_STYLE
|
|
};
|
|
|
|
// Sets the comment style.
|
|
void set_comment_style(CommentStyle style) { comment_style_ = style; }
|
|
|
|
// Whether to require whitespace between a number and a field name.
|
|
// Default is true. Do not use this; for Google-internal cleanup only.
|
|
void set_require_space_after_number(bool require) {
|
|
require_space_after_number_ = require;
|
|
}
|
|
|
|
// Whether to allow string literals to span multiple lines. Default is false.
|
|
// Do not use this; for Google-internal cleanup only.
|
|
void set_allow_multiline_strings(bool allow) {
|
|
allow_multiline_strings_ = allow;
|
|
}
|
|
|
|
// External helper: validate an identifier.
|
|
static bool IsIdentifier(const std::string& text);
|
|
|
|
// -----------------------------------------------------------------
|
|
private:
|
|
GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
|
|
|
|
Token current_; // Returned by current().
|
|
Token previous_; // Returned by previous().
|
|
|
|
ZeroCopyInputStream* input_;
|
|
ErrorCollector* error_collector_;
|
|
|
|
char current_char_; // == buffer_[buffer_pos_], updated by NextChar().
|
|
const char* buffer_; // Current buffer returned from input_.
|
|
int buffer_size_; // Size of buffer_.
|
|
int buffer_pos_; // Current position within the buffer.
|
|
bool read_error_; // Did we previously encounter a read error?
|
|
|
|
// Line and column number of current_char_ within the whole input stream.
|
|
int line_;
|
|
ColumnNumber column_;
|
|
|
|
// String to which text should be appended as we advance through it.
|
|
// Call RecordTo(&str) to start recording and StopRecording() to stop.
|
|
// E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the
|
|
// position within the current buffer where recording started.
|
|
std::string* record_target_;
|
|
int record_start_;
|
|
|
|
// Options.
|
|
bool allow_f_after_float_;
|
|
CommentStyle comment_style_;
|
|
bool require_space_after_number_;
|
|
bool allow_multiline_strings_;
|
|
|
|
// Since we count columns we need to interpret tabs somehow. We'll take
|
|
// the standard 8-character definition for lack of any way to do better.
|
|
// This must match the documentation of ColumnNumber.
|
|
static const int kTabWidth = 8;
|
|
|
|
// -----------------------------------------------------------------
|
|
// Helper methods.
|
|
|
|
// Consume this character and advance to the next one.
|
|
void NextChar();
|
|
|
|
// Read a new buffer from the input.
|
|
void Refresh();
|
|
|
|
inline void RecordTo(std::string* target);
|
|
inline void StopRecording();
|
|
|
|
// Called when the current character is the first character of a new
|
|
// token (not including whitespace or comments).
|
|
inline void StartToken();
|
|
// Called when the current character is the first character after the
|
|
// end of the last token. After this returns, current_.text will
|
|
// contain all text consumed since StartToken() was called.
|
|
inline void EndToken();
|
|
|
|
// Convenience method to add an error at the current line and column.
|
|
void AddError(const std::string& message) {
|
|
error_collector_->AddError(line_, column_, message);
|
|
}
|
|
|
|
// -----------------------------------------------------------------
|
|
// The following four methods are used to consume tokens of specific
|
|
// types. They are actually used to consume all characters *after*
|
|
// the first, since the calling function consumes the first character
|
|
// in order to decide what kind of token is being read.
|
|
|
|
// Read and consume a string, ending when the given delimiter is
|
|
// consumed.
|
|
void ConsumeString(char delimiter);
|
|
|
|
// Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
|
|
// depending on what was read. This needs to know if the first
|
|
// character was a zero in order to correctly recognize hex and octal
|
|
// numbers.
|
|
// It also needs to know if the first character was a . to parse floating
|
|
// point correctly.
|
|
TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
|
|
|
|
// Consume the rest of a line.
|
|
void ConsumeLineComment(std::string* content);
|
|
// Consume until "*/".
|
|
void ConsumeBlockComment(std::string* content);
|
|
|
|
enum NextCommentStatus {
|
|
// Started a line comment.
|
|
LINE_COMMENT,
|
|
|
|
// Started a block comment.
|
|
BLOCK_COMMENT,
|
|
|
|
// Consumed a slash, then realized it wasn't a comment. current_ has
|
|
// been filled in with a slash token. The caller should return it.
|
|
SLASH_NOT_COMMENT,
|
|
|
|
// We do not appear to be starting a comment here.
|
|
NO_COMMENT
|
|
};
|
|
|
|
// If we're at the start of a new comment, consume it and return what kind
|
|
// of comment it is.
|
|
NextCommentStatus TryConsumeCommentStart();
|
|
|
|
// -----------------------------------------------------------------
|
|
// These helper methods make the parsing code more readable. The
|
|
// "character classes" referred to are defined at the top of the .cc file.
|
|
// Basically it is a C++ class with one method:
|
|
// static bool InClass(char c);
|
|
// The method returns true if c is a member of this "class", like "Letter"
|
|
// or "Digit".
|
|
|
|
// Returns true if the current character is of the given character
|
|
// class, but does not consume anything.
|
|
template <typename CharacterClass>
|
|
inline bool LookingAt();
|
|
|
|
// If the current character is in the given class, consume it and return
|
|
// true. Otherwise return false.
|
|
// e.g. TryConsumeOne<Letter>()
|
|
template <typename CharacterClass>
|
|
inline bool TryConsumeOne();
|
|
|
|
// Like above, but try to consume the specific character indicated.
|
|
inline bool TryConsume(char c);
|
|
|
|
// Consume zero or more of the given character class.
|
|
template <typename CharacterClass>
|
|
inline void ConsumeZeroOrMore();
|
|
|
|
// Consume one or more of the given character class or log the given
|
|
// error message.
|
|
// e.g. ConsumeOneOrMore<Digit>("Expected digits.");
|
|
template <typename CharacterClass>
|
|
inline void ConsumeOneOrMore(const char* error);
|
|
};
|
|
|
|
// inline methods ====================================================
|
|
inline const Tokenizer::Token& Tokenizer::current() { return current_; }
|
|
|
|
inline const Tokenizer::Token& Tokenizer::previous() { return previous_; }
|
|
|
|
inline void Tokenizer::ParseString(const std::string& text,
|
|
std::string* output) {
|
|
output->clear();
|
|
ParseStringAppend(text, output);
|
|
}
|
|
|
|
} // namespace io
|
|
} // namespace protobuf
|
|
} // namespace google
|
|
|
|
#include <google/protobuf/port_undef.inc>
|
|
|
|
#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
|