00001 #ifndef _XML_XMLUTF8Lexer_H_ // -*-c++-*-
00002
00003 #define _XML_XMLUTF8Lexer_H_
00004
00005
00006
00007
00008
00009
00010
00011 #ifdef __GNUG__
00012 # pragma interface
00013 #endif
00014
00015 #include <xml/utf8/Builder.h>
00016 #include <cstddef>
00017 #include <string>
00018
00019 namespace strmod {
00020 namespace xml {
00021 namespace utf8 {
00022
00023
00024
00025
00026 class Lexer
00027 {
00028 public:
00029
00030 Lexer()
00031 : nonwsok_(false)
00032 {
00033 }
00034
00035
00036 bool getNonWSInElements() const { return nonwsok_; }
00037
00038 void setNonWSInElements(bool nonwsok) { nonwsok_ = nonwsok; }
00039
00040
00041
00042
00043
00044
00045
00046
00047 bool lex(const char *buf, unsigned int len,
00048 Builder::BufHandle &lastbuf, Builder &builder);
00049
00050 private:
00051 typedef Builder::Position Position;
00052 typedef Builder::BufHandle BufHandle;
00053
00054 enum XState { XBad, XStart, XLess,
00055 XCommentExcl, XCommentExclDash, XInComment, XDashInComment,
00056 XDashDashInComment,
00057 XOpenElement, XInOpenElement, XEmptyElementEnd,
00058 XCloseElement, XInCloseElement,
00059 XAttr, XAttrAfterEq, XAttrSQ, XAttrDQ };
00060
00061 enum XSubState { XSNone, XSBad,
00062 XSStartName, XSInName,
00063 XSEntity, XSNamedEntity, XSCharEntity, XSDecEntity,
00064 XSHexEntityStart, XSHexEntity, XSEndEntity };
00065
00066 static const char exclamation = '\x21';
00067 static const char doublequote = '\x22';
00068 static const char poundsign = '\x23';
00069 static const char singlequote = '\x27';
00070 static const char ampersand = '\x26';
00071 static const char dash = '\x2D';
00072 static const char dot = '\x2E';
00073 static const char forwslash = '\x2F';
00074 static const char colon = '\x3A';
00075 static const char semicolon = '\x3B';
00076 static const char lessthan = '\x3C';
00077 static const char equals = '\x3D';
00078 static const char greaterthan = '\x3E';
00079 static const char char_x = '\x78';
00080 static const ::std::string out_of_range_message;
00081 static const ::std::string bad_case_message;
00082
00083 char name_[256];
00084 bool nonwsok_;
00085 struct LocalState {
00086 XState state_;
00087 XSubState substate_;
00088 size_t namepos_;
00089 bool used_elbegin_, used_attr_;
00090 Position elbegin_, attrbegin_, attrvalbegin_;
00091
00092 LocalState()
00093 : state_(XStart), substate_(XSNone), namepos_(0),
00094 used_elbegin_(false), used_attr_(false)
00095 {
00096 }
00097 } localstate_;
00098
00099 static inline bool iswhite(const char c);
00100 static inline bool isnamestart(const char c);
00101 static inline bool isnamebody(const char c);
00102 static inline bool isdigit(const char c);
00103 static inline bool isxdigit(const char c);
00104
00105 inline void advanceState(const char c, const size_t i, const BufHandle &bh,
00106 LocalState &ss, Builder &parser);
00107
00108 static void throw_out_of_range();
00109 static void throw_bad_case();
00110 void call_startElementTag(const Position &begin, size_t namepos,
00111 Builder &parser);
00112 void call_addAttribute(const Position &attrbegin, const Position &attrend,
00113 const Position &valbegin, const Position &valend,
00114 size_t namepos, Builder &parser);
00115 void call_closeElementTag(const Position &begin, const Position &end,
00116 size_t namepos, Builder &parser);
00117 };
00118
00119 }
00120 }
00121 }
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165 #endif