Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

Lexer.h

00001 #ifndef _XML_XMLUTF8Lexer_H_  // -*-c++-*-
00002 
00003 #define _XML_XMLUTF8Lexer_H_
00004 
00005 /* $URL: svn://svn.omnifarious.org/home/hopper/src/svn-gp/trunk/C++/libNet/xml/xml/utf8/Lexer.h $
00006  * $Author: hopper $
00007  * $Date: 2003-01-12 21:49:40 -0600 (Sun, 12 Jan 2003) $
00008  * $Rev: 38 $
00009  */
00010 
00011 #ifdef __GNUG__
00012 #  pragma interface
00013 #endif
00014 
00015 #include <xml/utf8/Builder.h>
00016 #include <cstddef>
00017 #include <string>
00018 
00019 namespace strmod {
00020 namespace xml {
00021 namespace utf8 {
00022 
00023 /** \class Lexer Lexer.h xml/utf8/Lexer.h
00024  * Finds tokens and reports them, and their positions to a Builder.
00025  */
00026 class Lexer
00027 {
00028  public:
00029    //! Constructs a Lexer in the between tag, not in a comment state.
00030    Lexer()
00031         : nonwsok_(false)
00032    {
00033    }
00034 
00035    //! Does the lexer consider non-whitespace inside of elements (between tags) to be an error?
00036    bool getNonWSInElements() const                     { return nonwsok_; }
00037    //! Tell the lexer whether or not non-whitespace inside of elements (between tags) is an error.
00038    void setNonWSInElements(bool nonwsok)               { nonwsok_ = nonwsok; }
00039 
00040    /** Process the UTF-8 encoded characters in buf, calling the builder at the appropriate points.
00041     * @param buf A pointer to an array of characters to process.
00042     * @param len The number of characters in the array pointed to by buf.
00043     * @param lastbuf IN: A buffer handle to identify which call of lex a particular token started in / OUT: The earlist buffer handle still being used internally by the Lexer.
00044     * @param builder The builder to call when tokens are encountered.
00045     * @return true if Lexer is storing no BufHandles, and lastbuf contains a valid BufHandle, false if it isn't.
00046     */
00047    bool lex(const char *buf, unsigned int len,
00048             Builder::BufHandle &lastbuf, Builder &builder);
00049 
00050  private:
00051    typedef Builder::Position Position;
00052    typedef Builder::BufHandle BufHandle;
00053 
00054    enum XState { XBad, XStart, XLess,
00055                  XCommentExcl, XCommentExclDash, XInComment, XDashInComment,
00056                  XDashDashInComment,
00057                  XOpenElement, XInOpenElement, XEmptyElementEnd,
00058                  XCloseElement, XInCloseElement,
00059                  XAttr, XAttrAfterEq, XAttrSQ, XAttrDQ };
00060 
00061    enum XSubState { XSNone, XSBad,
00062                     XSStartName, XSInName,
00063                     XSEntity, XSNamedEntity, XSCharEntity, XSDecEntity,
00064                     XSHexEntityStart, XSHexEntity, XSEndEntity };
00065 
00066    static const char exclamation = '\x21';
00067    static const char doublequote = '\x22';
00068    static const char poundsign = '\x23';
00069    static const char singlequote = '\x27';
00070    static const char ampersand = '\x26';
00071    static const char dash = '\x2D';
00072    static const char dot = '\x2E';
00073    static const char forwslash = '\x2F';
00074    static const char colon = '\x3A';
00075    static const char semicolon = '\x3B';
00076    static const char lessthan = '\x3C';
00077    static const char equals = '\x3D';
00078    static const char greaterthan = '\x3E';
00079    static const char char_x = '\x78';
00080    static const ::std::string out_of_range_message;
00081    static const ::std::string bad_case_message;
00082 
00083    char name_[256];
00084    bool nonwsok_;
00085    struct LocalState {
00086       XState state_;
00087       XSubState substate_;
00088       size_t namepos_;
00089       bool used_elbegin_, used_attr_;
00090       Position elbegin_, attrbegin_, attrvalbegin_;
00091 
00092       LocalState()
00093            : state_(XStart), substate_(XSNone), namepos_(0),
00094              used_elbegin_(false), used_attr_(false)
00095       {
00096       }
00097    } localstate_;
00098 
00099    static inline bool iswhite(const char c);
00100    static inline bool isnamestart(const char c);
00101    static inline bool isnamebody(const char c);
00102    static inline bool isdigit(const char c);
00103    static inline bool isxdigit(const char c);
00104 
00105    inline void advanceState(const char c, const size_t i, const BufHandle &bh,
00106                             LocalState &ss, Builder &parser);
00107 
00108    static void throw_out_of_range();
00109    static void throw_bad_case();
00110    void call_startElementTag(const Position &begin, size_t namepos,
00111                              Builder &parser);
00112    void call_addAttribute(const Position &attrbegin, const Position &attrend,
00113                           const Position &valbegin, const Position &valend,
00114                           size_t namepos, Builder &parser);
00115    void call_closeElementTag(const Position &begin, const Position &end,
00116                              size_t namepos, Builder &parser);
00117 };
00118 
00119 } // namespace utf8
00120 } // namespace xml
00121 } // namespace strmod
00122 
00123 // $Log: XMLUTF8Lexer.h,v $
00124 // Revision 1.12  2003/01/10 02:28:37  hopper
00125 // Moved stuff over to XMLUTF8Lexer.cpp from XMLUTF8Lexer.h because it was
00126 // cluttering up the header file.  Added a function to throw an exception
00127 // when the main lexer routine encounters 'impossible' conditions.
00128 //
00129 // Revision 1.11  2003/01/09 22:48:32  hopper
00130 // Much farter along multiple buffer parsing.
00131 //
00132 // Revision 1.10  2003/01/09 03:43:52  hopper
00133 // Farther along the path to a decent XML parser.
00134 //
00135 // Revision 1.9  2003/01/08 18:00:07  hopper
00136 // More scrabblings along the path to a decent XML parser.
00137 //
00138 // Revision 1.8  2003/01/06 15:18:35  hopper
00139 // Towards further attempts to record positions in previously parsed buffers.
00140 //
00141 // Revision 1.7  2002/12/11 21:55:41  hopper
00142 // It parses attributes now.  There's even a decent test for it.  :-)
00143 //
00144 // Revision 1.6  2002/12/11 18:52:02  hopper
00145 // More steps towards parsing attributes.
00146 //
00147 // Revision 1.5  2002/12/11 13:42:36  hopper
00148 // Moving towards handling attributes, and multi-buffer parsing.
00149 //
00150 // Revision 1.4  2002/12/10 22:46:02  hopper
00151 // Renamed the XMLParserStrategy to the more appropriate XMLBuilder from
00152 // Design Patterns.
00153 //
00154 // Revision 1.3  2002/12/10 16:08:42  hopper
00155 // Preliminary changes to allow elements to have #PCDATA.
00156 //
00157 // Revision 1.2  2002/12/10 13:21:13  hopper
00158 // Moved Header line to better place.
00159 //
00160 // Revision 1.1  2002/12/10 13:19:16  hopper
00161 // Some XML parsing classes I'm playing with until I get something that's
00162 // suitable for public consumption.
00163 //
00164 
00165 #endif

Generated on Wed Jan 29 00:32:44 2003 for libNet by doxygen1.3-rc1