1 /*
   2  * Summary: interface for an HTML 4.0 non-verifying parser
   3  * Description: this module implements an HTML 4.0 non-verifying parser
   4  *              with API compatible with the XML parser ones. It should
   5  *              be able to parse "real world" HTML, even if severely
   6  *              broken from a specification point of view.
   7  *
   8  * Copy: See Copyright for the status of this software.
   9  *
  10  * Author: Daniel Veillard
  11  */
  12 
  13 #ifndef __HTML_PARSER_H__
  14 #define __HTML_PARSER_H__
  15 #include <libxml/xmlversion.h>
  16 #include <libxml/parser.h>
  17 
  18 #ifdef LIBXML_HTML_ENABLED
  19 
  20 #ifdef __cplusplus
  21 extern "C" {
  22 #endif
  23 
  24 /*
  25  * Most of the back-end structures from XML and HTML are shared.
  26  */
  27 typedef xmlParserCtxt htmlParserCtxt;
  28 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
  29 typedef xmlParserNodeInfo htmlParserNodeInfo;
  30 typedef xmlSAXHandler htmlSAXHandler;
  31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
  32 typedef xmlParserInput htmlParserInput;
  33 typedef xmlParserInputPtr htmlParserInputPtr;
  34 typedef xmlDocPtr htmlDocPtr;
  35 typedef xmlNodePtr htmlNodePtr;
  36 
  37 /*
  38  * Internal description of an HTML element, representing HTML 4.01
  39  * and XHTML 1.0 (which share the same structure).
  40  */
  41 typedef struct _htmlElemDesc htmlElemDesc;
  42 typedef htmlElemDesc *htmlElemDescPtr;
  43 struct _htmlElemDesc {
  44     const char *name;   /* The tag name */
  45     char startTag;      /* Whether the start tag can be implied */
  46     char endTag;        /* Whether the end tag can be implied */
  47     char saveEndTag;    /* Whether the end tag should be saved */
  48     char empty;         /* Is this an empty element ? */
  49     char depr;          /* Is this a deprecated element ? */
  50     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
  51     char isinline;      /* is this a block 0 or inline 1 element */
  52     const char *desc;   /* the description */
  53 
  54 /* NRK Jan.2003
  55  * New fields encapsulating HTML structure
  56  *
  57  * Bugs:
  58  *  This is a very limited representation.  It fails to tell us when
  59  *  an element *requires* subelements (we only have whether they're
  60  *  allowed or not), and it doesn't tell us where CDATA and PCDATA
  61  *  are allowed.  Some element relationships are not fully represented:
  62  *  these are flagged with the word MODIFIER
  63  */
  64     const char** subelts;       /* allowed sub-elements of this element */
  65     const char* defaultsubelt;  /* subelement for suggested auto-repair
  66                        if necessary or NULL */
  67     const char** attrs_opt;     /* Optional Attributes */
  68     const char** attrs_depr;        /* Additional deprecated attributes */
  69     const char** attrs_req;     /* Required attributes */
  70 };
  71 
  72 /*
  73  * Internal description of an HTML entity.
  74  */
  75 typedef struct _htmlEntityDesc htmlEntityDesc;
  76 typedef htmlEntityDesc *htmlEntityDescPtr;
  77 struct _htmlEntityDesc {
  78     unsigned int value; /* the UNICODE value for the character */
  79     const char *name;   /* The entity name */
  80     const char *desc;   /* the description */
  81 };
  82 
  83 /*
  84  * There is only few public functions.
  85  */
  86 XMLPUBFUN const htmlElemDesc * XMLCALL
  87             htmlTagLookup   (const xmlChar *tag);
  88 XMLPUBFUN const htmlEntityDesc * XMLCALL
  89             htmlEntityLookup(const xmlChar *name);
  90 XMLPUBFUN const htmlEntityDesc * XMLCALL
  91             htmlEntityValueLookup(unsigned int value);
  92 
  93 XMLPUBFUN int XMLCALL
  94             htmlIsAutoClosed(htmlDocPtr doc,
  95                      htmlNodePtr elem);
  96 XMLPUBFUN int XMLCALL
  97             htmlAutoCloseTag(htmlDocPtr doc,
  98                      const xmlChar *name,
  99                      htmlNodePtr elem);
 100 XMLPUBFUN const htmlEntityDesc * XMLCALL
 101             htmlParseEntityRef(htmlParserCtxtPtr ctxt,
 102                      const xmlChar **str);
 103 XMLPUBFUN int XMLCALL
 104             htmlParseCharRef(htmlParserCtxtPtr ctxt);
 105 XMLPUBFUN void XMLCALL
 106             htmlParseElement(htmlParserCtxtPtr ctxt);
 107 
 108 XMLPUBFUN htmlParserCtxtPtr XMLCALL
 109             htmlNewParserCtxt(void);
 110 
 111 XMLPUBFUN htmlParserCtxtPtr XMLCALL
 112             htmlCreateMemoryParserCtxt(const char *buffer,
 113                            int size);
 114 
 115 XMLPUBFUN int XMLCALL
 116             htmlParseDocument(htmlParserCtxtPtr ctxt);
 117 XMLPUBFUN htmlDocPtr XMLCALL
 118             htmlSAXParseDoc (xmlChar *cur,
 119                      const char *encoding,
 120                      htmlSAXHandlerPtr sax,
 121                      void *userData);
 122 XMLPUBFUN htmlDocPtr XMLCALL
 123             htmlParseDoc    (xmlChar *cur,
 124                      const char *encoding);
 125 XMLPUBFUN htmlDocPtr XMLCALL
 126             htmlSAXParseFile(const char *filename,
 127                      const char *encoding,
 128                      htmlSAXHandlerPtr sax,
 129                      void *userData);
 130 XMLPUBFUN htmlDocPtr XMLCALL
 131             htmlParseFile   (const char *filename,
 132                      const char *encoding);
 133 XMLPUBFUN int XMLCALL
 134             UTF8ToHtml  (unsigned char *out,
 135                      int *outlen,
 136                      const unsigned char *in,
 137                      int *inlen);
 138 XMLPUBFUN int XMLCALL
 139             htmlEncodeEntities(unsigned char *out,
 140                      int *outlen,
 141                      const unsigned char *in,
 142                      int *inlen, int quoteChar);
 143 XMLPUBFUN int XMLCALL
 144             htmlIsScriptAttribute(const xmlChar *name);
 145 XMLPUBFUN int XMLCALL
 146             htmlHandleOmittedElem(int val);
 147 
 148 #ifdef LIBXML_PUSH_ENABLED
 149 /**
 150  * Interfaces for the Push mode.
 151  */
 152 XMLPUBFUN htmlParserCtxtPtr XMLCALL
 153             htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
 154                          void *user_data,
 155                          const char *chunk,
 156                          int size,
 157                          const char *filename,
 158                          xmlCharEncoding enc);
 159 XMLPUBFUN int XMLCALL
 160             htmlParseChunk      (htmlParserCtxtPtr ctxt,
 161                          const char *chunk,
 162                          int size,
 163                          int terminate);
 164 #endif /* LIBXML_PUSH_ENABLED */
 165 
 166 XMLPUBFUN void XMLCALL
 167             htmlFreeParserCtxt  (htmlParserCtxtPtr ctxt);
 168 
 169 /*
 170  * New set of simpler/more flexible APIs
 171  */
 172 /**
 173  * xmlParserOption:
 174  *
 175  * This is the set of XML parser options that can be passed down
 176  * to the xmlReadDoc() and similar calls.
 177  */
 178 typedef enum {
 179     HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
 180     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
 181     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
 182     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
 183     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
 184     HTML_PARSE_NONET    = 1<<11,/* Forbid network access */
 185     HTML_PARSE_COMPACT  = 1<<16 /* compact small text nodes */
 186 } htmlParserOption;
 187 
 188 XMLPUBFUN void XMLCALL
 189         htmlCtxtReset       (htmlParserCtxtPtr ctxt);
 190 XMLPUBFUN int XMLCALL
 191         htmlCtxtUseOptions  (htmlParserCtxtPtr ctxt,
 192                      int options);
 193 XMLPUBFUN htmlDocPtr XMLCALL
 194         htmlReadDoc     (const xmlChar *cur,
 195                      const char *URL,
 196                      const char *encoding,
 197                      int options);
 198 XMLPUBFUN htmlDocPtr XMLCALL
 199         htmlReadFile        (const char *URL,
 200                      const char *encoding,
 201                      int options);
 202 XMLPUBFUN htmlDocPtr XMLCALL
 203         htmlReadMemory      (const char *buffer,
 204                      int size,
 205                      const char *URL,
 206                      const char *encoding,
 207                      int options);
 208 XMLPUBFUN htmlDocPtr XMLCALL
 209         htmlReadFd      (int fd,
 210                      const char *URL,
 211                      const char *encoding,
 212                      int options);
 213 XMLPUBFUN htmlDocPtr XMLCALL
 214         htmlReadIO      (xmlInputReadCallback ioread,
 215                      xmlInputCloseCallback ioclose,
 216                      void *ioctx,
 217                      const char *URL,
 218                      const char *encoding,
 219                      int options);
 220 XMLPUBFUN htmlDocPtr XMLCALL
 221         htmlCtxtReadDoc     (xmlParserCtxtPtr ctxt,
 222                      const xmlChar *cur,
 223                      const char *URL,
 224                      const char *encoding,
 225                      int options);
 226 XMLPUBFUN htmlDocPtr XMLCALL
 227         htmlCtxtReadFile        (xmlParserCtxtPtr ctxt,
 228                      const char *filename,
 229                      const char *encoding,
 230                      int options);
 231 XMLPUBFUN htmlDocPtr XMLCALL
 232         htmlCtxtReadMemory      (xmlParserCtxtPtr ctxt,
 233                      const char *buffer,
 234                      int size,
 235                      const char *URL,
 236                      const char *encoding,
 237                      int options);
 238 XMLPUBFUN htmlDocPtr XMLCALL
 239         htmlCtxtReadFd      (xmlParserCtxtPtr ctxt,
 240                      int fd,
 241                      const char *URL,
 242                      const char *encoding,
 243                      int options);
 244 XMLPUBFUN htmlDocPtr XMLCALL
 245         htmlCtxtReadIO      (xmlParserCtxtPtr ctxt,
 246                      xmlInputReadCallback ioread,
 247                      xmlInputCloseCallback ioclose,
 248                      void *ioctx,
 249                      const char *URL,
 250                      const char *encoding,
 251                      int options);
 252 
 253 /* NRK/Jan2003: further knowledge of HTML structure
 254  */
 255 typedef enum {
 256   HTML_NA = 0 ,     /* something we don't check at all */
 257   HTML_INVALID = 0x1 ,
 258   HTML_DEPRECATED = 0x2 ,
 259   HTML_VALID = 0x4 ,
 260   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
 261 } htmlStatus ;
 262 
 263 /* Using htmlElemDesc rather than name here, to emphasise the fact
 264    that otherwise there's a lookup overhead
 265 */
 266 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
 267 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
 268 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
 269 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
 270 /**
 271  * htmlDefaultSubelement:
 272  * @elt: HTML element
 273  *
 274  * Returns the default subelement for this element
 275  */
 276 #define htmlDefaultSubelement(elt) elt->defaultsubelt
 277 /**
 278  * htmlElementAllowedHereDesc:
 279  * @parent: HTML parent element
 280  * @elt: HTML element
 281  *
 282  * Checks whether an HTML element description may be a
 283  * direct child of the specified element.
 284  *
 285  * Returns 1 if allowed; 0 otherwise.
 286  */
 287 #define htmlElementAllowedHereDesc(parent,elt) \
 288     htmlElementAllowedHere((parent), (elt)->name)
 289 /**
 290  * htmlRequiredAttrs:
 291  * @elt: HTML element
 292  *
 293  * Returns the attributes required for the specified element.
 294  */
 295 #define htmlRequiredAttrs(elt) (elt)->attrs_req
 296 
 297 
 298 #ifdef __cplusplus
 299 }
 300 #endif
 301 
 302 #endif /* LIBXML_HTML_ENABLED */
 303 #endif /* __HTML_PARSER_H__ */