ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
xmltok.cc
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file xmltok.cc
1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2  See the file COPYING for copying permission.
3 */
4 
5 #if defined(__clang__) || defined(__GNUC__)
6 #pragma GCC diagnostic ignored "-Wunused-parameter"
7 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
8 #endif
9 
10 #include <stddef.h>
11 
12 #ifdef COMPILED_FROM_DSP
13 #include "winconfig.h"
14 #elif defined(MACOS_CLASSIC)
15 #include "macconfig.h"
16 #elif defined(__amigaos__)
17 #include "amigaconfig.h"
18 #elif defined(__WATCOMC__)
19 #include "watcomconfig.h"
20 #else
21 #ifdef HAVE_EXPAT_CONFIG_H
22 #include <expat_config.h>
23 #endif
24 #endif /* ndef COMPILED_FROM_DSP */
25 
26 #include "expat_external.h"
27 #include "internal.h"
28 #include "xmltok.h"
29 #include "nametab.h"
30 
31 #ifdef XML_DTD
32 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
33 #else
34 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
35 #endif
36 
37 #define VTABLE1 \
38  { PREFIX(prologTok), PREFIX(contentTok), \
39  PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
40  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
41  PREFIX(sameName), \
42  PREFIX(nameMatchesAscii), \
43  PREFIX(nameLength), \
44  PREFIX(skipS), \
45  PREFIX(getAtts), \
46  PREFIX(charRefNumber), \
47  PREFIX(predefinedEntityName), \
48  PREFIX(updatePosition), \
49  PREFIX(isPublicId)
50 
51 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
52 
53 #define UCS2_GET_NAMING(pages, hi, lo) \
54  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
55 
56 /* A 2 byte UTF-8 representation splits the characters 11 bits between
57  the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
58  pages, 3 bits to add to that index and 5 bits to generate the mask.
59 */
60 #define UTF8_GET_NAMING2(pages, byte) \
61  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
62  + ((((byte)[0]) & 3) << 1) \
63  + ((((byte)[1]) >> 5) & 1)] \
64  & (1 << (((byte)[1]) & 0x1F)))
65 
66 /* A 3 byte UTF-8 representation splits the characters 16 bits between
67  the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
68  into pages, 3 bits to add to that index and 5 bits to generate the
69  mask.
70 */
71 #define UTF8_GET_NAMING3(pages, byte) \
72  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
73  + ((((byte)[1]) >> 2) & 0xF)] \
74  << 3) \
75  + ((((byte)[1]) & 3) << 1) \
76  + ((((byte)[2]) >> 5) & 1)] \
77  & (1 << (((byte)[2]) & 0x1F)))
78 
79 #define UTF8_GET_NAMING(pages, p, n) \
80  ((n) == 2 \
81  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
82  : ((n) == 3 \
83  ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
84  : 0))
85 
86 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
87  of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
88  with the additional restriction of not allowing the Unicode
89  code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
90  Implementation details:
91  (A & 0x80) == 0 means A < 0x80
92  and
93  (A & 0xC0) == 0xC0 means A > 0xBF
94 */
95 
96 #define UTF8_INVALID2(p) \
97  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
98 
99 #define UTF8_INVALID3(p) \
100  (((p)[2] & 0x80) == 0 \
101  || \
102  ((*p) == 0xEF && (p)[1] == 0xBF \
103  ? \
104  (p)[2] > 0xBD \
105  : \
106  ((p)[2] & 0xC0) == 0xC0) \
107  || \
108  ((*p) == 0xE0 \
109  ? \
110  (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
111  : \
112  ((p)[1] & 0x80) == 0 \
113  || \
114  ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
115 
116 #define UTF8_INVALID4(p) \
117  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
118  || \
119  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
120  || \
121  ((*p) == 0xF0 \
122  ? \
123  (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
124  : \
125  ((p)[1] & 0x80) == 0 \
126  || \
127  ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
128 
129 static int PTRFASTCALL
130 isNever(const ENCODING *enc, const char *p)
131 {
132  return 0;
133 }
134 
135 static int PTRFASTCALL
136 utf8_isName2(const ENCODING *enc, const char *p)
137 {
138  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
139 }
140 
141 static int PTRFASTCALL
142 utf8_isName3(const ENCODING *enc, const char *p)
143 {
144  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
145 }
146 
147 #define utf8_isName4 isNever
148 
149 static int PTRFASTCALL
150 utf8_isNmstrt2(const ENCODING *enc, const char *p)
151 {
152  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
153 }
154 
155 static int PTRFASTCALL
156 utf8_isNmstrt3(const ENCODING *enc, const char *p)
157 {
158  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
159 }
160 
161 #define utf8_isNmstrt4 isNever
162 
163 static int PTRFASTCALL
164 utf8_isInvalid2(const ENCODING *enc, const char *p)
165 {
166  return UTF8_INVALID2((const unsigned char *)p);
167 }
168 
169 static int PTRFASTCALL
170 utf8_isInvalid3(const ENCODING *enc, const char *p)
171 {
172  return UTF8_INVALID3((const unsigned char *)p);
173 }
174 
175 static int PTRFASTCALL
176 utf8_isInvalid4(const ENCODING *enc, const char *p)
177 {
178  return UTF8_INVALID4((const unsigned char *)p);
179 }
180 
183  unsigned char type[256];
184 #ifdef XML_MIN_SIZE
185  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
186  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
187  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
188  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
189  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
190 #endif /* XML_MIN_SIZE */
191  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
192  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
193  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
194  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
195  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
196  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
197  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
198  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
199  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
200 };
201 
202 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
203 
204 #ifdef XML_MIN_SIZE
205 
206 #define STANDARD_VTABLE(E) \
207  E ## byteType, \
208  E ## isNameMin, \
209  E ## isNmstrtMin, \
210  E ## byteToAscii, \
211  E ## charMatches,
212 
213 #else
214 
215 #define STANDARD_VTABLE(E) /* as nothing */
216 
217 #endif
218 
219 #define NORMAL_VTABLE(E) \
220  E ## isName2, \
221  E ## isName3, \
222  E ## isName4, \
223  E ## isNmstrt2, \
224  E ## isNmstrt3, \
225  E ## isNmstrt4, \
226  E ## isInvalid2, \
227  E ## isInvalid3, \
228  E ## isInvalid4
229 
230 static int FASTCALL checkCharRefNumber(int);
231 
232 #include "xmltok_impl.h"
233 #include "ascii.h"
234 
235 #ifdef XML_MIN_SIZE
236 #define sb_isNameMin isNever
237 #define sb_isNmstrtMin isNever
238 #endif
239 
240 #ifdef XML_MIN_SIZE
241 #define MINBPC(enc) ((enc)->minBytesPerChar)
242 #else
243 /* minimum bytes per character */
244 #define MINBPC(enc) 1
245 #endif
246 
247 #define SB_BYTE_TYPE(enc, p) \
248  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249 
250 #ifdef XML_MIN_SIZE
251 static int PTRFASTCALL
252 sb_byteType(const ENCODING *enc, const char *p)
253 {
254  return SB_BYTE_TYPE(enc, p);
255 }
256 #define BYTE_TYPE(enc, p) \
257  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258 #else
259 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260 #endif
261 
262 #ifdef XML_MIN_SIZE
263 #define BYTE_TO_ASCII(enc, p) \
264  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
265 static int PTRFASTCALL
266 sb_byteToAscii(const ENCODING *enc, const char *p)
267 {
268  return *p;
269 }
270 #else
271 #define BYTE_TO_ASCII(enc, p) (*(p))
272 #endif
273 
274 #define IS_NAME_CHAR(enc, p, n) \
275  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
276 #define IS_NMSTRT_CHAR(enc, p, n) \
277  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
278 #define IS_INVALID_CHAR(enc, p, n) \
279  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
280 
281 #ifdef XML_MIN_SIZE
282 #define IS_NAME_CHAR_MINBPC(enc, p) \
283  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
284 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
285  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
286 #else
287 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
288 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
289 #endif
290 
291 #ifdef XML_MIN_SIZE
292 #define CHAR_MATCHES(enc, p, c) \
293  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
294 static int PTRCALL
295 sb_charMatches(const ENCODING *enc, const char *p, int c)
296 {
297  return *p == c;
298 }
299 #else
300 /* c is an ASCII character */
301 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
302 #endif
303 
304 #define PREFIX(ident) normal_ ## ident
305 #define XML_TOK_IMPL_C
306 #include "xmltok_impl.cc"
307 #undef XML_TOK_IMPL_C
308 
309 #undef MINBPC
310 #undef BYTE_TYPE
311 #undef BYTE_TO_ASCII
312 #undef CHAR_MATCHES
313 #undef IS_NAME_CHAR
314 #undef IS_NAME_CHAR_MINBPC
315 #undef IS_NMSTRT_CHAR
316 #undef IS_NMSTRT_CHAR_MINBPC
317 #undef IS_INVALID_CHAR
318 
319 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
320  UTF8_cval1 = 0x00,
321  UTF8_cval2 = 0xc0,
322  UTF8_cval3 = 0xe0,
323  UTF8_cval4 = 0xf0
324 };
325 
326 static void PTRCALL
328  const char **fromP, const char *fromLim,
329  char **toP, const char *toLim)
330 {
331  char *to;
332  const char *from;
333  if (fromLim - *fromP > toLim - *toP) {
334  /* Avoid copying partial characters. */
335  for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
336  if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
337  break;
338  }
339  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
340  *to = *from;
341  *fromP = from;
342  *toP = to;
343 }
344 
345 static void PTRCALL
347  const char **fromP, const char *fromLim,
348  unsigned short **toP, const unsigned short *toLim)
349 {
350  unsigned short *to = *toP;
351  const char *from = *fromP;
352  while (from != fromLim && to != toLim) {
353  switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
354  case BT_LEAD2:
355  *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
356  from += 2;
357  break;
358  case BT_LEAD3:
359  *to++ = (unsigned short)(((from[0] & 0xf) << 12)
360  | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
361  from += 3;
362  break;
363  case BT_LEAD4:
364  {
365  unsigned long n;
366  if (to + 1 == toLim)
367  goto after;
368  n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
369  | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
370  n -= 0x10000;
371  to[0] = (unsigned short)((n >> 10) | 0xD800);
372  to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
373  to += 2;
374  from += 4;
375  }
376  break;
377  default:
378  *to++ = *from++;
379  break;
380  }
381  }
382 after:
383  *fromP = from;
384  *toP = to;
385 }
386 
387 #ifdef XML_NS
388 static const struct normal_encoding utf8_encoding_ns = {
389  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
390  {
391 #include "asciitab.h"
392 #include "utf8tab.h"
393  },
394  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
395 };
396 #endif
397 
398 static const struct normal_encoding utf8_encoding = {
399  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
400  {
401 #define BT_COLON BT_NMSTRT
402 #include "asciitab.h"
403 #undef BT_COLON
404 #include "utf8tab.h"
405  },
406  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
407 };
408 
409 #ifdef XML_NS
410 
411 static const struct normal_encoding internal_utf8_encoding_ns = {
412  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
413  {
414 #include "iasciitab.h"
415 #include "utf8tab.h"
416  },
417  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
418 };
419 
420 #endif
421 
423  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
424  {
425 #define BT_COLON BT_NMSTRT
426 #include "iasciitab.h"
427 #undef BT_COLON
428 #include "utf8tab.h"
429  },
430  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
431 };
432 
433 static void PTRCALL
435  const char **fromP, const char *fromLim,
436  char **toP, const char *toLim)
437 {
438  for (;;) {
439  unsigned char c;
440  if (*fromP == fromLim)
441  break;
442  c = (unsigned char)**fromP;
443  if (c & 0x80) {
444  if (toLim - *toP < 2)
445  break;
446  *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
447  *(*toP)++ = (char)((c & 0x3f) | 0x80);
448  (*fromP)++;
449  }
450  else {
451  if (*toP == toLim)
452  break;
453  *(*toP)++ = *(*fromP)++;
454  }
455  }
456 }
457 
458 static void PTRCALL
460  const char **fromP, const char *fromLim,
461  unsigned short **toP, const unsigned short *toLim)
462 {
463  while (*fromP != fromLim && *toP != toLim)
464  *(*toP)++ = (unsigned char)*(*fromP)++;
465 }
466 
467 #ifdef XML_NS
468 
469 static const struct normal_encoding latin1_encoding_ns = {
470  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
471  {
472 #include "asciitab.h"
473 #include "latin1tab.h"
474  },
475  STANDARD_VTABLE(sb_)
476 };
477 
478 #endif
479 
480 static const struct normal_encoding latin1_encoding = {
481  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
482  {
483 #define BT_COLON BT_NMSTRT
484 #include "asciitab.h"
485 #undef BT_COLON
486 #include "latin1tab.h"
487  },
488  STANDARD_VTABLE(sb_)
489 };
490 
491 static void PTRCALL
493  const char **fromP, const char *fromLim,
494  char **toP, const char *toLim)
495 {
496  while (*fromP != fromLim && *toP != toLim)
497  *(*toP)++ = *(*fromP)++;
498 }
499 
500 #ifdef XML_NS
501 
502 static const struct normal_encoding ascii_encoding_ns = {
503  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
504  {
505 #include "asciitab.h"
506 /* BT_NONXML == 0 */
507  },
508  STANDARD_VTABLE(sb_)
509 };
510 
511 #endif
512 
513 static const struct normal_encoding ascii_encoding = {
514  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
515  {
516 #define BT_COLON BT_NMSTRT
517 #include "asciitab.h"
518 #undef BT_COLON
519 /* BT_NONXML == 0 */
520  },
521  STANDARD_VTABLE(sb_)
522 };
523 
524 static int PTRFASTCALL
525 unicode_byte_type(char hi, char lo)
526 {
527  switch ((unsigned char)hi) {
528  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
529  return BT_LEAD4;
530  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
531  return BT_TRAIL;
532  case 0xFF:
533  switch ((unsigned char)lo) {
534  case 0xFF:
535  case 0xFE:
536  return BT_NONXML;
537  }
538  break;
539  }
540  return BT_NONASCII;
541 }
542 
543 #define DEFINE_UTF16_TO_UTF8(E) \
544 static void PTRCALL \
545 E ## toUtf8(const ENCODING *enc, \
546  const char **fromP, const char *fromLim, \
547  char **toP, const char *toLim) \
548 { \
549  const char *from; \
550  for (from = *fromP; from != fromLim; from += 2) { \
551  int plane; \
552  unsigned char lo2; \
553  unsigned char lo = GET_LO(from); \
554  unsigned char hi = GET_HI(from); \
555  switch (hi) { \
556  case 0: \
557  if (lo < 0x80) { \
558  if (*toP == toLim) { \
559  *fromP = from; \
560  return; \
561  } \
562  *(*toP)++ = lo; \
563  break; \
564  } \
565  /* fall through */ \
566  case 0x1: case 0x2: case 0x3: \
567  case 0x4: case 0x5: case 0x6: case 0x7: \
568  if (toLim - *toP < 2) { \
569  *fromP = from; \
570  return; \
571  } \
572  *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
573  *(*toP)++ = ((lo & 0x3f) | 0x80); \
574  break; \
575  default: \
576  if (toLim - *toP < 3) { \
577  *fromP = from; \
578  return; \
579  } \
580  /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
581  *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
582  *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
583  *(*toP)++ = ((lo & 0x3f) | 0x80); \
584  break; \
585  case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
586  if (toLim - *toP < 4) { \
587  *fromP = from; \
588  return; \
589  } \
590  plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
591  *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
592  *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
593  from += 2; \
594  lo2 = GET_LO(from); \
595  *(*toP)++ = (((lo & 0x3) << 4) \
596  | ((GET_HI(from) & 0x3) << 2) \
597  | (lo2 >> 6) \
598  | 0x80); \
599  *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
600  break; \
601  } \
602  } \
603  *fromP = from; \
604 }
605 
606 #define DEFINE_UTF16_TO_UTF16(E) \
607 static void PTRCALL \
608 E ## toUtf16(const ENCODING *enc, \
609  const char **fromP, const char *fromLim, \
610  unsigned short **toP, const unsigned short *toLim) \
611 { \
612  /* Avoid copying first half only of surrogate */ \
613  if (fromLim - *fromP > ((toLim - *toP) << 1) \
614  && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
615  fromLim -= 2; \
616  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
617  *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
618 }
619 
620 #define SET2(ptr, ch) \
621  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
622 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
623 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
624 
625 DEFINE_UTF16_TO_UTF8(little2_)
626 DEFINE_UTF16_TO_UTF16(little2_)
627 
628 #undef SET2
629 #undef GET_LO
630 #undef GET_HI
631 
632 #define SET2(ptr, ch) \
633  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
634 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
635 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
636 
639 
640 #undef SET2
641 #undef GET_LO
642 #undef GET_HI
643 
644 #define LITTLE2_BYTE_TYPE(enc, p) \
645  ((p)[1] == 0 \
646  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
647  : unicode_byte_type((p)[1], (p)[0]))
648 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
649 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
650 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
651  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
652 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
653  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
654 
655 #ifdef XML_MIN_SIZE
656 
657 static int PTRFASTCALL
658 little2_byteType(const ENCODING *enc, const char *p)
659 {
660  return LITTLE2_BYTE_TYPE(enc, p);
661 }
662 
663 static int PTRFASTCALL
664 little2_byteToAscii(const ENCODING *enc, const char *p)
665 {
666  return LITTLE2_BYTE_TO_ASCII(enc, p);
667 }
668 
669 static int PTRCALL
670 little2_charMatches(const ENCODING *enc, const char *p, int c)
671 {
672  return LITTLE2_CHAR_MATCHES(enc, p, c);
673 }
674 
675 static int PTRFASTCALL
676 little2_isNameMin(const ENCODING *enc, const char *p)
677 {
678  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
679 }
680 
681 static int PTRFASTCALL
682 little2_isNmstrtMin(const ENCODING *enc, const char *p)
683 {
684  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
685 }
686 
687 #undef VTABLE
688 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
689 
690 #else /* not XML_MIN_SIZE */
691 
692 #undef PREFIX
693 #define PREFIX(ident) little2_ ## ident
694 #define MINBPC(enc) 2
695 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
696 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
697 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
698 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
699 #define IS_NAME_CHAR(enc, p, n) 0
700 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
701 #define IS_NMSTRT_CHAR(enc, p, n) (0)
702 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
703 
704 #define XML_TOK_IMPL_C
705 #include "xmltok_impl.cc"
706 #undef XML_TOK_IMPL_C
707 
708 #undef MINBPC
709 #undef BYTE_TYPE
710 #undef BYTE_TO_ASCII
711 #undef CHAR_MATCHES
712 #undef IS_NAME_CHAR
713 #undef IS_NAME_CHAR_MINBPC
714 #undef IS_NMSTRT_CHAR
715 #undef IS_NMSTRT_CHAR_MINBPC
716 #undef IS_INVALID_CHAR
717 
718 #endif /* not XML_MIN_SIZE */
719 
720 #ifdef XML_NS
721 
722 static const struct normal_encoding little2_encoding_ns = {
723  { VTABLE, 2, 0,
724 #if BYTEORDER == 1234
725  1
726 #else
727  0
728 #endif
729  },
730  {
731 #include "asciitab.h"
732 #include "latin1tab.h"
733  },
734  STANDARD_VTABLE(little2_)
735 };
736 
737 #endif
738 
739 static const struct normal_encoding little2_encoding = {
740  { VTABLE, 2, 0,
741 #if BYTEORDER == 1234
742  1
743 #else
744  0
745 #endif
746  },
747  {
748 #define BT_COLON BT_NMSTRT
749 #include "asciitab.h"
750 #undef BT_COLON
751 #include "latin1tab.h"
752  },
753  STANDARD_VTABLE(little2_)
754 };
755 
756 #if BYTEORDER != 4321
757 
758 #ifdef XML_NS
759 
760 static const struct normal_encoding internal_little2_encoding_ns = {
761  { VTABLE, 2, 0, 1 },
762  {
763 #include "iasciitab.h"
764 #include "latin1tab.h"
765  },
766  STANDARD_VTABLE(little2_)
767 };
768 
769 #endif
770 
772  { VTABLE, 2, 0, 1 },
773  {
774 #define BT_COLON BT_NMSTRT
775 #include "iasciitab.h"
776 #undef BT_COLON
777 #include "latin1tab.h"
778  },
779  STANDARD_VTABLE(little2_)
780 };
781 
782 #endif
783 
784 
785 #define BIG2_BYTE_TYPE(enc, p) \
786  ((p)[0] == 0 \
787  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
788  : unicode_byte_type((p)[0], (p)[1]))
789 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
790 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
791 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
792  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
793 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
794  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
795 
796 #ifdef XML_MIN_SIZE
797 
798 static int PTRFASTCALL
799 big2_byteType(const ENCODING *enc, const char *p)
800 {
801  return BIG2_BYTE_TYPE(enc, p);
802 }
803 
804 static int PTRFASTCALL
805 big2_byteToAscii(const ENCODING *enc, const char *p)
806 {
807  return BIG2_BYTE_TO_ASCII(enc, p);
808 }
809 
810 static int PTRCALL
811 big2_charMatches(const ENCODING *enc, const char *p, int c)
812 {
813  return BIG2_CHAR_MATCHES(enc, p, c);
814 }
815 
816 static int PTRFASTCALL
817 big2_isNameMin(const ENCODING *enc, const char *p)
818 {
819  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
820 }
821 
822 static int PTRFASTCALL
823 big2_isNmstrtMin(const ENCODING *enc, const char *p)
824 {
825  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
826 }
827 
828 #undef VTABLE
829 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
830 
831 #else /* not XML_MIN_SIZE */
832 
833 #undef PREFIX
834 #define PREFIX(ident) big2_ ## ident
835 #define MINBPC(enc) 2
836 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
837 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
838 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
839 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
840 #define IS_NAME_CHAR(enc, p, n) 0
841 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
842 #define IS_NMSTRT_CHAR(enc, p, n) (0)
843 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
844 
845 #define XML_TOK_IMPL_C
846 #include "xmltok_impl.cc"
847 #undef XML_TOK_IMPL_C
848 
849 #undef MINBPC
850 #undef BYTE_TYPE
851 #undef BYTE_TO_ASCII
852 #undef CHAR_MATCHES
853 #undef IS_NAME_CHAR
854 #undef IS_NAME_CHAR_MINBPC
855 #undef IS_NMSTRT_CHAR
856 #undef IS_NMSTRT_CHAR_MINBPC
857 #undef IS_INVALID_CHAR
858 
859 #endif /* not XML_MIN_SIZE */
860 
861 #ifdef XML_NS
862 
863 static const struct normal_encoding big2_encoding_ns = {
864  { VTABLE, 2, 0,
865 #if BYTEORDER == 4321
866  1
867 #else
868  0
869 #endif
870  },
871  {
872 #include "asciitab.h"
873 #include "latin1tab.h"
874  },
875  STANDARD_VTABLE(big2_)
876 };
877 
878 #endif
879 
880 static const struct normal_encoding big2_encoding = {
881  { VTABLE, 2, 0,
882 #if BYTEORDER == 4321
883  1
884 #else
885  0
886 #endif
887  },
888  {
889 #define BT_COLON BT_NMSTRT
890 #include "asciitab.h"
891 #undef BT_COLON
892 #include "latin1tab.h"
893  },
894  STANDARD_VTABLE(big2_)
895 };
896 
897 #if BYTEORDER != 1234
898 
899 #ifdef XML_NS
900 
901 static const struct normal_encoding internal_big2_encoding_ns = {
902  { VTABLE, 2, 0, 1 },
903  {
904 #include "iasciitab.h"
905 #include "latin1tab.h"
906  },
907  STANDARD_VTABLE(big2_)
908 };
909 
910 #endif
911 
913  { VTABLE, 2, 0, 1 },
914  {
915 #define BT_COLON BT_NMSTRT
916 #include "iasciitab.h"
917 #undef BT_COLON
918 #include "latin1tab.h"
919  },
920  STANDARD_VTABLE(big2_)
921 };
922 
923 #endif
924 
925 #undef PREFIX
926 
927 static int FASTCALL
928 streqci(const char *s1, const char *s2)
929 {
930  for (;;) {
931  char c1 = *s1++;
932  char c2 = *s2++;
933  if (ASCII_a <= c1 && c1 <= ASCII_z)
934  c1 += ASCII_A - ASCII_a;
935  if (ASCII_a <= c2 && c2 <= ASCII_z)
936  c2 += ASCII_A - ASCII_a;
937  if (c1 != c2)
938  return 0;
939  if (!c1)
940  break;
941  }
942  return 1;
943 }
944 
945 static void PTRCALL
946 initUpdatePosition(const ENCODING *enc, const char *ptr,
947  const char *end, POSITION *pos)
948 {
949  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
950 }
951 
952 static int
953 toAscii(const ENCODING *enc, const char *ptr, const char *end)
954 {
955  char buf[1];
956  char *p = buf;
957  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
958  if (p == buf)
959  return -1;
960  else
961  return buf[0];
962 }
963 
964 static int FASTCALL
965 isSpace(int c)
966 {
967  switch (c) {
968  case 0x20:
969  case 0xD:
970  case 0xA:
971  case 0x9:
972  return 1;
973  }
974  return 0;
975 }
976 
977 /* Return 1 if there's just optional white space or there's an S
978  followed by name=val.
979 */
980 static int
982  const char *ptr,
983  const char *end,
984  const char **namePtr,
985  const char **nameEndPtr,
986  const char **valPtr,
987  const char **nextTokPtr)
988 {
989  int c;
990  char open;
991  if (ptr == end) {
992  *namePtr = NULL;
993  return 1;
994  }
995  if (!isSpace(toAscii(enc, ptr, end))) {
996  *nextTokPtr = ptr;
997  return 0;
998  }
999  do {
1000  ptr += enc->minBytesPerChar;
1001  } while (isSpace(toAscii(enc, ptr, end)));
1002  if (ptr == end) {
1003  *namePtr = NULL;
1004  return 1;
1005  }
1006  *namePtr = ptr;
1007  for (;;) {
1008  c = toAscii(enc, ptr, end);
1009  if (c == -1) {
1010  *nextTokPtr = ptr;
1011  return 0;
1012  }
1013  if (c == ASCII_EQUALS) {
1014  *nameEndPtr = ptr;
1015  break;
1016  }
1017  if (isSpace(c)) {
1018  *nameEndPtr = ptr;
1019  do {
1020  ptr += enc->minBytesPerChar;
1021  } while (isSpace(c = toAscii(enc, ptr, end)));
1022  if (c != ASCII_EQUALS) {
1023  *nextTokPtr = ptr;
1024  return 0;
1025  }
1026  break;
1027  }
1028  ptr += enc->minBytesPerChar;
1029  }
1030  if (ptr == *namePtr) {
1031  *nextTokPtr = ptr;
1032  return 0;
1033  }
1034  ptr += enc->minBytesPerChar;
1035  c = toAscii(enc, ptr, end);
1036  while (isSpace(c)) {
1037  ptr += enc->minBytesPerChar;
1038  c = toAscii(enc, ptr, end);
1039  }
1040  if (c != ASCII_QUOT && c != ASCII_APOS) {
1041  *nextTokPtr = ptr;
1042  return 0;
1043  }
1044  open = (char)c;
1045  ptr += enc->minBytesPerChar;
1046  *valPtr = ptr;
1047  for (;; ptr += enc->minBytesPerChar) {
1048  c = toAscii(enc, ptr, end);
1049  if (c == open)
1050  break;
1051  if (!(ASCII_a <= c && c <= ASCII_z)
1052  && !(ASCII_A <= c && c <= ASCII_Z)
1053  && !(ASCII_0 <= c && c <= ASCII_9)
1054  && c != ASCII_PERIOD
1055  && c != ASCII_MINUS
1056  && c != ASCII_UNDERSCORE) {
1057  *nextTokPtr = ptr;
1058  return 0;
1059  }
1060  }
1061  *nextTokPtr = ptr + enc->minBytesPerChar;
1062  return 1;
1063 }
1064 
1065 static const char KW_version[] = {
1067 };
1068 
1069 static const char KW_encoding[] = {
1071 };
1072 
1073 static const char KW_standalone[] = {
1075  ASCII_n, ASCII_e, '\0'
1076 };
1077 
1078 static const char KW_yes[] = {
1079  ASCII_y, ASCII_e, ASCII_s, '\0'
1080 };
1081 
1082 static const char KW_no[] = {
1083  ASCII_n, ASCII_o, '\0'
1084 };
1085 
1086 static int
1087 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1088  const char *,
1089  const char *),
1090  int isGeneralTextEntity,
1091  const ENCODING *enc,
1092  const char *ptr,
1093  const char *end,
1094  const char **badPtr,
1095  const char **versionPtr,
1096  const char **versionEndPtr,
1097  const char **encodingName,
1098  const ENCODING **encoding,
1099  int *standalone)
1100 {
1101  const char *val = NULL;
1102  const char *name = NULL;
1103  const char *nameEnd = NULL;
1104  ptr += 5 * enc->minBytesPerChar;
1105  end -= 2 * enc->minBytesPerChar;
1106  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1107  || !name) {
1108  *badPtr = ptr;
1109  return 0;
1110  }
1111  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1112  if (!isGeneralTextEntity) {
1113  *badPtr = name;
1114  return 0;
1115  }
1116  }
1117  else {
1118  if (versionPtr)
1119  *versionPtr = val;
1120  if (versionEndPtr)
1121  *versionEndPtr = ptr;
1122  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1123  *badPtr = ptr;
1124  return 0;
1125  }
1126  if (!name) {
1127  if (isGeneralTextEntity) {
1128  /* a TextDecl must have an EncodingDecl */
1129  *badPtr = ptr;
1130  return 0;
1131  }
1132  return 1;
1133  }
1134  }
1135  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1136  int c = toAscii(enc, val, end);
1137  if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1138  *badPtr = val;
1139  return 0;
1140  }
1141  if (encodingName)
1142  *encodingName = val;
1143  if (encoding)
1144  *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1146  *badPtr = ptr;
1147  return 0;
1148  }
1149  if (!name)
1150  return 1;
1151  }
1152  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1153  || isGeneralTextEntity) {
1154  *badPtr = name;
1155  return 0;
1156  }
1157  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1158  if (standalone)
1159  *standalone = 1;
1160  }
1161  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1162  if (standalone)
1163  *standalone = 0;
1164  }
1165  else {
1166  *badPtr = val;
1167  return 0;
1168  }
1169  while (isSpace(toAscii(enc, ptr, end)))
1170  ptr += enc->minBytesPerChar;
1171  if (ptr != end) {
1172  *badPtr = ptr;
1173  return 0;
1174  }
1175  return 1;
1176 }
1177 
1178 static int FASTCALL
1180 {
1181  switch (result >> 8) {
1182  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1183  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1184  return -1;
1185  case 0:
1186  if (latin1_encoding.type[result] == BT_NONXML)
1187  return -1;
1188  break;
1189  case 0xFF:
1190  if (result == 0xFFFE || result == 0xFFFF)
1191  return -1;
1192  break;
1193  }
1194  return result;
1195 }
1196 
1197 int FASTCALL
1198 XmlUtf8Encode(int c, char *buf)
1199 {
1200  enum {
1201  /* minN is minimum legal resulting value for N byte sequence */
1202  min2 = 0x80,
1203  min3 = 0x800,
1204  min4 = 0x10000
1205  };
1206 
1207  if (c < 0)
1208  return 0;
1209  if (c < min2) {
1210  buf[0] = (char)(c | UTF8_cval1);
1211  return 1;
1212  }
1213  if (c < min3) {
1214  buf[0] = (char)((c >> 6) | UTF8_cval2);
1215  buf[1] = (char)((c & 0x3f) | 0x80);
1216  return 2;
1217  }
1218  if (c < min4) {
1219  buf[0] = (char)((c >> 12) | UTF8_cval3);
1220  buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1221  buf[2] = (char)((c & 0x3f) | 0x80);
1222  return 3;
1223  }
1224  if (c < 0x110000) {
1225  buf[0] = (char)((c >> 18) | UTF8_cval4);
1226  buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1227  buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1228  buf[3] = (char)((c & 0x3f) | 0x80);
1229  return 4;
1230  }
1231  return 0;
1232 }
1233 
1234 int FASTCALL
1235 XmlUtf16Encode(int charNum, unsigned short *buf)
1236 {
1237  if (charNum < 0)
1238  return 0;
1239  if (charNum < 0x10000) {
1240  buf[0] = (unsigned short)charNum;
1241  return 1;
1242  }
1243  if (charNum < 0x110000) {
1244  charNum -= 0x10000;
1245  buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1246  buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1247  return 2;
1248  }
1249  return 0;
1250 }
1251 
1254  CONVERTER convert;
1255  void *userData;
1256  unsigned short utf16[256];
1257  char utf8[256][4];
1258 };
1259 
1260 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1261 
1262 int
1264 {
1265  return sizeof(struct unknown_encoding);
1266 }
1267 
1268 static int PTRFASTCALL
1269 unknown_isName(const ENCODING *enc, const char *p)
1270 {
1271  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1272  int c = uenc->convert(uenc->userData, p);
1273  if (c & ~0xFFFF)
1274  return 0;
1275  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1276 }
1277 
1278 static int PTRFASTCALL
1279 unknown_isNmstrt(const ENCODING *enc, const char *p)
1280 {
1281  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1282  int c = uenc->convert(uenc->userData, p);
1283  if (c & ~0xFFFF)
1284  return 0;
1285  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1286 }
1287 
1288 static int PTRFASTCALL
1289 unknown_isInvalid(const ENCODING *enc, const char *p)
1290 {
1291  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1292  int c = uenc->convert(uenc->userData, p);
1293  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1294 }
1295 
1296 static void PTRCALL
1298  const char **fromP, const char *fromLim,
1299  char **toP, const char *toLim)
1300 {
1301  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1302  char buf[XML_UTF8_ENCODE_MAX];
1303  for (;;) {
1304  const char *utf8;
1305  int n;
1306  if (*fromP == fromLim)
1307  break;
1308  utf8 = uenc->utf8[(unsigned char)**fromP];
1309  n = *utf8++;
1310  if (n == 0) {
1311  int c = uenc->convert(uenc->userData, *fromP);
1312  n = XmlUtf8Encode(c, buf);
1313  if (n > toLim - *toP)
1314  break;
1315  utf8 = buf;
1316  *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1317  - (BT_LEAD2 - 2));
1318  }
1319  else {
1320  if (n > toLim - *toP)
1321  break;
1322  (*fromP)++;
1323  }
1324  do {
1325  *(*toP)++ = *utf8++;
1326  } while (--n != 0);
1327  }
1328 }
1329 
1330 static void PTRCALL
1332  const char **fromP, const char *fromLim,
1333  unsigned short **toP, const unsigned short *toLim)
1334 {
1335  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336  while (*fromP != fromLim && *toP != toLim) {
1337  unsigned short c = uenc->utf16[(unsigned char)**fromP];
1338  if (c == 0) {
1339  c = (unsigned short)
1340  uenc->convert(uenc->userData, *fromP);
1341  *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342  - (BT_LEAD2 - 2));
1343  }
1344  else
1345  (*fromP)++;
1346  *(*toP)++ = c;
1347  }
1348 }
1349 
1350 ENCODING *
1352  int *table,
1353  CONVERTER convert,
1354  void *userData)
1355 {
1356  int i;
1357  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1358  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1359  ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1360  for (i = 0; i < 128; i++)
1361  if (latin1_encoding.type[i] != BT_OTHER
1362  && latin1_encoding.type[i] != BT_NONXML
1363  && table[i] != i)
1364  return 0;
1365  for (i = 0; i < 256; i++) {
1366  int c = table[i];
1367  if (c == -1) {
1368  e->normal.type[i] = BT_MALFORM;
1369  /* This shouldn't really get used. */
1370  e->utf16[i] = 0xFFFF;
1371  e->utf8[i][0] = 1;
1372  e->utf8[i][1] = 0;
1373  }
1374  else if (c < 0) {
1375  if (c < -4)
1376  return 0;
1377  e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1378  e->utf8[i][0] = 0;
1379  e->utf16[i] = 0;
1380  }
1381  else if (c < 0x80) {
1382  if (latin1_encoding.type[c] != BT_OTHER
1383  && latin1_encoding.type[c] != BT_NONXML
1384  && c != i)
1385  return 0;
1386  e->normal.type[i] = latin1_encoding.type[c];
1387  e->utf8[i][0] = 1;
1388  e->utf8[i][1] = (char)c;
1389  e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1390  }
1391  else if (checkCharRefNumber(c) < 0) {
1392  e->normal.type[i] = BT_NONXML;
1393  /* This shouldn't really get used. */
1394  e->utf16[i] = 0xFFFF;
1395  e->utf8[i][0] = 1;
1396  e->utf8[i][1] = 0;
1397  }
1398  else {
1399  if (c > 0xFFFF)
1400  return 0;
1401  if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1402  e->normal.type[i] = BT_NMSTRT;
1403  else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1404  e->normal.type[i] = BT_NAME;
1405  else
1406  e->normal.type[i] = BT_OTHER;
1407  e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1408  e->utf16[i] = (unsigned short)c;
1409  }
1410  }
1411  e->userData = userData;
1412  e->convert = convert;
1413  if (convert) {
1414  e->normal.isName2 = unknown_isName;
1415  e->normal.isName3 = unknown_isName;
1416  e->normal.isName4 = unknown_isName;
1417  e->normal.isNmstrt2 = unknown_isNmstrt;
1418  e->normal.isNmstrt3 = unknown_isNmstrt;
1419  e->normal.isNmstrt4 = unknown_isNmstrt;
1420  e->normal.isInvalid2 = unknown_isInvalid;
1421  e->normal.isInvalid3 = unknown_isInvalid;
1422  e->normal.isInvalid4 = unknown_isInvalid;
1423  }
1424  e->normal.enc.utf8Convert = unknown_toUtf8;
1425  e->normal.enc.utf16Convert = unknown_toUtf16;
1426  return &(e->normal.enc);
1427 }
1428 
1429 /* If this enumeration is changed, getEncodingIndex and encodings
1430 must also be changed. */
1431 enum {
1439  /* must match encodingNames up to here */
1441 };
1442 
1443 static const char KW_ISO_8859_1[] = {
1445  ASCII_MINUS, ASCII_1, '\0'
1446 };
1447 static const char KW_US_ASCII[] = {
1449  '\0'
1450 };
1451 static const char KW_UTF_8[] = {
1453 };
1454 static const char KW_UTF_16[] = {
1456 };
1457 static const char KW_UTF_16BE[] = {
1459  '\0'
1460 };
1461 static const char KW_UTF_16LE[] = {
1463  '\0'
1464 };
1465 
1466 static int FASTCALL
1468 {
1469  static const char * const encodingNames[] = {
1470  KW_ISO_8859_1,
1471  KW_US_ASCII,
1472  KW_UTF_8,
1473  KW_UTF_16,
1474  KW_UTF_16BE,
1475  KW_UTF_16LE,
1476  };
1477  int i;
1478  if (name == NULL)
1479  return NO_ENC;
1480  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1481  if (streqci(name, encodingNames[i]))
1482  return i;
1483  return UNKNOWN_ENC;
1484 }
1485 
1486 /* For binary compatibility, we store the index of the encoding
1487  specified at initialization in the isUtf16 member.
1488 */
1489 
1490 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1491 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1492 
1493 /* This is what detects the encoding. encodingTable maps from
1494  encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1495  the external (protocol) specified encoding; state is
1496  XML_CONTENT_STATE if we're parsing an external text entity, and
1497  XML_PROLOG_STATE otherwise.
1498 */
1499 
1500 
1501 static int
1502 initScan(const ENCODING * const *encodingTable,
1503  const INIT_ENCODING *enc,
1504  int state,
1505  const char *ptr,
1506  const char *end,
1507  const char **nextTokPtr)
1508 {
1509  const ENCODING **encPtr;
1510 
1511  if (ptr == end)
1512  return XML_TOK_NONE;
1513  encPtr = enc->encPtr;
1514  if (ptr + 1 == end) {
1515  /* only a single byte available for auto-detection */
1516 #ifndef XML_DTD /* FIXME */
1517  /* a well-formed document entity must have more than one byte */
1518  if (state != XML_CONTENT_STATE)
1519  return XML_TOK_PARTIAL;
1520 #endif
1521  /* so we're parsing an external text entity... */
1522  /* if UTF-16 was externally specified, then we need at least 2 bytes */
1523  switch (INIT_ENC_INDEX(enc)) {
1524  case UTF_16_ENC:
1525  case UTF_16LE_ENC:
1526  case UTF_16BE_ENC:
1527  return XML_TOK_PARTIAL;
1528  }
1529  switch ((unsigned char)*ptr) {
1530  case 0xFE:
1531  case 0xFF:
1532  case 0xEF: /* possibly first byte of UTF-8 BOM */
1533  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1534  && state == XML_CONTENT_STATE)
1535  break;
1536  /* fall through */
1537  case 0x00:
1538  case 0x3C:
1539  return XML_TOK_PARTIAL;
1540  }
1541  }
1542  else {
1543  switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1544  case 0xFEFF:
1545  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1546  && state == XML_CONTENT_STATE)
1547  break;
1548  *nextTokPtr = ptr + 2;
1549  *encPtr = encodingTable[UTF_16BE_ENC];
1550  return XML_TOK_BOM;
1551  /* 00 3C is handled in the default case */
1552  case 0x3C00:
1553  if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1554  || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1555  && state == XML_CONTENT_STATE)
1556  break;
1557  *encPtr = encodingTable[UTF_16LE_ENC];
1558  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1559  case 0xFFFE:
1560  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1561  && state == XML_CONTENT_STATE)
1562  break;
1563  *nextTokPtr = ptr + 2;
1564  *encPtr = encodingTable[UTF_16LE_ENC];
1565  return XML_TOK_BOM;
1566  case 0xEFBB:
1567  /* Maybe a UTF-8 BOM (EF BB BF) */
1568  /* If there's an explicitly specified (external) encoding
1569  of ISO-8859-1 or some flavour of UTF-16
1570  and this is an external text entity,
1571  don't look for the BOM,
1572  because it might be a legal data.
1573  */
1574  if (state == XML_CONTENT_STATE) {
1575  int e = INIT_ENC_INDEX(enc);
1576  if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1577  || e == UTF_16LE_ENC || e == UTF_16_ENC)
1578  break;
1579  }
1580  if (ptr + 2 == end)
1581  return XML_TOK_PARTIAL;
1582  if ((unsigned char)ptr[2] == 0xBF) {
1583  *nextTokPtr = ptr + 3;
1584  *encPtr = encodingTable[UTF_8_ENC];
1585  return XML_TOK_BOM;
1586  }
1587  break;
1588  default:
1589  if (ptr[0] == '\0') {
1590  /* 0 isn't a legal data character. Furthermore a document
1591  entity can only start with ASCII characters. So the only
1592  way this can fail to be big-endian UTF-16 if it it's an
1593  external parsed general entity that's labelled as
1594  UTF-16LE.
1595  */
1596  if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1597  break;
1598  *encPtr = encodingTable[UTF_16BE_ENC];
1599  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1600  }
1601  else if (ptr[1] == '\0') {
1602  /* We could recover here in the case:
1603  - parsing an external entity
1604  - second byte is 0
1605  - no externally specified encoding
1606  - no encoding declaration
1607  by assuming UTF-16LE. But we don't, because this would mean when
1608  presented just with a single byte, we couldn't reliably determine
1609  whether we needed further bytes.
1610  */
1611  if (state == XML_CONTENT_STATE)
1612  break;
1613  *encPtr = encodingTable[UTF_16LE_ENC];
1614  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1615  }
1616  break;
1617  }
1618  }
1619  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1620  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621 }
1622 
1623 
1624 #define NS(x) x
1625 #define ns(x) x
1626 #define XML_TOK_NS_C
1627 #include "xmltok_ns.cc"
1628 #undef XML_TOK_NS_C
1629 #undef NS
1630 #undef ns
1631 
1632 #ifdef XML_NS
1633 
1634 #define NS(x) x ## NS
1635 #define ns(x) x ## _ns
1636 
1637 #define XML_TOK_NS_C
1638 #include "xmltok_ns.cc"
1639 #undef XML_TOK_NS_C
1640 
1641 #undef NS
1642 #undef ns
1643 
1644 ENCODING *
1645 XmlInitUnknownEncodingNS(void *mem,
1646  int *table,
1647  CONVERTER convert,
1648  void *userData)
1649 {
1650  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1651  if (enc)
1652  ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1653  return enc;
1654 }
1655 
1656 #endif /* XML_NS */