OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [classpath/] [gnu/] [xml/] [aelfred2/] [XmlParser.java] - Blame information for rev 769

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 769 jeremybenn
/* XmlParser.java --
2
   Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version.
37
 
38
Partly derived from code which carried the following notice:
39
 
40
  Copyright (c) 1997, 1998 by Microstar Software Ltd.
41
 
42
  AElfred is free for both commercial and non-commercial use and
43
  redistribution, provided that Microstar's copyright and disclaimer are
44
  retained intact.  You are free to modify AElfred for your own use and
45
  to redistribute AElfred with your modifications, provided that the
46
  modifications are clearly documented.
47
 
48
  This program is distributed in the hope that it will be useful, but
49
  WITHOUT ANY WARRANTY; without even the implied warranty of
50
  merchantability or fitness for a particular purpose.  Please use it AT
51
  YOUR OWN RISK.
52
*/
53
 
54
package gnu.xml.aelfred2;
55
 
56
import gnu.java.security.action.GetPropertyAction;
57
 
58
import java.io.BufferedInputStream;
59
import java.io.CharConversionException;
60
import java.io.EOFException;
61
import java.io.InputStream;
62
import java.io.InputStreamReader;
63
import java.io.IOException;
64
import java.io.Reader;
65
import java.io.UnsupportedEncodingException;
66
import java.net.URL;
67
import java.net.URLConnection;
68
import java.security.AccessController;
69
 
70
import java.util.Iterator;
71
import java.util.HashMap;
72
import java.util.LinkedList;
73
 
74
import org.xml.sax.InputSource;
75
import org.xml.sax.SAXException;
76
 
77
 
78
/**
79
 * Parse XML documents and return parse events through call-backs.
80
 * Use the <code>SAXDriver</code> class as your entry point, as all
81
 * internal parser interfaces are subject to change.
82
 *
83
 * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
84
 *      (version 1.2a with bugfixes)
85
 * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
86
 * @see SAXDriver
87
 */
88
final class XmlParser
89
{
90
 
91
  // avoid slow per-character readCh()
92
  private final static boolean USE_CHEATS = true;
93
 
94
  ////////////////////////////////////////////////////////////////////////
95
  // Constants.
96
  ////////////////////////////////////////////////////////////////////////
97
 
98
  //
99
  // Constants for element content type.
100
  //
101
 
102
  /**
103
   * Constant: an element has not been declared.
104
   * @see #getElementContentType
105
   */
106
  public final static int CONTENT_UNDECLARED = 0;
107
 
108
  /**
109
   * Constant: the element has a content model of ANY.
110
   * @see #getElementContentType
111
   */
112
  public final static int CONTENT_ANY = 1;
113
 
114
  /**
115
   * Constant: the element has declared content of EMPTY.
116
   * @see #getElementContentType
117
   */
118
  public final static int CONTENT_EMPTY = 2;
119
 
120
  /**
121
   * Constant: the element has mixed content.
122
   * @see #getElementContentType
123
   */
124
  public final static int CONTENT_MIXED = 3;
125
 
126
  /**
127
   * Constant: the element has element content.
128
   * @see #getElementContentType
129
   */
130
  public final static int CONTENT_ELEMENTS = 4;
131
 
132
 
133
  //
134
  // Constants for the entity type.
135
  //
136
 
137
  /**
138
   * Constant: the entity has not been declared.
139
   * @see #getEntityType
140
   */
141
  public final static int ENTITY_UNDECLARED = 0;
142
 
143
  /**
144
   * Constant: the entity is internal.
145
   * @see #getEntityType
146
   */
147
  public final static int ENTITY_INTERNAL = 1;
148
 
149
  /**
150
   * Constant: the entity is external, non-parsable data.
151
   * @see #getEntityType
152
   */
153
  public final static int ENTITY_NDATA = 2;
154
 
155
  /**
156
   * Constant: the entity is external XML data.
157
   * @see #getEntityType
158
   */
159
  public final static int ENTITY_TEXT = 3;
160
 
161
  //
162
  // Attribute type constants are interned literal strings.
163
  //
164
 
165
  //
166
  // Constants for supported encodings.  "external" is just a flag.
167
  //
168
  private final static int ENCODING_EXTERNAL = 0;
169
  private final static int ENCODING_UTF_8 = 1;
170
  private final static int ENCODING_ISO_8859_1 = 2;
171
  private final static int ENCODING_UCS_2_12 = 3;
172
  private final static int ENCODING_UCS_2_21 = 4;
173
  private final static int ENCODING_UCS_4_1234 = 5;
174
  private final static int ENCODING_UCS_4_4321 = 6;
175
  private final static int ENCODING_UCS_4_2143 = 7;
176
  private final static int ENCODING_UCS_4_3412 = 8;
177
  private final static int ENCODING_ASCII = 9;
178
 
179
  //
180
  // Constants for attribute default value.
181
  //
182
 
183
  /**
184
   * Constant: the attribute is not declared.
185
   * @see #getAttributeDefaultValueType
186
   */
187
  public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
188
 
189
  /**
190
   * Constant: the attribute has a literal default value specified.
191
   * @see #getAttributeDefaultValueType
192
   * @see #getAttributeDefaultValue
193
   */
194
  public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
195
 
196
  /**
197
   * Constant: the attribute was declared #IMPLIED.
198
   * @see #getAttributeDefaultValueType
199
   */
200
  public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
201
 
202
  /**
203
   * Constant: the attribute was declared #REQUIRED.
204
   * @see #getAttributeDefaultValueType
205
   */
206
  public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
207
 
208
  /**
209
   * Constant: the attribute was declared #FIXED.
210
   * @see #getAttributeDefaultValueType
211
   * @see #getAttributeDefaultValue
212
   */
213
  public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
214
 
215
  //
216
  // Constants for input.
217
  //
218
  private final static int INPUT_NONE = 0;
219
  private final static int INPUT_INTERNAL = 1;
220
  private final static int INPUT_STREAM = 3;
221
  private final static int INPUT_READER = 5;
222
 
223
  //
224
  // Flags for reading literals.
225
  //
226
  // expand general entity refs (attribute values in dtd and content)
227
  private final static int LIT_ENTITY_REF = 2;
228
  // normalize this value (space chars) (attributes, public ids)
229
  private final static int LIT_NORMALIZE = 4;
230
  // literal is an attribute value
231
  private final static int LIT_ATTRIBUTE = 8;
232
  // don't expand parameter entities
233
  private final static int LIT_DISABLE_PE = 16;
234
  // don't expand [or parse] character refs
235
  private final static int LIT_DISABLE_CREF = 32;
236
  // don't parse general entity refs
237
  private final static int LIT_DISABLE_EREF = 64;
238
  // literal is a public ID value
239
  private final static int LIT_PUBID = 256;
240
 
241
  //
242
  // Flags affecting PE handling in DTDs (if expandPE is true).
243
  // PEs expand with space padding, except inside literals.
244
  //
245
  private final static int CONTEXT_NORMAL = 0;
246
  private final static int CONTEXT_LITERAL = 1;
247
 
248
  // Emit warnings for relative URIs with no base URI.
249
  static boolean uriWarnings;
250
  static
251
  {
252
    String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
253
    GetPropertyAction a = new GetPropertyAction(key);
254
    uriWarnings = "true".equals(AccessController.doPrivileged(a));
255
  }
256
 
257
  //
258
  // The current XML handler interface.
259
  //
260
  private SAXDriver handler;
261
 
262
  //
263
  // I/O information.
264
  //
265
  private Reader reader;   // current reader
266
  private InputStream is;     // current input stream
267
  private int line;     // current line number
268
  private int column;   // current column number
269
  private int sourceType;   // type of input source
270
  private LinkedList inputStack;   // stack of input soruces
271
  private URLConnection externalEntity; // current external entity
272
  private int encoding;   // current character encoding
273
  private int currentByteCount; // bytes read from current source
274
  private InputSource scratch;  // temporary
275
 
276
  //
277
  // Buffers for decoded but unparsed character input.
278
  //
279
  private char[] readBuffer;
280
  private int readBufferPos;
281
  private int readBufferLength;
282
  private int readBufferOverflow;  // overflow from last data chunk.
283
 
284
  //
285
  // Buffer for undecoded raw byte input.
286
  //
287
  private final static int READ_BUFFER_MAX = 16384;
288
  private byte[] rawReadBuffer;
289
 
290
 
291
  //
292
  // Buffer for attribute values, char refs, DTD stuff.
293
  //
294
  private static int DATA_BUFFER_INITIAL = 4096;
295
  private char[] dataBuffer;
296
  private int dataBufferPos;
297
 
298
  //
299
  // Buffer for parsed names.
300
  //
301
  private static int NAME_BUFFER_INITIAL = 1024;
302
  private char[] nameBuffer;
303
  private int nameBufferPos;
304
 
305
  //
306
  // Save any standalone flag
307
  //
308
  private boolean docIsStandalone;
309
 
310
  //
311
  // Hashtables for DTD information on elements, entities, and notations.
312
  // Populated until we start ignoring decls (because of skipping a PE)
313
  //
314
  private HashMap elementInfo;
315
  private HashMap entityInfo;
316
  private HashMap notationInfo;
317
  private boolean skippedPE;
318
 
319
  //
320
  // Element type currently in force.
321
  //
322
  private String currentElement;
323
  private int currentElementContent;
324
 
325
  //
326
  // Stack of entity names, to detect recursion.
327
  //
328
  private LinkedList entityStack;
329
 
330
  //
331
  // PE expansion is enabled in most chunks of the DTD, not all.
332
  // When it's enabled, literals are treated differently.
333
  //
334
  private boolean inLiteral;
335
  private boolean expandPE;
336
  private boolean peIsError;
337
 
338
  //
339
  // can't report entity expansion inside two constructs:
340
  // - attribute expansions (internal entities only)
341
  // - markup declarations (parameter entities only)
342
  //
343
  private boolean doReport;
344
 
345
  //
346
  // Symbol table, for caching interned names.
347
  //
348
  // These show up wherever XML names or nmtokens are used:  naming elements,
349
  // attributes, PIs, notations, entities, and enumerated attribute values.
350
  //
351
  // NOTE:  This hashtable doesn't grow.  The default size is intended to be
352
  // rather large for most documents.  Example:  one snapshot of the DocBook
353
  // XML 4.1 DTD used only about 350 such names.  As a rule, only pathological
354
  // documents (ones that don't reuse names) should ever see much collision.
355
  //
356
  // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
357
  // "2039" keeps the hash table size at about two memory pages on typical
358
  // 32 bit hardware.
359
  //
360
  private final static int SYMBOL_TABLE_LENGTH = 2039;
361
 
362
  private Object[][] symbolTable;
363
 
364
  //
365
  // Hash table of attributes found in current start tag.
366
  //
367
  private String[] tagAttributes;
368
  private int tagAttributePos;
369
 
370
  //
371
  // Utility flag: have we noticed a CR while reading the last
372
  // data chunk?  If so, we will have to go back and normalise
373
  // CR or CR/LF line ends.
374
  //
375
  private boolean sawCR;
376
 
377
  //
378
  // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
379
  //
380
  private boolean inCDATA;
381
 
382
  //
383
  // Xml version.
384
  //
385
  private static final int XML_10 = 0;
386
  private static final int XML_11 = 1;
387
  private int xmlVersion = XML_10;
388
 
389
  //////////////////////////////////////////////////////////////////////
390
  // Constructors.
391
  ////////////////////////////////////////////////////////////////////////
392
 
393
  /**
394
   * Construct a new parser with no associated handler.
395
   * @see #setHandler
396
   * @see #parse
397
   */
398
  // package private
399
  XmlParser()
400
  {
401
  }
402
 
403
  /**
404
   * Set the handler that will receive parsing events.
405
   * @param handler The handler to receive callback events.
406
   * @see #parse
407
   */
408
  // package private
409
  void setHandler(SAXDriver handler)
410
  {
411
    this.handler = handler;
412
  }
413
 
414
  /**
415
   * Parse an XML document from the character stream, byte stream, or URI
416
   * that you provide (in that order of preference).  Any URI that you
417
   * supply will become the base URI for resolving relative URI, and may
418
   * be used to acquire a reader or byte stream.
419
   *
420
   * <p> Only one thread at a time may use this parser; since it is
421
   * private to this package, post-parse cleanup is done by the caller,
422
   * which MUST NOT REUSE the parser (just null it).
423
   *
424
   * @param systemId Absolute URI of the document; should never be null,
425
   *    but may be so iff a reader <em>or</em> a stream is provided.
426
   * @param publicId The public identifier of the document, or null.
427
   * @param reader A character stream; must be null if stream isn't.
428
   * @param stream A byte input stream; must be null if reader isn't.
429
   * @param encoding The suggested encoding, or null if unknown.
430
   * @exception java.lang.Exception Basically SAXException or IOException
431
   */
432
  // package private
433
  void doParse(String systemId, String publicId, Reader reader,
434
               InputStream stream, String encoding)
435
    throws Exception
436
  {
437
    if (handler == null)
438
      {
439
        throw new IllegalStateException("no callback handler");
440
      }
441
 
442
    initializeVariables();
443
 
444
    // predeclare the built-in entities here (replacement texts)
445
    // we don't need to intern(), since we're guaranteed literals
446
    // are always (globally) interned.
447
    setInternalEntity("amp", "&#38;");
448
    setInternalEntity("lt", "&#60;");
449
    setInternalEntity("gt", "&#62;");
450
    setInternalEntity("apos", "&#39;");
451
    setInternalEntity("quot", "&#34;");
452
 
453
    try
454
      {
455
        // pushURL first to ensure locator is correct in startDocument
456
        // ... it might report an IO or encoding exception.
457
        handler.startDocument();
458
        pushURL(false, "[document]",
459
                // default baseURI: null
460
                new ExternalIdentifiers(publicId, systemId, null),
461
                reader, stream, encoding, false);
462
 
463
        parseDocument();
464
      }
465
    catch (EOFException e)
466
      {
467
        //empty input
468
        error("empty document, with no root element.");
469
      }
470
    finally
471
      {
472
        if (reader != null)
473
          {
474
            try
475
              {
476
                reader.close();
477
              }
478
            catch (IOException e)
479
              {
480
                /* ignore */
481
              }
482
          }
483
        if (stream != null)
484
          {
485
            try
486
              {
487
                stream.close();
488
              }
489
            catch (IOException e)
490
              {
491
                /* ignore */
492
              }
493
          }
494
        if (is != null)
495
          {
496
            try
497
              {
498
                is.close();
499
              }
500
            catch (IOException e)
501
              {
502
                /* ignore */
503
              }
504
          }
505
        scratch = null;
506
      }
507
  }
508
 
509
  //////////////////////////////////////////////////////////////////////
510
  // Error reporting.
511
  //////////////////////////////////////////////////////////////////////
512
 
513
  /**
514
   * Report an error.
515
   * @param message The error message.
516
   * @param textFound The text that caused the error (or null).
517
   * @see SAXDriver#error
518
   * @see #line
519
   */
520
  private void error(String message, String textFound, String textExpected)
521
    throws SAXException
522
  {
523
    if (textFound != null)
524
      {
525
        message = message + " (found \"" + textFound + "\")";
526
      }
527
    if (textExpected != null)
528
      {
529
        message = message + " (expected \"" + textExpected + "\")";
530
      }
531
    handler.fatal(message);
532
 
533
    // "can't happen"
534
    throw new SAXException(message);
535
  }
536
 
537
  /**
538
   * Report a serious error.
539
   * @param message The error message.
540
   * @param textFound The text that caused the error (or null).
541
   */
542
  private void error(String message, char textFound, String textExpected)
543
    throws SAXException
544
  {
545
    error(message, Character.toString(textFound), textExpected);
546
  }
547
 
548
  /**
549
   * Report typical case fatal errors.
550
   */
551
  private void error(String message)
552
    throws SAXException
553
  {
554
    handler.fatal(message);
555
  }
556
 
557
  //////////////////////////////////////////////////////////////////////
558
  // Major syntactic productions.
559
  //////////////////////////////////////////////////////////////////////
560
 
561
  /**
562
   * Parse an XML document.
563
   * <pre>
564
   * [1] document ::= prolog element Misc*
565
   * </pre>
566
   * <p>This is the top-level parsing function for a single XML
567
   * document.  As a minimum, a well-formed document must have
568
   * a document element, and a valid document must have a prolog
569
   * (one with doctype) as well.
570
   */
571
  private void parseDocument()
572
    throws Exception
573
  {
574
    try
575
      {                                       // added by MHK
576
        boolean sawDTD = parseProlog();
577
        require('<');
578
        parseElement(!sawDTD);
579
      }
580
    catch (EOFException ee)
581
      {                 // added by MHK
582
        error("premature end of file", "[EOF]", null);
583
      }
584
 
585
    try
586
      {
587
        parseMisc();   //skip all white, PIs, and comments
588
        char c = readCh();    //if this doesn't throw an exception...
589
        error("unexpected characters after document end", c, null);
590
      }
591
    catch (EOFException e)
592
      {
593
        return;
594
      }
595
  }
596
 
597
  static final char[] startDelimComment = { '<', '!', '-', '-' };
598
  static final char[] endDelimComment = { '-', '-' };
599
 
600
  /**
601
   * Skip a comment.
602
   * <pre>
603
   * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
604
   * </pre>
605
   * <p> (The <code>&lt;!--</code> has already been read.)
606
   */
607
  private void parseComment()
608
    throws Exception
609
  {
610
    char c;
611
    boolean saved = expandPE;
612
 
613
    expandPE = false;
614
    parseUntil(endDelimComment);
615
    require('>');
616
    expandPE = saved;
617
    handler.comment(dataBuffer, 0, dataBufferPos);
618
    dataBufferPos = 0;
619
  }
620
 
621
  static final char[] startDelimPI = { '<', '?' };
622
  static final char[] endDelimPI = { '?', '>' };
623
 
624
  /**
625
   * Parse a processing instruction and do a call-back.
626
   * <pre>
627
   * [16] PI ::= '&lt;?' PITarget
628
   *    (S (Char* - (Char* '?&gt;' Char*)))?
629
   *    '?&gt;'
630
   * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
631
   * </pre>
632
   * <p> (The <code>&lt;?</code> has already been read.)
633
   */
634
  private void parsePI()
635
    throws SAXException, IOException
636
  {
637
    String name;
638
    boolean saved = expandPE;
639
 
640
    expandPE = false;
641
    name = readNmtoken(true);
642
    //NE08
643
    if (name.indexOf(':') >= 0)
644
      {
645
        error("Illegal character(':') in processing instruction name ",
646
              name, null);
647
      }
648
    if ("xml".equalsIgnoreCase(name))
649
      {
650
        error("Illegal processing instruction target", name, null);
651
      }
652
    if (!tryRead(endDelimPI))
653
      {
654
        requireWhitespace();
655
        parseUntil(endDelimPI);
656
      }
657
    expandPE = saved;
658
    handler.processingInstruction(name, dataBufferToString());
659
  }
660
 
661
  static final char[] endDelimCDATA = { ']', ']', '>' };
662
 
663
  private boolean isDirtyCurrentElement;
664
 
665
  /**
666
   * Parse a CDATA section.
667
   * <pre>
668
   * [18] CDSect ::= CDStart CData CDEnd
669
   * [19] CDStart ::= '&lt;![CDATA['
670
   * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
671
   * [21] CDEnd ::= ']]&gt;'
672
   * </pre>
673
   * <p> (The '&lt;![CDATA[' has already been read.)
674
   */
675
  private void parseCDSect()
676
    throws Exception
677
  {
678
    parseUntil(endDelimCDATA);
679
    dataBufferFlush();
680
  }
681
 
682
  /**
683
   * Parse the prolog of an XML document.
684
   * <pre>
685
   * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
686
   * </pre>
687
   * <p>We do not look for the XML declaration here, because it was
688
   * handled by pushURL ().
689
   * @see pushURL
690
   * @return true if a DTD was read.
691
   */
692
  private boolean parseProlog()
693
    throws Exception
694
  {
695
    parseMisc();
696
 
697
    if (tryRead("<!DOCTYPE"))
698
      {
699
        parseDoctypedecl();
700
        parseMisc();
701
        return true;
702
      }
703
    return false;
704
  }
705
 
706
  private void checkLegalVersion(String version)
707
    throws SAXException
708
  {
709
    int len = version.length();
710
    for (int i = 0; i < len; i++)
711
      {
712
        char c = version.charAt(i);
713
        if ('0' <= c && c <= '9')
714
          {
715
            continue;
716
          }
717
        if (c == '_' || c == '.' || c == ':' || c == '-')
718
          {
719
            continue;
720
          }
721
        if ('a' <= c && c <= 'z')
722
          {
723
            continue;
724
          }
725
        if ('A' <= c && c <= 'Z')
726
          {
727
            continue;
728
          }
729
        error ("illegal character in version", version, "1.0");
730
      }
731
  }
732
 
733
  /**
734
   * Parse the XML declaration.
735
   * <pre>
736
   * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
737
   * [24] VersionInfo ::= S 'version' Eq
738
   *    ("'" VersionNum "'" | '"' VersionNum '"' )
739
   * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
740
   * [32] SDDecl ::= S 'standalone' Eq
741
   *    ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
742
   * [80] EncodingDecl ::= S 'encoding' Eq
743
   *    ( "'" EncName "'" | "'" EncName "'" )
744
   * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
745
   * </pre>
746
   * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
747
   * @return the encoding in the declaration, uppercased; or null
748
   * @see #parseTextDecl
749
   * @see #setupDecoding
750
   */
751
  private String parseXMLDecl(boolean ignoreEncoding)
752
    throws SAXException, IOException
753
  {
754
    String version;
755
    String encodingName = null;
756
    String standalone = null;
757
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
758
    String inputEncoding = null;
759
 
760
    switch (this.encoding)
761
      {
762
      case ENCODING_EXTERNAL:
763
      case ENCODING_UTF_8:
764
        inputEncoding = "UTF-8";
765
        break;
766
      case ENCODING_ISO_8859_1:
767
        inputEncoding = "ISO-8859-1";
768
        break;
769
      case ENCODING_UCS_2_12:
770
        inputEncoding = "UTF-16BE";
771
        break;
772
      case ENCODING_UCS_2_21:
773
        inputEncoding = "UTF-16LE";
774
        break;
775
      }
776
 
777
    // Read the version.
778
    require("version");
779
    parseEq();
780
    checkLegalVersion(version = readLiteral(flags));
781
    if (!version.equals("1.0"))
782
      {
783
        if (version.equals("1.1"))
784
          {
785
            handler.warn("expected XML version 1.0, not: " + version);
786
            xmlVersion = XML_11;
787
          }
788
        else
789
          {
790
            error("illegal XML version", version, "1.0 or 1.1");
791
          }
792
      }
793
    else
794
      {
795
        xmlVersion = XML_10;
796
      }
797
    // Try reading an encoding declaration.
798
    boolean white = tryWhitespace();
799
 
800
    if (tryRead("encoding"))
801
      {
802
        if (!white)
803
          {
804
            error("whitespace required before 'encoding='");
805
          }
806
        parseEq();
807
        encodingName = readLiteral(flags);
808
        if (!ignoreEncoding)
809
          {
810
            setupDecoding(encodingName);
811
          }
812
      }
813
 
814
    // Try reading a standalone declaration
815
    if (encodingName != null)
816
      {
817
        white = tryWhitespace();
818
      }
819
    if (tryRead("standalone"))
820
      {
821
        if (!white)
822
          {
823
            error("whitespace required before 'standalone='");
824
          }
825
        parseEq();
826
        standalone = readLiteral(flags);
827
        if ("yes".equals(standalone))
828
          {
829
            docIsStandalone = true;
830
          }
831
        else if (!"no".equals(standalone))
832
          {
833
            error("standalone flag must be 'yes' or 'no'");
834
          }
835
      }
836
 
837
    skipWhitespace();
838
    require("?>");
839
 
840
    if (inputEncoding == null)
841
      {
842
        inputEncoding = encodingName;
843
      }
844
    return encodingName;
845
  }
846
 
847
  /**
848
   * Parse a text declaration.
849
   * <pre>
850
   * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
851
   * [80] EncodingDecl ::= S 'encoding' Eq
852
   *    ( '"' EncName '"' | "'" EncName "'" )
853
   * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
854
   * </pre>
855
   * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
856
   * @return the encoding in the declaration, uppercased; or null
857
   * @see #parseXMLDecl
858
   * @see #setupDecoding
859
   */
860
  private String parseTextDecl(boolean ignoreEncoding)
861
    throws SAXException, IOException
862
  {
863
    String encodingName = null;
864
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
865
 
866
    // Read an optional version.
867
    if (tryRead ("version"))
868
      {
869
        String version;
870
        parseEq();
871
        checkLegalVersion(version = readLiteral(flags));
872
 
873
        if (version.equals("1.1"))
874
          {
875
            if (xmlVersion == XML_10)
876
              {
877
                error("external subset has later version number.", "1.0",
878
                      version);
879
              }
880
            handler.warn("expected XML version 1.0, not: " + version);
881
            xmlVersion = XML_11;
882
          }
883
        else if (!version.equals("1.0"))
884
          {
885
            error("illegal XML version", version, "1.0 or 1.1");
886
          }
887
        requireWhitespace();
888
      }
889
 
890
    // Read the encoding.
891
    require("encoding");
892
    parseEq();
893
    encodingName = readLiteral(flags);
894
    if (!ignoreEncoding)
895
      {
896
        setupDecoding(encodingName);
897
      }
898
    skipWhitespace();
899
    require("?>");
900
 
901
    return encodingName;
902
  }
903
 
904
  /**
905
   * Sets up internal state so that we can decode an entity using the
906
   * specified encoding.  This is used when we start to read an entity
907
   * and we have been given knowledge of its encoding before we start to
908
   * read any data (e.g. from a SAX input source or from a MIME type).
909
   *
910
   * <p> It is also used after autodetection, at which point only very
911
   * limited adjustments to the encoding may be used (switching between
912
   * related builtin decoders).
913
   *
914
   * @param encodingName The name of the encoding specified by the user.
915
   * @exception IOException if the encoding isn't supported either
916
   *  internally to this parser, or by the hosting JVM.
917
   * @see #parseXMLDecl
918
   * @see #parseTextDecl
919
     */
920
  private void setupDecoding(String encodingName)
921
    throws SAXException, IOException
922
  {
923
    encodingName = encodingName.toUpperCase();
924
 
925
    // ENCODING_EXTERNAL indicates an encoding that wasn't
926
    // autodetected ... we can use builtin decoders, or
927
    // ones from the JVM (InputStreamReader).
928
 
929
    // Otherwise we can only tweak what was autodetected, and
930
    // only for single byte (ASCII derived) builtin encodings.
931
 
932
    // ASCII-derived encodings
933
    if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
934
      {
935
        if (encodingName.equals("ISO-8859-1")
936
            || encodingName.equals("8859_1")
937
            || encodingName.equals("ISO8859_1"))
938
          {
939
            encoding = ENCODING_ISO_8859_1;
940
            return;
941
          }
942
        else if (encodingName.equals("US-ASCII")
943
                 || encodingName.equals("ASCII"))
944
          {
945
            encoding = ENCODING_ASCII;
946
            return;
947
          }
948
        else if (encodingName.equals("UTF-8")
949
                 || encodingName.equals("UTF8"))
950
          {
951
            encoding = ENCODING_UTF_8;
952
            return;
953
          }
954
        else if (encoding != ENCODING_EXTERNAL)
955
          {
956
            // used to start with a new reader ...
957
            throw new UnsupportedEncodingException(encodingName);
958
          }
959
        // else fallthrough ...
960
        // it's ASCII-ish and something other than a builtin
961
      }
962
 
963
    // Unicode and such
964
    if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
965
      {
966
        if (!(encodingName.equals("ISO-10646-UCS-2")
967
              || encodingName.equals("UTF-16")
968
              || encodingName.equals("UTF-16BE")
969
              || encodingName.equals("UTF-16LE")))
970
          {
971
            error("unsupported Unicode encoding", encodingName, "UTF-16");
972
          }
973
        return;
974
      }
975
 
976
    // four byte encodings
977
    if (encoding == ENCODING_UCS_4_1234
978
        || encoding == ENCODING_UCS_4_4321
979
        || encoding == ENCODING_UCS_4_2143
980
        || encoding == ENCODING_UCS_4_3412)
981
      {
982
        // Strictly:  "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
983
        if (!encodingName.equals("ISO-10646-UCS-4"))
984
          {
985
            error("unsupported 32-bit encoding", encodingName,
986
                  "ISO-10646-UCS-4");
987
          }
988
        return;
989
      }
990
 
991
    // assert encoding == ENCODING_EXTERNAL
992
    // if (encoding != ENCODING_EXTERNAL)
993
    //     throw new RuntimeException ("encoding = " + encoding);
994
 
995
    if (encodingName.equals("UTF-16BE"))
996
      {
997
        encoding = ENCODING_UCS_2_12;
998
        return;
999
      }
1000
    if (encodingName.equals("UTF-16LE"))
1001
      {
1002
        encoding = ENCODING_UCS_2_21;
1003
        return;
1004
      }
1005
 
1006
    // We couldn't use the builtin decoders at all.  But we can try to
1007
    // create a reader, since we haven't messed up buffering.  Tweak
1008
    // the encoding name if necessary.
1009
 
1010
    if (encodingName.equals("UTF-16")
1011
        || encodingName.equals("ISO-10646-UCS-2"))
1012
      {
1013
        encodingName = "Unicode";
1014
      }
1015
    // Ignoring all the EBCDIC aliases here
1016
 
1017
    reader = new InputStreamReader(is, encodingName);
1018
    sourceType = INPUT_READER;
1019
  }
1020
 
1021
  /**
1022
   * Parse miscellaneous markup outside the document element and DOCTYPE
1023
   * declaration.
1024
   * <pre>
1025
   * [27] Misc ::= Comment | PI | S
1026
   * </pre>
1027
   */
1028
  private void parseMisc()
1029
    throws Exception
1030
  {
1031
    while (true)
1032
      {
1033
        skipWhitespace();
1034
        if (tryRead(startDelimPI))
1035
          {
1036
            parsePI();
1037
          }
1038
        else if (tryRead(startDelimComment))
1039
          {
1040
            parseComment();
1041
          }
1042
        else
1043
          {
1044
            return;
1045
          }
1046
      }
1047
  }
1048
 
1049
  /**
1050
   * Parse a document type declaration.
1051
   * <pre>
1052
   * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1053
   *    ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1054
   * </pre>
1055
   * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1056
   */
1057
  private void parseDoctypedecl()
1058
    throws Exception
1059
  {
1060
    String rootName;
1061
    ExternalIdentifiers ids;
1062
 
1063
    // Read the document type name.
1064
    requireWhitespace();
1065
    rootName = readNmtoken(true);
1066
 
1067
    // Read the External subset's IDs
1068
    skipWhitespace();
1069
    ids = readExternalIds(false, true);
1070
 
1071
    // report (a) declaration of name, (b) lexical info (ids)
1072
    handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1073
 
1074
    // Internal subset is parsed first, if present
1075
    skipWhitespace();
1076
    if (tryRead('['))
1077
      {
1078
 
1079
        // loop until the subset ends
1080
        while (true)
1081
          {
1082
            doReport = expandPE = true;
1083
            skipWhitespace();
1084
            doReport = expandPE = false;
1085
            if (tryRead(']'))
1086
              {
1087
                break;     // end of subset
1088
              }
1089
            else
1090
              {
1091
                // WFC, PEs in internal subset (only between decls)
1092
                peIsError = expandPE = true;
1093
                parseMarkupdecl();
1094
                peIsError = expandPE = false;
1095
              }
1096
          }
1097
      }
1098
    skipWhitespace();
1099
    require('>');
1100
 
1101
    // Read the external subset, if any
1102
    InputSource subset;
1103
 
1104
    if (ids.systemId == null)
1105
      {
1106
        subset = handler.getExternalSubset(rootName,
1107
                                           handler.getSystemId());
1108
      }
1109
    else
1110
      {
1111
        subset = null;
1112
      }
1113
    if (ids.systemId != null || subset != null)
1114
      {
1115
        pushString(null, ">");
1116
 
1117
        // NOTE:  [dtd] is so we say what SAX2 expects,
1118
        // though it's misleading (subset, not entire dtd)
1119
        if (ids.systemId != null)
1120
          {
1121
            pushURL(true, "[dtd]", ids, null, null, null, true);
1122
          }
1123
        else
1124
          {
1125
            handler.warn("modifying document by adding external subset");
1126
            pushURL(true, "[dtd]",
1127
                    new ExternalIdentifiers(subset.getPublicId(),
1128
                                            subset.getSystemId(),
1129
                                            null),
1130
                    subset.getCharacterStream(),
1131
                    subset.getByteStream(),
1132
                    subset.getEncoding(),
1133
                    false);
1134
          }
1135
 
1136
        // Loop until we end up back at '>'
1137
        while (true)
1138
          {
1139
            doReport = expandPE = true;
1140
            skipWhitespace();
1141
            doReport = expandPE = false;
1142
            if (tryRead('>'))
1143
              {
1144
                break;
1145
              }
1146
            else
1147
              {
1148
                expandPE = true;
1149
                parseMarkupdecl();
1150
                expandPE = false;
1151
              }
1152
          }
1153
 
1154
        // the ">" string isn't popped yet
1155
        if (inputStack.size() != 1)
1156
          {
1157
            error("external subset has unmatched '>'");
1158
          }
1159
      }
1160
 
1161
    // done dtd
1162
    handler.endDoctype();
1163
    expandPE = false;
1164
    doReport = true;
1165
  }
1166
 
1167
  /**
1168
   * Parse a markup declaration in the internal or external DTD subset.
1169
   * <pre>
1170
   * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1171
   *    | NotationDecl | PI | Comment
1172
   * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1173
   *    | PEReference | S) *
1174
   * </pre>
1175
   * <p> Reading toplevel PE references is handled as a lexical issue
1176
   * by the caller, as is whitespace.
1177
   */
1178
  private void parseMarkupdecl()
1179
    throws Exception
1180
  {
1181
    char[] saved = null;
1182
    boolean savedPE = expandPE;
1183
 
1184
    // prevent "<%foo;" and ensures saved entity is right
1185
    require('<');
1186
    unread('<');
1187
    expandPE = false;
1188
 
1189
    if (tryRead("<!ELEMENT"))
1190
      {
1191
        saved = readBuffer;
1192
        expandPE = savedPE;
1193
        parseElementDecl();
1194
      }
1195
    else if (tryRead("<!ATTLIST"))
1196
      {
1197
        saved = readBuffer;
1198
        expandPE = savedPE;
1199
        parseAttlistDecl();
1200
      }
1201
    else if (tryRead("<!ENTITY"))
1202
      {
1203
        saved = readBuffer;
1204
        expandPE = savedPE;
1205
        parseEntityDecl();
1206
      }
1207
    else if (tryRead("<!NOTATION"))
1208
      {
1209
        saved = readBuffer;
1210
        expandPE = savedPE;
1211
        parseNotationDecl();
1212
      }
1213
    else if (tryRead(startDelimPI))
1214
      {
1215
        saved = readBuffer;
1216
        expandPE = savedPE;
1217
        parsePI();
1218
      }
1219
    else if (tryRead(startDelimComment))
1220
      {
1221
        saved = readBuffer;
1222
        expandPE = savedPE;
1223
        parseComment();
1224
      }
1225
    else if (tryRead("<!["))
1226
      {
1227
        saved = readBuffer;
1228
        expandPE = savedPE;
1229
        if (inputStack.size() > 0)
1230
          {
1231
            parseConditionalSect(saved);
1232
          }
1233
        else
1234
          {
1235
            error("conditional sections illegal in internal subset");
1236
          }
1237
      }
1238
    else
1239
      {
1240
        error("expected markup declaration");
1241
      }
1242
 
1243
    // VC: Proper Decl/PE Nesting
1244
    if (readBuffer != saved)
1245
      {
1246
        handler.verror("Illegal Declaration/PE nesting");
1247
      }
1248
  }
1249
 
1250
  /**
1251
   * Parse an element, with its tags.
1252
   * <pre>
1253
   * [39] element ::= EmptyElementTag | STag content ETag
1254
   * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1255
   * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1256
   * </pre>
1257
   * <p> (The '&lt;' has already been read.)
1258
   * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1259
   * and parseContent () will take care of calling parseETag ().
1260
   */
1261
  private void parseElement(boolean maybeGetSubset)
1262
    throws Exception
1263
  {
1264
    String gi;
1265
    char c;
1266
    int oldElementContent = currentElementContent;
1267
    String oldElement = currentElement;
1268
    ElementDecl element;
1269
 
1270
    // This is the (global) counter for the
1271
    // array of specified attributes.
1272
    tagAttributePos = 0;
1273
 
1274
    // Read the element type name.
1275
    gi = readNmtoken(true);
1276
 
1277
    // If we saw no DTD, and this is the document root element,
1278
    // let the application modify the input stream by providing one.
1279
    if (maybeGetSubset)
1280
      {
1281
        InputSource subset = handler.getExternalSubset(gi,
1282
                                                       handler.getSystemId());
1283
        if (subset != null)
1284
          {
1285
            String publicId = subset.getPublicId();
1286
            String systemId = subset.getSystemId();
1287
 
1288
            handler.warn("modifying document by adding DTD");
1289
            handler.doctypeDecl(gi, publicId, systemId);
1290
            pushString(null, ">");
1291
 
1292
            // NOTE:  [dtd] is so we say what SAX2 expects,
1293
            // though it's misleading (subset, not entire dtd)
1294
            pushURL(true, "[dtd]",
1295
                    new ExternalIdentifiers(publicId, systemId, null),
1296
                    subset.getCharacterStream(),
1297
                    subset.getByteStream(),
1298
                    subset.getEncoding(),
1299
                    false);
1300
 
1301
            // Loop until we end up back at '>'
1302
            while (true)
1303
              {
1304
                doReport = expandPE = true;
1305
                skipWhitespace();
1306
                doReport = expandPE = false;
1307
                if (tryRead('>'))
1308
                  {
1309
                    break;
1310
                  }
1311
                else
1312
                  {
1313
                    expandPE = true;
1314
                    parseMarkupdecl();
1315
                    expandPE = false;
1316
                  }
1317
              }
1318
 
1319
            // the ">" string isn't popped yet
1320
            if (inputStack.size() != 1)
1321
              {
1322
                error("external subset has unmatched '>'");
1323
              }
1324
 
1325
            handler.endDoctype();
1326
          }
1327
      }
1328
 
1329
    // Determine the current content type.
1330
    currentElement = gi;
1331
    element = (ElementDecl) elementInfo.get(gi);
1332
    currentElementContent = getContentType(element, CONTENT_ANY);
1333
 
1334
    // Read the attributes, if any.
1335
    // After this loop, "c" is the closing delimiter.
1336
    boolean white = tryWhitespace();
1337
    c = readCh();
1338
    while (c != '/' && c != '>')
1339
      {
1340
        unread(c);
1341
        if (!white)
1342
          {
1343
            error("need whitespace between attributes");
1344
          }
1345
        parseAttribute(gi);
1346
        white = tryWhitespace();
1347
        c = readCh();
1348
      }
1349
 
1350
    // Supply any defaulted attributes.
1351
    Iterator atts = declaredAttributes(element);
1352
    if (atts != null)
1353
      {
1354
        String aname;
1355
loop:
1356
        while (atts.hasNext())
1357
          {
1358
            aname = (String) atts.next();
1359
            // See if it was specified.
1360
            for (int i = 0; i < tagAttributePos; i++)
1361
              {
1362
                if (tagAttributes[i] == aname)
1363
                  {
1364
                    continue loop;
1365
                  }
1366
              }
1367
            // ... or has a default
1368
            String value = getAttributeDefaultValue(gi, aname);
1369
 
1370
            if (value == null)
1371
              {
1372
                continue;
1373
              }
1374
            handler.attribute(aname, value, false);
1375
          }
1376
      }
1377
 
1378
    // Figure out if this is a start tag
1379
    // or an empty element, and dispatch an
1380
    // event accordingly.
1381
    switch (c)
1382
      {
1383
      case '>':
1384
        handler.startElement(gi);
1385
        parseContent();
1386
        break;
1387
      case '/':
1388
        require('>');
1389
        handler.startElement(gi);
1390
        handler.endElement(gi);
1391
        break;
1392
      }
1393
 
1394
    // Restore the previous state.
1395
    currentElement = oldElement;
1396
    currentElementContent = oldElementContent;
1397
  }
1398
 
1399
  /**
1400
   * Parse an attribute assignment.
1401
   * <pre>
1402
   * [41] Attribute ::= Name Eq AttValue
1403
   * </pre>
1404
   * @param name The name of the attribute's element.
1405
   * @see SAXDriver#attribute
1406
   */
1407
  private void parseAttribute(String name)
1408
    throws Exception
1409
  {
1410
    String aname;
1411
    String type;
1412
    String value;
1413
    int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1414
 
1415
    // Read the attribute name.
1416
    aname = readNmtoken(true);
1417
    type = getAttributeType(name, aname);
1418
 
1419
    // Parse '='
1420
    parseEq();
1421
 
1422
    // Read the value, normalizing whitespace
1423
    // unless it is CDATA.
1424
    if (handler.stringInterning)
1425
      {
1426
        if (type == "CDATA" || type == null)
1427
          {
1428
            value = readLiteral(flags);
1429
          }
1430
        else
1431
          {
1432
            value = readLiteral(flags | LIT_NORMALIZE);
1433
          }
1434
      }
1435
    else
1436
      {
1437
        if (type == null || type.equals("CDATA"))
1438
          {
1439
            value = readLiteral(flags);
1440
          }
1441
        else
1442
          {
1443
            value = readLiteral(flags | LIT_NORMALIZE);
1444
          }
1445
      }
1446
 
1447
    // WFC: no duplicate attributes
1448
    for (int i = 0; i < tagAttributePos; i++)
1449
      {
1450
        if (aname.equals(tagAttributes [i]))
1451
          {
1452
            error("duplicate attribute", aname, null);
1453
          }
1454
      }
1455
 
1456
    // Inform the handler about the
1457
    // attribute.
1458
    handler.attribute(aname, value, true);
1459
    dataBufferPos = 0;
1460
 
1461
    // Note that the attribute has been
1462
    // specified.
1463
    if (tagAttributePos == tagAttributes.length)
1464
      {
1465
        String newAttrib[] = new String[tagAttributes.length * 2];
1466
        System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1467
        tagAttributes = newAttrib;
1468
      }
1469
    tagAttributes[tagAttributePos++] = aname;
1470
  }
1471
 
1472
  /**
1473
   * Parse an equals sign surrounded by optional whitespace.
1474
   * <pre>
1475
   * [25] Eq ::= S? '=' S?
1476
   * </pre>
1477
   */
1478
  private void parseEq()
1479
    throws SAXException, IOException
1480
  {
1481
    skipWhitespace();
1482
    require('=');
1483
    skipWhitespace();
1484
  }
1485
 
1486
  /**
1487
   * Parse an end tag.
1488
   * <pre>
1489
   * [42] ETag ::= '</' Name S? '>'
1490
   * </pre>
1491
   * <p>NOTE: parseContent () chains to here, we already read the
1492
   * "&lt;/".
1493
   */
1494
  private void parseETag()
1495
    throws Exception
1496
  {
1497
    require(currentElement);
1498
    skipWhitespace();
1499
    require('>');
1500
    handler.endElement(currentElement);
1501
    // not re-reporting any SAXException re bogus end tags,
1502
    // even though that diagnostic might be clearer ...
1503
  }
1504
 
1505
  /**
1506
   * Parse the content of an element.
1507
   * <pre>
1508
   * [43] content ::= (element | CharData | Reference
1509
   *    | CDSect | PI | Comment)*
1510
   * [67] Reference ::= EntityRef | CharRef
1511
   * </pre>
1512
   * <p> NOTE: consumes ETtag.
1513
   */
1514
  private void parseContent()
1515
    throws Exception
1516
  {
1517
    char c;
1518
 
1519
    while (true)
1520
      {
1521
        // consume characters (or ignorable whitspace) until delimiter
1522
        parseCharData();
1523
 
1524
        // Handle delimiters
1525
        c = readCh();
1526
        switch (c)
1527
          {
1528
          case '&':       // Found "&"
1529
            c = readCh();
1530
            if (c == '#')
1531
              {
1532
                parseCharRef();
1533
              }
1534
            else
1535
              {
1536
                unread(c);
1537
                parseEntityRef(true);
1538
              }
1539
            isDirtyCurrentElement = true;
1540
            break;
1541
 
1542
          case '<':       // Found "<"
1543
            dataBufferFlush();
1544
            c = readCh();
1545
            switch (c)
1546
              {
1547
              case '!':       // Found "<!"
1548
                c = readCh();
1549
                switch (c)
1550
                  {
1551
                  case '-':     // Found "<!-"
1552
                    require('-');
1553
                    isDirtyCurrentElement = false;
1554
                    parseComment();
1555
                    break;
1556
                  case '[':     // Found "<!["
1557
                    isDirtyCurrentElement = false;
1558
                    require("CDATA[");
1559
                    handler.startCDATA();
1560
                    inCDATA = true;
1561
                    parseCDSect();
1562
                    inCDATA = false;
1563
                    handler.endCDATA();
1564
                    break;
1565
                  default:
1566
                    error("expected comment or CDATA section", c, null);
1567
                    break;
1568
                  }
1569
                break;
1570
 
1571
              case '?':     // Found "<?"
1572
                isDirtyCurrentElement = false;
1573
                parsePI();
1574
                break;
1575
 
1576
              case '/':     // Found "</"
1577
                isDirtyCurrentElement = false;
1578
                parseETag();
1579
                return;
1580
 
1581
              default:     // Found "<" followed by something else
1582
                isDirtyCurrentElement = false;
1583
                unread(c);
1584
                parseElement(false);
1585
                break;
1586
              }
1587
          }
1588
      }
1589
  }
1590
 
1591
  /**
1592
   * Parse an element type declaration.
1593
   * <pre>
1594
   * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1595
   * </pre>
1596
   * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1597
   */
1598
  private void parseElementDecl()
1599
    throws Exception
1600
  {
1601
    String name;
1602
 
1603
    requireWhitespace();
1604
    // Read the element type name.
1605
    name = readNmtoken(true);
1606
 
1607
    requireWhitespace();
1608
    // Read the content model.
1609
    parseContentspec(name);
1610
 
1611
    skipWhitespace();
1612
    require('>');
1613
  }
1614
 
1615
  /**
1616
   * Content specification.
1617
   * <pre>
1618
   * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1619
   * </pre>
1620
   */
1621
  private void parseContentspec(String name)
1622
    throws Exception
1623
  {
1624
    // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1625
    if (tryRead("EMPTY"))
1626
      {
1627
        setElement(name, CONTENT_EMPTY, null, null);
1628
        if (!skippedPE)
1629
          {
1630
            handler.getDeclHandler().elementDecl(name, "EMPTY");
1631
          }
1632
        return;
1633
      }
1634
    else if (tryRead("ANY"))
1635
      {
1636
        setElement(name, CONTENT_ANY, null, null);
1637
        if (!skippedPE)
1638
          {
1639
            handler.getDeclHandler().elementDecl(name, "ANY");
1640
          }
1641
        return;
1642
      }
1643
    else
1644
      {
1645
        String model;
1646
        char[] saved;
1647
 
1648
        require('(');
1649
        saved = readBuffer;
1650
        dataBufferAppend('(');
1651
        skipWhitespace();
1652
        if (tryRead("#PCDATA"))
1653
          {
1654
            dataBufferAppend("#PCDATA");
1655
            parseMixed(saved);
1656
            model = dataBufferToString();
1657
            setElement(name, CONTENT_MIXED, model, null);
1658
          }
1659
        else
1660
          {
1661
            parseElements(saved);
1662
            model = dataBufferToString();
1663
            setElement(name, CONTENT_ELEMENTS, model, null);
1664
          }
1665
        if (!skippedPE)
1666
          {
1667
            handler.getDeclHandler().elementDecl(name, model);
1668
          }
1669
      }
1670
  }
1671
 
1672
  /**
1673
   * Parse an element-content model.
1674
   * <pre>
1675
   * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1676
   * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1677
   * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1678
   * </pre>
1679
   *
1680
   * <p> NOTE: the opening '(' and S have already been read.
1681
   *
1682
   * @param saved Buffer for entity that should have the terminal ')'
1683
   */
1684
  private void parseElements(char[] saved)
1685
    throws Exception
1686
  {
1687
    char c;
1688
    char sep;
1689
 
1690
    // Parse the first content particle
1691
    skipWhitespace();
1692
    parseCp();
1693
 
1694
    // Check for end or for a separator.
1695
    skipWhitespace();
1696
    c = readCh();
1697
    switch (c)
1698
      {
1699
      case ')':
1700
        // VC: Proper Group/PE Nesting
1701
        if (readBuffer != saved)
1702
          {
1703
            handler.verror("Illegal Group/PE nesting");
1704
          }
1705
 
1706
        dataBufferAppend(')');
1707
        c = readCh();
1708
        switch (c)
1709
          {
1710
          case '*':
1711
          case '+':
1712
          case '?':
1713
            dataBufferAppend(c);
1714
            break;
1715
          default:
1716
            unread(c);
1717
          }
1718
        return;
1719
      case ',':       // Register the separator.
1720
      case '|':
1721
        sep = c;
1722
        dataBufferAppend(c);
1723
        break;
1724
      default:
1725
        error("bad separator in content model", c, null);
1726
        return;
1727
      }
1728
 
1729
    // Parse the rest of the content model.
1730
    while (true)
1731
      {
1732
        skipWhitespace();
1733
        parseCp();
1734
        skipWhitespace();
1735
        c = readCh();
1736
        if (c == ')')
1737
          {
1738
            // VC: Proper Group/PE Nesting
1739
            if (readBuffer != saved)
1740
              {
1741
                handler.verror("Illegal Group/PE nesting");
1742
              }
1743
 
1744
            dataBufferAppend(')');
1745
            break;
1746
          }
1747
        else if (c != sep)
1748
          {
1749
            error("bad separator in content model", c, null);
1750
            return;
1751
          }
1752
        else
1753
          {
1754
            dataBufferAppend(c);
1755
          }
1756
      }
1757
 
1758
    // Check for the occurrence indicator.
1759
    c = readCh();
1760
    switch (c)
1761
      {
1762
      case '?':
1763
      case '*':
1764
      case '+':
1765
        dataBufferAppend(c);
1766
        return;
1767
      default:
1768
        unread(c);
1769
        return;
1770
      }
1771
  }
1772
 
1773
  /**
1774
   * Parse a content particle.
1775
   * <pre>
1776
   * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1777
   * </pre>
1778
   */
1779
  private void parseCp()
1780
    throws Exception
1781
  {
1782
    if (tryRead('('))
1783
      {
1784
        dataBufferAppend('(');
1785
        parseElements(readBuffer);
1786
      }
1787
    else
1788
      {
1789
        dataBufferAppend(readNmtoken(true));
1790
        char c = readCh();
1791
        switch (c)
1792
          {
1793
          case '?':
1794
          case '*':
1795
          case '+':
1796
            dataBufferAppend(c);
1797
            break;
1798
          default:
1799
            unread(c);
1800
            break;
1801
          }
1802
      }
1803
  }
1804
 
1805
  /**
1806
   * Parse mixed content.
1807
   * <pre>
1808
   * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1809
   *        | '(' S? ('#PCDATA') S? ')'
1810
   * </pre>
1811
   *
1812
   * @param saved Buffer for entity that should have the terminal ')'
1813
   */
1814
  private void parseMixed(char[] saved)
1815
    throws Exception
1816
  {
1817
    // Check for PCDATA alone.
1818
    skipWhitespace();
1819
    if (tryRead(')'))
1820
      {
1821
        // VC: Proper Group/PE Nesting
1822
        if (readBuffer != saved)
1823
          {
1824
            handler.verror("Illegal Group/PE nesting");
1825
          }
1826
 
1827
        dataBufferAppend(")*");
1828
        tryRead('*');
1829
        return;
1830
      }
1831
 
1832
    // Parse mixed content.
1833
    skipWhitespace();
1834
    while (!tryRead(")"))
1835
      {
1836
        require('|');
1837
        dataBufferAppend('|');
1838
        skipWhitespace();
1839
        dataBufferAppend(readNmtoken(true));
1840
        skipWhitespace();
1841
      }
1842
 
1843
    // VC: Proper Group/PE Nesting
1844
    if (readBuffer != saved)
1845
      {
1846
        handler.verror("Illegal Group/PE nesting");
1847
      }
1848
 
1849
    require('*');
1850
    dataBufferAppend(")*");
1851
  }
1852
 
1853
  /**
1854
   * Parse an attribute list declaration.
1855
   * <pre>
1856
   * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1857
   * </pre>
1858
   * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1859
   */
1860
  private void parseAttlistDecl()
1861
    throws Exception
1862
  {
1863
    String elementName;
1864
 
1865
    requireWhitespace();
1866
    elementName = readNmtoken(true);
1867
    boolean white = tryWhitespace();
1868
    while (!tryRead('>'))
1869
      {
1870
        if (!white)
1871
          {
1872
            error("whitespace required before attribute definition");
1873
          }
1874
        parseAttDef(elementName);
1875
        white = tryWhitespace();
1876
      }
1877
  }
1878
 
1879
  /**
1880
   * Parse a single attribute definition.
1881
   * <pre>
1882
   * [53] AttDef ::= S Name S AttType S DefaultDecl
1883
   * </pre>
1884
   */
1885
  private void parseAttDef(String elementName)
1886
    throws Exception
1887
  {
1888
    String name;
1889
    String type;
1890
    String enumer = null;
1891
 
1892
    // Read the attribute name.
1893
    name = readNmtoken(true);
1894
 
1895
    // Read the attribute type.
1896
    requireWhitespace();
1897
    type = readAttType();
1898
 
1899
    // Get the string of enumerated values if necessary.
1900
    if (handler.stringInterning)
1901
      {
1902
        if ("ENUMERATION" == type || "NOTATION" == type)
1903
          {
1904
            enumer = dataBufferToString();
1905
          }
1906
      }
1907
    else
1908
      {
1909
        if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1910
          {
1911
            enumer = dataBufferToString();
1912
          }
1913
      }
1914
 
1915
    // Read the default value.
1916
    requireWhitespace();
1917
    parseDefault(elementName, name, type, enumer);
1918
  }
1919
 
1920
  /**
1921
   * Parse the attribute type.
1922
   * <pre>
1923
   * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1924
   * [55] StringType ::= 'CDATA'
1925
   * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1926
   *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1927
   * [57] EnumeratedType ::= NotationType | Enumeration
1928
   * </pre>
1929
   */
1930
  private String readAttType()
1931
    throws Exception
1932
  {
1933
    if (tryRead('('))
1934
      {
1935
        parseEnumeration(false);
1936
        return "ENUMERATION";
1937
      }
1938
    else
1939
      {
1940
        String typeString = readNmtoken(true);
1941
        if (handler.stringInterning)
1942
          {
1943
            if ("NOTATION" == typeString)
1944
              {
1945
                parseNotationType();
1946
                return typeString;
1947
              }
1948
            else if ("CDATA" == typeString
1949
                     || "ID" == typeString
1950
                     || "IDREF" == typeString
1951
                     || "IDREFS" == typeString
1952
                     || "ENTITY" == typeString
1953
                     || "ENTITIES" == typeString
1954
                     || "NMTOKEN" == typeString
1955
                     || "NMTOKENS" == typeString)
1956
              {
1957
                return typeString;
1958
              }
1959
          }
1960
        else
1961
          {
1962
            if ("NOTATION".equals(typeString))
1963
              {
1964
                parseNotationType();
1965
                return typeString;
1966
              }
1967
            else if ("CDATA".equals(typeString)
1968
                     || "ID".equals(typeString)
1969
                     || "IDREF".equals(typeString)
1970
                     || "IDREFS".equals(typeString)
1971
                     || "ENTITY".equals(typeString)
1972
                     || "ENTITIES".equals(typeString)
1973
                     || "NMTOKEN".equals(typeString)
1974
                     || "NMTOKENS".equals(typeString))
1975
              {
1976
                return typeString;
1977
              }
1978
          }
1979
        error("illegal attribute type", typeString, null);
1980
        return null;
1981
      }
1982
  }
1983
 
1984
  /**
1985
   * Parse an enumeration.
1986
   * <pre>
1987
   * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1988
   * </pre>
1989
   * <p>NOTE: the '(' has already been read.
1990
   */
1991
  private void parseEnumeration(boolean isNames)
1992
    throws Exception
1993
  {
1994
    dataBufferAppend('(');
1995
 
1996
    // Read the first token.
1997
    skipWhitespace();
1998
    dataBufferAppend(readNmtoken(isNames));
1999
    // Read the remaining tokens.
2000
    skipWhitespace();
2001
    while (!tryRead(')'))
2002
      {
2003
        require('|');
2004
        dataBufferAppend('|');
2005
        skipWhitespace();
2006
        dataBufferAppend(readNmtoken (isNames));
2007
        skipWhitespace();
2008
      }
2009
    dataBufferAppend(')');
2010
  }
2011
 
2012
  /**
2013
   * Parse a notation type for an attribute.
2014
   * <pre>
2015
   * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2016
   *    (S? '|' S? name)* S? ')'
2017
   * </pre>
2018
   * <p>NOTE: the 'NOTATION' has already been read
2019
   */
2020
  private void parseNotationType()
2021
    throws Exception
2022
  {
2023
    requireWhitespace();
2024
    require('(');
2025
 
2026
    parseEnumeration(true);
2027
  }
2028
 
2029
  /**
2030
   * Parse the default value for an attribute.
2031
   * <pre>
2032
   * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2033
   *    | (('#FIXED' S)? AttValue)
2034
   * </pre>
2035
   */
2036
  private void parseDefault(String elementName, String name,
2037
                            String type, String enumer)
2038
    throws Exception
2039
  {
2040
    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2041
    String value = null;
2042
    int flags = LIT_ATTRIBUTE;
2043
    boolean saved = expandPE;
2044
    String defaultType = null;
2045
 
2046
    // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2047
    // chars to spaces (doesn't matter when that's done if it doesn't
2048
    // interfere with char refs expanding to whitespace).
2049
 
2050
    if (!skippedPE)
2051
      {
2052
        flags |= LIT_ENTITY_REF;
2053
        if (handler.stringInterning)
2054
          {
2055
            if ("CDATA" != type)
2056
              {
2057
                flags |= LIT_NORMALIZE;
2058
              }
2059
          }
2060
        else
2061
          {
2062
            if (!"CDATA".equals(type))
2063
              {
2064
                flags |= LIT_NORMALIZE;
2065
              }
2066
          }
2067
      }
2068
 
2069
    expandPE = false;
2070
    if (tryRead('#'))
2071
      {
2072
        if (tryRead("FIXED"))
2073
          {
2074
            defaultType = "#FIXED";
2075
            valueType = ATTRIBUTE_DEFAULT_FIXED;
2076
            requireWhitespace();
2077
            value = readLiteral(flags);
2078
          }
2079
        else if (tryRead("REQUIRED"))
2080
          {
2081
            defaultType = "#REQUIRED";
2082
            valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2083
          }
2084
        else if (tryRead("IMPLIED"))
2085
          {
2086
            defaultType = "#IMPLIED";
2087
            valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2088
          }
2089
        else
2090
          {
2091
            error("illegal keyword for attribute default value");
2092
          }
2093
      }
2094
    else
2095
      {
2096
        value = readLiteral(flags);
2097
      }
2098
    expandPE = saved;
2099
    setAttribute(elementName, name, type, enumer, value, valueType);
2100
    if (handler.stringInterning)
2101
      {
2102
        if ("ENUMERATION" == type)
2103
          {
2104
            type = enumer;
2105
          }
2106
        else if ("NOTATION" == type)
2107
          {
2108
            type = "NOTATION " + enumer;
2109
          }
2110
      }
2111
    else
2112
      {
2113
        if ("ENUMERATION".equals(type))
2114
          {
2115
            type = enumer;
2116
          }
2117
        else if ("NOTATION".equals(type))
2118
          {
2119
            type = "NOTATION " + enumer;
2120
          }
2121
      }
2122
    if (!skippedPE)
2123
      {
2124
        handler.getDeclHandler().attributeDecl(elementName, name, type,
2125
                                               defaultType, value);
2126
      }
2127
  }
2128
 
2129
  /**
2130
   * Parse a conditional section.
2131
   * <pre>
2132
   * [61] conditionalSect ::= includeSect || ignoreSect
2133
   * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2134
   *    extSubsetDecl ']]&gt;'
2135
   * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2136
   *    ignoreSectContents* ']]&gt;'
2137
   * [64] ignoreSectContents ::= Ignore
2138
   *    ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2139
   * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2140
   * </pre>
2141
   * <p> NOTE: the '&gt;![' has already been read.
2142
   */
2143
  private void parseConditionalSect(char[] saved)
2144
    throws Exception
2145
  {
2146
    skipWhitespace();
2147
    if (tryRead("INCLUDE"))
2148
      {
2149
        skipWhitespace();
2150
        require('[');
2151
        // VC: Proper Conditional Section/PE Nesting
2152
        if (readBuffer != saved)
2153
          {
2154
            handler.verror("Illegal Conditional Section/PE nesting");
2155
          }
2156
        skipWhitespace();
2157
        while (!tryRead("]]>"))
2158
          {
2159
            parseMarkupdecl();
2160
            skipWhitespace();
2161
          }
2162
      }
2163
    else if (tryRead("IGNORE"))
2164
      {
2165
        skipWhitespace();
2166
        require('[');
2167
        // VC: Proper Conditional Section/PE Nesting
2168
        if (readBuffer != saved)
2169
          {
2170
            handler.verror("Illegal Conditional Section/PE nesting");
2171
          }
2172
        int nesting = 1;
2173
        char c;
2174
        expandPE = false;
2175
        for (int nest = 1; nest > 0; )
2176
          {
2177
            c = readCh();
2178
            switch (c)
2179
              {
2180
              case '<':
2181
                if (tryRead("!["))
2182
                  {
2183
                    nest++;
2184
                  }
2185
                break;
2186
              case ']':
2187
                if (tryRead("]>"))
2188
                  {
2189
                    nest--;
2190
                  }
2191
              }
2192
          }
2193
        expandPE = true;
2194
      }
2195
    else
2196
      {
2197
        error("conditional section must begin with INCLUDE or IGNORE");
2198
      }
2199
  }
2200
 
2201
  private void parseCharRef()
2202
    throws SAXException, IOException
2203
  {
2204
    parseCharRef(true /* do flushDataBuffer by default */);
2205
  }
2206
 
2207
  /**
2208
   * Try to read a character reference without consuming data from buffer.
2209
   * <pre>
2210
   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2211
   * </pre>
2212
   * <p>NOTE: the '&#' has already been read.
2213
   */
2214
  private void tryReadCharRef()
2215
    throws SAXException, IOException
2216
  {
2217
    int value = 0;
2218
    char c;
2219
 
2220
    if (tryRead('x'))
2221
      {
2222
loop1:
2223
        while (true)
2224
          {
2225
            c = readCh();
2226
            if (c == ';')
2227
              {
2228
                break loop1;
2229
              }
2230
            else
2231
              {
2232
                int n = Character.digit(c, 16);
2233
                if (n == -1)
2234
                  {
2235
                    error("illegal character in character reference", c, null);
2236
                    break loop1;
2237
                  }
2238
                value *= 16;
2239
                value += n;
2240
              }
2241
          }
2242
      }
2243
    else
2244
      {
2245
loop2:
2246
        while (true)
2247
          {
2248
            c = readCh();
2249
            if (c == ';')
2250
              {
2251
                break loop2;
2252
              }
2253
            else
2254
              {
2255
                int n = Character.digit(c, 10);
2256
                if (n == -1)
2257
                  {
2258
                    error("illegal character in character reference", c, null);
2259
                    break loop2;
2260
                  }
2261
                value *= 10;
2262
                value += n;
2263
              }
2264
          }
2265
      }
2266
 
2267
    // check for character refs being legal XML
2268
    if ((value < 0x0020
2269
         && ! (value == '\n' || value == '\t' || value == '\r'))
2270
        || (value >= 0xD800 && value <= 0xDFFF)
2271
        || value == 0xFFFE || value == 0xFFFF
2272
        || value > 0x0010ffff)
2273
      {
2274
        error("illegal XML character reference U+"
2275
              + Integer.toHexString(value));
2276
      }
2277
 
2278
    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2279
    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2280
    if (value > 0x0010ffff)
2281
      {
2282
        // too big for surrogate
2283
        error("character reference " + value + " is too large for UTF-16",
2284
              Integer.toString(value), null);
2285
      }
2286
 
2287
  }
2288
 
2289
  /**
2290
   * Read and interpret a character reference.
2291
   * <pre>
2292
   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2293
   * </pre>
2294
   * <p>NOTE: the '&#' has already been read.
2295
   */
2296
  private void parseCharRef(boolean doFlush)
2297
    throws SAXException, IOException
2298
  {
2299
    int value = 0;
2300
    char c;
2301
 
2302
    if (tryRead('x'))
2303
      {
2304
loop1:
2305
        while (true)
2306
          {
2307
            c = readCh();
2308
            if (c == ';')
2309
              {
2310
                break loop1;
2311
              }
2312
            else
2313
              {
2314
                int n = Character.digit(c, 16);
2315
                if (n == -1)
2316
                  {
2317
                    error("illegal character in character reference", c, null);
2318
                    break loop1;
2319
                  }
2320
                value *= 16;
2321
                value += n;
2322
              }
2323
          }
2324
      }
2325
    else
2326
      {
2327
loop2:
2328
        while (true)
2329
          {
2330
            c = readCh();
2331
            if (c == ';')
2332
              {
2333
                break loop2;
2334
              }
2335
            else
2336
              {
2337
                int n = Character.digit(c, 10);
2338
                if (n == -1)
2339
                  {
2340
                    error("illegal character in character reference", c, null);
2341
                    break loop2;
2342
                  }
2343
                value *= 10;
2344
                value += c - '0';
2345
              }
2346
          }
2347
      }
2348
 
2349
    // check for character refs being legal XML
2350
    if ((value < 0x0020
2351
         && ! (value == '\n' || value == '\t' || value == '\r'))
2352
        || (value >= 0xD800 && value <= 0xDFFF)
2353
        || value == 0xFFFE || value == 0xFFFF
2354
        || value > 0x0010ffff)
2355
      {
2356
        error("illegal XML character reference U+"
2357
              + Integer.toHexString(value));
2358
      }
2359
 
2360
    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2361
    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2362
    if (value <= 0x0000ffff)
2363
      {
2364
        // no surrogates needed
2365
        dataBufferAppend((char) value);
2366
      }
2367
    else if (value <= 0x0010ffff)
2368
      {
2369
        value -= 0x10000;
2370
        // > 16 bits, surrogate needed
2371
        dataBufferAppend((char) (0xd800 | (value >> 10)));
2372
        dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2373
      }
2374
    else
2375
      {
2376
        // too big for surrogate
2377
        error("character reference " + value + " is too large for UTF-16",
2378
              Integer.toString(value), null);
2379
      }
2380
    if (doFlush)
2381
      {
2382
        dataBufferFlush();
2383
      }
2384
  }
2385
 
2386
  /**
2387
   * Parse and expand an entity reference.
2388
   * <pre>
2389
   * [68] EntityRef ::= '&' Name ';'
2390
   * </pre>
2391
   * <p>NOTE: the '&amp;' has already been read.
2392
   * @param externalAllowed External entities are allowed here.
2393
   */
2394
  private void parseEntityRef(boolean externalAllowed)
2395
    throws SAXException, IOException
2396
  {
2397
    String name;
2398
 
2399
    name = readNmtoken(true);
2400
    require(';');
2401
    switch (getEntityType(name))
2402
      {
2403
      case ENTITY_UNDECLARED:
2404
        // NOTE:  XML REC describes amazingly convoluted handling for
2405
        // this case.  Nothing as meaningful as being a WFness error
2406
        // unless the processor might _legitimately_ not have seen a
2407
        // declaration ... which is what this implements.
2408
        String message;
2409
 
2410
        message = "reference to undeclared general entity " + name;
2411
        if (skippedPE && !docIsStandalone)
2412
          {
2413
            handler.verror(message);
2414
            // we don't know this entity, and it might be external...
2415
            if (externalAllowed)
2416
              {
2417
                handler.skippedEntity(name);
2418
              }
2419
          }
2420
        else
2421
          {
2422
            error(message);
2423
          }
2424
        break;
2425
      case ENTITY_INTERNAL:
2426
          pushString(name, getEntityValue(name));
2427
 
2428
          //workaround for possible input pop before marking
2429
          //the buffer reading position
2430
          char t = readCh();
2431
          unread(t);
2432
          int bufferPosMark = readBufferPos;
2433
 
2434
          int end = readBufferPos + getEntityValue(name).length();
2435
          for (int k = readBufferPos; k < end; k++)
2436
            {
2437
              t = readCh();
2438
              if (t == '&')
2439
                {
2440
                  t = readCh();
2441
                  if (t  == '#')
2442
                    {
2443
                      //try to match a character ref
2444
                      tryReadCharRef();
2445
 
2446
                      //everything has been read
2447
                      if (readBufferPos >= end)
2448
                        {
2449
                          break;
2450
                        }
2451
                      k = readBufferPos;
2452
                      continue;
2453
                    }
2454
                  else if (Character.isLetter(t))
2455
                    {
2456
                      //looks like an entity ref
2457
                      unread(t);
2458
                      readNmtoken(true);
2459
                      require(';');
2460
 
2461
                      //everything has been read
2462
                      if (readBufferPos >= end)
2463
                        {
2464
                          break;
2465
                        }
2466
                      k = readBufferPos;
2467
                      continue;
2468
                    }
2469
                  error(" malformed entity reference");
2470
                }
2471
 
2472
            }
2473
          readBufferPos = bufferPosMark;
2474
          break;
2475
      case ENTITY_TEXT:
2476
          if (externalAllowed)
2477
            {
2478
              pushURL(false, name, getEntityIds(name),
2479
                      null, null, null, true);
2480
            }
2481
          else
2482
            {
2483
              error("reference to external entity in attribute value.",
2484
                    name, null);
2485
            }
2486
          break;
2487
      case ENTITY_NDATA:
2488
          if (externalAllowed)
2489
            {
2490
              error("unparsed entity reference in content", name, null);
2491
            }
2492
          else
2493
            {
2494
              error("reference to external entity in attribute value.",
2495
                    name, null);
2496
            }
2497
          break;
2498
      default:
2499
          throw new RuntimeException();
2500
      }
2501
  }
2502
 
2503
  /**
2504
   * Parse and expand a parameter entity reference.
2505
   * <pre>
2506
   * [69] PEReference ::= '%' Name ';'
2507
   * </pre>
2508
   * <p>NOTE: the '%' has already been read.
2509
   */
2510
  private void parsePEReference()
2511
    throws SAXException, IOException
2512
  {
2513
    String name;
2514
 
2515
    name = "%" + readNmtoken(true);
2516
    require(';');
2517
    switch (getEntityType(name))
2518
      {
2519
      case ENTITY_UNDECLARED:
2520
        // VC: Entity Declared
2521
        handler.verror("reference to undeclared parameter entity " + name);
2522
 
2523
        // we should disable handling of all subsequent declarations
2524
        // unless this is a standalone document (info discarded)
2525
        break;
2526
      case ENTITY_INTERNAL:
2527
        if (inLiteral)
2528
          {
2529
            pushString(name, getEntityValue(name));
2530
          }
2531
        else
2532
          {
2533
            pushString(name, ' ' + getEntityValue(name) + ' ');
2534
          }
2535
        break;
2536
      case ENTITY_TEXT:
2537
        if (!inLiteral)
2538
          {
2539
            pushString(null, " ");
2540
          }
2541
        pushURL(true, name, getEntityIds(name), null, null, null, true);
2542
        if (!inLiteral)
2543
          {
2544
            pushString(null, " ");
2545
          }
2546
        break;
2547
      }
2548
  }
2549
 
2550
  /**
2551
   * Parse an entity declaration.
2552
   * <pre>
2553
   * [70] EntityDecl ::= GEDecl | PEDecl
2554
   * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2555
   * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2556
   * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2557
   * [74] PEDef ::= EntityValue | ExternalID
2558
   * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2559
   *       | 'PUBLIC' S PubidLiteral S SystemLiteral
2560
   * [76] NDataDecl ::= S 'NDATA' S Name
2561
   * </pre>
2562
   * <p>NOTE: the '&lt;!ENTITY' has already been read.
2563
   */
2564
  private void parseEntityDecl()
2565
    throws Exception
2566
  {
2567
    boolean peFlag = false;
2568
    int flags = 0;
2569
 
2570
    // Check for a parameter entity.
2571
    expandPE = false;
2572
    requireWhitespace();
2573
    if (tryRead('%'))
2574
      {
2575
        peFlag = true;
2576
        requireWhitespace();
2577
      }
2578
    expandPE = true;
2579
 
2580
    // Read the entity name, and prepend
2581
    // '%' if necessary.
2582
    String name = readNmtoken(true);
2583
    //NE08
2584
    if (name.indexOf(':') >= 0)
2585
      {
2586
        error("Illegal character(':') in entity name ", name, null);
2587
      }
2588
    if (peFlag)
2589
      {
2590
        name = "%" + name;
2591
      }
2592
 
2593
    // Read the entity value.
2594
    requireWhitespace();
2595
    char c = readCh();
2596
    unread (c);
2597
    if (c == '"' || c == '\'')
2598
      {
2599
        // Internal entity ... replacement text has expanded refs
2600
        // to characters and PEs, but not to general entities
2601
        String value = readLiteral(flags);
2602
        setInternalEntity(name, value);
2603
      }
2604
    else
2605
      {
2606
        // Read the external IDs
2607
        ExternalIdentifiers ids = readExternalIds(false, false);
2608
 
2609
        // Check for NDATA declaration.
2610
        boolean white = tryWhitespace();
2611
        if (!peFlag && tryRead("NDATA"))
2612
          {
2613
            if (!white)
2614
              {
2615
                error("whitespace required before NDATA");
2616
              }
2617
            requireWhitespace();
2618
            String notationName = readNmtoken(true);
2619
            if (!skippedPE)
2620
              {
2621
                setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2622
                handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2623
                                           ids.baseUri, notationName);
2624
              }
2625
          }
2626
        else if (!skippedPE)
2627
          {
2628
            setExternalEntity(name, ENTITY_TEXT, ids, null);
2629
            handler.getDeclHandler()
2630
              .externalEntityDecl(name, ids.publicId,
2631
                                   handler.resolveURIs()
2632
                                   // FIXME: ASSUMES not skipped
2633
                                   // "false" forces error on bad URI
2634
                                   ? handler.absolutize(ids.baseUri,
2635
                                                        ids.systemId,
2636
                                                        false)
2637
                                   : ids.systemId);
2638
          }
2639
      }
2640
 
2641
    // Finish the declaration.
2642
    skipWhitespace();
2643
    require('>');
2644
  }
2645
 
2646
  /**
2647
   * Parse a notation declaration.
2648
   * <pre>
2649
   * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2650
   *    (ExternalID | PublicID) S? '&gt;'
2651
   * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2652
   * </pre>
2653
   * <P>NOTE: the '&lt;!NOTATION' has already been read.
2654
   */
2655
  private void parseNotationDecl()
2656
    throws Exception
2657
  {
2658
    String nname;
2659
    ExternalIdentifiers ids;
2660
 
2661
    requireWhitespace();
2662
    nname = readNmtoken(true);
2663
    //NE08
2664
    if (nname.indexOf(':') >= 0)
2665
      {
2666
        error("Illegal character(':') in notation name ", nname, null);
2667
      }
2668
    requireWhitespace();
2669
 
2670
    // Read the external identifiers.
2671
    ids = readExternalIds(true, false);
2672
 
2673
    // Register the notation.
2674
    setNotation(nname, ids);
2675
 
2676
    skipWhitespace();
2677
    require('>');
2678
  }
2679
 
2680
  /**
2681
   * Parse character data.
2682
   * <pre>
2683
   * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2684
   * </pre>
2685
   */
2686
  private void parseCharData()
2687
    throws Exception
2688
  {
2689
    char c;
2690
    int state = 0;
2691
    boolean pureWhite = false;
2692
 
2693
    // assert (dataBufferPos == 0);
2694
 
2695
    // are we expecting pure whitespace?  it might be dirty...
2696
    if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2697
      {
2698
        pureWhite = true;
2699
      }
2700
 
2701
    // always report right out of readBuffer
2702
    // to minimize (pointless) buffer copies
2703
    while (true)
2704
      {
2705
        int lineAugment = 0;
2706
        int columnAugment = 0;
2707
        int i;
2708
 
2709
loop:
2710
        for (i = readBufferPos; i < readBufferLength; i++)
2711
          {
2712
            switch (c = readBuffer[i])
2713
              {
2714
              case '\n':
2715
                lineAugment++;
2716
                columnAugment = 0;
2717
                // pureWhite unmodified
2718
                break;
2719
              case '\r':  // should not happen!!
2720
              case '\t':
2721
              case ' ':
2722
                // pureWhite unmodified
2723
                columnAugment++;
2724
                break;
2725
              case '&':
2726
              case '<':
2727
                columnAugment++;
2728
                // pureWhite unmodified
2729
                // CLEAN end of text sequence
2730
                state = 1;
2731
                break loop;
2732
              case ']':
2733
                // that's not a whitespace char, and
2734
                // can not terminate pure whitespace either
2735
                pureWhite = false;
2736
                if ((i + 2) < readBufferLength)
2737
                  {
2738
                    if (readBuffer [i + 1] == ']'
2739
                        && readBuffer [i + 2] == '>')
2740
                      {
2741
                        // ERROR end of text sequence
2742
                        state = 2;
2743
                        break loop;
2744
                      }
2745
                  }
2746
                else
2747
                  {
2748
                    // FIXME missing two end-of-buffer cases
2749
                  }
2750
                columnAugment++;
2751
                break;
2752
              default:
2753
                if ((c < 0x0020 || c > 0xFFFD)
2754
                    || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2755
                        && xmlVersion == XML_11))
2756
                  {
2757
                    error("illegal XML character U+"
2758
                          + Integer.toHexString(c));
2759
                  }
2760
                // that's not a whitespace char
2761
                pureWhite = false;
2762
                columnAugment++;
2763
              }
2764
          }
2765
 
2766
        // report text thus far
2767
        if (lineAugment > 0)
2768
          {
2769
            line += lineAugment;
2770
            column = columnAugment;
2771
          }
2772
        else
2773
          {
2774
            column += columnAugment;
2775
          }
2776
 
2777
        // report characters/whitspace
2778
        int length = i - readBufferPos;
2779
 
2780
        if (length != 0)
2781
          {
2782
            if (pureWhite)
2783
              {
2784
                handler.ignorableWhitespace(readBuffer,
2785
                                            readBufferPos, length);
2786
              }
2787
            else
2788
              {
2789
                handler.charData(readBuffer, readBufferPos, length);
2790
              }
2791
            readBufferPos = i;
2792
          }
2793
 
2794
        if (state != 0)
2795
          {
2796
            break;
2797
          }
2798
 
2799
        // fill next buffer from this entity, or
2800
        // pop stack and continue with previous entity
2801
        unread(readCh());
2802
      }
2803
    if (!pureWhite)
2804
      {
2805
        isDirtyCurrentElement = true;
2806
      }
2807
    // finish, maybe with error
2808
    if (state != 1)  // finish, no error
2809
      {
2810
        error("character data may not contain ']]>'");
2811
      }
2812
  }
2813
 
2814
  //////////////////////////////////////////////////////////////////////
2815
  // High-level reading and scanning methods.
2816
  //////////////////////////////////////////////////////////////////////
2817
 
2818
  /**
2819
   * Require whitespace characters.
2820
   */
2821
  private void requireWhitespace()
2822
    throws SAXException, IOException
2823
  {
2824
    char c = readCh();
2825
    if (isWhitespace(c))
2826
      {
2827
        skipWhitespace();
2828
      }
2829
    else
2830
      {
2831
        error("whitespace required", c, null);
2832
      }
2833
  }
2834
 
2835
  /**
2836
   * Skip whitespace characters.
2837
   * <pre>
2838
   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2839
   * </pre>
2840
   */
2841
  private void skipWhitespace()
2842
    throws SAXException, IOException
2843
  {
2844
    // Start with a little cheat.  Most of
2845
    // the time, the white space will fall
2846
    // within the current read buffer; if
2847
    // not, then fall through.
2848
    if (USE_CHEATS)
2849
      {
2850
        int lineAugment = 0;
2851
        int columnAugment = 0;
2852
 
2853
loop:
2854
        for (int i = readBufferPos; i < readBufferLength; i++)
2855
          {
2856
            switch (readBuffer[i])
2857
              {
2858
              case ' ':
2859
              case '\t':
2860
              case '\r':
2861
                columnAugment++;
2862
                break;
2863
              case '\n':
2864
                lineAugment++;
2865
                columnAugment = 0;
2866
                break;
2867
              case '%':
2868
                if (expandPE)
2869
                  {
2870
                    break loop;
2871
                  }
2872
                // else fall through...
2873
              default:
2874
                readBufferPos = i;
2875
                if (lineAugment > 0)
2876
                  {
2877
                    line += lineAugment;
2878
                    column = columnAugment;
2879
                  }
2880
                else
2881
                  {
2882
                    column += columnAugment;
2883
                  }
2884
                return;
2885
              }
2886
          }
2887
      }
2888
 
2889
    // OK, do it the slow way.
2890
    char c = readCh ();
2891
    while (isWhitespace(c))
2892
      {
2893
        c = readCh();
2894
      }
2895
    unread(c);
2896
  }
2897
 
2898
  /**
2899
   * Read a name or (when parsing an enumeration) name token.
2900
   * <pre>
2901
   * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2902
   * [7] Nmtoken ::= (NameChar)+
2903
   * </pre>
2904
   */
2905
  private String readNmtoken(boolean isName)
2906
    throws SAXException, IOException
2907
  {
2908
    char c;
2909
 
2910
    if (USE_CHEATS)
2911
      {
2912
loop:
2913
        for (int i = readBufferPos; i < readBufferLength; i++)
2914
          {
2915
            c = readBuffer[i];
2916
            switch (c)
2917
              {
2918
              case '%':
2919
                if (expandPE)
2920
                  {
2921
                    break loop;
2922
                  }
2923
                // else fall through...
2924
 
2925
                // What may legitimately come AFTER a name/nmtoken?
2926
              case '<': case '>': case '&':
2927
              case ',': case '|': case '*': case '+': case '?':
2928
              case ')':
2929
              case '=':
2930
              case '\'': case '"':
2931
              case '[':
2932
              case ' ': case '\t': case '\r': case '\n':
2933
              case ';':
2934
              case '/':
2935
                int start = readBufferPos;
2936
                if (i == start)
2937
                  {
2938
                    error("name expected", readBuffer[i], null);
2939
                  }
2940
                readBufferPos = i;
2941
                return intern(readBuffer, start, i - start);
2942
 
2943
              default:
2944
                // FIXME ... per IBM's OASIS test submission, these:
2945
                //   ?    U+06dd
2946
                //   Combining  U+309B
2947
                //these switches are kind of ugly but at least we won't
2948
                //have to go over the whole lits for each char
2949
                if (isName && i == readBufferPos)
2950
                  {
2951
                    char c2 = (char) (c & 0x00f0);
2952
                    switch (c & 0xff00)
2953
                      {
2954
                        //starting with 01
2955
                      case 0x0100:
2956
                        switch (c2)
2957
                          {
2958
                          case 0x0030:
2959
                            if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2960
                              {
2961
                                error("Not a name start character, U+"
2962
                                      + Integer.toHexString(c));
2963
                              }
2964
                            break;
2965
                          case 0x0040:
2966
                            if (c == 0x0140 || c == 0x0149)
2967
                              {
2968
                                error("Not a name start character, U+"
2969
                                      + Integer.toHexString(c));
2970
                              }
2971
                            break;
2972
                          case 0x00c0:
2973
                            if (c == 0x01c4 || c == 0x01cc)
2974
                              {
2975
                                error("Not a name start character, U+"
2976
                                      + Integer.toHexString(c));
2977
                              }
2978
                            break;
2979
                          case 0x00f0:
2980
                            if (c == 0x01f1 || c == 0x01f3)
2981
                              {
2982
                                error("Not a name start character, U+"
2983
                                      + Integer.toHexString(c));
2984
                              }
2985
                            break;
2986
                          case 0x00b0:
2987
                            if (c == 0x01f1 || c == 0x01f3)
2988
                              {
2989
                                error("Not a name start character, U+"
2990
                                      + Integer.toHexString(c));
2991
                              }
2992
                            break;
2993
                          default:
2994
                            if (c == 0x017f)
2995
                              {
2996
                                error("Not a name start character, U+"
2997
                                      + Integer.toHexString(c));
2998
                              }
2999
                          }
3000
 
3001
                        break;
3002
                        //starting with 11
3003
                      case 0x1100:
3004
                        switch (c2)
3005
                          {
3006
                          case 0x0000:
3007
                            if (c == 0x1104 || c == 0x1108 ||
3008
                                c == 0x110a || c == 0x110d)
3009
                              {
3010
                                error("Not a name start character, U+"
3011
                                      + Integer.toHexString(c));
3012
                              }
3013
                            break;
3014
                          case 0x0030:
3015
                            if (c == 0x113b || c == 0x113f)
3016
                              {
3017
                                error("Not a name start character, U+"
3018
                                      + Integer.toHexString(c));
3019
                              }
3020
                            break;
3021
                          case 0x0040:
3022
                            if (c == 0x1141 || c == 0x114d
3023
                                || c == 0x114f )
3024
                              {
3025
                                error("Not a name start character, U+"
3026
                                      + Integer.toHexString(c));
3027
                              }
3028
                            break;
3029
                          case 0x0050:
3030
                            if (c == 0x1151 || c == 0x1156)
3031
                              {
3032
                                error("Not a name start character, U+"
3033
                                      + Integer.toHexString(c));
3034
                              }
3035
                            break;
3036
                          case 0x0060:
3037
                            if (c == 0x1162 || c == 0x1164
3038
                                || c == 0x1166 || c == 0x116b
3039
                                || c == 0x116f)
3040
                              {
3041
                                error("Not a name start character, U+"
3042
                                      + Integer.toHexString(c));
3043
                              }
3044
                            break;
3045
                          case 0x00b0:
3046
                            if (c == 0x11b6 || c == 0x11b9
3047
                                || c == 0x11bb || c == 0x116f)
3048
                              {
3049
                                error("Not a name start character, U+"
3050
                                      + Integer.toHexString(c));
3051
                              }
3052
                            break;
3053
                          default:
3054
                            if (c == 0x1174 || c == 0x119f
3055
                                || c == 0x11ac || c == 0x11c3
3056
                                || c == 0x11f1)
3057
                              {
3058
                                error("Not a name start character, U+"
3059
                                      + Integer.toHexString(c));
3060
                              }
3061
                          }
3062
                        break;
3063
                      default:
3064
                        if (c == 0x0e46 || c == 0x1011
3065
                            || c == 0x212f || c == 0x0587
3066
                            || c == 0x0230 )
3067
                          {
3068
                            error("Not a name start character, U+"
3069
                                  + Integer.toHexString(c));
3070
                          }
3071
                      }
3072
                  }
3073
                // punt on exact tests from Appendix A; approximate
3074
                // them using the Unicode ID start/part rules
3075
                if (i == readBufferPos && isName)
3076
                  {
3077
                    if (!Character.isUnicodeIdentifierStart(c)
3078
                        && c != ':' && c != '_')
3079
                      {
3080
                        error("Not a name start character, U+"
3081
                              + Integer.toHexString(c));
3082
                      }
3083
                  }
3084
                else if (!Character.isUnicodeIdentifierPart(c)
3085
                         && c != '-' && c != ':' && c != '_' && c != '.'
3086
                         && !isExtender(c))
3087
                  {
3088
                    error("Not a name character, U+"
3089
                          + Integer.toHexString(c));
3090
                  }
3091
              }
3092
          }
3093
      }
3094
 
3095
    nameBufferPos = 0;
3096
 
3097
    // Read the first character.
3098
    while (true)
3099
      {
3100
        c = readCh();
3101
        switch (c)
3102
          {
3103
          case '%':
3104
          case '<': case '>': case '&':
3105
          case ',': case '|': case '*': case '+': case '?':
3106
          case ')':
3107
          case '=':
3108
          case '\'': case '"':
3109
          case '[':
3110
          case ' ': case '\t': case '\n': case '\r':
3111
          case ';':
3112
          case '/':
3113
            unread(c);
3114
            if (nameBufferPos == 0)
3115
              {
3116
                error ("name expected");
3117
              }
3118
            // punt on exact tests from Appendix A, but approximate them
3119
            if (isName
3120
                && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3121
                && ":_".indexOf(nameBuffer[0]) == -1)
3122
              {
3123
                error("Not a name start character, U+"
3124
                      + Integer.toHexString(nameBuffer[0]));
3125
              }
3126
            String s = intern(nameBuffer, 0, nameBufferPos);
3127
            nameBufferPos = 0;
3128
            return s;
3129
          default:
3130
            // punt on exact tests from Appendix A, but approximate them
3131
 
3132
            if ((nameBufferPos != 0 || !isName)
3133
                && !Character.isUnicodeIdentifierPart(c)
3134
                && ":-_.".indexOf(c) == -1
3135
                && !isExtender(c))
3136
              {
3137
                error("Not a name character, U+"
3138
                      + Integer.toHexString(c));
3139
              }
3140
            if (nameBufferPos >= nameBuffer.length)
3141
              {
3142
                nameBuffer =
3143
                  (char[]) extendArray(nameBuffer,
3144
                                       nameBuffer.length, nameBufferPos);
3145
              }
3146
            nameBuffer[nameBufferPos++] = c;
3147
          }
3148
      }
3149
  }
3150
 
3151
  private static boolean isExtender(char c)
3152
  {
3153
    // [88] Extender ::= ...
3154
    return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3155
      || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3156
      || (c >= 0x3031 && c <= 0x3035)
3157
      || (c >= 0x309d && c <= 0x309e)
3158
      || (c >= 0x30fc && c <= 0x30fe);
3159
  }
3160
 
3161
  /**
3162
   * Read a literal.  With matching single or double quotes as
3163
   * delimiters (and not embedded!) this is used to parse:
3164
   * <pre>
3165
   *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3166
   *  [10] AttValue ::= ... ([^<&] | Reference)* ...
3167
   *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
3168
   *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3169
   * </pre>
3170
   * as well as the quoted strings in XML and text declarations
3171
   * (for version, encoding, and standalone) which have their
3172
   * own constraints.
3173
   */
3174
  private String readLiteral(int flags)
3175
    throws SAXException, IOException
3176
  {
3177
    char delim, c;
3178
    int startLine = line;
3179
    boolean saved = expandPE;
3180
    boolean savedReport = doReport;
3181
 
3182
    // Find the first delimiter.
3183
    delim = readCh();
3184
    if (delim != '"' && delim != '\'')
3185
      {
3186
        error("expected '\"' or \"'\"", delim, null);
3187
        return null;
3188
      }
3189
    inLiteral = true;
3190
    if ((flags & LIT_DISABLE_PE) != 0)
3191
      {
3192
        expandPE = false;
3193
      }
3194
    doReport = false;
3195
 
3196
    // Each level of input source has its own buffer; remember
3197
    // ours, so we won't read the ending delimiter from any
3198
    // other input source, regardless of entity processing.
3199
    char[] ourBuf = readBuffer;
3200
 
3201
    // Read the literal.
3202
    try
3203
      {
3204
        c = readCh();
3205
        boolean ampRead = false;
3206
loop:
3207
        while (! (c == delim && readBuffer == ourBuf))
3208
          {
3209
            switch (c)
3210
              {
3211
                // attributes and public ids are normalized
3212
                // in almost the same ways
3213
              case '\n':
3214
              case '\r':
3215
                if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3216
                  {
3217
                    c = ' ';
3218
                  }
3219
                break;
3220
              case '\t':
3221
                if ((flags & LIT_ATTRIBUTE) != 0)
3222
                  {
3223
                    c = ' ';
3224
                  }
3225
                break;
3226
              case '&':
3227
                c = readCh();
3228
                // Char refs are expanded immediately, except for
3229
                // all the cases where it's deferred.
3230
                if (c == '#')
3231
                  {
3232
                    if ((flags & LIT_DISABLE_CREF) != 0)
3233
                      {
3234
                        dataBufferAppend('&');
3235
                        break;
3236
                      }
3237
                    parseCharRef(false /* Do not do flushDataBuffer */);
3238
 
3239
                    // exotic WFness risk: this is an entity literal,
3240
                    // dataBuffer [dataBufferPos - 1] == '&', and
3241
                    // following chars are a _partial_ entity/char ref
3242
 
3243
                    // It looks like an entity ref ...
3244
                  }
3245
                else
3246
                  {
3247
                    unread(c);
3248
                    // Expand it?
3249
                    if ((flags & LIT_ENTITY_REF) > 0)
3250
                      {
3251
                        parseEntityRef(false);
3252
                        if (String.valueOf(readBuffer).equals("&#38;"))
3253
                          {
3254
                            ampRead = true;
3255
                          }
3256
                        //Is it just data?
3257
                      }
3258
                    else if ((flags & LIT_DISABLE_EREF) != 0)
3259
                      {
3260
                        dataBufferAppend('&');
3261
 
3262
                        // OK, it will be an entity ref -- expanded later.
3263
                      }
3264
                    else
3265
                      {
3266
                        String name = readNmtoken(true);
3267
                        require(';');
3268
                        dataBufferAppend('&');
3269
                        dataBufferAppend(name);
3270
                        dataBufferAppend(';');
3271
                      }
3272
                  }
3273
                c = readCh();
3274
                continue loop;
3275
 
3276
              case '<':
3277
                // and why?  Perhaps so "&foo;" expands the same
3278
                // inside and outside an attribute?
3279
                if ((flags & LIT_ATTRIBUTE) != 0)
3280
                  {
3281
                    error("attribute values may not contain '<'");
3282
                  }
3283
                break;
3284
 
3285
                // We don't worry about case '%' and PE refs, readCh does.
3286
 
3287
              default:
3288
                break;
3289
              }
3290
            dataBufferAppend(c);
3291
            c = readCh();
3292
          }
3293
      }
3294
    catch (EOFException e)
3295
      {
3296
        error("end of input while looking for delimiter (started on line "
3297
              + startLine + ')', null, Character.toString(delim));
3298
      }
3299
    inLiteral = false;
3300
    expandPE = saved;
3301
    doReport = savedReport;
3302
 
3303
    // Normalise whitespace if necessary.
3304
    if ((flags & LIT_NORMALIZE) > 0)
3305
      {
3306
        dataBufferNormalize();
3307
      }
3308
 
3309
    // Return the value.
3310
    return dataBufferToString();
3311
  }
3312
 
3313
  /**
3314
   * Try reading external identifiers.
3315
   * A system identifier is not required for notations.
3316
   * @param inNotation Are we parsing a notation decl?
3317
   * @param isSubset Parsing external subset decl (may be omitted)?
3318
   * @return A three-member String array containing the identifiers,
3319
   *  or nulls. Order: public, system, baseURI.
3320
   */
3321
  private ExternalIdentifiers readExternalIds(boolean inNotation,
3322
                                              boolean isSubset)
3323
    throws Exception
3324
  {
3325
    char c;
3326
    ExternalIdentifiers ids = new ExternalIdentifiers();
3327
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3328
 
3329
    if (tryRead("PUBLIC"))
3330
      {
3331
        requireWhitespace();
3332
        ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3333
        if (inNotation)
3334
          {
3335
            skipWhitespace();
3336
            c = readCh();
3337
            unread(c);
3338
            if (c == '"' || c == '\'')
3339
              {
3340
                ids.systemId = readLiteral(flags);
3341
              }
3342
          }
3343
        else
3344
          {
3345
            requireWhitespace();
3346
            ids.systemId = readLiteral(flags);
3347
          }
3348
 
3349
        for (int i = 0; i < ids.publicId.length(); i++)
3350
          {
3351
            c = ids.publicId.charAt(i);
3352
            if (c >= 'a' && c <= 'z')
3353
              {
3354
                continue;
3355
              }
3356
            if (c >= 'A' && c <= 'Z')
3357
              {
3358
                continue;
3359
              }
3360
            if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3361
              {
3362
                continue;
3363
              }
3364
            error("illegal PUBLIC id character U+"
3365
                  + Integer.toHexString(c));
3366
          }
3367
      }
3368
    else if (tryRead("SYSTEM"))
3369
      {
3370
        requireWhitespace();
3371
        ids.systemId = readLiteral(flags);
3372
      }
3373
    else if (!isSubset)
3374
      {
3375
        error("missing SYSTEM or PUBLIC keyword");
3376
      }
3377
 
3378
    if (ids.systemId != null)
3379
      {
3380
        if (ids.systemId.indexOf('#') != -1)
3381
          {
3382
            handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3383
          }
3384
        ids.baseUri = handler.getSystemId();
3385
        if (ids.baseUri == null && uriWarnings)
3386
          {
3387
            handler.warn("No base URI; hope URI is absolute: "
3388
                         + ids.systemId);
3389
          }
3390
      }
3391
 
3392
    return ids;
3393
  }
3394
 
3395
  /**
3396
   * Test if a character is whitespace.
3397
   * <pre>
3398
   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3399
   * </pre>
3400
   * @param c The character to test.
3401
   * @return true if the character is whitespace.
3402
   */
3403
  private final boolean isWhitespace(char c)
3404
  {
3405
    if (c > 0x20)
3406
      {
3407
        return false;
3408
      }
3409
    if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3410
      {
3411
        return true;
3412
      }
3413
    return false;  // illegal ...
3414
  }
3415
 
3416
  //////////////////////////////////////////////////////////////////////
3417
  // Utility routines.
3418
  //////////////////////////////////////////////////////////////////////
3419
 
3420
  /**
3421
   * Add a character to the data buffer.
3422
   */
3423
  private void dataBufferAppend(char c)
3424
  {
3425
    // Expand buffer if necessary.
3426
    if (dataBufferPos >= dataBuffer.length)
3427
      {
3428
        dataBuffer = (char[]) extendArray(dataBuffer,
3429
                                          dataBuffer.length, dataBufferPos);
3430
      }
3431
    dataBuffer[dataBufferPos++] = c;
3432
  }
3433
 
3434
  /**
3435
   * Add a string to the data buffer.
3436
   */
3437
  private void dataBufferAppend(String s)
3438
  {
3439
    dataBufferAppend(s.toCharArray(), 0, s.length());
3440
  }
3441
 
3442
  /**
3443
   * Append (part of) a character array to the data buffer.
3444
   */
3445
  private void dataBufferAppend(char[] ch, int start, int length)
3446
  {
3447
    dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3448
                                      dataBufferPos + length);
3449
 
3450
    System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3451
    dataBufferPos += length;
3452
  }
3453
 
3454
  /**
3455
   * Normalise space characters in the data buffer.
3456
   */
3457
  private void dataBufferNormalize()
3458
  {
3459
    int i = 0;
3460
    int j = 0;
3461
    int end = dataBufferPos;
3462
 
3463
    // Skip spaces at the start.
3464
    while (j < end && dataBuffer[j] == ' ')
3465
      {
3466
        j++;
3467
      }
3468
 
3469
    // Skip whitespace at the end.
3470
    while (end > j && dataBuffer[end - 1] == ' ')
3471
      {
3472
        end --;
3473
      }
3474
 
3475
    // Start copying to the left.
3476
    while (j < end)
3477
      {
3478
 
3479
        char c = dataBuffer[j++];
3480
 
3481
        // Normalise all other spaces to
3482
        // a single space.
3483
        if (c == ' ')
3484
          {
3485
            while (j < end && dataBuffer[j++] == ' ')
3486
              {
3487
                continue;
3488
              }
3489
            dataBuffer[i++] = ' ';
3490
            dataBuffer[i++] = dataBuffer[j - 1];
3491
          }
3492
        else
3493
          {
3494
            dataBuffer[i++] = c;
3495
          }
3496
      }
3497
 
3498
    // The new length is <= the old one.
3499
    dataBufferPos = i;
3500
  }
3501
 
3502
  /**
3503
   * Convert the data buffer to a string.
3504
   */
3505
  private String dataBufferToString()
3506
  {
3507
    String s = new String(dataBuffer, 0, dataBufferPos);
3508
    dataBufferPos = 0;
3509
    return s;
3510
  }
3511
 
3512
  /**
3513
   * Flush the contents of the data buffer to the handler, as
3514
   * appropriate, and reset the buffer for new input.
3515
   */
3516
  private void dataBufferFlush()
3517
    throws SAXException
3518
  {
3519
    if (currentElementContent == CONTENT_ELEMENTS
3520
        && dataBufferPos > 0
3521
        && !inCDATA)
3522
      {
3523
        // We can't just trust the buffer to be whitespace, there
3524
        // are (error) cases when it isn't
3525
        for (int i = 0; i < dataBufferPos; i++)
3526
          {
3527
            if (!isWhitespace(dataBuffer[i]))
3528
              {
3529
                handler.charData(dataBuffer, 0, dataBufferPos);
3530
                dataBufferPos = 0;
3531
              }
3532
          }
3533
        if (dataBufferPos > 0)
3534
          {
3535
            handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3536
            dataBufferPos = 0;
3537
          }
3538
      }
3539
    else if (dataBufferPos > 0)
3540
      {
3541
        handler.charData(dataBuffer, 0, dataBufferPos);
3542
        dataBufferPos = 0;
3543
      }
3544
  }
3545
 
3546
  /**
3547
   * Require a string to appear, or throw an exception.
3548
   * <p><em>Precondition:</em> Entity expansion is not required.
3549
   * <p><em>Precondition:</em> data buffer has no characters that
3550
   * will get sent to the application.
3551
   */
3552
  private void require(String delim)
3553
    throws SAXException, IOException
3554
  {
3555
    int length = delim.length();
3556
    char[] ch;
3557
 
3558
    if (length < dataBuffer.length)
3559
      {
3560
        ch = dataBuffer;
3561
        delim.getChars(0, length, ch, 0);
3562
      }
3563
    else
3564
      {
3565
        ch = delim.toCharArray();
3566
      }
3567
 
3568
    if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3569
      {
3570
        int offset = readBufferPos;
3571
 
3572
        for (int i = 0; i < length; i++, offset++)
3573
          {
3574
            if (ch[i] != readBuffer[offset])
3575
              {
3576
                error ("required string", null, delim);
3577
              }
3578
          }
3579
        readBufferPos = offset;
3580
 
3581
      }
3582
    else
3583
      {
3584
        for (int i = 0; i < length; i++)
3585
          {
3586
            require(ch[i]);
3587
          }
3588
      }
3589
  }
3590
 
3591
  /**
3592
   * Require a character to appear, or throw an exception.
3593
   */
3594
  private void require(char delim)
3595
    throws SAXException, IOException
3596
  {
3597
    char c = readCh();
3598
 
3599
    if (c != delim)
3600
      {
3601
        error("required character", c, Character.toString(delim));
3602
      }
3603
  }
3604
 
3605
  /**
3606
   * Create an interned string from a character array.
3607
   * &AElig;lfred uses this method to create an interned version
3608
   * of all names and name tokens, so that it can test equality
3609
   * with <code>==</code> instead of <code>String.equals ()</code>.
3610
   *
3611
   * <p>This is much more efficient than constructing a non-interned
3612
   * string first, and then interning it.
3613
   *
3614
   * @param ch an array of characters for building the string.
3615
   * @param start the starting position in the array.
3616
   * @param length the number of characters to place in the string.
3617
   * @return an interned string.
3618
   * @see #intern (String)
3619
   * @see java.lang.String#intern
3620
   */
3621
  public String intern(char[] ch, int start, int length)
3622
  {
3623
    int index = 0;
3624
    int hash = 0;
3625
    Object[] bucket;
3626
 
3627
    // Generate a hash code.  This is a widely used string hash,
3628
    // often attributed to Brian Kernighan.
3629
    for (int i = start; i < start + length; i++)
3630
      {
3631
        hash = 31 * hash + ch[i];
3632
      }
3633
    hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3634
 
3635
    // Get the bucket -- consists of {array,String} pairs
3636
    if ((bucket = symbolTable[hash]) == null)
3637
      {
3638
        // first string in this bucket
3639
        bucket = new Object[8];
3640
 
3641
        // Search for a matching tuple, and
3642
        // return the string if we find one.
3643
      }
3644
    else
3645
      {
3646
        while (index < bucket.length)
3647
          {
3648
            char[] chFound = (char[]) bucket[index];
3649
 
3650
            // Stop when we hit an empty entry.
3651
            if (chFound == null)
3652
              {
3653
                break;
3654
              }
3655
 
3656
            // If they're the same length, check for a match.
3657
            if (chFound.length == length)
3658
              {
3659
                for (int i = 0; i < chFound.length; i++)
3660
                  {
3661
                    // continue search on failure
3662
                    if (ch[start + i] != chFound[i])
3663
                      {
3664
                        break;
3665
                      }
3666
                    else if (i == length - 1)
3667
                      {
3668
                        // That's it, we have a match!
3669
                        return (String) bucket[index + 1];
3670
                      }
3671
                  }
3672
              }
3673
            index += 2;
3674
          }
3675
        // Not found -- we'll have to add it.
3676
 
3677
        // Do we have to grow the bucket?
3678
        bucket = (Object[]) extendArray(bucket, bucket.length, index);
3679
      }
3680
    symbolTable[hash] = bucket;
3681
 
3682
    // OK, add it to the end of the bucket -- "local" interning.
3683
    // Intern "globally" to let applications share interning benefits.
3684
    // That is, "!=" and "==" work on our strings, not just equals().
3685
    String s = new String(ch, start, length).intern();
3686
    bucket[index] = s.toCharArray();
3687
    bucket[index + 1] = s;
3688
    return s;
3689
  }
3690
 
3691
  /**
3692
   * Ensure the capacity of an array, allocating a new one if
3693
   * necessary.  Usually extends only for name hash collisions.
3694
   */
3695
  private Object extendArray(Object array, int currentSize, int requiredSize)
3696
  {
3697
    if (requiredSize < currentSize)
3698
      {
3699
        return array;
3700
      }
3701
    else
3702
      {
3703
        Object newArray = null;
3704
        int newSize = currentSize * 2;
3705
 
3706
        if (newSize <= requiredSize)
3707
          {
3708
            newSize = requiredSize + 1;
3709
          }
3710
 
3711
        if (array instanceof char[])
3712
          {
3713
            newArray = new char[newSize];
3714
          }
3715
        else if (array instanceof Object[])
3716
          {
3717
            newArray = new Object[newSize];
3718
          }
3719
        else
3720
          {
3721
            throw new RuntimeException();
3722
          }
3723
 
3724
        System.arraycopy(array, 0, newArray, 0, currentSize);
3725
        return newArray;
3726
      }
3727
  }
3728
 
3729
  //////////////////////////////////////////////////////////////////////
3730
  // XML query routines.
3731
  //////////////////////////////////////////////////////////////////////
3732
 
3733
  boolean isStandalone()
3734
  {
3735
    return docIsStandalone;
3736
  }
3737
 
3738
  //
3739
  // Elements
3740
  //
3741
 
3742
  private int getContentType(ElementDecl element, int defaultType)
3743
  {
3744
    int retval;
3745
 
3746
    if (element == null)
3747
      {
3748
        return defaultType;
3749
      }
3750
    retval = element.contentType;
3751
    if (retval == CONTENT_UNDECLARED)
3752
      {
3753
        retval = defaultType;
3754
      }
3755
    return retval;
3756
  }
3757
 
3758
  /**
3759
   * Look up the content type of an element.
3760
   * @param name The element type name.
3761
   * @return An integer constant representing the content type.
3762
   * @see #CONTENT_UNDECLARED
3763
   * @see #CONTENT_ANY
3764
   * @see #CONTENT_EMPTY
3765
   * @see #CONTENT_MIXED
3766
   * @see #CONTENT_ELEMENTS
3767
   */
3768
  public int getElementContentType(String name)
3769
  {
3770
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3771
    return getContentType(element, CONTENT_UNDECLARED);
3772
  }
3773
 
3774
  /**
3775
   * Register an element.
3776
   * Array format:
3777
   *  [0] element type name
3778
   *  [1] content model (mixed, elements only)
3779
   *  [2] attribute hash table
3780
   */
3781
  private void setElement(String name, int contentType,
3782
                          String contentModel, HashMap attributes)
3783
    throws SAXException
3784
  {
3785
    if (skippedPE)
3786
      {
3787
        return;
3788
      }
3789
 
3790
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3791
 
3792
    // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3793
    if (element == null)
3794
      {
3795
        element = new ElementDecl();
3796
        element.contentType = contentType;
3797
        element.contentModel = contentModel;
3798
        element.attributes = attributes;
3799
        elementInfo.put(name, element);
3800
        return;
3801
      }
3802
 
3803
    // <!ELEMENT ...> declaration?
3804
    if (contentType != CONTENT_UNDECLARED)
3805
      {
3806
        // ... following an associated <!ATTLIST ...>
3807
        if (element.contentType == CONTENT_UNDECLARED)
3808
          {
3809
            element.contentType = contentType;
3810
            element.contentModel = contentModel;
3811
          }
3812
        else
3813
          {
3814
            // VC: Unique Element Type Declaration
3815
            handler.verror("multiple declarations for element type: "
3816
                           + name);
3817
          }
3818
      }
3819
 
3820
    // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3821
    else if (attributes != null)
3822
      {
3823
        element.attributes = attributes;
3824
      }
3825
  }
3826
 
3827
  /**
3828
   * Look up the attribute hash table for an element.
3829
   * The hash table is the second item in the element array.
3830
   */
3831
  private HashMap getElementAttributes(String name)
3832
  {
3833
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3834
    return (element == null) ? null : element.attributes;
3835
  }
3836
 
3837
  //
3838
  // Attributes
3839
  //
3840
 
3841
  /**
3842
   * Get the declared attributes for an element type.
3843
   * @param elname The name of the element type.
3844
   * @return An iterator over all the attributes declared for
3845
   *   a specific element type.  The results will be valid only
3846
   *   after the DTD (if any) has been parsed.
3847
   * @see #getAttributeType
3848
   * @see #getAttributeEnumeration
3849
   * @see #getAttributeDefaultValueType
3850
   * @see #getAttributeDefaultValue
3851
   * @see #getAttributeExpandedValue
3852
   */
3853
  private Iterator declaredAttributes(ElementDecl element)
3854
  {
3855
    HashMap attlist;
3856
 
3857
    if (element == null)
3858
      {
3859
        return null;
3860
      }
3861
    if ((attlist = element.attributes) == null)
3862
      {
3863
        return null;
3864
      }
3865
    return attlist.keySet().iterator();
3866
  }
3867
 
3868
  /**
3869
   * Get the declared attributes for an element type.
3870
   * @param elname The name of the element type.
3871
   * @return An iterator over all the attributes declared for
3872
   *   a specific element type.  The results will be valid only
3873
   *   after the DTD (if any) has been parsed.
3874
   * @see #getAttributeType
3875
   * @see #getAttributeEnumeration
3876
   * @see #getAttributeDefaultValueType
3877
   * @see #getAttributeDefaultValue
3878
   * @see #getAttributeExpandedValue
3879
   */
3880
  public Iterator declaredAttributes(String elname)
3881
  {
3882
    return declaredAttributes((ElementDecl) elementInfo.get(elname));
3883
  }
3884
 
3885
  /**
3886
   * Retrieve the declared type of an attribute.
3887
   * @param name The name of the associated element.
3888
   * @param aname The name of the attribute.
3889
   * @return An interend string denoting the type, or null
3890
   *  indicating an undeclared attribute.
3891
   */
3892
  public String getAttributeType(String name, String aname)
3893
  {
3894
    AttributeDecl attribute = getAttribute(name, aname);
3895
    return (attribute == null) ? null : attribute.type;
3896
  }
3897
 
3898
  /**
3899
   * Retrieve the allowed values for an enumerated attribute type.
3900
   * @param name The name of the associated element.
3901
   * @param aname The name of the attribute.
3902
   * @return A string containing the token list.
3903
   */
3904
  public String getAttributeEnumeration(String name, String aname)
3905
  {
3906
    AttributeDecl attribute = getAttribute(name, aname);
3907
    // assert:  attribute.enumeration is "ENUMERATION" or "NOTATION"
3908
    return (attribute == null) ? null : attribute.enumeration;
3909
  }
3910
 
3911
  /**
3912
   * Retrieve the default value of a declared attribute.
3913
   * @param name The name of the associated element.
3914
   * @param aname The name of the attribute.
3915
   * @return The default value, or null if the attribute was
3916
   *   #IMPLIED or simply undeclared and unspecified.
3917
   * @see #getAttributeExpandedValue
3918
   */
3919
  public String getAttributeDefaultValue(String name, String aname)
3920
  {
3921
    AttributeDecl attribute = getAttribute(name, aname);
3922
    return (attribute == null) ? null : attribute.value;
3923
  }
3924
 
3925
    /*
3926
 
3927
// FIXME:  Leaving this in, until W3C finally resolves the confusion
3928
// between parts of the XML 2nd REC about when entity declararations
3929
// are guaranteed to be known.  Current code matches what section 5.1
3930
// (conformance) describes, but some readings of the self-contradicting
3931
// text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3932
// attribute expansion/normalization must be deferred in some cases
3933
// (just TRY to identify them!).
3934
 
3935
     * Retrieve the expanded value of a declared attribute.
3936
     * <p>General entities (and char refs) will be expanded (once).
3937
     * @param name The name of the associated element.
3938
     * @param aname The name of the attribute.
3939
     * @return The expanded default value, or null if the attribute was
3940
     *   #IMPLIED or simply undeclared
3941
     * @see #getAttributeDefaultValue
3942
    public String getAttributeExpandedValue (String name, String aname)
3943
    throws Exception
3944
    {
3945
  AttributeDecl attribute = getAttribute (name, aname);
3946
 
3947
  if (attribute == null) {
3948
      return null;
3949
  } else if (attribute.defaultValue == null && attribute.value != null) {
3950
      // we MUST use the same buf for both quotes else the literal
3951
      // can't be properly terminated
3952
      char buf [] = new char [1];
3953
      int  flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3954
      String type = getAttributeType (name, aname);
3955
 
3956
      if (type != "CDATA" && type != null)
3957
    flags |= LIT_NORMALIZE;
3958
      buf [0] = '"';
3959
      pushCharArray (null, buf, 0, 1);
3960
      pushString (null, attribute.value);
3961
      pushCharArray (null, buf, 0, 1);
3962
      attribute.defaultValue = readLiteral (flags);
3963
  }
3964
  return attribute.defaultValue;
3965
    }
3966
     */
3967
 
3968
  /**
3969
   * Retrieve the default value mode of a declared attribute.
3970
   * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3971
   * @see #ATTRIBUTE_DEFAULT_IMPLIED
3972
   * @see #ATTRIBUTE_DEFAULT_REQUIRED
3973
   * @see #ATTRIBUTE_DEFAULT_FIXED
3974
   */
3975
  public int getAttributeDefaultValueType(String name, String aname)
3976
  {
3977
    AttributeDecl attribute = getAttribute(name, aname);
3978
    return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
3979
      attribute.valueType;
3980
  }
3981
 
3982
  /**
3983
   * Register an attribute declaration for later retrieval.
3984
   * Format:
3985
   * - String type
3986
   * - String default value
3987
   * - int value type
3988
   * - enumeration
3989
   * - processed default value
3990
   */
3991
  private void setAttribute(String elName, String name, String type,
3992
                            String enumeration, String value, int valueType)
3993
    throws Exception
3994
  {
3995
    HashMap attlist;
3996
 
3997
    if (skippedPE)
3998
      {
3999
        return;
4000
      }
4001
 
4002
    // Create a new hashtable if necessary.
4003
    attlist = getElementAttributes(elName);
4004
    if (attlist == null)
4005
      {
4006
        attlist = new HashMap();
4007
      }
4008
 
4009
    // ignore multiple attribute declarations!
4010
    if (attlist.get(name) != null)
4011
      {
4012
        // warn ...
4013
        return;
4014
      }
4015
    else
4016
      {
4017
        AttributeDecl attribute = new AttributeDecl();
4018
        attribute.type = type;
4019
        attribute.value = value;
4020
        attribute.valueType = valueType;
4021
        attribute.enumeration = enumeration;
4022
        attlist.put(name, attribute);
4023
 
4024
        // save; but don't overwrite any existing <!ELEMENT ...>
4025
        setElement(elName, CONTENT_UNDECLARED, null, attlist);
4026
      }
4027
  }
4028
 
4029
  /**
4030
   * Retrieve the attribute declaration for the given element name and name.
4031
   */
4032
  private AttributeDecl getAttribute(String elName, String name)
4033
  {
4034
    HashMap attlist = getElementAttributes(elName);
4035
    return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
4036
  }
4037
 
4038
  //
4039
  // Entities
4040
  //
4041
 
4042
  /**
4043
   * Find the type of an entity.
4044
   * @returns An integer constant representing the entity type.
4045
   * @see #ENTITY_UNDECLARED
4046
   * @see #ENTITY_INTERNAL
4047
   * @see #ENTITY_NDATA
4048
   * @see #ENTITY_TEXT
4049
   */
4050
  public int getEntityType(String ename)
4051
  {
4052
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4053
    return (entity == null) ?  ENTITY_UNDECLARED : entity.type;
4054
  }
4055
 
4056
  /**
4057
   * Return an external entity's identifiers.
4058
   * @param ename The name of the external entity.
4059
   * @return The entity's public identifier, system identifier, and base URI.
4060
   *  Null if the entity was not declared as an external entity.
4061
   * @see #getEntityType
4062
   */
4063
  public ExternalIdentifiers getEntityIds(String ename)
4064
  {
4065
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4066
    return (entity == null) ? null : entity.ids;
4067
  }
4068
 
4069
  /**
4070
   * Return an internal entity's replacement text.
4071
   * @param ename The name of the internal entity.
4072
   * @return The entity's replacement text, or null if
4073
   *   the entity was not declared as an internal entity.
4074
   * @see #getEntityType
4075
   */
4076
  public String getEntityValue(String ename)
4077
  {
4078
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4079
    return (entity == null) ? null : entity.value;
4080
  }
4081
 
4082
  /**
4083
   * Register an entity declaration for later retrieval.
4084
   */
4085
  private void setInternalEntity(String eName, String value)
4086
    throws SAXException
4087
  {
4088
    if (skippedPE)
4089
      {
4090
        return;
4091
      }
4092
 
4093
    if (entityInfo.get(eName) == null)
4094
      {
4095
        EntityInfo entity = new EntityInfo();
4096
        entity.type = ENTITY_INTERNAL;
4097
        entity.value = value;
4098
        entityInfo.put(eName, entity);
4099
      }
4100
    if (handler.stringInterning)
4101
      {
4102
        if ("lt" == eName || "gt" == eName || "quot" == eName
4103
            || "apos" == eName || "amp" == eName)
4104
          {
4105
            return;
4106
          }
4107
      }
4108
    else
4109
      {
4110
        if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4111
            || "apos".equals(eName) || "amp".equals(eName))
4112
          {
4113
            return;
4114
          }
4115
      }
4116
    handler.getDeclHandler().internalEntityDecl(eName, value);
4117
  }
4118
 
4119
  /**
4120
   * Register an external entity declaration for later retrieval.
4121
   */
4122
  private void setExternalEntity(String eName, int eClass,
4123
                                 ExternalIdentifiers ids, String nName)
4124
  {
4125
    if (entityInfo.get(eName) == null)
4126
      {
4127
        EntityInfo entity = new EntityInfo();
4128
        entity.type = eClass;
4129
        entity.ids = ids;
4130
        entity.notationName = nName;
4131
        entityInfo.put(eName, entity);
4132
      }
4133
  }
4134
 
4135
  //
4136
  // Notations.
4137
  //
4138
 
4139
  /**
4140
   * Report a notation declaration, checking for duplicates.
4141
   */
4142
  private void setNotation(String nname, ExternalIdentifiers ids)
4143
    throws SAXException
4144
  {
4145
    if (skippedPE)
4146
      {
4147
        return;
4148
      }
4149
 
4150
    handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4151
    if (notationInfo.get(nname) == null)
4152
      {
4153
        notationInfo.put(nname, nname);
4154
      }
4155
    else
4156
      {
4157
        // VC: Unique Notation Name
4158
        handler.verror("Duplicate notation name decl: " + nname);
4159
      }
4160
  }
4161
 
4162
  //
4163
  // Location.
4164
  //
4165
 
4166
  /**
4167
   * Return the current line number.
4168
   */
4169
  public int getLineNumber()
4170
  {
4171
    return line;
4172
  }
4173
 
4174
  /**
4175
   * Return the current column number.
4176
   */
4177
  public int getColumnNumber()
4178
  {
4179
    return column;
4180
  }
4181
 
4182
  //////////////////////////////////////////////////////////////////////
4183
  // High-level I/O.
4184
  //////////////////////////////////////////////////////////////////////
4185
 
4186
  /**
4187
   * Read a single character from the readBuffer.
4188
   * <p>The readDataChunk () method maintains the buffer.
4189
   * <p>If we hit the end of an entity, try to pop the stack and
4190
   * keep going.
4191
   * <p> (This approach doesn't really enforce XML's rules about
4192
   * entity boundaries, but this is not currently a validating
4193
   * parser).
4194
   * <p>This routine also attempts to keep track of the current
4195
   * position in external entities, but it's not entirely accurate.
4196
   * @return The next available input character.
4197
   * @see #unread (char)
4198
   * @see #readDataChunk
4199
   * @see #readBuffer
4200
   * @see #line
4201
   * @return The next character from the current input source.
4202
   */
4203
  private char readCh()
4204
    throws SAXException, IOException
4205
  {
4206
    // As long as there's nothing in the
4207
    // read buffer, try reading more data
4208
    // (for an external entity) or popping
4209
    // the entity stack (for either).
4210
    while (readBufferPos >= readBufferLength)
4211
      {
4212
        switch (sourceType)
4213
          {
4214
          case INPUT_READER:
4215
          case INPUT_STREAM:
4216
            readDataChunk();
4217
            while (readBufferLength < 1)
4218
              {
4219
                popInput();
4220
                if (readBufferLength < 1)
4221
                  {
4222
                    readDataChunk();
4223
                  }
4224
              }
4225
            break;
4226
 
4227
          default:
4228
 
4229
            popInput();
4230
            break;
4231
          }
4232
      }
4233
 
4234
    char c = readBuffer[readBufferPos++];
4235
 
4236
    if (c == '\n')
4237
      {
4238
        line++;
4239
        column = 0;
4240
      }
4241
    else
4242
      {
4243
        if (c == '<')
4244
          {
4245
            /* the most common return to parseContent () ... NOP */
4246
          }
4247
        else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4248
                 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
4249
                     && xmlVersion == XML_11))
4250
          {
4251
            error("illegal XML character U+" + Integer.toHexString(c));
4252
          }
4253
 
4254
        // If we're in the DTD and in a context where PEs get expanded,
4255
        // do so ... 1/14/2000 errata identify those contexts.  There
4256
        // are also spots in the internal subset where PE refs are fatal
4257
        // errors, hence yet another flag.
4258
        else if (c == '%' && expandPE)
4259
          {
4260
            if (peIsError)
4261
              {
4262
                error("PE reference within decl in internal subset.");
4263
              }
4264
            parsePEReference();
4265
            return readCh();
4266
          }
4267
        column++;
4268
      }
4269
 
4270
    return c;
4271
  }
4272
 
4273
  /**
4274
   * Push a single character back onto the current input stream.
4275
   * <p>This method usually pushes the character back onto
4276
   * the readBuffer.
4277
   * <p>I don't think that this would ever be called with
4278
   * readBufferPos = 0, because the methods always reads a character
4279
   * before unreading it, but just in case, I've added a boundary
4280
   * condition.
4281
   * @param c The character to push back.
4282
   * @see #readCh
4283
   * @see #unread (char[])
4284
   * @see #readBuffer
4285
   */
4286
  private void unread(char c)
4287
    throws SAXException
4288
  {
4289
    // Normal condition.
4290
    if (c == '\n')
4291
      {
4292
        line--;
4293
        column = -1;
4294
      }
4295
    if (readBufferPos > 0)
4296
      {
4297
        readBuffer[--readBufferPos] = c;
4298
      }
4299
    else
4300
      {
4301
        pushString(null, Character.toString(c));
4302
      }
4303
  }
4304
 
4305
  /**
4306
   * Push a char array back onto the current input stream.
4307
   * <p>NOTE: you must <em>never</em> push back characters that you
4308
   * haven't actually read: use pushString () instead.
4309
   * @see #readCh
4310
   * @see #unread (char)
4311
   * @see #readBuffer
4312
   * @see #pushString
4313
   */
4314
  private void unread(char[] ch, int length)
4315
    throws SAXException
4316
  {
4317
    for (int i = 0; i < length; i++)
4318
      {
4319
        if (ch[i] == '\n')
4320
          {
4321
            line--;
4322
            column = -1;
4323
          }
4324
      }
4325
    if (length < readBufferPos)
4326
      {
4327
        readBufferPos -= length;
4328
      }
4329
    else
4330
      {
4331
        pushCharArray(null, ch, 0, length);
4332
      }
4333
  }
4334
 
4335
  /**
4336
   * Push, or skip, a new external input source.
4337
   * The source will be some kind of parsed entity, such as a PE
4338
   * (including the external DTD subset) or content for the body.
4339
   *
4340
   * @param url The java.net.URL object for the entity.
4341
   * @see SAXDriver#resolveEntity
4342
   * @see #pushString
4343
   * @see #sourceType
4344
   * @see #pushInput
4345
   * @see #detectEncoding
4346
   * @see #sourceType
4347
   * @see #readBuffer
4348
   */
4349
  private void pushURL(boolean isPE,
4350
                       String ename,
4351
                       ExternalIdentifiers ids,
4352
                       Reader reader,
4353
                       InputStream stream,
4354
                       String encoding,
4355
                       boolean doResolve)
4356
    throws SAXException, IOException
4357
  {
4358
    boolean ignoreEncoding;
4359
    String systemId;
4360
    InputSource source;
4361
 
4362
    if (!isPE)
4363
      {
4364
        dataBufferFlush();
4365
      }
4366
 
4367
    scratch.setPublicId(ids.publicId);
4368
    scratch.setSystemId(ids.systemId);
4369
 
4370
    // See if we should skip or substitute the entity.
4371
    // If we're not skipping, resolving reports startEntity()
4372
    // and updates the (handler's) stack of URIs.
4373
    if (doResolve)
4374
      {
4375
        // assert (stream == null && reader == null && encoding == null)
4376
        source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4377
        if (source == null)
4378
          {
4379
            handler.warn("skipping entity: " + ename);
4380
            handler.skippedEntity(ename);
4381
            if (isPE)
4382
              {
4383
                skippedPE = true;
4384
              }
4385
            return;
4386
          }
4387
 
4388
        // we might be using alternate IDs/encoding
4389
        systemId = source.getSystemId();
4390
        // The following warning and setting systemId was deleted bcause
4391
        // the application has the option of not setting systemId
4392
        // provided that it has set the characte/byte stream.
4393
        /*
4394
           if (systemId == null) {
4395
           handler.warn ("missing system ID, using " + ids.systemId);
4396
           systemId = ids.systemId;
4397
           }
4398
         */
4399
      }
4400
    else
4401
      {
4402
        // "[document]", or "[dtd]" via getExternalSubset()
4403
        scratch.setCharacterStream(reader);
4404
        scratch.setByteStream(stream);
4405
        scratch.setEncoding(encoding);
4406
        source = scratch;
4407
        systemId = ids.systemId;
4408
        if (handler.stringInterning)
4409
          {
4410
            handler.startExternalEntity(ename, systemId,
4411
                                        "[document]" == ename);
4412
          }
4413
        else
4414
          {
4415
            handler.startExternalEntity(ename, systemId,
4416
                                        "[document]".equals(ename));
4417
          }
4418
      }
4419
 
4420
    // we may have been given I/O streams directly
4421
    if (source.getCharacterStream() != null)
4422
      {
4423
        if (source.getByteStream() != null)
4424
          error("InputSource has two streams!");
4425
        reader = source.getCharacterStream();
4426
      }
4427
    else if (source.getByteStream() != null)
4428
      {
4429
        encoding = source.getEncoding();
4430
        if (encoding == null)
4431
          {
4432
            stream = source.getByteStream();
4433
          }
4434
        else
4435
          {
4436
            try
4437
              {
4438
                reader = new InputStreamReader(source.getByteStream(),
4439
                                               encoding);
4440
              }
4441
            catch (IOException e)
4442
              {
4443
                stream = source.getByteStream();
4444
              }
4445
          }
4446
      }
4447
    else if (systemId == null)
4448
      {
4449
        error("InputSource has no URI!");
4450
      }
4451
    scratch.setCharacterStream(null);
4452
    scratch.setByteStream(null);
4453
    scratch.setEncoding(null);
4454
 
4455
    // Push the existing status.
4456
    pushInput(ename);
4457
 
4458
    // Create a new read buffer.
4459
    // (Note the four-character margin)
4460
    readBuffer = new char[READ_BUFFER_MAX + 4];
4461
    readBufferPos = 0;
4462
    readBufferLength = 0;
4463
    readBufferOverflow = -1;
4464
    is = null;
4465
    line = 1;
4466
    column = 0;
4467
    currentByteCount = 0;
4468
 
4469
    // If there's an explicit character stream, just
4470
    // ignore encoding declarations.
4471
    if (reader != null)
4472
      {
4473
        sourceType = INPUT_READER;
4474
        this.reader = reader;
4475
        tryEncodingDecl(true);
4476
        return;
4477
      }
4478
 
4479
    // Else we handle the conversion, and need to ensure
4480
    // it's done right.
4481
    sourceType = INPUT_STREAM;
4482
    if (stream != null)
4483
      {
4484
        is = stream;
4485
      }
4486
    else
4487
      {
4488
        // We have to open our own stream to the URL.
4489
        URL url = new URL(systemId);
4490
 
4491
        externalEntity = url.openConnection();
4492
        externalEntity.connect();
4493
        is = externalEntity.getInputStream();
4494
      }
4495
 
4496
    // If we get to here, there must be
4497
    // an InputStream available.
4498
    if (!is.markSupported())
4499
      {
4500
        is = new BufferedInputStream(is);
4501
      }
4502
 
4503
    // Get any external encoding label.
4504
    if (encoding == null && externalEntity != null)
4505
      {
4506
        // External labels can be untrustworthy; filesystems in
4507
        // particular often have the wrong default for content
4508
        // that wasn't locally originated.  Those we autodetect.
4509
        if (!"file".equals(externalEntity.getURL().getProtocol()))
4510
          {
4511
            int temp;
4512
 
4513
            // application/xml;charset=something;otherAttr=...
4514
            // ... with many variants on 'something'
4515
            encoding = externalEntity.getContentType();
4516
 
4517
            // MHK code (fix for Saxon 5.5.1/007):
4518
            // protect against encoding==null
4519
            if (encoding == null)
4520
              {
4521
                temp = -1;
4522
              }
4523
            else
4524
              {
4525
                temp = encoding.indexOf("charset");
4526
              }
4527
 
4528
            // RFC 2376 sez MIME text defaults to ASCII, but since the
4529
            // JDK will create a MIME type out of thin air, we always
4530
            // autodetect when there's no explicit charset attribute.
4531
            if (temp < 0)
4532
              {
4533
                encoding = null;  // autodetect
4534
              }
4535
            else
4536
              {
4537
                // only this one attribute
4538
                if ((temp = encoding.indexOf(';')) > 0)
4539
                  {
4540
                    encoding = encoding.substring(0, temp);
4541
                  }
4542
 
4543
                if ((temp = encoding.indexOf('=', temp + 7)) > 0)
4544
                  {
4545
                    encoding = encoding.substring(temp + 1);
4546
 
4547
                    // attributes can have comment fields (RFC 822)
4548
                    if ((temp = encoding.indexOf('(')) > 0)
4549
                      {
4550
                        encoding = encoding.substring(0, temp);
4551
                      }
4552
                    // ... and values may be quoted
4553
                    if ((temp = encoding.indexOf('"')) > 0)
4554
                      {
4555
                        encoding =
4556
                          encoding.substring(temp + 1,
4557
                                             encoding.indexOf('"', temp + 2));
4558
                      }
4559
                    encoding = encoding.trim();
4560
                  }
4561
                else
4562
                  {
4563
                    handler.warn("ignoring illegal MIME attribute: "
4564
                                 + encoding);
4565
                    encoding = null;
4566
                  }
4567
              }
4568
          }
4569
      }
4570
 
4571
    // if we got an external encoding label, use it ...
4572
    if (encoding != null)
4573
      {
4574
        this.encoding = ENCODING_EXTERNAL;
4575
        setupDecoding(encoding);
4576
        ignoreEncoding = true;
4577
 
4578
        // ... else autodetect from first bytes.
4579
      }
4580
    else
4581
      {
4582
        detectEncoding();
4583
        ignoreEncoding = false;
4584
      }
4585
 
4586
    // Read any XML or text declaration.
4587
    // If we autodetected, it may tell us the "real" encoding.
4588
    try
4589
      {
4590
        tryEncodingDecl(ignoreEncoding);
4591
      }
4592
    catch (UnsupportedEncodingException x)
4593
      {
4594
        encoding = x.getMessage();
4595
 
4596
        // if we don't handle the declared encoding,
4597
        // try letting a JVM InputStreamReader do it
4598
        try
4599
          {
4600
            if (sourceType != INPUT_STREAM)
4601
              {
4602
                throw x;
4603
              }
4604
 
4605
            is.reset();
4606
            readBufferPos = 0;
4607
            readBufferLength = 0;
4608
            readBufferOverflow = -1;
4609
            line = 1;
4610
            currentByteCount = column = 0;
4611
 
4612
            sourceType = INPUT_READER;
4613
            this.reader = new InputStreamReader(is, encoding);
4614
            is = null;
4615
 
4616
            tryEncodingDecl(true);
4617
 
4618
          }
4619
        catch (IOException e)
4620
          {
4621
            error("unsupported text encoding",
4622
                  encoding,
4623
                  null);
4624
          }
4625
      }
4626
  }
4627
 
4628
  /**
4629
   * Check for an encoding declaration.  This is the second part of the
4630
   * XML encoding autodetection algorithm, relying on detectEncoding to
4631
   * get to the point that this part can read any encoding declaration
4632
   * in the document (using only US-ASCII characters).
4633
   *
4634
   * <p> Because this part starts to fill parser buffers with this data,
4635
   * it's tricky to setup a reader so that Java's built-in decoders can be
4636
   * used for the character encodings that aren't built in to this parser
4637
   * (such as EUC-JP, KOI8-R, Big5, etc).
4638
   *
4639
   * @return any encoding in the declaration, uppercased; or null
4640
   * @see detectEncoding
4641
   */
4642
  private String tryEncodingDecl(boolean ignoreEncoding)
4643
    throws SAXException, IOException
4644
  {
4645
    // Read the XML/text declaration.
4646
    if (tryRead("<?xml"))
4647
      {
4648
        if (tryWhitespace())
4649
          {
4650
            if (inputStack.size() > 0)
4651
              {
4652
                return parseTextDecl(ignoreEncoding);
4653
              }
4654
            else
4655
              {
4656
                return parseXMLDecl(ignoreEncoding);
4657
              }
4658
          }
4659
        else
4660
          {
4661
            // <?xml-stylesheet ...?> or similar
4662
            unread('l');
4663
            unread('m');
4664
            unread('x');
4665
            unread('?');
4666
            unread('<');
4667
          }
4668
      }
4669
    return null;
4670
  }
4671
 
4672
  /**
4673
   * Attempt to detect the encoding of an entity.
4674
   * <p>The trick here (as suggested in the XML standard) is that
4675
   * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
4676
   * <b>must</b> begin with an XML declaration or an encoding
4677
   * declaration; we simply have to look for "&lt;?xml" in various
4678
   * encodings.
4679
   * <p>This method has no way to distinguish among 8-bit encodings.
4680
   * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4681
   * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
4682
   * should work, but most will be rejected later by setupDecoding ().
4683
   * @see #tryEncoding (byte[], byte, byte, byte, byte)
4684
   * @see #tryEncoding (byte[], byte, byte)
4685
   * @see #setupDecoding
4686
   */
4687
  private void detectEncoding()
4688
    throws SAXException, IOException
4689
  {
4690
    byte[] signature = new byte[4];
4691
 
4692
    // Read the first four bytes for
4693
    // autodetection.
4694
    is.mark(4);
4695
    is.read(signature);
4696
    is.reset();
4697
 
4698
    //
4699
    // FIRST:  four byte encodings (who uses these?)
4700
    //
4701
    if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4702
                    (byte) 0x00, (byte) 0x3c))
4703
      {
4704
        // UCS-4 must begin with "<?xml"
4705
        // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4706
        // "UTF-32BE"
4707
        encoding = ENCODING_UCS_4_1234;
4708
      }
4709
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4710
                         (byte) 0x00, (byte) 0x00))
4711
      {
4712
        // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4713
        // "UTF-32LE"
4714
        encoding = ENCODING_UCS_4_4321;
4715
      }
4716
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4717
                         (byte) 0x3c, (byte) 0x00))
4718
      {
4719
        // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4720
        encoding = ENCODING_UCS_4_2143;
4721
      }
4722
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4723
                         (byte) 0x00, (byte) 0x00))
4724
      {
4725
        // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4726
        encoding = ENCODING_UCS_4_3412;
4727
 
4728
        // 00 00 fe ff UCS_4_1234 (with BOM)
4729
        // ff fe 00 00 UCS_4_4321 (with BOM)
4730
      }
4731
 
4732
    //
4733
    // SECOND:  two byte encodings
4734
    // note ... with 1/14/2000 errata the XML spec identifies some
4735
    // more "broken UTF-16" autodetection cases, with no XML decl,
4736
    // which we don't handle here (that's legal too).
4737
    //
4738
    else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4739
      {
4740
        // UCS-2 with a byte-order marker. (UTF-16)
4741
        // 0xfe 0xff: UCS-2, big-endian (12)
4742
        encoding = ENCODING_UCS_2_12;
4743
        is.read(); is.read();
4744
      }
4745
    else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4746
      {
4747
        // UCS-2 with a byte-order marker. (UTF-16)
4748
        // 0xff 0xfe: UCS-2, little-endian (21)
4749
        encoding = ENCODING_UCS_2_21;
4750
        is.read(); is.read();
4751
      }
4752
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4753
                         (byte) 0x00, (byte) 0x3f))
4754
      {
4755
        // UTF-16BE (otherwise, malformed UTF-16)
4756
        // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4757
        encoding = ENCODING_UCS_2_12;
4758
        error("no byte-order mark for UCS-2 entity");
4759
      }
4760
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4761
                         (byte) 0x3f, (byte) 0x00))
4762
      {
4763
        // UTF-16LE (otherwise, malformed UTF-16)
4764
        // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4765
        encoding = ENCODING_UCS_2_21;
4766
        error("no byte-order mark for UCS-2 entity");
4767
      }
4768
 
4769
    //
4770
    // THIRD:  ASCII-derived encodings, fixed and variable lengths
4771
    //
4772
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4773
                         (byte) 0x78, (byte) 0x6d))
4774
      {
4775
        // ASCII derived
4776
        // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4777
        encoding = ENCODING_UTF_8;
4778
        prefetchASCIIEncodingDecl();
4779
      }
4780
    else if (signature[0] == (byte) 0xef
4781
             && signature[1] == (byte) 0xbb
4782
             && signature[2] == (byte) 0xbf)
4783
      {
4784
        // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4785
        // this un-needed notion slipped into XML 2nd ed through a
4786
        // "non-normative" erratum; now required by MSFT and UDDI,
4787
        // and E22 made it normative.
4788
        encoding = ENCODING_UTF_8;
4789
        is.read(); is.read(); is.read();
4790
      }
4791
    else
4792
      {
4793
        // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4794
        // ... but we COULD at least kick in some fixed code page
4795
 
4796
        // (default) UTF-8 without encoding/XML declaration
4797
        encoding = ENCODING_UTF_8;
4798
      }
4799
  }
4800
 
4801
  /**
4802
   * Check for a four-byte signature.
4803
   * <p>Utility routine for detectEncoding ().
4804
   * <p>Always looks for some part of "<?XML" in a specific encoding.
4805
   * @param sig The first four bytes read.
4806
   * @param b1 The first byte of the signature
4807
   * @param b2 The second byte of the signature
4808
   * @param b3 The third byte of the signature
4809
   * @param b4 The fourth byte of the signature
4810
   * @see #detectEncoding
4811
   */
4812
  private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4813
                                     byte b3, byte b4)
4814
  {
4815
    return (sig[0] == b1 && sig[1] == b2
4816
            && sig[2] == b3 && sig[3] == b4);
4817
  }
4818
 
4819
  /**
4820
   * Check for a two-byte signature.
4821
   * <p>Looks for a UCS-2 byte-order mark.
4822
   * <p>Utility routine for detectEncoding ().
4823
   * @param sig The first four bytes read.
4824
   * @param b1 The first byte of the signature
4825
   * @param b2 The second byte of the signature
4826
   * @see #detectEncoding
4827
   */
4828
  private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4829
  {
4830
    return ((sig[0] == b1) && (sig[1] == b2));
4831
  }
4832
 
4833
  /**
4834
   * This method pushes a string back onto input.
4835
   * <p>It is useful either as the expansion of an internal entity,
4836
   * or for backtracking during the parse.
4837
   * <p>Call pushCharArray () to do the actual work.
4838
   * @param s The string to push back onto input.
4839
   * @see #pushCharArray
4840
   */
4841
  private void pushString(String ename, String s)
4842
    throws SAXException
4843
  {
4844
    char[] ch = s.toCharArray();
4845
    pushCharArray(ename, ch, 0, ch.length);
4846
  }
4847
 
4848
  /**
4849
   * Push a new internal input source.
4850
   * <p>This method is useful for expanding an internal entity,
4851
   * or for unreading a string of characters.  It creates a new
4852
   * readBuffer containing the characters in the array, instead
4853
   * of characters converted from an input byte stream.
4854
   * @param ch The char array to push.
4855
   * @see #pushString
4856
   * @see #pushURL
4857
   * @see #readBuffer
4858
   * @see #sourceType
4859
   * @see #pushInput
4860
   */
4861
  private void pushCharArray(String ename, char[] ch, int start, int length)
4862
    throws SAXException
4863
  {
4864
    // Push the existing status
4865
    pushInput(ename);
4866
    if (ename != null && doReport)
4867
      {
4868
        dataBufferFlush();
4869
        handler.startInternalEntity(ename);
4870
      }
4871
    sourceType = INPUT_INTERNAL;
4872
    readBuffer = ch;
4873
    readBufferPos = start;
4874
    readBufferLength = length;
4875
    readBufferOverflow = -1;
4876
  }
4877
 
4878
  /**
4879
   * Save the current input source onto the stack.
4880
   * <p>This method saves all of the global variables associated with
4881
   * the current input source, so that they can be restored when a new
4882
   * input source has finished.  It also tests for entity recursion.
4883
   * <p>The method saves the following global variables onto a stack
4884
   * using a fixed-length array:
4885
   * <ol>
4886
   * <li>sourceType
4887
   * <li>externalEntity
4888
   * <li>readBuffer
4889
   * <li>readBufferPos
4890
   * <li>readBufferLength
4891
   * <li>line
4892
   * <li>encoding
4893
   * </ol>
4894
   * @param ename The name of the entity (if any) causing the new input.
4895
   * @see #popInput
4896
   * @see #sourceType
4897
   * @see #externalEntity
4898
   * @see #readBuffer
4899
   * @see #readBufferPos
4900
   * @see #readBufferLength
4901
   * @see #line
4902
   * @see #encoding
4903
   */
4904
  private void pushInput(String ename)
4905
    throws SAXException
4906
  {
4907
    // Check for entity recursion.
4908
    if (ename != null)
4909
      {
4910
        Iterator entities = entityStack.iterator();
4911
        while (entities.hasNext())
4912
          {
4913
            String e = (String) entities.next();
4914
            if (e != null && e == ename)
4915
              {
4916
                error("recursive reference to entity", ename, null);
4917
              }
4918
          }
4919
      }
4920
    entityStack.addLast(ename);
4921
 
4922
    // Don't bother if there is no current input.
4923
    if (sourceType == INPUT_NONE)
4924
      {
4925
        return;
4926
      }
4927
 
4928
    // Set up a snapshot of the current
4929
    // input source.
4930
    Input input = new Input();
4931
 
4932
    input.sourceType = sourceType;
4933
    input.externalEntity = externalEntity;
4934
    input.readBuffer = readBuffer;
4935
    input.readBufferPos = readBufferPos;
4936
    input.readBufferLength = readBufferLength;
4937
    input.line = line;
4938
    input.encoding = encoding;
4939
    input.readBufferOverflow = readBufferOverflow;
4940
    input.is = is;
4941
    input.currentByteCount = currentByteCount;
4942
    input.column = column;
4943
    input.reader = reader;
4944
 
4945
    // Push it onto the stack.
4946
    inputStack.addLast(input);
4947
  }
4948
 
4949
  /**
4950
   * Restore a previous input source.
4951
   * <p>This method restores all of the global variables associated with
4952
   * the current input source.
4953
   * @exception java.io.EOFException
4954
   *    If there are no more entries on the input stack.
4955
   * @see #pushInput
4956
   * @see #sourceType
4957
   * @see #externalEntity
4958
   * @see #readBuffer
4959
   * @see #readBufferPos
4960
   * @see #readBufferLength
4961
   * @see #line
4962
   * @see #encoding
4963
   */
4964
  private void popInput()
4965
    throws SAXException, IOException
4966
  {
4967
    String ename = (String) entityStack.removeLast();
4968
 
4969
    if (ename != null && doReport)
4970
      {
4971
        dataBufferFlush();
4972
      }
4973
    switch (sourceType)
4974
      {
4975
      case INPUT_STREAM:
4976
        handler.endExternalEntity(ename);
4977
        is.close();
4978
        break;
4979
      case INPUT_READER:
4980
        handler.endExternalEntity(ename);
4981
        reader.close();
4982
        break;
4983
      case INPUT_INTERNAL:
4984
        if (ename != null && doReport)
4985
          {
4986
            handler.endInternalEntity(ename);
4987
          }
4988
        break;
4989
      }
4990
 
4991
    // Throw an EOFException if there
4992
    // is nothing else to pop.
4993
    if (inputStack.isEmpty())
4994
      {
4995
        throw new EOFException("no more input");
4996
      }
4997
 
4998
    Input input = (Input) inputStack.removeLast();
4999
 
5000
    sourceType = input.sourceType;
5001
    externalEntity = input.externalEntity;
5002
    readBuffer = input.readBuffer;
5003
    readBufferPos = input.readBufferPos;
5004
    readBufferLength = input.readBufferLength;
5005
    line = input.line;
5006
    encoding = input.encoding;
5007
    readBufferOverflow = input.readBufferOverflow;
5008
    is = input.is;
5009
    currentByteCount = input.currentByteCount;
5010
    column = input.column;
5011
    reader = input.reader;
5012
  }
5013
 
5014
  /**
5015
   * Return true if we can read the expected character.
5016
   * <p>Note that the character will be removed from the input stream
5017
   * on success, but will be put back on failure.  Do not attempt to
5018
   * read the character again if the method succeeds.
5019
   * @param delim The character that should appear next.  For a
5020
   *        insensitive match, you must supply this in upper-case.
5021
   * @return true if the character was successfully read, or false if
5022
   *   it was not.
5023
   * @see #tryRead (String)
5024
   */
5025
  private boolean tryRead(char delim)
5026
    throws SAXException, IOException
5027
  {
5028
    char c;
5029
 
5030
    // Read the character
5031
    c = readCh();
5032
 
5033
    // Test for a match, and push the character
5034
    // back if the match fails.
5035
    if (c == delim)
5036
      {
5037
        return true;
5038
      }
5039
    else
5040
      {
5041
        unread(c);
5042
        return false;
5043
      }
5044
  }
5045
 
5046
  /**
5047
   * Return true if we can read the expected string.
5048
   * <p>This is simply a convenience method.
5049
   * <p>Note that the string will be removed from the input stream
5050
   * on success, but will be put back on failure.  Do not attempt to
5051
   * read the string again if the method succeeds.
5052
   * <p>This method will push back a character rather than an
5053
   * array whenever possible (probably the majority of cases).
5054
   * @param delim The string that should appear next.
5055
   * @return true if the string was successfully read, or false if
5056
   *   it was not.
5057
   * @see #tryRead (char)
5058
   */
5059
  private boolean tryRead(String delim)
5060
    throws SAXException, IOException
5061
  {
5062
    return tryRead(delim.toCharArray());
5063
  }
5064
 
5065
  private boolean tryRead(char[] ch)
5066
    throws SAXException, IOException
5067
  {
5068
    char c;
5069
 
5070
    // Compare the input, character-
5071
    // by character.
5072
 
5073
    for (int i = 0; i < ch.length; i++)
5074
      {
5075
        c = readCh();
5076
        if (c != ch[i])
5077
          {
5078
            unread(c);
5079
            if (i != 0)
5080
              {
5081
                unread(ch, i);
5082
              }
5083
            return false;
5084
          }
5085
      }
5086
    return true;
5087
  }
5088
 
5089
  /**
5090
   * Return true if we can read some whitespace.
5091
   * <p>This is simply a convenience method.
5092
   * <p>This method will push back a character rather than an
5093
   * array whenever possible (probably the majority of cases).
5094
   * @return true if whitespace was found.
5095
   */
5096
  private boolean tryWhitespace()
5097
    throws SAXException, IOException
5098
  {
5099
    char c;
5100
    c = readCh();
5101
    if (isWhitespace(c))
5102
      {
5103
        skipWhitespace();
5104
        return true;
5105
      }
5106
    else
5107
      {
5108
        unread(c);
5109
        return false;
5110
      }
5111
  }
5112
 
5113
  /**
5114
   * Read all data until we find the specified string.
5115
   * This is useful for scanning CDATA sections and PIs.
5116
   * <p>This is inefficient right now, since it calls tryRead ()
5117
   * for every character.
5118
   * @param delim The string delimiter
5119
   * @see #tryRead (String, boolean)
5120
   * @see #readCh
5121
   */
5122
  private void parseUntil(String delim)
5123
    throws SAXException, IOException
5124
  {
5125
    parseUntil(delim.toCharArray());
5126
  }
5127
 
5128
  private void parseUntil(char[] delim)
5129
    throws SAXException, IOException
5130
  {
5131
    char c;
5132
    int startLine = line;
5133
 
5134
    try
5135
      {
5136
        while (!tryRead(delim))
5137
          {
5138
            c = readCh();
5139
            dataBufferAppend(c);
5140
          }
5141
      }
5142
    catch (EOFException e)
5143
      {
5144
        error("end of input while looking for delimiter "
5145
              + "(started on line " + startLine
5146
              + ')', null, new String(delim));
5147
      }
5148
  }
5149
 
5150
  //////////////////////////////////////////////////////////////////////
5151
  // Low-level I/O.
5152
  //////////////////////////////////////////////////////////////////////
5153
 
5154
  /**
5155
   * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5156
   * Doesn't buffer more than absolutely needed, so that when an encoding
5157
   * decl says we need to create an InputStreamReader, we can discard our
5158
   * buffer and reset().  Caller knows the first chars of the decl exist
5159
   * in the input stream.
5160
   */
5161
  private void prefetchASCIIEncodingDecl()
5162
    throws SAXException, IOException
5163
  {
5164
    int ch;
5165
    readBufferPos = readBufferLength = 0;
5166
 
5167
    is.mark(readBuffer.length);
5168
    while (true)
5169
      {
5170
        ch = is.read();
5171
        readBuffer[readBufferLength++] = (char) ch;
5172
        switch (ch)
5173
          {
5174
          case (int) '>':
5175
            return;
5176
          case -1:
5177
            error("file ends before end of XML or encoding declaration.",
5178
                  null, "?>");
5179
          }
5180
        if (readBuffer.length == readBufferLength)
5181
          {
5182
            error("unfinished XML or encoding declaration");
5183
          }
5184
      }
5185
  }
5186
 
5187
  /**
5188
   * Read a chunk of data from an external input source.
5189
   * <p>This is simply a front-end that fills the rawReadBuffer
5190
   * with bytes, then calls the appropriate encoding handler.
5191
   * @see #encoding
5192
   * @see #rawReadBuffer
5193
   * @see #readBuffer
5194
   * @see #filterCR
5195
   * @see #copyUtf8ReadBuffer
5196
   * @see #copyIso8859_1ReadBuffer
5197
   * @see #copyUcs_2ReadBuffer
5198
   * @see #copyUcs_4ReadBuffer
5199
   */
5200
  private void readDataChunk()
5201
    throws SAXException, IOException
5202
  {
5203
    int count;
5204
 
5205
    // See if we have any overflow (filterCR sets for CR at end)
5206
    if (readBufferOverflow > -1)
5207
      {
5208
        readBuffer[0] = (char) readBufferOverflow;
5209
        readBufferOverflow = -1;
5210
        readBufferPos = 1;
5211
        sawCR = true;
5212
      }
5213
    else
5214
      {
5215
        readBufferPos = 0;
5216
        sawCR = false;
5217
      }
5218
 
5219
    // input from a character stream.
5220
    if (sourceType == INPUT_READER)
5221
      {
5222
        count = reader.read(readBuffer,
5223
                            readBufferPos, READ_BUFFER_MAX - readBufferPos);
5224
        if (count < 0)
5225
          {
5226
            readBufferLength = readBufferPos;
5227
          }
5228
        else
5229
          {
5230
            readBufferLength = readBufferPos + count;
5231
          }
5232
        if (readBufferLength > 0)
5233
          {
5234
            filterCR(count >= 0);
5235
          }
5236
        sawCR = false;
5237
        return;
5238
      }
5239
 
5240
    // Read as many bytes as possible into the raw buffer.
5241
    count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
5242
 
5243
    // Dispatch to an encoding-specific reader method to populate
5244
    // the readBuffer.  In most parser speed profiles, these routines
5245
    // show up at the top of the CPU usage chart.
5246
    if (count > 0)
5247
      {
5248
        switch (encoding)
5249
          {
5250
            // one byte builtins
5251
          case ENCODING_ASCII:
5252
            copyIso8859_1ReadBuffer(count, (char) 0x0080);
5253
            break;
5254
          case ENCODING_UTF_8:
5255
            copyUtf8ReadBuffer(count);
5256
            break;
5257
          case ENCODING_ISO_8859_1:
5258
            copyIso8859_1ReadBuffer(count, (char) 0);
5259
            break;
5260
 
5261
            // two byte builtins
5262
          case ENCODING_UCS_2_12:
5263
            copyUcs2ReadBuffer(count, 8, 0);
5264
            break;
5265
          case ENCODING_UCS_2_21:
5266
            copyUcs2ReadBuffer(count, 0, 8);
5267
            break;
5268
 
5269
            // four byte builtins
5270
          case ENCODING_UCS_4_1234:
5271
            copyUcs4ReadBuffer(count, 24, 16, 8, 0);
5272
            break;
5273
          case ENCODING_UCS_4_4321:
5274
            copyUcs4ReadBuffer(count, 0, 8, 16, 24);
5275
            break;
5276
          case ENCODING_UCS_4_2143:
5277
            copyUcs4ReadBuffer(count, 16, 24, 0, 8);
5278
            break;
5279
          case ENCODING_UCS_4_3412:
5280
            copyUcs4ReadBuffer(count, 8, 0, 24, 16);
5281
            break;
5282
          }
5283
      }
5284
    else
5285
      {
5286
        readBufferLength = readBufferPos;
5287
      }
5288
 
5289
    readBufferPos = 0;
5290
 
5291
    // Filter out all carriage returns if we've seen any
5292
    // (including any saved from a previous read)
5293
    if (sawCR)
5294
      {
5295
        filterCR(count >= 0);
5296
        sawCR = false;
5297
 
5298
        // must actively report EOF, lest some CRs get lost.
5299
        if (readBufferLength == 0 && count >= 0)
5300
          {
5301
            readDataChunk();
5302
          }
5303
      }
5304
 
5305
    if (count > 0)
5306
      {
5307
        currentByteCount += count;
5308
      }
5309
  }
5310
 
5311
  /**
5312
   * Filter carriage returns in the read buffer.
5313
   * CRLF becomes LF; CR becomes LF.
5314
   * @param moreData true iff more data might come from the same source
5315
   * @see #readDataChunk
5316
   * @see #readBuffer
5317
   * @see #readBufferOverflow
5318
   */
5319
  private void filterCR(boolean moreData)
5320
  {
5321
    int i, j;
5322
 
5323
    readBufferOverflow = -1;
5324
 
5325
loop:
5326
    for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5327
      {
5328
        switch (readBuffer[j])
5329
          {
5330
          case '\r':
5331
            if (j == readBufferLength - 1)
5332
              {
5333
                if (moreData)
5334
                  {
5335
                    readBufferOverflow = '\r';
5336
                    readBufferLength--;
5337
                  }
5338
                else   // CR at end of buffer
5339
                  {
5340
                    readBuffer[i++] = '\n';
5341
                  }
5342
                break loop;
5343
              }
5344
            else if (readBuffer[j + 1] == '\n')
5345
              {
5346
                j++;
5347
              }
5348
            readBuffer[i] = '\n';
5349
            break;
5350
 
5351
          case '\n':
5352
          default:
5353
            readBuffer[i] = readBuffer[j];
5354
            break;
5355
          }
5356
      }
5357
    readBufferLength = i;
5358
  }
5359
 
5360
  /**
5361
   * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
5362
   * <p>When readDataChunk () calls this method, the raw bytes are in
5363
   * rawReadBuffer, and the final characters will appear in
5364
   * readBuffer.
5365
   * <p>Note that as of Unicode 3.1, good practice became a requirement,
5366
   * so that each Unicode character has exactly one UTF-8 representation.
5367
   * @param count The number of bytes to convert.
5368
   * @see #readDataChunk
5369
   * @see #rawReadBuffer
5370
   * @see #readBuffer
5371
   * @see #getNextUtf8Byte
5372
   */
5373
  private void copyUtf8ReadBuffer(int count)
5374
    throws SAXException, IOException
5375
  {
5376
    int i = 0;
5377
    int j = readBufferPos;
5378
    int b1;
5379
    char c = 0;
5380
 
5381
    /*
5382
    // check once, so the runtime won't (if it's smart enough)
5383
    if (count < 0 || count > rawReadBuffer.length)
5384
    throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
5385
     */
5386
 
5387
    while (i < count)
5388
      {
5389
        b1 = rawReadBuffer[i++];
5390
 
5391
        // Determine whether we are dealing
5392
        // with a one-, two-, three-, or four-
5393
        // byte sequence.
5394
        if (b1 < 0)
5395
          {
5396
            if ((b1 & 0xe0) == 0xc0)
5397
              {
5398
                // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5399
                c = (char) (((b1 & 0x1f) << 6)
5400
                            | getNextUtf8Byte(i++, count));
5401
                if (c < 0x0080)
5402
                  {
5403
                    encodingError("Illegal two byte UTF-8 sequence",
5404
                                  c, 0);
5405
                  }
5406
 
5407
                //Sec 2.11
5408
                // [1] the two-character sequence #xD #xA
5409
                // [2] the two-character sequence #xD #x85
5410
                if ((c == 0x0085 || c == 0x000a) && sawCR)
5411
                  {
5412
                    continue;
5413
                  }
5414
 
5415
                // Sec 2.11
5416
                // [3] the single character #x85
5417
 
5418
                if (c == 0x0085 && xmlVersion == XML_11)
5419
                  {
5420
                    readBuffer[j++] = '\r';
5421
                  }
5422
              }
5423
            else if ((b1 & 0xf0) == 0xe0)
5424
              {
5425
                // 3-byte sequence:
5426
                // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5427
                // most CJKV characters
5428
                c = (char) (((b1 & 0x0f) << 12) |
5429
                            (getNextUtf8Byte(i++, count) << 6) |
5430
                            getNextUtf8Byte(i++, count));
5431
                //sec 2.11
5432
                //[4] the single character #x2028
5433
                if (c == 0x2028 && xmlVersion == XML_11)
5434
                  {
5435
                    readBuffer[j++] = '\r';
5436
                    sawCR = true;
5437
                    continue;
5438
                  }
5439
                if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
5440
                  {
5441
                    encodingError("Illegal three byte UTF-8 sequence",
5442
                                  c, 0);
5443
                  }
5444
              }
5445
            else if ((b1 & 0xf8) == 0xf0)
5446
              {
5447
                // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
5448
                //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
5449
                // (uuuuu = wwww + 1)
5450
                // "Surrogate Pairs" ... from the "Astral Planes"
5451
                // Unicode 3.1 assigned the first characters there
5452
                int iso646 = b1 & 07;
5453
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5454
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5455
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5456
 
5457
                if (iso646 <= 0xffff)
5458
                  {
5459
                    encodingError("Illegal four byte UTF-8 sequence",
5460
                                  iso646, 0);
5461
                  }
5462
                else
5463
                  {
5464
                    if (iso646 > 0x0010ffff)
5465
                      {
5466
                        encodingError("UTF-8 value out of range for Unicode",
5467
                                      iso646, 0);
5468
                      }
5469
                    iso646 -= 0x010000;
5470
                    readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
5471
                    readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
5472
                    continue;
5473
                  }
5474
              }
5475
            else
5476
              {
5477
                // The five and six byte encodings aren't supported;
5478
                // they exceed the Unicode (and XML) range.
5479
                encodingError("unsupported five or six byte UTF-8 sequence",
5480
                              0xff & b1, i);
5481
                // NOTREACHED
5482
                c = 0;
5483
              }
5484
          }
5485
        else
5486
          {
5487
            // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
5488
            // (US-ASCII character, "common" case, one branch to here)
5489
            c = (char) b1;
5490
          }
5491
        readBuffer[j++] = c;
5492
        if (c == '\r')
5493
          {
5494
            sawCR = true;
5495
          }
5496
      }
5497
    // How many characters have we read?
5498
    readBufferLength = j;
5499
  }
5500
 
5501
  /**
5502
   * Return the next byte value in a UTF-8 sequence.
5503
   * If it is not possible to get a byte from the current
5504
   * entity, throw an exception.
5505
   * @param pos The current position in the rawReadBuffer.
5506
   * @param count The number of bytes in the rawReadBuffer
5507
   * @return The significant six bits of a non-initial byte in
5508
   *   a UTF-8 sequence.
5509
   * @exception EOFException If the sequence is incomplete.
5510
   */
5511
  private int getNextUtf8Byte(int pos, int count)
5512
    throws SAXException, IOException
5513
  {
5514
    int val;
5515
 
5516
    // Take a character from the buffer
5517
    // or from the actual input stream.
5518
    if (pos < count)
5519
      {
5520
        val = rawReadBuffer[pos];
5521
      }
5522
    else
5523
      {
5524
        val = is.read();
5525
        if (val == -1)
5526
          {
5527
            encodingError("unfinished multi-byte UTF-8 sequence at EOF",
5528
                          -1, pos);
5529
          }
5530
      }
5531
 
5532
    // Check for the correct bits at the start.
5533
    if ((val & 0xc0) != 0x80)
5534
      {
5535
        encodingError("bad continuation of multi-byte UTF-8 sequence",
5536
                      val, pos + 1);
5537
      }
5538
 
5539
    // Return the significant bits.
5540
    return (val & 0x3f);
5541
  }
5542
 
5543
  /**
5544
   * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
5545
   * UTF-16 characters.
5546
   *
5547
   * <p>When readDataChunk () calls this method, the raw bytes are in
5548
   * rawReadBuffer, and the final characters will appear in
5549
   * readBuffer.
5550
   *
5551
   * @param count The number of bytes to convert.
5552
   * @param mask For ASCII conversion, 0x7f; else, 0xff.
5553
   * @see #readDataChunk
5554
   * @see #rawReadBuffer
5555
   * @see #readBuffer
5556
   */
5557
  private void copyIso8859_1ReadBuffer(int count, char mask)
5558
    throws IOException
5559
  {
5560
    int i, j;
5561
    for (i = 0, j = readBufferPos; i < count; i++, j++)
5562
      {
5563
        char c = (char) (rawReadBuffer[i] & 0xff);
5564
        if ((c & mask) != 0)
5565
          {
5566
            throw new CharConversionException("non-ASCII character U+"
5567
                                              + Integer.toHexString(c));
5568
          }
5569
        if (c == 0x0085 && xmlVersion == XML_11)
5570
          {
5571
            c = '\r';
5572
          }
5573
        readBuffer[j] = c;
5574
        if (c == '\r')
5575
          {
5576
            sawCR = true;
5577
          }
5578
      }
5579
    readBufferLength = j;
5580
  }
5581
 
5582
  /**
5583
   * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
5584
   * (as used in Java string manipulation).
5585
   *
5586
   * <p>When readDataChunk () calls this method, the raw bytes are in
5587
   * rawReadBuffer, and the final characters will appear in
5588
   * readBuffer.
5589
   * @param count The number of bytes to convert.
5590
   * @param shift1 The number of bits to shift byte 1.
5591
   * @param shift2 The number of bits to shift byte 2
5592
   * @see #readDataChunk
5593
   * @see #rawReadBuffer
5594
   * @see #readBuffer
5595
   */
5596
  private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
5597
    throws SAXException
5598
  {
5599
    int j = readBufferPos;
5600
 
5601
    if (count > 0 && (count % 2) != 0)
5602
      {
5603
        encodingError("odd number of bytes in UCS-2 encoding", -1, count);
5604
      }
5605
    // The loops are faster with less internal brancing; hence two
5606
    if (shift1 == 0)
5607
      {  // "UTF-16-LE"
5608
        for (int i = 0; i < count; i += 2)
5609
          {
5610
            char c = (char) (rawReadBuffer[i + 1] << 8);
5611
            c |= 0xff & rawReadBuffer[i];
5612
            readBuffer[j++] = c;
5613
            if (c == '\r')
5614
              {
5615
                sawCR = true;
5616
              }
5617
          }
5618
      }
5619
    else
5620
      {  // "UTF-16-BE"
5621
        for (int i = 0; i < count; i += 2)
5622
          {
5623
            char c = (char) (rawReadBuffer[i] << 8);
5624
            c |= 0xff & rawReadBuffer[i + 1];
5625
            readBuffer[j++] = c;
5626
            if (c == '\r')
5627
              {
5628
                sawCR = true;
5629
              }
5630
          }
5631
      }
5632
    readBufferLength = j;
5633
  }
5634
 
5635
  /**
5636
   * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
5637
   *
5638
   * <p>When readDataChunk () calls this method, the raw bytes are in
5639
   * rawReadBuffer, and the final characters will appear in
5640
   * readBuffer.
5641
   * <p>Java has Unicode chars, and this routine uses surrogate pairs
5642
   * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
5643
   * exception is thrown if the ISO-10646 character has no Unicode
5644
   * representation.
5645
   *
5646
   * @param count The number of bytes to convert.
5647
   * @param shift1 The number of bits to shift byte 1.
5648
   * @param shift2 The number of bits to shift byte 2
5649
   * @param shift3 The number of bits to shift byte 2
5650
   * @param shift4 The number of bits to shift byte 2
5651
   * @see #readDataChunk
5652
   * @see #rawReadBuffer
5653
   * @see #readBuffer
5654
   */
5655
  private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
5656
                                  int shift3, int shift4)
5657
    throws SAXException
5658
  {
5659
    int j = readBufferPos;
5660
 
5661
    if (count > 0 && (count % 4) != 0)
5662
      {
5663
        encodingError("number of bytes in UCS-4 encoding " +
5664
                      "not divisible by 4",
5665
                      -1, count);
5666
      }
5667
    for (int i = 0; i < count; i += 4)
5668
      {
5669
        int value = (((rawReadBuffer [i] & 0xff) << shift1) |
5670
                     ((rawReadBuffer [i + 1] & 0xff) << shift2) |
5671
                     ((rawReadBuffer [i + 2] & 0xff) << shift3) |
5672
                     ((rawReadBuffer [i + 3] & 0xff) << shift4));
5673
        if (value < 0x0000ffff)
5674
          {
5675
            readBuffer [j++] = (char) value;
5676
            if (value == (int) '\r')
5677
              {
5678
                sawCR = true;
5679
              }
5680
          }
5681
        else if (value < 0x0010ffff)
5682
          {
5683
            value -= 0x010000;
5684
            readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
5685
            readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
5686
          }
5687
        else
5688
          {
5689
            encodingError("UCS-4 value out of range for Unicode",
5690
                          value, i);
5691
          }
5692
      }
5693
    readBufferLength = j;
5694
  }
5695
 
5696
  /**
5697
   * Report a character encoding error.
5698
   */
5699
  private void encodingError(String message, int value, int offset)
5700
    throws SAXException
5701
  {
5702
    if (value != -1)
5703
      {
5704
        message = message + " (character code: 0x" +
5705
          Integer.toHexString(value) + ')';
5706
        error(message);
5707
      }
5708
  }
5709
 
5710
  //////////////////////////////////////////////////////////////////////
5711
  // Local Variables.
5712
  //////////////////////////////////////////////////////////////////////
5713
 
5714
  /**
5715
   * Re-initialize the variables for each parse.
5716
   */
5717
  private void initializeVariables()
5718
  {
5719
    // First line
5720
    line = 1;
5721
    column = 0;
5722
 
5723
    // Set up the buffers for data and names
5724
    dataBufferPos = 0;
5725
    dataBuffer = new char[DATA_BUFFER_INITIAL];
5726
    nameBufferPos = 0;
5727
    nameBuffer = new char[NAME_BUFFER_INITIAL];
5728
 
5729
    // Set up the DTD hash tables
5730
    elementInfo = new HashMap();
5731
    entityInfo = new HashMap();
5732
    notationInfo = new HashMap();
5733
    skippedPE = false;
5734
 
5735
    // Set up the variables for the current
5736
    // element context.
5737
    currentElement = null;
5738
    currentElementContent = CONTENT_UNDECLARED;
5739
 
5740
    // Set up the input variables
5741
    sourceType = INPUT_NONE;
5742
    inputStack = new LinkedList();
5743
    entityStack = new LinkedList();
5744
    externalEntity = null;
5745
    tagAttributePos = 0;
5746
    tagAttributes = new String[100];
5747
    rawReadBuffer = new byte[READ_BUFFER_MAX];
5748
    readBufferOverflow = -1;
5749
 
5750
    scratch = new InputSource();
5751
 
5752
    inLiteral = false;
5753
    expandPE = false;
5754
    peIsError = false;
5755
 
5756
    doReport = false;
5757
 
5758
    inCDATA = false;
5759
 
5760
    symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5761
  }
5762
 
5763
  static class ExternalIdentifiers
5764
  {
5765
 
5766
    String publicId;
5767
    String systemId;
5768
    String baseUri;
5769
 
5770
    ExternalIdentifiers()
5771
    {
5772
    }
5773
 
5774
    ExternalIdentifiers(String publicId, String systemId, String baseUri)
5775
    {
5776
      this.publicId = publicId;
5777
      this.systemId = systemId;
5778
      this.baseUri = baseUri;
5779
    }
5780
 
5781
  }
5782
 
5783
  static class EntityInfo
5784
  {
5785
 
5786
    int type;
5787
    ExternalIdentifiers ids;
5788
    String value;
5789
    String notationName;
5790
 
5791
  }
5792
 
5793
  static class AttributeDecl
5794
  {
5795
 
5796
    String type;
5797
    String value;
5798
    int valueType;
5799
    String enumeration;
5800
    String defaultValue;
5801
 
5802
  }
5803
 
5804
  static class ElementDecl
5805
  {
5806
 
5807
    int contentType;
5808
    String contentModel;
5809
    HashMap attributes;
5810
 
5811
  }
5812
 
5813
  static class Input
5814
  {
5815
 
5816
    int sourceType;
5817
    URLConnection externalEntity;
5818
    char[] readBuffer;
5819
    int readBufferPos;
5820
    int readBufferLength;
5821
    int line;
5822
    int encoding;
5823
    int readBufferOverflow;
5824
    InputStream is;
5825
    int currentByteCount;
5826
    int column;
5827
    Reader reader;
5828
 
5829
  }
5830
 
5831
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.