OpenCores
URL https://opencores.org/ocsvn/scarts/scarts/trunk

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [libjava/] [classpath/] [gnu/] [xml/] [aelfred2/] [XmlParser.java] - Blame information for rev 14

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 14 jlechner
/* XmlParser.java --
2
   Copyright (C) 1999,2000,2001, 2006 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version.
37
 
38
Partly derived from code which carried the following notice:
39
 
40
  Copyright (c) 1997, 1998 by Microstar Software Ltd.
41
 
42
  AElfred is free for both commercial and non-commercial use and
43
  redistribution, provided that Microstar's copyright and disclaimer are
44
  retained intact.  You are free to modify AElfred for your own use and
45
  to redistribute AElfred with your modifications, provided that the
46
  modifications are clearly documented.
47
 
48
  This program is distributed in the hope that it will be useful, but
49
  WITHOUT ANY WARRANTY; without even the implied warranty of
50
  merchantability or fitness for a particular purpose.  Please use it AT
51
  YOUR OWN RISK.
52
*/
53
 
54
package gnu.xml.aelfred2;
55
 
56
import gnu.java.security.action.GetPropertyAction;
57
 
58
import java.io.BufferedInputStream;
59
import java.io.CharConversionException;
60
import java.io.EOFException;
61
import java.io.InputStream;
62
import java.io.InputStreamReader;
63
import java.io.IOException;
64
import java.io.Reader;
65
import java.io.UnsupportedEncodingException;
66
import java.net.URL;
67
import java.net.URLConnection;
68
import java.security.AccessController;
69
 
70
import java.util.Iterator;
71
import java.util.HashMap;
72
import java.util.LinkedList;
73
 
74
import org.xml.sax.InputSource;
75
import org.xml.sax.SAXException;
76
 
77
 
78
/**
79
 * Parse XML documents and return parse events through call-backs.
80
 * Use the <code>SAXDriver</code> class as your entry point, as all
81
 * internal parser interfaces are subject to change.
82
 *
83
 * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
84
 *      (version 1.2a with bugfixes)
85
 * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
86
 * @see SAXDriver
87
 */
88
final class XmlParser
89
{
90
 
91
  // avoid slow per-character readCh()
92
  private final static boolean USE_CHEATS = true;
93
 
94
  ////////////////////////////////////////////////////////////////////////
95
  // Constants.
96
  ////////////////////////////////////////////////////////////////////////
97
 
98
  //
99
  // Constants for element content type.
100
  //
101
 
102
  /**
103
   * Constant: an element has not been declared.
104
   * @see #getElementContentType
105
   */
106
  public final static int CONTENT_UNDECLARED = 0;
107
 
108
  /**
109
   * Constant: the element has a content model of ANY.
110
   * @see #getElementContentType
111
   */
112
  public final static int CONTENT_ANY = 1;
113
 
114
  /**
115
   * Constant: the element has declared content of EMPTY.
116
   * @see #getElementContentType
117
   */
118
  public final static int CONTENT_EMPTY = 2;
119
 
120
  /**
121
   * Constant: the element has mixed content.
122
   * @see #getElementContentType
123
   */
124
  public final static int CONTENT_MIXED = 3;
125
 
126
  /**
127
   * Constant: the element has element content.
128
   * @see #getElementContentType
129
   */
130
  public final static int CONTENT_ELEMENTS = 4;
131
 
132
 
133
  //
134
  // Constants for the entity type.
135
  //
136
 
137
  /**
138
   * Constant: the entity has not been declared.
139
   * @see #getEntityType
140
   */
141
  public final static int ENTITY_UNDECLARED = 0;
142
 
143
  /**
144
   * Constant: the entity is internal.
145
   * @see #getEntityType
146
   */
147
  public final static int ENTITY_INTERNAL = 1;
148
 
149
  /**
150
   * Constant: the entity is external, non-parsable data.
151
   * @see #getEntityType
152
   */
153
  public final static int ENTITY_NDATA = 2;
154
 
155
  /**
156
   * Constant: the entity is external XML data.
157
   * @see #getEntityType
158
   */
159
  public final static int ENTITY_TEXT = 3;
160
 
161
  //
162
  // Attribute type constants are interned literal strings.
163
  //
164
 
165
  //
166
  // Constants for supported encodings.  "external" is just a flag.
167
  //
168
  private final static int ENCODING_EXTERNAL = 0;
169
  private final static int ENCODING_UTF_8 = 1;
170
  private final static int ENCODING_ISO_8859_1 = 2;
171
  private final static int ENCODING_UCS_2_12 = 3;
172
  private final static int ENCODING_UCS_2_21 = 4;
173
  private final static int ENCODING_UCS_4_1234 = 5;
174
  private final static int ENCODING_UCS_4_4321 = 6;
175
  private final static int ENCODING_UCS_4_2143 = 7;
176
  private final static int ENCODING_UCS_4_3412 = 8;
177
  private final static int ENCODING_ASCII = 9;
178
 
179
  //
180
  // Constants for attribute default value.
181
  //
182
 
183
  /**
184
   * Constant: the attribute is not declared.
185
   * @see #getAttributeDefaultValueType
186
   */
187
  public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
188
 
189
  /**
190
   * Constant: the attribute has a literal default value specified.
191
   * @see #getAttributeDefaultValueType
192
   * @see #getAttributeDefaultValue
193
   */
194
  public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
195
 
196
  /**
197
   * Constant: the attribute was declared #IMPLIED.
198
   * @see #getAttributeDefaultValueType
199
   */
200
  public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
201
 
202
  /**
203
   * Constant: the attribute was declared #REQUIRED.
204
   * @see #getAttributeDefaultValueType
205
   */
206
  public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
207
 
208
  /**
209
   * Constant: the attribute was declared #FIXED.
210
   * @see #getAttributeDefaultValueType
211
   * @see #getAttributeDefaultValue
212
   */
213
  public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
214
 
215
  //
216
  // Constants for input.
217
  //
218
  private final static int INPUT_NONE = 0;
219
  private final static int INPUT_INTERNAL = 1;
220
  private final static int INPUT_STREAM = 3;
221
  private final static int INPUT_READER = 5;
222
 
223
  //
224
  // Flags for reading literals.
225
  //
226
  // expand general entity refs (attribute values in dtd and content)
227
  private final static int LIT_ENTITY_REF = 2;
228
  // normalize this value (space chars) (attributes, public ids)
229
  private final static int LIT_NORMALIZE = 4;
230
  // literal is an attribute value 
231
  private final static int LIT_ATTRIBUTE = 8;
232
  // don't expand parameter entities
233
  private final static int LIT_DISABLE_PE = 16;
234
  // don't expand [or parse] character refs
235
  private final static int LIT_DISABLE_CREF = 32;
236
  // don't parse general entity refs
237
  private final static int LIT_DISABLE_EREF = 64;
238
  // literal is a public ID value 
239
  private final static int LIT_PUBID = 256;
240
 
241
  //
242
  // Flags affecting PE handling in DTDs (if expandPE is true).
243
  // PEs expand with space padding, except inside literals.
244
  //
245
  private final static int CONTEXT_NORMAL = 0;
246
  private final static int CONTEXT_LITERAL = 1;
247
 
248
  // Emit warnings for relative URIs with no base URI.
249
  static boolean uriWarnings;
250
  static
251
  {
252
    String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
253
    GetPropertyAction a = new GetPropertyAction(key);
254
    uriWarnings = "true".equals(AccessController.doPrivileged(a));
255
  }
256
 
257
  //
258
  // The current XML handler interface.
259
  //
260
  private SAXDriver handler;
261
 
262
  //
263
  // I/O information.
264
  //
265
  private Reader reader;   // current reader
266
  private InputStream is;     // current input stream
267
  private int line;     // current line number
268
  private int column;   // current column number
269
  private int sourceType;   // type of input source
270
  private LinkedList inputStack;   // stack of input soruces
271
  private URLConnection externalEntity; // current external entity
272
  private int encoding;   // current character encoding
273
  private int currentByteCount; // bytes read from current source
274
  private InputSource scratch;  // temporary
275
 
276
  //
277
  // Buffers for decoded but unparsed character input.
278
  //
279
  private char[] readBuffer;
280
  private int readBufferPos;
281
  private int readBufferLength;
282
  private int readBufferOverflow;  // overflow from last data chunk.
283
 
284
  //
285
  // Buffer for undecoded raw byte input.
286
  //
287
  private final static int READ_BUFFER_MAX = 16384;
288
  private byte[] rawReadBuffer;
289
 
290
 
291
  //
292
  // Buffer for attribute values, char refs, DTD stuff.
293
  //
294
  private static int DATA_BUFFER_INITIAL = 4096;
295
  private char[] dataBuffer;
296
  private int dataBufferPos;
297
 
298
  //
299
  // Buffer for parsed names.
300
  //
301
  private static int NAME_BUFFER_INITIAL = 1024;
302
  private char[] nameBuffer;
303
  private int nameBufferPos;
304
 
305
  //
306
  // Save any standalone flag
307
  //
308
  private boolean docIsStandalone;
309
 
310
  //
311
  // Hashtables for DTD information on elements, entities, and notations.
312
  // Populated until we start ignoring decls (because of skipping a PE)
313
  //
314
  private HashMap elementInfo;
315
  private HashMap entityInfo;
316
  private HashMap notationInfo;
317
  private boolean skippedPE;
318
 
319
  //
320
  // Element type currently in force.
321
  //
322
  private String currentElement;
323
  private int currentElementContent;
324
 
325
  //
326
  // Stack of entity names, to detect recursion.
327
  //
328
  private LinkedList entityStack;
329
 
330
  //
331
  // PE expansion is enabled in most chunks of the DTD, not all.
332
  // When it's enabled, literals are treated differently.
333
  //
334
  private boolean inLiteral;
335
  private boolean expandPE;
336
  private boolean peIsError;
337
 
338
  //
339
  // can't report entity expansion inside two constructs:
340
  // - attribute expansions (internal entities only)
341
  // - markup declarations (parameter entities only)
342
  //
343
  private boolean doReport;
344
 
345
  //
346
  // Symbol table, for caching interned names.
347
  //
348
  // These show up wherever XML names or nmtokens are used:  naming elements,
349
  // attributes, PIs, notations, entities, and enumerated attribute values.
350
  //
351
  // NOTE:  This hashtable doesn't grow.  The default size is intended to be
352
  // rather large for most documents.  Example:  one snapshot of the DocBook
353
  // XML 4.1 DTD used only about 350 such names.  As a rule, only pathological
354
  // documents (ones that don't reuse names) should ever see much collision.
355
  //
356
  // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
357
  // "2039" keeps the hash table size at about two memory pages on typical
358
  // 32 bit hardware.
359
  //
360
  private final static int SYMBOL_TABLE_LENGTH = 2039;
361
 
362
  private Object[][] symbolTable;
363
 
364
  //
365
  // Hash table of attributes found in current start tag.
366
  //
367
  private String[] tagAttributes;
368
  private int tagAttributePos;
369
 
370
  //
371
  // Utility flag: have we noticed a CR while reading the last
372
  // data chunk?  If so, we will have to go back and normalise
373
  // CR or CR/LF line ends.
374
  //
375
  private boolean sawCR;
376
 
377
  //
378
  // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
379
  // 
380
  private boolean inCDATA;
381
 
382
  //
383
  // Xml version.
384
  //  
385
  private static final int XML_10 = 0;
386
  private static final int XML_11 = 1;
387
  private int xmlVersion = XML_10;
388
 
389
  //////////////////////////////////////////////////////////////////////
390
  // Constructors.
391
  ////////////////////////////////////////////////////////////////////////
392
 
393
  /**
394
   * Construct a new parser with no associated handler.
395
   * @see #setHandler
396
   * @see #parse
397
   */
398
  // package private
399
  XmlParser()
400
  {
401
  }
402
 
403
  /**
404
   * Set the handler that will receive parsing events.
405
   * @param handler The handler to receive callback events.
406
   * @see #parse
407
   */
408
  // package private
409
  void setHandler(SAXDriver handler)
410
  {
411
    this.handler = handler;
412
  }
413
 
414
  /**
415
   * Parse an XML document from the character stream, byte stream, or URI
416
   * that you provide (in that order of preference).  Any URI that you
417
   * supply will become the base URI for resolving relative URI, and may
418
   * be used to acquire a reader or byte stream.
419
   *
420
   * <p> Only one thread at a time may use this parser; since it is
421
   * private to this package, post-parse cleanup is done by the caller,
422
   * which MUST NOT REUSE the parser (just null it).
423
   *
424
   * @param systemId Absolute URI of the document; should never be null,
425
   *    but may be so iff a reader <em>or</em> a stream is provided.
426
   * @param publicId The public identifier of the document, or null.
427
   * @param reader A character stream; must be null if stream isn't.
428
   * @param stream A byte input stream; must be null if reader isn't.
429
   * @param encoding The suggested encoding, or null if unknown.
430
   * @exception java.lang.Exception Basically SAXException or IOException
431
   */
432
  // package private 
433
  void doParse(String systemId, String publicId, Reader reader,
434
               InputStream stream, String encoding)
435
    throws Exception
436
  {
437
    if (handler == null)
438
      {
439
        throw new IllegalStateException("no callback handler");
440
      }
441
 
442
    initializeVariables();
443
 
444
    // predeclare the built-in entities here (replacement texts)
445
    // we don't need to intern(), since we're guaranteed literals
446
    // are always (globally) interned.
447
    setInternalEntity("amp", "&#38;");
448
    setInternalEntity("lt", "&#60;");
449
    setInternalEntity("gt", "&#62;");
450
    setInternalEntity("apos", "&#39;");
451
    setInternalEntity("quot", "&#34;");
452
 
453
    try
454
      {
455
        // pushURL first to ensure locator is correct in startDocument
456
        // ... it might report an IO or encoding exception.
457
        handler.startDocument();
458
        pushURL(false, "[document]",
459
                // default baseURI: null
460
                new ExternalIdentifiers(publicId, systemId, null),
461
                reader, stream, encoding, false);
462
 
463
        parseDocument();
464
      }
465
    catch (EOFException e)
466
      {
467
        //empty input
468
        error("empty document, with no root element.");
469
      }
470
    finally
471
      {
472
        if (reader != null)
473
          {
474
            try
475
              {
476
                reader.close();
477
              }
478
            catch (IOException e)
479
              {
480
                /* ignore */
481
              }
482
          }
483
        if (stream != null)
484
          {
485
            try
486
              {
487
                stream.close();
488
              }
489
            catch (IOException e)
490
              {
491
                /* ignore */
492
              }
493
          }
494
        if (is != null)
495
          {
496
            try
497
              {
498
                is.close();
499
              }
500
            catch (IOException e)
501
              {
502
                /* ignore */
503
              }
504
          }
505
        scratch = null;
506
      }
507
  }
508
 
509
  //////////////////////////////////////////////////////////////////////
510
  // Error reporting.
511
  //////////////////////////////////////////////////////////////////////
512
 
513
  /**
514
   * Report an error.
515
   * @param message The error message.
516
   * @param textFound The text that caused the error (or null).
517
   * @see SAXDriver#error
518
   * @see #line
519
   */
520
  private void error(String message, String textFound, String textExpected)
521
    throws SAXException
522
  {
523
    if (textFound != null)
524
      {
525
        message = message + " (found \"" + textFound + "\")";
526
      }
527
    if (textExpected != null)
528
      {
529
        message = message + " (expected \"" + textExpected + "\")";
530
      }
531
    handler.fatal(message);
532
 
533
    // "can't happen"
534
    throw new SAXException(message);
535
  }
536
 
537
  /**
538
   * Report a serious error.
539
   * @param message The error message.
540
   * @param textFound The text that caused the error (or null).
541
   */
542
  private void error(String message, char textFound, String textExpected)
543
    throws SAXException
544
  {
545
    error(message, new Character(textFound).toString(), textExpected);
546
  }
547
 
548
  /**
549
   * Report typical case fatal errors.
550
   */
551
  private void error(String message)
552
    throws SAXException
553
  {
554
    handler.fatal(message);
555
  }
556
 
557
  //////////////////////////////////////////////////////////////////////
558
  // Major syntactic productions.
559
  //////////////////////////////////////////////////////////////////////
560
 
561
  /**
562
   * Parse an XML document.
563
   * <pre>
564
   * [1] document ::= prolog element Misc*
565
   * </pre>
566
   * <p>This is the top-level parsing function for a single XML
567
   * document.  As a minimum, a well-formed document must have
568
   * a document element, and a valid document must have a prolog
569
   * (one with doctype) as well.
570
   */
571
  private void parseDocument()
572
    throws Exception
573
  {
574
    try
575
      {                                       // added by MHK
576
        boolean sawDTD = parseProlog();
577
        require('<');
578
        parseElement(!sawDTD);
579
      }
580
    catch (EOFException ee)
581
      {                 // added by MHK
582
        error("premature end of file", "[EOF]", null);
583
      }
584
 
585
    try
586
      {
587
        parseMisc();   //skip all white, PIs, and comments
588
        char c = readCh();    //if this doesn't throw an exception...
589
        error("unexpected characters after document end", c, null);
590
      }
591
    catch (EOFException e)
592
      {
593
        return;
594
      }
595
  }
596
 
597
  static final char[] startDelimComment = { '<', '!', '-', '-' };
598
  static final char[] endDelimComment = { '-', '-' };
599
 
600
  /**
601
   * Skip a comment.
602
   * <pre>
603
   * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
604
   * </pre>
605
   * <p> (The <code>&lt;!--</code> has already been read.)
606
   */
607
  private void parseComment()
608
    throws Exception
609
  {
610
    char c;
611
    boolean saved = expandPE;
612
 
613
    expandPE = false;
614
    parseUntil(endDelimComment);
615
    require('>');
616
    expandPE = saved;
617
    handler.comment(dataBuffer, 0, dataBufferPos);
618
    dataBufferPos = 0;
619
  }
620
 
621
  static final char[] startDelimPI = { '<', '?' };
622
  static final char[] endDelimPI = { '?', '>' };
623
 
624
  /**
625
   * Parse a processing instruction and do a call-back.
626
   * <pre>
627
   * [16] PI ::= '&lt;?' PITarget
628
   *    (S (Char* - (Char* '?&gt;' Char*)))?
629
   *    '?&gt;'
630
   * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
631
   * </pre>
632
   * <p> (The <code>&lt;?</code> has already been read.)
633
   */
634
  private void parsePI()
635
    throws SAXException, IOException
636
  {
637
    String name;
638
    boolean saved = expandPE;
639
 
640
    expandPE = false;
641
    name = readNmtoken(true);
642
    //NE08
643
    if (name.indexOf(':') >= 0)
644
      {
645
        error("Illegal character(':') in processing instruction name ",
646
              name, null);
647
      }
648
    if ("xml".equalsIgnoreCase(name))
649
      {
650
        error("Illegal processing instruction target", name, null);
651
      }
652
    if (!tryRead(endDelimPI))
653
      {
654
        requireWhitespace();
655
        parseUntil(endDelimPI);
656
      }
657
    expandPE = saved;
658
    handler.processingInstruction(name, dataBufferToString());
659
  }
660
 
661
  static final char[] endDelimCDATA = { ']', ']', '>' };
662
 
663
  private boolean isDirtyCurrentElement;
664
 
665
  /**
666
   * Parse a CDATA section.
667
   * <pre>
668
   * [18] CDSect ::= CDStart CData CDEnd
669
   * [19] CDStart ::= '&lt;![CDATA['
670
   * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
671
   * [21] CDEnd ::= ']]&gt;'
672
   * </pre>
673
   * <p> (The '&lt;![CDATA[' has already been read.)
674
   */
675
  private void parseCDSect()
676
    throws Exception
677
  {
678
    parseUntil(endDelimCDATA);
679
    dataBufferFlush();
680
  }
681
 
682
  /**
683
   * Parse the prolog of an XML document.
684
   * <pre>
685
   * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
686
   * </pre>
687
   * <p>We do not look for the XML declaration here, because it was
688
   * handled by pushURL ().
689
   * @see pushURL
690
   * @return true if a DTD was read.
691
   */
692
  private boolean parseProlog()
693
    throws Exception
694
  {
695
    parseMisc();
696
 
697
    if (tryRead("<!DOCTYPE"))
698
      {
699
        parseDoctypedecl();
700
        parseMisc();
701
        return true;
702
      }
703
    return false;
704
  }
705
 
706
  private void checkLegalVersion(String version)
707
    throws SAXException
708
  {
709
    int len = version.length();
710
    for (int i = 0; i < len; i++)
711
      {
712
        char c = version.charAt(i);
713
        if ('0' <= c && c <= '9')
714
          {
715
            continue;
716
          }
717
        if (c == '_' || c == '.' || c == ':' || c == '-')
718
          {
719
            continue;
720
          }
721
        if ('a' <= c && c <= 'z')
722
          {
723
            continue;
724
          }
725
        if ('A' <= c && c <= 'Z')
726
          {
727
            continue;
728
          }
729
        error ("illegal character in version", version, "1.0");
730
      }
731
  }
732
 
733
  /**
734
   * Parse the XML declaration.
735
   * <pre>
736
   * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
737
   * [24] VersionInfo ::= S 'version' Eq
738
   *    ("'" VersionNum "'" | '"' VersionNum '"' )
739
   * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
740
   * [32] SDDecl ::= S 'standalone' Eq
741
   *    ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
742
   * [80] EncodingDecl ::= S 'encoding' Eq
743
   *    ( "'" EncName "'" | "'" EncName "'" )
744
   * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
745
   * </pre>
746
   * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
747
   * @return the encoding in the declaration, uppercased; or null
748
   * @see #parseTextDecl
749
   * @see #setupDecoding
750
   */
751
  private String parseXMLDecl(boolean ignoreEncoding)
752
    throws SAXException, IOException
753
  {
754
    String version;
755
    String encodingName = null;
756
    String standalone = null;
757
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
758
    String inputEncoding = null;
759
 
760
    switch (this.encoding)
761
      {
762
      case ENCODING_EXTERNAL:
763
      case ENCODING_UTF_8:
764
        inputEncoding = "UTF-8";
765
        break;
766
      case ENCODING_ISO_8859_1:
767
        inputEncoding = "ISO-8859-1";
768
        break;
769
      case ENCODING_UCS_2_12:
770
        inputEncoding = "UTF-16BE";
771
        break;
772
      case ENCODING_UCS_2_21:
773
        inputEncoding = "UTF-16LE";
774
        break;
775
      }
776
 
777
    // Read the version.
778
    require("version");
779
    parseEq();
780
    checkLegalVersion(version = readLiteral(flags));
781
    if (!version.equals("1.0"))
782
      {
783
        if (version.equals("1.1"))
784
          {
785
            handler.warn("expected XML version 1.0, not: " + version);
786
            xmlVersion = XML_11;
787
          }
788
        else
789
          {
790
            error("illegal XML version", version, "1.0 or 1.1");
791
          }
792
      }
793
    else
794
      {
795
        xmlVersion = XML_10;
796
      }
797
    // Try reading an encoding declaration.
798
    boolean white = tryWhitespace();
799
 
800
    if (tryRead("encoding"))
801
      {
802
        if (!white)
803
          {
804
            error("whitespace required before 'encoding='");
805
          }
806
        parseEq();
807
        encodingName = readLiteral(flags);
808
        if (!ignoreEncoding)
809
          {
810
            setupDecoding(encodingName);
811
          }
812
      }
813
 
814
    // Try reading a standalone declaration
815
    if (encodingName != null)
816
      {
817
        white = tryWhitespace();
818
      }
819
    if (tryRead("standalone"))
820
      {
821
        if (!white)
822
          {
823
            error("whitespace required before 'standalone='");
824
          }
825
        parseEq();
826
        standalone = readLiteral(flags);
827
        if ("yes".equals(standalone))
828
          {
829
            docIsStandalone = true;
830
          }
831
        else if (!"no".equals(standalone))
832
          {
833
            error("standalone flag must be 'yes' or 'no'");
834
          }
835
      }
836
 
837
    skipWhitespace();
838
    require("?>");
839
 
840
    if (inputEncoding == null)
841
      {
842
        inputEncoding = encodingName;
843
      }
844
    handler.xmlDecl(version, encodingName, docIsStandalone,
845
                    inputEncoding);
846
 
847
    return encodingName;
848
  }
849
 
850
  /**
851
   * Parse a text declaration.
852
   * <pre>
853
   * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
854
   * [80] EncodingDecl ::= S 'encoding' Eq
855
   *    ( '"' EncName '"' | "'" EncName "'" )
856
   * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
857
   * </pre>
858
   * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
859
   * @return the encoding in the declaration, uppercased; or null
860
   * @see #parseXMLDecl
861
   * @see #setupDecoding
862
   */
863
  private String parseTextDecl(boolean ignoreEncoding)
864
    throws SAXException, IOException
865
  {
866
    String encodingName = null;
867
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
868
 
869
    // Read an optional version.
870
    if (tryRead ("version"))
871
      {
872
        String version;
873
        parseEq();
874
        checkLegalVersion(version = readLiteral(flags));
875
 
876
        if (version.equals("1.1"))
877
          {
878
            if (xmlVersion == XML_10)
879
              {
880
                error("external subset has later version number.", "1.0",
881
                      version);
882
              }
883
            handler.warn("expected XML version 1.0, not: " + version);
884
            xmlVersion = XML_11;
885
          }
886
        else if (!version.equals("1.0"))
887
          {
888
            error("illegal XML version", version, "1.0 or 1.1");
889
          }
890
        requireWhitespace();
891
      }
892
 
893
    // Read the encoding.
894
    require("encoding");
895
    parseEq();
896
    encodingName = readLiteral(flags);
897
    if (!ignoreEncoding)
898
      {
899
        setupDecoding(encodingName);
900
      }
901
    skipWhitespace();
902
    require("?>");
903
 
904
    return encodingName;
905
  }
906
 
907
  /**
908
   * Sets up internal state so that we can decode an entity using the
909
   * specified encoding.  This is used when we start to read an entity
910
   * and we have been given knowledge of its encoding before we start to
911
   * read any data (e.g. from a SAX input source or from a MIME type).
912
   *
913
   * <p> It is also used after autodetection, at which point only very
914
   * limited adjustments to the encoding may be used (switching between
915
   * related builtin decoders).
916
   *
917
   * @param encodingName The name of the encoding specified by the user.
918
   * @exception IOException if the encoding isn't supported either
919
   *  internally to this parser, or by the hosting JVM.
920
   * @see #parseXMLDecl
921
   * @see #parseTextDecl
922
     */
923
  private void setupDecoding(String encodingName)
924
    throws SAXException, IOException
925
  {
926
    encodingName = encodingName.toUpperCase();
927
 
928
    // ENCODING_EXTERNAL indicates an encoding that wasn't
929
    // autodetected ... we can use builtin decoders, or
930
    // ones from the JVM (InputStreamReader).
931
 
932
    // Otherwise we can only tweak what was autodetected, and
933
    // only for single byte (ASCII derived) builtin encodings.
934
 
935
    // ASCII-derived encodings
936
    if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
937
      {
938
        if (encodingName.equals("ISO-8859-1")
939
            || encodingName.equals("8859_1")
940
            || encodingName.equals("ISO8859_1"))
941
          {
942
            encoding = ENCODING_ISO_8859_1;
943
            return;
944
          }
945
        else if (encodingName.equals("US-ASCII")
946
                 || encodingName.equals("ASCII"))
947
          {
948
            encoding = ENCODING_ASCII;
949
            return;
950
          }
951
        else if (encodingName.equals("UTF-8")
952
                 || encodingName.equals("UTF8"))
953
          {
954
            encoding = ENCODING_UTF_8;
955
            return;
956
          }
957
        else if (encoding != ENCODING_EXTERNAL)
958
          {
959
            // used to start with a new reader ...
960
            throw new UnsupportedEncodingException(encodingName);
961
          }
962
        // else fallthrough ...
963
        // it's ASCII-ish and something other than a builtin
964
      }
965
 
966
    // Unicode and such
967
    if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
968
      {
969
        if (!(encodingName.equals("ISO-10646-UCS-2")
970
              || encodingName.equals("UTF-16")
971
              || encodingName.equals("UTF-16BE")
972
              || encodingName.equals("UTF-16LE")))
973
          {
974
            error("unsupported Unicode encoding", encodingName, "UTF-16");
975
          }
976
        return;
977
      }
978
 
979
    // four byte encodings
980
    if (encoding == ENCODING_UCS_4_1234
981
        || encoding == ENCODING_UCS_4_4321
982
        || encoding == ENCODING_UCS_4_2143
983
        || encoding == ENCODING_UCS_4_3412)
984
      {
985
        // Strictly:  "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
986
        if (!encodingName.equals("ISO-10646-UCS-4"))
987
          {
988
            error("unsupported 32-bit encoding", encodingName,
989
                  "ISO-10646-UCS-4");
990
          }
991
        return;
992
      }
993
 
994
    // assert encoding == ENCODING_EXTERNAL
995
    // if (encoding != ENCODING_EXTERNAL)
996
    //     throw new RuntimeException ("encoding = " + encoding);
997
 
998
    if (encodingName.equals("UTF-16BE"))
999
      {
1000
        encoding = ENCODING_UCS_2_12;
1001
        return;
1002
      }
1003
    if (encodingName.equals("UTF-16LE"))
1004
      {
1005
        encoding = ENCODING_UCS_2_21;
1006
        return;
1007
      }
1008
 
1009
    // We couldn't use the builtin decoders at all.  But we can try to
1010
    // create a reader, since we haven't messed up buffering.  Tweak
1011
    // the encoding name if necessary.
1012
 
1013
    if (encodingName.equals("UTF-16")
1014
        || encodingName.equals("ISO-10646-UCS-2"))
1015
      {
1016
        encodingName = "Unicode";
1017
      }
1018
    // Ignoring all the EBCDIC aliases here
1019
 
1020
    reader = new InputStreamReader(is, encodingName);
1021
    sourceType = INPUT_READER;
1022
  }
1023
 
1024
  /**
1025
   * Parse miscellaneous markup outside the document element and DOCTYPE
1026
   * declaration.
1027
   * <pre>
1028
   * [27] Misc ::= Comment | PI | S
1029
   * </pre>
1030
   */
1031
  private void parseMisc()
1032
    throws Exception
1033
  {
1034
    while (true)
1035
      {
1036
        skipWhitespace();
1037
        if (tryRead(startDelimPI))
1038
          {
1039
            parsePI();
1040
          }
1041
        else if (tryRead(startDelimComment))
1042
          {
1043
            parseComment();
1044
          }
1045
        else
1046
          {
1047
            return;
1048
          }
1049
      }
1050
  }
1051
 
1052
  /**
1053
   * Parse a document type declaration.
1054
   * <pre>
1055
   * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1056
   *    ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1057
   * </pre>
1058
   * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1059
   */
1060
  private void parseDoctypedecl()
1061
    throws Exception
1062
  {
1063
    String rootName;
1064
    ExternalIdentifiers ids;
1065
 
1066
    // Read the document type name.
1067
    requireWhitespace();
1068
    rootName = readNmtoken(true);
1069
 
1070
    // Read the External subset's IDs
1071
    skipWhitespace();
1072
    ids = readExternalIds(false, true);
1073
 
1074
    // report (a) declaration of name, (b) lexical info (ids)
1075
    handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1076
 
1077
    // Internal subset is parsed first, if present
1078
    skipWhitespace();
1079
    if (tryRead('['))
1080
      {
1081
 
1082
        // loop until the subset ends
1083
        while (true)
1084
          {
1085
            doReport = expandPE = true;
1086
            skipWhitespace();
1087
            doReport = expandPE = false;
1088
            if (tryRead(']'))
1089
              {
1090
                break;     // end of subset
1091
              }
1092
            else
1093
              {
1094
                // WFC, PEs in internal subset (only between decls)
1095
                peIsError = expandPE = true;
1096
                parseMarkupdecl();
1097
                peIsError = expandPE = false;
1098
              }
1099
          }
1100
      }
1101
    skipWhitespace();
1102
    require('>');
1103
 
1104
    // Read the external subset, if any
1105
    InputSource subset;
1106
 
1107
    if (ids.systemId == null)
1108
      {
1109
        subset = handler.getExternalSubset(rootName,
1110
                                           handler.getSystemId());
1111
      }
1112
    else
1113
      {
1114
        subset = null;
1115
      }
1116
    if (ids.systemId != null || subset != null)
1117
      {
1118
        pushString(null, ">");
1119
 
1120
        // NOTE:  [dtd] is so we say what SAX2 expects,
1121
        // though it's misleading (subset, not entire dtd)
1122
        if (ids.systemId != null)
1123
          {
1124
            pushURL(true, "[dtd]", ids, null, null, null, true);
1125
          }
1126
        else
1127
          {
1128
            handler.warn("modifying document by adding external subset");
1129
            pushURL(true, "[dtd]",
1130
                    new ExternalIdentifiers(subset.getPublicId(),
1131
                                            subset.getSystemId(),
1132
                                            null),
1133
                    subset.getCharacterStream(),
1134
                    subset.getByteStream(),
1135
                    subset.getEncoding(),
1136
                    false);
1137
          }
1138
 
1139
        // Loop until we end up back at '>'
1140
        while (true)
1141
          {
1142
            doReport = expandPE = true;
1143
            skipWhitespace();
1144
            doReport = expandPE = false;
1145
            if (tryRead('>'))
1146
              {
1147
                break;
1148
              }
1149
            else
1150
              {
1151
                expandPE = true;
1152
                parseMarkupdecl();
1153
                expandPE = false;
1154
              }
1155
          }
1156
 
1157
        // the ">" string isn't popped yet
1158
        if (inputStack.size() != 1)
1159
          {
1160
            error("external subset has unmatched '>'");
1161
          }
1162
      }
1163
 
1164
    // done dtd
1165
    handler.endDoctype();
1166
    expandPE = false;
1167
    doReport = true;
1168
  }
1169
 
1170
  /**
1171
   * Parse a markup declaration in the internal or external DTD subset.
1172
   * <pre>
1173
   * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1174
   *    | NotationDecl | PI | Comment
1175
   * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1176
   *    | PEReference | S) *
1177
   * </pre>
1178
   * <p> Reading toplevel PE references is handled as a lexical issue
1179
   * by the caller, as is whitespace.
1180
   */
1181
  private void parseMarkupdecl()
1182
    throws Exception
1183
  {
1184
    char[] saved = null;
1185
    boolean savedPE = expandPE;
1186
 
1187
    // prevent "<%foo;" and ensures saved entity is right
1188
    require('<');
1189
    unread('<');
1190
    expandPE = false;
1191
 
1192
    if (tryRead("<!ELEMENT"))
1193
      {
1194
        saved = readBuffer;
1195
        expandPE = savedPE;
1196
        parseElementDecl();
1197
      }
1198
    else if (tryRead("<!ATTLIST"))
1199
      {
1200
        saved = readBuffer;
1201
        expandPE = savedPE;
1202
        parseAttlistDecl();
1203
      }
1204
    else if (tryRead("<!ENTITY"))
1205
      {
1206
        saved = readBuffer;
1207
        expandPE = savedPE;
1208
        parseEntityDecl();
1209
      }
1210
    else if (tryRead("<!NOTATION"))
1211
      {
1212
        saved = readBuffer;
1213
        expandPE = savedPE;
1214
        parseNotationDecl();
1215
      }
1216
    else if (tryRead(startDelimPI))
1217
      {
1218
        saved = readBuffer;
1219
        expandPE = savedPE;
1220
        parsePI();
1221
      }
1222
    else if (tryRead(startDelimComment))
1223
      {
1224
        saved = readBuffer;
1225
        expandPE = savedPE;
1226
        parseComment();
1227
      }
1228
    else if (tryRead("<!["))
1229
      {
1230
        saved = readBuffer;
1231
        expandPE = savedPE;
1232
        if (inputStack.size() > 0)
1233
          {
1234
            parseConditionalSect(saved);
1235
          }
1236
        else
1237
          {
1238
            error("conditional sections illegal in internal subset");
1239
          }
1240
      }
1241
    else
1242
      {
1243
        error("expected markup declaration");
1244
      }
1245
 
1246
    // VC: Proper Decl/PE Nesting
1247
    if (readBuffer != saved)
1248
      {
1249
        handler.verror("Illegal Declaration/PE nesting");
1250
      }
1251
  }
1252
 
1253
  /**
1254
   * Parse an element, with its tags.
1255
   * <pre>
1256
   * [39] element ::= EmptyElementTag | STag content ETag
1257
   * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1258
   * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1259
   * </pre>
1260
   * <p> (The '&lt;' has already been read.)
1261
   * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1262
   * and parseContent () will take care of calling parseETag ().
1263
   */
1264
  private void parseElement(boolean maybeGetSubset)
1265
    throws Exception
1266
  {
1267
    String gi;
1268
    char c;
1269
    int oldElementContent = currentElementContent;
1270
    String oldElement = currentElement;
1271
    ElementDecl element;
1272
 
1273
    // This is the (global) counter for the
1274
    // array of specified attributes.
1275
    tagAttributePos = 0;
1276
 
1277
    // Read the element type name.
1278
    gi = readNmtoken(true);
1279
 
1280
    // If we saw no DTD, and this is the document root element,
1281
    // let the application modify the input stream by providing one.
1282
    if (maybeGetSubset)
1283
      {
1284
        InputSource subset = handler.getExternalSubset(gi,
1285
                                                       handler.getSystemId());
1286
        if (subset != null)
1287
          {
1288
            String publicId = subset.getPublicId();
1289
            String systemId = subset.getSystemId();
1290
 
1291
            handler.warn("modifying document by adding DTD");
1292
            handler.doctypeDecl(gi, publicId, systemId);
1293
            pushString(null, ">");
1294
 
1295
            // NOTE:  [dtd] is so we say what SAX2 expects,
1296
            // though it's misleading (subset, not entire dtd)
1297
            pushURL(true, "[dtd]",
1298
                    new ExternalIdentifiers(publicId, systemId, null),
1299
                    subset.getCharacterStream(),
1300
                    subset.getByteStream(),
1301
                    subset.getEncoding(),
1302
                    false);
1303
 
1304
            // Loop until we end up back at '>'
1305
            while (true)
1306
              {
1307
                doReport = expandPE = true;
1308
                skipWhitespace();
1309
                doReport = expandPE = false;
1310
                if (tryRead('>'))
1311
                  {
1312
                    break;
1313
                  }
1314
                else
1315
                  {
1316
                    expandPE = true;
1317
                    parseMarkupdecl();
1318
                    expandPE = false;
1319
                  }
1320
              }
1321
 
1322
            // the ">" string isn't popped yet
1323
            if (inputStack.size() != 1)
1324
              {
1325
                error("external subset has unmatched '>'");
1326
              }
1327
 
1328
            handler.endDoctype();
1329
          }
1330
      }
1331
 
1332
    // Determine the current content type.
1333
    currentElement = gi;
1334
    element = (ElementDecl) elementInfo.get(gi);
1335
    currentElementContent = getContentType(element, CONTENT_ANY);
1336
 
1337
    // Read the attributes, if any.
1338
    // After this loop, "c" is the closing delimiter.
1339
    boolean white = tryWhitespace();
1340
    c = readCh();
1341
    while (c != '/' && c != '>')
1342
      {
1343
        unread(c);
1344
        if (!white)
1345
          {
1346
            error("need whitespace between attributes");
1347
          }
1348
        parseAttribute(gi);
1349
        white = tryWhitespace();
1350
        c = readCh();
1351
      }
1352
 
1353
    // Supply any defaulted attributes.
1354
    Iterator atts = declaredAttributes(element);
1355
    if (atts != null)
1356
      {
1357
        String aname;
1358
loop:
1359
        while (atts.hasNext())
1360
          {
1361
            aname = (String) atts.next();
1362
            // See if it was specified.
1363
            for (int i = 0; i < tagAttributePos; i++)
1364
              {
1365
                if (tagAttributes[i] == aname)
1366
                  {
1367
                    continue loop;
1368
                  }
1369
              }
1370
            // ... or has a default
1371
            String value = getAttributeDefaultValue(gi, aname);
1372
 
1373
            if (value == null)
1374
              {
1375
                continue;
1376
              }
1377
            handler.attribute(aname, value, false);
1378
          }
1379
      }
1380
 
1381
    // Figure out if this is a start tag
1382
    // or an empty element, and dispatch an
1383
    // event accordingly.
1384
    switch (c)
1385
      {
1386
      case '>':
1387
        handler.startElement(gi);
1388
        parseContent();
1389
        break;
1390
      case '/':
1391
        require('>');
1392
        handler.startElement(gi);
1393
        handler.endElement(gi);
1394
        break;
1395
      }
1396
 
1397
    // Restore the previous state.
1398
    currentElement = oldElement;
1399
    currentElementContent = oldElementContent;
1400
  }
1401
 
1402
  /**
1403
   * Parse an attribute assignment.
1404
   * <pre>
1405
   * [41] Attribute ::= Name Eq AttValue
1406
   * </pre>
1407
   * @param name The name of the attribute's element.
1408
   * @see SAXDriver#attribute
1409
   */
1410
  private void parseAttribute(String name)
1411
    throws Exception
1412
  {
1413
    String aname;
1414
    String type;
1415
    String value;
1416
    int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1417
 
1418
    // Read the attribute name.
1419
    aname = readNmtoken(true);
1420
    type = getAttributeType(name, aname);
1421
 
1422
    // Parse '='
1423
    parseEq();
1424
 
1425
    // Read the value, normalizing whitespace
1426
    // unless it is CDATA.
1427
    if (handler.stringInterning)
1428
      {
1429
        if (type == "CDATA" || type == null)
1430
          {
1431
            value = readLiteral(flags);
1432
          }
1433
        else
1434
          {
1435
            value = readLiteral(flags | LIT_NORMALIZE);
1436
          }
1437
      }
1438
    else
1439
      {
1440
        if (type == null || type.equals("CDATA"))
1441
          {
1442
            value = readLiteral(flags);
1443
          }
1444
        else
1445
          {
1446
            value = readLiteral(flags | LIT_NORMALIZE);
1447
          }
1448
      }
1449
 
1450
    // WFC: no duplicate attributes
1451
    for (int i = 0; i < tagAttributePos; i++)
1452
      {
1453
        if (aname.equals(tagAttributes [i]))
1454
          {
1455
            error("duplicate attribute", aname, null);
1456
          }
1457
      }
1458
 
1459
    // Inform the handler about the
1460
    // attribute.
1461
    handler.attribute(aname, value, true);
1462
    dataBufferPos = 0;
1463
 
1464
    // Note that the attribute has been
1465
    // specified.
1466
    if (tagAttributePos == tagAttributes.length)
1467
      {
1468
        String newAttrib[] = new String[tagAttributes.length * 2];
1469
        System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1470
        tagAttributes = newAttrib;
1471
      }
1472
    tagAttributes[tagAttributePos++] = aname;
1473
  }
1474
 
1475
  /**
1476
   * Parse an equals sign surrounded by optional whitespace.
1477
   * <pre>
1478
   * [25] Eq ::= S? '=' S?
1479
   * </pre>
1480
   */
1481
  private void parseEq()
1482
    throws SAXException, IOException
1483
  {
1484
    skipWhitespace();
1485
    require('=');
1486
    skipWhitespace();
1487
  }
1488
 
1489
  /**
1490
   * Parse an end tag.
1491
   * <pre>
1492
   * [42] ETag ::= '</' Name S? '>'
1493
   * </pre>
1494
   * <p>NOTE: parseContent () chains to here, we already read the
1495
   * "&lt;/".
1496
   */
1497
  private void parseETag()
1498
    throws Exception
1499
  {
1500
    require(currentElement);
1501
    skipWhitespace();
1502
    require('>');
1503
    handler.endElement(currentElement);
1504
    // not re-reporting any SAXException re bogus end tags,
1505
    // even though that diagnostic might be clearer ...
1506
  }
1507
 
1508
  /**
1509
   * Parse the content of an element.
1510
   * <pre>
1511
   * [43] content ::= (element | CharData | Reference
1512
   *    | CDSect | PI | Comment)*
1513
   * [67] Reference ::= EntityRef | CharRef
1514
   * </pre>
1515
   * <p> NOTE: consumes ETtag.
1516
   */
1517
  private void parseContent()
1518
    throws Exception
1519
  {
1520
    char c;
1521
 
1522
    while (true)
1523
      {
1524
        // consume characters (or ignorable whitspace) until delimiter
1525
        parseCharData();
1526
 
1527
        // Handle delimiters
1528
        c = readCh();
1529
        switch (c)
1530
          {
1531
          case '&':       // Found "&"
1532
            c = readCh();
1533
            if (c == '#')
1534
              {
1535
                parseCharRef();
1536
              }
1537
            else
1538
              {
1539
                unread(c);
1540
                parseEntityRef(true);
1541
              }
1542
            isDirtyCurrentElement = true;
1543
            break;
1544
 
1545
          case '<':       // Found "<"
1546
            dataBufferFlush();
1547
            c = readCh();
1548
            switch (c)
1549
              {
1550
              case '!':       // Found "<!"
1551
                c = readCh();
1552
                switch (c)
1553
                  {
1554
                  case '-':     // Found "<!-"
1555
                    require('-');
1556
                    isDirtyCurrentElement = false;
1557
                    parseComment();
1558
                    break;
1559
                  case '[':     // Found "<!["
1560
                    isDirtyCurrentElement = false;
1561
                    require("CDATA[");
1562
                    handler.startCDATA();
1563
                    inCDATA = true;
1564
                    parseCDSect();
1565
                    inCDATA = false;
1566
                    handler.endCDATA();
1567
                    break;
1568
                  default:
1569
                    error("expected comment or CDATA section", c, null);
1570
                    break;
1571
                  }
1572
                break;
1573
 
1574
              case '?':     // Found "<?"
1575
                isDirtyCurrentElement = false;
1576
                parsePI();
1577
                break;
1578
 
1579
              case '/':     // Found "</"
1580
                isDirtyCurrentElement = false;
1581
                parseETag();
1582
                return;
1583
 
1584
              default:     // Found "<" followed by something else
1585
                isDirtyCurrentElement = false;
1586
                unread(c);
1587
                parseElement(false);
1588
                break;
1589
              }
1590
          }
1591
      }
1592
  }
1593
 
1594
  /**
1595
   * Parse an element type declaration.
1596
   * <pre>
1597
   * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1598
   * </pre>
1599
   * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1600
   */
1601
  private void parseElementDecl()
1602
    throws Exception
1603
  {
1604
    String name;
1605
 
1606
    requireWhitespace();
1607
    // Read the element type name.
1608
    name = readNmtoken(true);
1609
 
1610
    requireWhitespace();
1611
    // Read the content model.
1612
    parseContentspec(name);
1613
 
1614
    skipWhitespace();
1615
    require('>');
1616
  }
1617
 
1618
  /**
1619
   * Content specification.
1620
   * <pre>
1621
   * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1622
   * </pre>
1623
   */
1624
  private void parseContentspec(String name)
1625
    throws Exception
1626
  {
1627
    // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1628
    if (tryRead("EMPTY"))
1629
      {
1630
        setElement(name, CONTENT_EMPTY, null, null);
1631
        if (!skippedPE)
1632
          {
1633
            handler.getDeclHandler().elementDecl(name, "EMPTY");
1634
          }
1635
        return;
1636
      }
1637
    else if (tryRead("ANY"))
1638
      {
1639
        setElement(name, CONTENT_ANY, null, null);
1640
        if (!skippedPE)
1641
          {
1642
            handler.getDeclHandler().elementDecl(name, "ANY");
1643
          }
1644
        return;
1645
      }
1646
    else
1647
      {
1648
        String model;
1649
        char[] saved;
1650
 
1651
        require('(');
1652
        saved = readBuffer;
1653
        dataBufferAppend('(');
1654
        skipWhitespace();
1655
        if (tryRead("#PCDATA"))
1656
          {
1657
            dataBufferAppend("#PCDATA");
1658
            parseMixed(saved);
1659
            model = dataBufferToString();
1660
            setElement(name, CONTENT_MIXED, model, null);
1661
          }
1662
        else
1663
          {
1664
            parseElements(saved);
1665
            model = dataBufferToString();
1666
            setElement(name, CONTENT_ELEMENTS, model, null);
1667
          }
1668
        if (!skippedPE)
1669
          {
1670
            handler.getDeclHandler().elementDecl(name, model);
1671
          }
1672
      }
1673
  }
1674
 
1675
  /**
1676
   * Parse an element-content model.
1677
   * <pre>
1678
   * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1679
   * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1680
   * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1681
   * </pre>
1682
   *
1683
   * <p> NOTE: the opening '(' and S have already been read.
1684
   *
1685
   * @param saved Buffer for entity that should have the terminal ')'
1686
   */
1687
  private void parseElements(char[] saved)
1688
    throws Exception
1689
  {
1690
    char c;
1691
    char sep;
1692
 
1693
    // Parse the first content particle
1694
    skipWhitespace();
1695
    parseCp();
1696
 
1697
    // Check for end or for a separator.
1698
    skipWhitespace();
1699
    c = readCh();
1700
    switch (c)
1701
      {
1702
      case ')':
1703
        // VC: Proper Group/PE Nesting
1704
        if (readBuffer != saved)
1705
          {
1706
            handler.verror("Illegal Group/PE nesting");
1707
          }
1708
 
1709
        dataBufferAppend(')');
1710
        c = readCh();
1711
        switch (c)
1712
          {
1713
          case '*':
1714
          case '+':
1715
          case '?':
1716
            dataBufferAppend(c);
1717
            break;
1718
          default:
1719
            unread(c);
1720
          }
1721
        return;
1722
      case ',':       // Register the separator.
1723
      case '|':
1724
        sep = c;
1725
        dataBufferAppend(c);
1726
        break;
1727
      default:
1728
        error("bad separator in content model", c, null);
1729
        return;
1730
      }
1731
 
1732
    // Parse the rest of the content model.
1733
    while (true)
1734
      {
1735
        skipWhitespace();
1736
        parseCp();
1737
        skipWhitespace();
1738
        c = readCh();
1739
        if (c == ')')
1740
          {
1741
            // VC: Proper Group/PE Nesting
1742
            if (readBuffer != saved)
1743
              {
1744
                handler.verror("Illegal Group/PE nesting");
1745
              }
1746
 
1747
            dataBufferAppend(')');
1748
            break;
1749
          }
1750
        else if (c != sep)
1751
          {
1752
            error("bad separator in content model", c, null);
1753
            return;
1754
          }
1755
        else
1756
          {
1757
            dataBufferAppend(c);
1758
          }
1759
      }
1760
 
1761
    // Check for the occurrence indicator.
1762
    c = readCh();
1763
    switch (c)
1764
      {
1765
      case '?':
1766
      case '*':
1767
      case '+':
1768
        dataBufferAppend(c);
1769
        return;
1770
      default:
1771
        unread(c);
1772
        return;
1773
      }
1774
  }
1775
 
1776
  /**
1777
   * Parse a content particle.
1778
   * <pre>
1779
   * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1780
   * </pre>
1781
   */
1782
  private void parseCp()
1783
    throws Exception
1784
  {
1785
    if (tryRead('('))
1786
      {
1787
        dataBufferAppend('(');
1788
        parseElements(readBuffer);
1789
      }
1790
    else
1791
      {
1792
        dataBufferAppend(readNmtoken(true));
1793
        char c = readCh();
1794
        switch (c)
1795
          {
1796
          case '?':
1797
          case '*':
1798
          case '+':
1799
            dataBufferAppend(c);
1800
            break;
1801
          default:
1802
            unread(c);
1803
            break;
1804
          }
1805
      }
1806
  }
1807
 
1808
  /**
1809
   * Parse mixed content.
1810
   * <pre>
1811
   * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1812
   *        | '(' S? ('#PCDATA') S? ')'
1813
   * </pre>
1814
   *
1815
   * @param saved Buffer for entity that should have the terminal ')'
1816
   */
1817
  private void parseMixed(char[] saved)
1818
    throws Exception
1819
  {
1820
    // Check for PCDATA alone.
1821
    skipWhitespace();
1822
    if (tryRead(')'))
1823
      {
1824
        // VC: Proper Group/PE Nesting
1825
        if (readBuffer != saved)
1826
          {
1827
            handler.verror("Illegal Group/PE nesting");
1828
          }
1829
 
1830
        dataBufferAppend(")*");
1831
        tryRead('*');
1832
        return;
1833
      }
1834
 
1835
    // Parse mixed content.
1836
    skipWhitespace();
1837
    while (!tryRead(")"))
1838
      {
1839
        require('|');
1840
        dataBufferAppend('|');
1841
        skipWhitespace();
1842
        dataBufferAppend(readNmtoken(true));
1843
        skipWhitespace();
1844
      }
1845
 
1846
    // VC: Proper Group/PE Nesting
1847
    if (readBuffer != saved)
1848
      {
1849
        handler.verror("Illegal Group/PE nesting");
1850
      }
1851
 
1852
    require('*');
1853
    dataBufferAppend(")*");
1854
  }
1855
 
1856
  /**
1857
   * Parse an attribute list declaration.
1858
   * <pre>
1859
   * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1860
   * </pre>
1861
   * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1862
   */
1863
  private void parseAttlistDecl()
1864
    throws Exception
1865
  {
1866
    String elementName;
1867
 
1868
    requireWhitespace();
1869
    elementName = readNmtoken(true);
1870
    boolean white = tryWhitespace();
1871
    while (!tryRead('>'))
1872
      {
1873
        if (!white)
1874
          {
1875
            error("whitespace required before attribute definition");
1876
          }
1877
        parseAttDef(elementName);
1878
        white = tryWhitespace();
1879
      }
1880
  }
1881
 
1882
  /**
1883
   * Parse a single attribute definition.
1884
   * <pre>
1885
   * [53] AttDef ::= S Name S AttType S DefaultDecl
1886
   * </pre>
1887
   */
1888
  private void parseAttDef(String elementName)
1889
    throws Exception
1890
  {
1891
    String name;
1892
    String type;
1893
    String enumer = null;
1894
 
1895
    // Read the attribute name.
1896
    name = readNmtoken(true);
1897
 
1898
    // Read the attribute type.
1899
    requireWhitespace();
1900
    type = readAttType();
1901
 
1902
    // Get the string of enumerated values if necessary.
1903
    if (handler.stringInterning)
1904
      {
1905
        if ("ENUMERATION" == type || "NOTATION" == type)
1906
          {
1907
            enumer = dataBufferToString();
1908
          }
1909
      }
1910
    else
1911
      {
1912
        if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1913
          {
1914
            enumer = dataBufferToString();
1915
          }
1916
      }
1917
 
1918
    // Read the default value.
1919
    requireWhitespace();
1920
    parseDefault(elementName, name, type, enumer);
1921
  }
1922
 
1923
  /**
1924
   * Parse the attribute type.
1925
   * <pre>
1926
   * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1927
   * [55] StringType ::= 'CDATA'
1928
   * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1929
   *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1930
   * [57] EnumeratedType ::= NotationType | Enumeration
1931
   * </pre>
1932
   */
1933
  private String readAttType()
1934
    throws Exception
1935
  {
1936
    if (tryRead('('))
1937
      {
1938
        parseEnumeration(false);
1939
        return "ENUMERATION";
1940
      }
1941
    else
1942
      {
1943
        String typeString = readNmtoken(true);
1944
        if (handler.stringInterning)
1945
          {
1946
            if ("NOTATION" == typeString)
1947
              {
1948
                parseNotationType();
1949
                return typeString;
1950
              }
1951
            else if ("CDATA" == typeString
1952
                     || "ID" == typeString
1953
                     || "IDREF" == typeString
1954
                     || "IDREFS" == typeString
1955
                     || "ENTITY" == typeString
1956
                     || "ENTITIES" == typeString
1957
                     || "NMTOKEN" == typeString
1958
                     || "NMTOKENS" == typeString)
1959
              {
1960
                return typeString;
1961
              }
1962
          }
1963
        else
1964
          {
1965
            if ("NOTATION".equals(typeString))
1966
              {
1967
                parseNotationType();
1968
                return typeString;
1969
              }
1970
            else if ("CDATA".equals(typeString)
1971
                     || "ID".equals(typeString)
1972
                     || "IDREF".equals(typeString)
1973
                     || "IDREFS".equals(typeString)
1974
                     || "ENTITY".equals(typeString)
1975
                     || "ENTITIES".equals(typeString)
1976
                     || "NMTOKEN".equals(typeString)
1977
                     || "NMTOKENS".equals(typeString))
1978
              {
1979
                return typeString;
1980
              }
1981
          }
1982
        error("illegal attribute type", typeString, null);
1983
        return null;
1984
      }
1985
  }
1986
 
1987
  /**
1988
   * Parse an enumeration.
1989
   * <pre>
1990
   * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1991
   * </pre>
1992
   * <p>NOTE: the '(' has already been read.
1993
   */
1994
  private void parseEnumeration(boolean isNames)
1995
    throws Exception
1996
  {
1997
    dataBufferAppend('(');
1998
 
1999
    // Read the first token.
2000
    skipWhitespace();
2001
    dataBufferAppend(readNmtoken(isNames));
2002
    // Read the remaining tokens.
2003
    skipWhitespace();
2004
    while (!tryRead(')'))
2005
      {
2006
        require('|');
2007
        dataBufferAppend('|');
2008
        skipWhitespace();
2009
        dataBufferAppend(readNmtoken (isNames));
2010
        skipWhitespace();
2011
      }
2012
    dataBufferAppend(')');
2013
  }
2014
 
2015
  /**
2016
   * Parse a notation type for an attribute.
2017
   * <pre>
2018
   * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2019
   *    (S? '|' S? name)* S? ')'
2020
   * </pre>
2021
   * <p>NOTE: the 'NOTATION' has already been read
2022
   */
2023
  private void parseNotationType()
2024
    throws Exception
2025
  {
2026
    requireWhitespace();
2027
    require('(');
2028
 
2029
    parseEnumeration(true);
2030
  }
2031
 
2032
  /**
2033
   * Parse the default value for an attribute.
2034
   * <pre>
2035
   * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2036
   *    | (('#FIXED' S)? AttValue)
2037
   * </pre>
2038
   */
2039
  private void parseDefault(String elementName, String name,
2040
                            String type, String enumer)
2041
    throws Exception
2042
  {
2043
    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2044
    String value = null;
2045
    int flags = LIT_ATTRIBUTE;
2046
    boolean saved = expandPE;
2047
    String defaultType = null;
2048
 
2049
    // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2050
    // chars to spaces (doesn't matter when that's done if it doesn't
2051
    // interfere with char refs expanding to whitespace).
2052
 
2053
    if (!skippedPE)
2054
      {
2055
        flags |= LIT_ENTITY_REF;
2056
        if (handler.stringInterning)
2057
          {
2058
            if ("CDATA" != type)
2059
              {
2060
                flags |= LIT_NORMALIZE;
2061
              }
2062
          }
2063
        else
2064
          {
2065
            if (!"CDATA".equals(type))
2066
              {
2067
                flags |= LIT_NORMALIZE;
2068
              }
2069
          }
2070
      }
2071
 
2072
    expandPE = false;
2073
    if (tryRead('#'))
2074
      {
2075
        if (tryRead("FIXED"))
2076
          {
2077
            defaultType = "#FIXED";
2078
            valueType = ATTRIBUTE_DEFAULT_FIXED;
2079
            requireWhitespace();
2080
            value = readLiteral(flags);
2081
          }
2082
        else if (tryRead("REQUIRED"))
2083
          {
2084
            defaultType = "#REQUIRED";
2085
            valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2086
          }
2087
        else if (tryRead("IMPLIED"))
2088
          {
2089
            defaultType = "#IMPLIED";
2090
            valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2091
          }
2092
        else
2093
          {
2094
            error("illegal keyword for attribute default value");
2095
          }
2096
      }
2097
    else
2098
      {
2099
        value = readLiteral(flags);
2100
      }
2101
    expandPE = saved;
2102
    setAttribute(elementName, name, type, enumer, value, valueType);
2103
    if (handler.stringInterning)
2104
      {
2105
        if ("ENUMERATION" == type)
2106
          {
2107
            type = enumer;
2108
          }
2109
        else if ("NOTATION" == type)
2110
          {
2111
            type = "NOTATION " + enumer;
2112
          }
2113
      }
2114
    else
2115
      {
2116
        if ("ENUMERATION".equals(type))
2117
          {
2118
            type = enumer;
2119
          }
2120
        else if ("NOTATION".equals(type))
2121
          {
2122
            type = "NOTATION " + enumer;
2123
          }
2124
      }
2125
    if (!skippedPE)
2126
      {
2127
        handler.getDeclHandler().attributeDecl(elementName, name, type,
2128
                                               defaultType, value);
2129
      }
2130
  }
2131
 
2132
  /**
2133
   * Parse a conditional section.
2134
   * <pre>
2135
   * [61] conditionalSect ::= includeSect || ignoreSect
2136
   * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2137
   *    extSubsetDecl ']]&gt;'
2138
   * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2139
   *    ignoreSectContents* ']]&gt;'
2140
   * [64] ignoreSectContents ::= Ignore
2141
   *    ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2142
   * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2143
   * </pre>
2144
   * <p> NOTE: the '&gt;![' has already been read.
2145
   */
2146
  private void parseConditionalSect(char[] saved)
2147
    throws Exception
2148
  {
2149
    skipWhitespace();
2150
    if (tryRead("INCLUDE"))
2151
      {
2152
        skipWhitespace();
2153
        require('[');
2154
        // VC: Proper Conditional Section/PE Nesting
2155
        if (readBuffer != saved)
2156
          {
2157
            handler.verror("Illegal Conditional Section/PE nesting");
2158
          }
2159
        skipWhitespace();
2160
        while (!tryRead("]]>"))
2161
          {
2162
            parseMarkupdecl();
2163
            skipWhitespace();
2164
          }
2165
      }
2166
    else if (tryRead("IGNORE"))
2167
      {
2168
        skipWhitespace();
2169
        require('[');
2170
        // VC: Proper Conditional Section/PE Nesting
2171
        if (readBuffer != saved)
2172
          {
2173
            handler.verror("Illegal Conditional Section/PE nesting");
2174
          }
2175
        int nesting = 1;
2176
        char c;
2177
        expandPE = false;
2178
        for (int nest = 1; nest > 0; )
2179
          {
2180
            c = readCh();
2181
            switch (c)
2182
              {
2183
              case '<':
2184
                if (tryRead("!["))
2185
                  {
2186
                    nest++;
2187
                  }
2188
                break;
2189
              case ']':
2190
                if (tryRead("]>"))
2191
                  {
2192
                    nest--;
2193
                  }
2194
              }
2195
          }
2196
        expandPE = true;
2197
      }
2198
    else
2199
      {
2200
        error("conditional section must begin with INCLUDE or IGNORE");
2201
      }
2202
  }
2203
 
2204
  private void parseCharRef()
2205
    throws SAXException, IOException
2206
  {
2207
    parseCharRef(true /* do flushDataBuffer by default */);
2208
  }
2209
 
2210
  /**
2211
   * Try to read a character reference without consuming data from buffer.
2212
   * <pre>
2213
   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2214
   * </pre>
2215
   * <p>NOTE: the '&#' has already been read.
2216
   */
2217
  private void tryReadCharRef()
2218
    throws SAXException, IOException
2219
  {
2220
    int value = 0;
2221
    char c;
2222
 
2223
    if (tryRead('x'))
2224
      {
2225
loop1:
2226
        while (true)
2227
          {
2228
            c = readCh();
2229
            if (c == ';')
2230
              {
2231
                break loop1;
2232
              }
2233
            else
2234
              {
2235
                int n = Character.digit(c, 16);
2236
                if (n == -1)
2237
                  {
2238
                    error("illegal character in character reference", c, null);
2239
                    break loop1;
2240
                  }
2241
                value *= 16;
2242
                value += n;
2243
              }
2244
          }
2245
      }
2246
    else
2247
      {
2248
loop2:
2249
        while (true)
2250
          {
2251
            c = readCh();
2252
            if (c == ';')
2253
              {
2254
                break loop2;
2255
              }
2256
            else
2257
              {
2258
                int n = Character.digit(c, 10);
2259
                if (n == -1)
2260
                  {
2261
                    error("illegal character in character reference", c, null);
2262
                    break loop2;
2263
                  }
2264
                value *= 10;
2265
                value += n;
2266
              }
2267
          }
2268
      }
2269
 
2270
    // check for character refs being legal XML
2271
    if ((value < 0x0020
2272
         && ! (value == '\n' || value == '\t' || value == '\r'))
2273
        || (value >= 0xD800 && value <= 0xDFFF)
2274
        || value == 0xFFFE || value == 0xFFFF
2275
        || value > 0x0010ffff)
2276
      {
2277
        error("illegal XML character reference U+"
2278
              + Integer.toHexString(value));
2279
      }
2280
 
2281
    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2282
    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2283
    if (value > 0x0010ffff)
2284
      {
2285
        // too big for surrogate
2286
        error("character reference " + value + " is too large for UTF-16",
2287
              new Integer(value).toString(), null);
2288
      }
2289
 
2290
  }
2291
 
2292
  /**
2293
   * Read and interpret a character reference.
2294
   * <pre>
2295
   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2296
   * </pre>
2297
   * <p>NOTE: the '&#' has already been read.
2298
   */
2299
  private void parseCharRef(boolean doFlush)
2300
    throws SAXException, IOException
2301
  {
2302
    int value = 0;
2303
    char c;
2304
 
2305
    if (tryRead('x'))
2306
      {
2307
loop1:
2308
        while (true)
2309
          {
2310
            c = readCh();
2311
            if (c == ';')
2312
              {
2313
                break loop1;
2314
              }
2315
            else
2316
              {
2317
                int n = Character.digit(c, 16);
2318
                if (n == -1)
2319
                  {
2320
                    error("illegal character in character reference", c, null);
2321
                    break loop1;
2322
                  }
2323
                value *= 16;
2324
                value += n;
2325
              }
2326
          }
2327
      }
2328
    else
2329
      {
2330
loop2:
2331
        while (true)
2332
          {
2333
            c = readCh();
2334
            if (c == ';')
2335
              {
2336
                break loop2;
2337
              }
2338
            else
2339
              {
2340
                int n = Character.digit(c, 10);
2341
                if (n == -1)
2342
                  {
2343
                    error("illegal character in character reference", c, null);
2344
                    break loop2;
2345
                  }
2346
                value *= 10;
2347
                value += c - '0';
2348
              }
2349
          }
2350
      }
2351
 
2352
    // check for character refs being legal XML
2353
    if ((value < 0x0020
2354
         && ! (value == '\n' || value == '\t' || value == '\r'))
2355
        || (value >= 0xD800 && value <= 0xDFFF)
2356
        || value == 0xFFFE || value == 0xFFFF
2357
        || value > 0x0010ffff)
2358
      {
2359
        error("illegal XML character reference U+"
2360
              + Integer.toHexString(value));
2361
      }
2362
 
2363
    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2364
    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2365
    if (value <= 0x0000ffff)
2366
      {
2367
        // no surrogates needed
2368
        dataBufferAppend((char) value);
2369
      }
2370
    else if (value <= 0x0010ffff)
2371
      {
2372
        value -= 0x10000;
2373
        // > 16 bits, surrogate needed
2374
        dataBufferAppend((char) (0xd800 | (value >> 10)));
2375
        dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2376
      }
2377
    else
2378
      {
2379
        // too big for surrogate
2380
        error("character reference " + value + " is too large for UTF-16",
2381
              new Integer(value).toString(), null);
2382
      }
2383
    if (doFlush)
2384
      {
2385
        dataBufferFlush();
2386
      }
2387
  }
2388
 
2389
  /**
2390
   * Parse and expand an entity reference.
2391
   * <pre>
2392
   * [68] EntityRef ::= '&' Name ';'
2393
   * </pre>
2394
   * <p>NOTE: the '&amp;' has already been read.
2395
   * @param externalAllowed External entities are allowed here.
2396
   */
2397
  private void parseEntityRef(boolean externalAllowed)
2398
    throws SAXException, IOException
2399
  {
2400
    String name;
2401
 
2402
    name = readNmtoken(true);
2403
    require(';');
2404
    switch (getEntityType(name))
2405
      {
2406
      case ENTITY_UNDECLARED:
2407
        // NOTE:  XML REC describes amazingly convoluted handling for
2408
        // this case.  Nothing as meaningful as being a WFness error
2409
        // unless the processor might _legitimately_ not have seen a
2410
        // declaration ... which is what this implements.
2411
        String message;
2412
 
2413
        message = "reference to undeclared general entity " + name;
2414
        if (skippedPE && !docIsStandalone)
2415
          {
2416
            handler.verror(message);
2417
            // we don't know this entity, and it might be external...
2418
            if (externalAllowed)
2419
              {
2420
                handler.skippedEntity(name);
2421
              }
2422
          }
2423
        else
2424
          {
2425
            error(message);
2426
          }
2427
        break;
2428
      case ENTITY_INTERNAL:
2429
          pushString(name, getEntityValue(name));
2430
 
2431
          //workaround for possible input pop before marking
2432
          //the buffer reading position  
2433
          char t = readCh();
2434
          unread(t);
2435
          int bufferPosMark = readBufferPos;
2436
 
2437
          int end = readBufferPos + getEntityValue(name).length();
2438
          for (int k = readBufferPos; k < end; k++)
2439
            {
2440
              t = readCh();
2441
              if (t == '&')
2442
                {
2443
                  t = readCh();
2444
                  if (t  == '#')
2445
                    {
2446
                      //try to match a character ref
2447
                      tryReadCharRef();
2448
 
2449
                      //everything has been read
2450
                      if (readBufferPos >= end)
2451
                        {
2452
                          break;
2453
                        }
2454
                      k = readBufferPos;
2455
                      continue;
2456
                    }
2457
                  else if (Character.isLetter(t))
2458
                    {
2459
                      //looks like an entity ref
2460
                      unread(t);
2461
                      readNmtoken(true);
2462
                      require(';');
2463
 
2464
                      //everything has been read
2465
                      if (readBufferPos >= end)
2466
                        {
2467
                          break;
2468
                        }
2469
                      k = readBufferPos;
2470
                      continue;
2471
                    }
2472
                  error(" malformed entity reference");
2473
                }
2474
 
2475
            }
2476
          readBufferPos = bufferPosMark;
2477
          break;
2478
      case ENTITY_TEXT:
2479
          if (externalAllowed)
2480
            {
2481
              pushURL(false, name, getEntityIds(name),
2482
                      null, null, null, true);
2483
            }
2484
          else
2485
            {
2486
              error("reference to external entity in attribute value.",
2487
                    name, null);
2488
            }
2489
          break;
2490
      case ENTITY_NDATA:
2491
          if (externalAllowed)
2492
            {
2493
              error("unparsed entity reference in content", name, null);
2494
            }
2495
          else
2496
            {
2497
              error("reference to external entity in attribute value.",
2498
                    name, null);
2499
            }
2500
          break;
2501
      default:
2502
          throw new RuntimeException();
2503
      }
2504
  }
2505
 
2506
  /**
2507
   * Parse and expand a parameter entity reference.
2508
   * <pre>
2509
   * [69] PEReference ::= '%' Name ';'
2510
   * </pre>
2511
   * <p>NOTE: the '%' has already been read.
2512
   */
2513
  private void parsePEReference()
2514
    throws SAXException, IOException
2515
  {
2516
    String name;
2517
 
2518
    name = "%" + readNmtoken(true);
2519
    require(';');
2520
    switch (getEntityType(name))
2521
      {
2522
      case ENTITY_UNDECLARED:
2523
        // VC: Entity Declared
2524
        handler.verror("reference to undeclared parameter entity " + name);
2525
 
2526
        // we should disable handling of all subsequent declarations
2527
        // unless this is a standalone document (info discarded)
2528
        break;
2529
      case ENTITY_INTERNAL:
2530
        if (inLiteral)
2531
          {
2532
            pushString(name, getEntityValue(name));
2533
          }
2534
        else
2535
          {
2536
            pushString(name, ' ' + getEntityValue(name) + ' ');
2537
          }
2538
        break;
2539
      case ENTITY_TEXT:
2540
        if (!inLiteral)
2541
          {
2542
            pushString(null, " ");
2543
          }
2544
        pushURL(true, name, getEntityIds(name), null, null, null, true);
2545
        if (!inLiteral)
2546
          {
2547
            pushString(null, " ");
2548
          }
2549
        break;
2550
      }
2551
  }
2552
 
2553
  /**
2554
   * Parse an entity declaration.
2555
   * <pre>
2556
   * [70] EntityDecl ::= GEDecl | PEDecl
2557
   * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2558
   * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2559
   * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2560
   * [74] PEDef ::= EntityValue | ExternalID
2561
   * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2562
   *       | 'PUBLIC' S PubidLiteral S SystemLiteral
2563
   * [76] NDataDecl ::= S 'NDATA' S Name
2564
   * </pre>
2565
   * <p>NOTE: the '&lt;!ENTITY' has already been read.
2566
   */
2567
  private void parseEntityDecl()
2568
    throws Exception
2569
  {
2570
    boolean peFlag = false;
2571
    int flags = 0;
2572
 
2573
    // Check for a parameter entity.
2574
    expandPE = false;
2575
    requireWhitespace();
2576
    if (tryRead('%'))
2577
      {
2578
        peFlag = true;
2579
        requireWhitespace();
2580
      }
2581
    expandPE = true;
2582
 
2583
    // Read the entity name, and prepend
2584
    // '%' if necessary.
2585
    String name = readNmtoken(true);
2586
    //NE08
2587
    if (name.indexOf(':') >= 0)
2588
      {
2589
        error("Illegal character(':') in entity name ", name, null);
2590
      }
2591
    if (peFlag)
2592
      {
2593
        name = "%" + name;
2594
      }
2595
 
2596
    // Read the entity value.
2597
    requireWhitespace();
2598
    char c = readCh();
2599
    unread (c);
2600
    if (c == '"' || c == '\'')
2601
      {
2602
        // Internal entity ... replacement text has expanded refs
2603
        // to characters and PEs, but not to general entities
2604
        String value = readLiteral(flags);
2605
        setInternalEntity(name, value);
2606
      }
2607
    else
2608
      {
2609
        // Read the external IDs
2610
        ExternalIdentifiers ids = readExternalIds(false, false);
2611
 
2612
        // Check for NDATA declaration.
2613
        boolean white = tryWhitespace();
2614
        if (!peFlag && tryRead("NDATA"))
2615
          {
2616
            if (!white)
2617
              {
2618
                error("whitespace required before NDATA");
2619
              }
2620
            requireWhitespace();
2621
            String notationName = readNmtoken(true);
2622
            if (!skippedPE)
2623
              {
2624
                setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2625
                handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2626
                                           ids.baseUri, notationName);
2627
              }
2628
          }
2629
        else if (!skippedPE)
2630
          {
2631
            setExternalEntity(name, ENTITY_TEXT, ids, null);
2632
            handler.getDeclHandler()
2633
              .externalEntityDecl(name, ids.publicId,
2634
                                   handler.resolveURIs()
2635
                                   // FIXME: ASSUMES not skipped
2636
                                   // "false" forces error on bad URI
2637
                                   ? handler.absolutize(ids.baseUri,
2638
                                                        ids.systemId,
2639
                                                        false)
2640
                                   : ids.systemId);
2641
          }
2642
      }
2643
 
2644
    // Finish the declaration.
2645
    skipWhitespace();
2646
    require('>');
2647
  }
2648
 
2649
  /**
2650
   * Parse a notation declaration.
2651
   * <pre>
2652
   * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2653
   *    (ExternalID | PublicID) S? '&gt;'
2654
   * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2655
   * </pre>
2656
   * <P>NOTE: the '&lt;!NOTATION' has already been read.
2657
   */
2658
  private void parseNotationDecl()
2659
    throws Exception
2660
  {
2661
    String nname;
2662
    ExternalIdentifiers ids;
2663
 
2664
    requireWhitespace();
2665
    nname = readNmtoken(true);
2666
    //NE08
2667
    if (nname.indexOf(':') >= 0)
2668
      {
2669
        error("Illegal character(':') in notation name ", nname, null);
2670
      }
2671
    requireWhitespace();
2672
 
2673
    // Read the external identifiers.
2674
    ids = readExternalIds(true, false);
2675
 
2676
    // Register the notation.
2677
    setNotation(nname, ids);
2678
 
2679
    skipWhitespace();
2680
    require('>');
2681
  }
2682
 
2683
  /**
2684
   * Parse character data.
2685
   * <pre>
2686
   * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2687
   * </pre>
2688
   */
2689
  private void parseCharData()
2690
    throws Exception
2691
  {
2692
    char c;
2693
    int state = 0;
2694
    boolean pureWhite = false;
2695
 
2696
    // assert (dataBufferPos == 0);
2697
 
2698
    // are we expecting pure whitespace?  it might be dirty...
2699
    if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2700
      {
2701
        pureWhite = true;
2702
      }
2703
 
2704
    // always report right out of readBuffer
2705
    // to minimize (pointless) buffer copies
2706
    while (true)
2707
      {
2708
        int lineAugment = 0;
2709
        int columnAugment = 0;
2710
        int i;
2711
 
2712
loop:
2713
        for (i = readBufferPos; i < readBufferLength; i++)
2714
          {
2715
            switch (c = readBuffer[i])
2716
              {
2717
              case '\n':
2718
                lineAugment++;
2719
                columnAugment = 0;
2720
                // pureWhite unmodified
2721
                break;
2722
              case '\r':  // should not happen!!
2723
              case '\t':
2724
              case ' ':
2725
                // pureWhite unmodified
2726
                columnAugment++;
2727
                break;
2728
              case '&':
2729
              case '<':
2730
                columnAugment++;
2731
                // pureWhite unmodified
2732
                // CLEAN end of text sequence
2733
                state = 1;
2734
                break loop;
2735
              case ']':
2736
                // that's not a whitespace char, and
2737
                // can not terminate pure whitespace either
2738
                pureWhite = false;
2739
                if ((i + 2) < readBufferLength)
2740
                  {
2741
                    if (readBuffer [i + 1] == ']'
2742
                        && readBuffer [i + 2] == '>')
2743
                      {
2744
                        // ERROR end of text sequence
2745
                        state = 2;
2746
                        break loop;
2747
                      }
2748
                  }
2749
                else
2750
                  {
2751
                    // FIXME missing two end-of-buffer cases
2752
                  }
2753
                columnAugment++;
2754
                break;
2755
              default:
2756
                if ((c < 0x0020 || c > 0xFFFD)
2757
                    || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2758
                        && xmlVersion == XML_11))
2759
                  {
2760
                    error("illegal XML character U+"
2761
                          + Integer.toHexString(c));
2762
                  }
2763
                // that's not a whitespace char
2764
                pureWhite = false;
2765
                columnAugment++;
2766
              }
2767
          }
2768
 
2769
        // report text thus far
2770
        if (lineAugment > 0)
2771
          {
2772
            line += lineAugment;
2773
            column = columnAugment;
2774
          }
2775
        else
2776
          {
2777
            column += columnAugment;
2778
          }
2779
 
2780
        // report characters/whitspace
2781
        int length = i - readBufferPos;
2782
 
2783
        if (length != 0)
2784
          {
2785
            if (pureWhite)
2786
              {
2787
                handler.ignorableWhitespace(readBuffer,
2788
                                            readBufferPos, length);
2789
              }
2790
            else
2791
              {
2792
                handler.charData(readBuffer, readBufferPos, length);
2793
              }
2794
            readBufferPos = i;
2795
          }
2796
 
2797
        if (state != 0)
2798
          {
2799
            break;
2800
          }
2801
 
2802
        // fill next buffer from this entity, or
2803
        // pop stack and continue with previous entity
2804
        unread(readCh());
2805
      }
2806
    if (!pureWhite)
2807
      {
2808
        isDirtyCurrentElement = true;
2809
      }
2810
    // finish, maybe with error
2811
    if (state != 1)  // finish, no error
2812
      {
2813
        error("character data may not contain ']]>'");
2814
      }
2815
  }
2816
 
2817
  //////////////////////////////////////////////////////////////////////
2818
  // High-level reading and scanning methods.
2819
  //////////////////////////////////////////////////////////////////////
2820
 
2821
  /**
2822
   * Require whitespace characters.
2823
   */
2824
  private void requireWhitespace()
2825
    throws SAXException, IOException
2826
  {
2827
    char c = readCh();
2828
    if (isWhitespace(c))
2829
      {
2830
        skipWhitespace();
2831
      }
2832
    else
2833
      {
2834
        error("whitespace required", c, null);
2835
      }
2836
  }
2837
 
2838
  /**
2839
   * Skip whitespace characters.
2840
   * <pre>
2841
   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2842
   * </pre>
2843
   */
2844
  private void skipWhitespace()
2845
    throws SAXException, IOException
2846
  {
2847
    // Start with a little cheat.  Most of
2848
    // the time, the white space will fall
2849
    // within the current read buffer; if
2850
    // not, then fall through.
2851
    if (USE_CHEATS)
2852
      {
2853
        int lineAugment = 0;
2854
        int columnAugment = 0;
2855
 
2856
loop:
2857
        for (int i = readBufferPos; i < readBufferLength; i++)
2858
          {
2859
            switch (readBuffer[i])
2860
              {
2861
              case ' ':
2862
              case '\t':
2863
              case '\r':
2864
                columnAugment++;
2865
                break;
2866
              case '\n':
2867
                lineAugment++;
2868
                columnAugment = 0;
2869
                break;
2870
              case '%':
2871
                if (expandPE)
2872
                  {
2873
                    break loop;
2874
                  }
2875
                // else fall through...
2876
              default:
2877
                readBufferPos = i;
2878
                if (lineAugment > 0)
2879
                  {
2880
                    line += lineAugment;
2881
                    column = columnAugment;
2882
                  }
2883
                else
2884
                  {
2885
                    column += columnAugment;
2886
                  }
2887
                return;
2888
              }
2889
          }
2890
      }
2891
 
2892
    // OK, do it the slow way.
2893
    char c = readCh ();
2894
    while (isWhitespace(c))
2895
      {
2896
        c = readCh();
2897
      }
2898
    unread(c);
2899
  }
2900
 
2901
  /**
2902
   * Read a name or (when parsing an enumeration) name token.
2903
   * <pre>
2904
   * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2905
   * [7] Nmtoken ::= (NameChar)+
2906
   * </pre>
2907
   */
2908
  private String readNmtoken(boolean isName)
2909
    throws SAXException, IOException
2910
  {
2911
    char c;
2912
 
2913
    if (USE_CHEATS)
2914
      {
2915
loop:
2916
        for (int i = readBufferPos; i < readBufferLength; i++)
2917
          {
2918
            c = readBuffer[i];
2919
            switch (c)
2920
              {
2921
              case '%':
2922
                if (expandPE)
2923
                  {
2924
                    break loop;
2925
                  }
2926
                // else fall through...
2927
 
2928
                // What may legitimately come AFTER a name/nmtoken?
2929
              case '<': case '>': case '&':
2930
              case ',': case '|': case '*': case '+': case '?':
2931
              case ')':
2932
              case '=':
2933
              case '\'': case '"':
2934
              case '[':
2935
              case ' ': case '\t': case '\r': case '\n':
2936
              case ';':
2937
              case '/':
2938
                int start = readBufferPos;
2939
                if (i == start)
2940
                  {
2941
                    error("name expected", readBuffer[i], null);
2942
                  }
2943
                readBufferPos = i;
2944
                return intern(readBuffer, start, i - start);
2945
 
2946
              default:
2947
                // FIXME ... per IBM's OASIS test submission, these:
2948
                //   ?    U+06dd 
2949
                //   Combining  U+309B
2950
                //these switches are kind of ugly but at least we won't
2951
                //have to go over the whole lits for each char
2952
                if (isName && i == readBufferPos)
2953
                  {
2954
                    char c2 = (char) (c & 0x00f0);
2955
                    switch (c & 0xff00)
2956
                      {
2957
                        //starting with 01
2958
                      case 0x0100:
2959
                        switch (c2)
2960
                          {
2961
                          case 0x0030:
2962
                            if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2963
                              {
2964
                                error("Not a name start character, U+"
2965
                                      + Integer.toHexString(c));
2966
                              }
2967
                            break;
2968
                          case 0x0040:
2969
                            if (c == 0x0140 || c == 0x0149)
2970
                              {
2971
                                error("Not a name start character, U+"
2972
                                      + Integer.toHexString(c));
2973
                              }
2974
                            break;
2975
                          case 0x00c0:
2976
                            if (c == 0x01c4 || c == 0x01cc)
2977
                              {
2978
                                error("Not a name start character, U+"
2979
                                      + Integer.toHexString(c));
2980
                              }
2981
                            break;
2982
                          case 0x00f0:
2983
                            if (c == 0x01f1 || c == 0x01f3)
2984
                              {
2985
                                error("Not a name start character, U+"
2986
                                      + Integer.toHexString(c));
2987
                              }
2988
                            break;
2989
                          case 0x00b0:
2990
                            if (c == 0x01f1 || c == 0x01f3)
2991
                              {
2992
                                error("Not a name start character, U+"
2993
                                      + Integer.toHexString(c));
2994
                              }
2995
                            break;
2996
                          default:
2997
                            if (c == 0x017f)
2998
                              {
2999
                                error("Not a name start character, U+"
3000
                                      + Integer.toHexString(c));
3001
                              }
3002
                          }
3003
 
3004
                        break;
3005
                        //starting with 11
3006
                      case 0x1100:
3007
                        switch (c2)
3008
                          {
3009
                          case 0x0000:
3010
                            if (c == 0x1104 || c == 0x1108 ||
3011
                                c == 0x110a || c == 0x110d)
3012
                              {
3013
                                error("Not a name start character, U+"
3014
                                      + Integer.toHexString(c));
3015
                              }
3016
                            break;
3017
                          case 0x0030:
3018
                            if (c == 0x113b || c == 0x113f)
3019
                              {
3020
                                error("Not a name start character, U+"
3021
                                      + Integer.toHexString(c));
3022
                              }
3023
                            break;
3024
                          case 0x0040:
3025
                            if (c == 0x1141 || c == 0x114d
3026
                                || c == 0x114f )
3027
                              {
3028
                                error("Not a name start character, U+"
3029
                                      + Integer.toHexString(c));
3030
                              }
3031
                            break;
3032
                          case 0x0050:
3033
                            if (c == 0x1151 || c == 0x1156)
3034
                              {
3035
                                error("Not a name start character, U+"
3036
                                      + Integer.toHexString(c));
3037
                              }
3038
                            break;
3039
                          case 0x0060:
3040
                            if (c == 0x1162 || c == 0x1164
3041
                                || c == 0x1166 || c == 0x116b
3042
                                || c == 0x116f)
3043
                              {
3044
                                error("Not a name start character, U+"
3045
                                      + Integer.toHexString(c));
3046
                              }
3047
                            break;
3048
                          case 0x00b0:
3049
                            if (c == 0x11b6 || c == 0x11b9
3050
                                || c == 0x11bb || c == 0x116f)
3051
                              {
3052
                                error("Not a name start character, U+"
3053
                                      + Integer.toHexString(c));
3054
                              }
3055
                            break;
3056
                          default:
3057
                            if (c == 0x1174 || c == 0x119f
3058
                                || c == 0x11ac || c == 0x11c3
3059
                                || c == 0x11f1)
3060
                              {
3061
                                error("Not a name start character, U+"
3062
                                      + Integer.toHexString(c));
3063
                              }
3064
                          }
3065
                        break;
3066
                      default:
3067
                        if (c == 0x0e46 || c == 0x1011
3068
                            || c == 0x212f || c == 0x0587
3069
                            || c == 0x0230 )
3070
                          {
3071
                            error("Not a name start character, U+"
3072
                                  + Integer.toHexString(c));
3073
                          }
3074
                      }
3075
                  }
3076
                // punt on exact tests from Appendix A; approximate
3077
                // them using the Unicode ID start/part rules
3078
                if (i == readBufferPos && isName)
3079
                  {
3080
                    if (!Character.isUnicodeIdentifierStart(c)
3081
                        && c != ':' && c != '_')
3082
                      {
3083
                        error("Not a name start character, U+"
3084
                              + Integer.toHexString(c));
3085
                      }
3086
                  }
3087
                else if (!Character.isUnicodeIdentifierPart(c)
3088
                         && c != '-' && c != ':' && c != '_' && c != '.'
3089
                         && !isExtender(c))
3090
                  {
3091
                    error("Not a name character, U+"
3092
                          + Integer.toHexString(c));
3093
                  }
3094
              }
3095
          }
3096
      }
3097
 
3098
    nameBufferPos = 0;
3099
 
3100
    // Read the first character.
3101
loop:
3102
    while (true)
3103
      {
3104
        c = readCh();
3105
        switch (c)
3106
          {
3107
          case '%':
3108
          case '<': case '>': case '&':
3109
          case ',': case '|': case '*': case '+': case '?':
3110
          case ')':
3111
          case '=':
3112
          case '\'': case '"':
3113
          case '[':
3114
          case ' ': case '\t': case '\n': case '\r':
3115
          case ';':
3116
          case '/':
3117
            unread(c);
3118
            if (nameBufferPos == 0)
3119
              {
3120
                error ("name expected");
3121
              }
3122
            // punt on exact tests from Appendix A, but approximate them
3123
            if (isName
3124
                && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3125
                && ":_".indexOf(nameBuffer[0]) == -1)
3126
              {
3127
                error("Not a name start character, U+"
3128
                      + Integer.toHexString(nameBuffer[0]));
3129
              }
3130
            String s = intern(nameBuffer, 0, nameBufferPos);
3131
            nameBufferPos = 0;
3132
            return s;
3133
          default:
3134
            // punt on exact tests from Appendix A, but approximate them
3135
 
3136
            if ((nameBufferPos != 0 || !isName)
3137
                && !Character.isUnicodeIdentifierPart(c)
3138
                && ":-_.".indexOf(c) == -1
3139
                && !isExtender(c))
3140
              {
3141
                error("Not a name character, U+"
3142
                      + Integer.toHexString(c));
3143
              }
3144
            if (nameBufferPos >= nameBuffer.length)
3145
              {
3146
                nameBuffer =
3147
                  (char[]) extendArray(nameBuffer,
3148
                                       nameBuffer.length, nameBufferPos);
3149
              }
3150
            nameBuffer[nameBufferPos++] = c;
3151
          }
3152
      }
3153
  }
3154
 
3155
  private static boolean isExtender(char c)
3156
  {
3157
    // [88] Extender ::= ...
3158
    return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3159
      || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3160
      || (c >= 0x3031 && c <= 0x3035)
3161
      || (c >= 0x309d && c <= 0x309e)
3162
      || (c >= 0x30fc && c <= 0x30fe);
3163
  }
3164
 
3165
  /**
3166
   * Read a literal.  With matching single or double quotes as
3167
   * delimiters (and not embedded!) this is used to parse:
3168
   * <pre>
3169
   *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3170
   *  [10] AttValue ::= ... ([^<&] | Reference)* ...
3171
   *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
3172
   *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3173
   * </pre>
3174
   * as well as the quoted strings in XML and text declarations
3175
   * (for version, encoding, and standalone) which have their
3176
   * own constraints.
3177
   */
3178
  private String readLiteral(int flags)
3179
    throws SAXException, IOException
3180
  {
3181
    char delim, c;
3182
    int startLine = line;
3183
    boolean saved = expandPE;
3184
    boolean savedReport = doReport;
3185
 
3186
    // Find the first delimiter.
3187
    delim = readCh();
3188
    if (delim != '"' && delim != '\'')
3189
      {
3190
        error("expected '\"' or \"'\"", delim, null);
3191
        return null;
3192
      }
3193
    inLiteral = true;
3194
    if ((flags & LIT_DISABLE_PE) != 0)
3195
      {
3196
        expandPE = false;
3197
      }
3198
    doReport = false;
3199
 
3200
    // Each level of input source has its own buffer; remember
3201
    // ours, so we won't read the ending delimiter from any
3202
    // other input source, regardless of entity processing.
3203
    char[] ourBuf = readBuffer;
3204
 
3205
    // Read the literal.
3206
    try
3207
      {
3208
        c = readCh();
3209
        boolean ampRead = false;
3210
loop:
3211
        while (! (c == delim && readBuffer == ourBuf))
3212
          {
3213
            switch (c)
3214
              {
3215
                // attributes and public ids are normalized
3216
                // in almost the same ways
3217
              case '\n':
3218
              case '\r':
3219
                if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3220
                  {
3221
                    c = ' ';
3222
                  }
3223
                break;
3224
              case '\t':
3225
                if ((flags & LIT_ATTRIBUTE) != 0)
3226
                  {
3227
                    c = ' ';
3228
                  }
3229
                break;
3230
              case '&':
3231
                c = readCh();
3232
                // Char refs are expanded immediately, except for
3233
                // all the cases where it's deferred.
3234
                if (c == '#')
3235
                  {
3236
                    if ((flags & LIT_DISABLE_CREF) != 0)
3237
                      {
3238
                        dataBufferAppend('&');
3239
                        break;
3240
                      }
3241
                    parseCharRef(false /* Do not do flushDataBuffer */);
3242
 
3243
                    // exotic WFness risk: this is an entity literal,
3244
                    // dataBuffer [dataBufferPos - 1] == '&', and
3245
                    // following chars are a _partial_ entity/char ref
3246
 
3247
                    // It looks like an entity ref ...
3248
                  }
3249
                else
3250
                  {
3251
                    unread(c);
3252
                    // Expand it?
3253
                    if ((flags & LIT_ENTITY_REF) > 0)
3254
                      {
3255
                        parseEntityRef(false);
3256
                        if (String.valueOf(readBuffer).equals("&#38;"))
3257
                          {
3258
                            ampRead = true;
3259
                          }
3260
                        //Is it just data?
3261
                      }
3262
                    else if ((flags & LIT_DISABLE_EREF) != 0)
3263
                      {
3264
                        dataBufferAppend('&');
3265
 
3266
                        // OK, it will be an entity ref -- expanded later.
3267
                      }
3268
                    else
3269
                      {
3270
                        String name = readNmtoken(true);
3271
                        require(';');
3272
                        dataBufferAppend('&');
3273
                        dataBufferAppend(name);
3274
                        dataBufferAppend(';');
3275
                      }
3276
                  }
3277
                c = readCh();
3278
                continue loop;
3279
 
3280
              case '<':
3281
                // and why?  Perhaps so "&foo;" expands the same
3282
                // inside and outside an attribute?
3283
                if ((flags & LIT_ATTRIBUTE) != 0)
3284
                  {
3285
                    error("attribute values may not contain '<'");
3286
                  }
3287
                break;
3288
 
3289
                // We don't worry about case '%' and PE refs, readCh does.
3290
 
3291
              default:
3292
                break;
3293
              }
3294
            dataBufferAppend(c);
3295
            c = readCh();
3296
          }
3297
      }
3298
    catch (EOFException e)
3299
      {
3300
        error("end of input while looking for delimiter (started on line "
3301
              + startLine + ')', null, new Character(delim).toString());
3302
      }
3303
    inLiteral = false;
3304
    expandPE = saved;
3305
    doReport = savedReport;
3306
 
3307
    // Normalise whitespace if necessary.
3308
    if ((flags & LIT_NORMALIZE) > 0)
3309
      {
3310
        dataBufferNormalize();
3311
      }
3312
 
3313
    // Return the value.
3314
    return dataBufferToString();
3315
  }
3316
 
3317
  /**
3318
   * Try reading external identifiers.
3319
   * A system identifier is not required for notations.
3320
   * @param inNotation Are we parsing a notation decl?
3321
   * @param isSubset Parsing external subset decl (may be omitted)?
3322
   * @return A three-member String array containing the identifiers,
3323
   *  or nulls. Order: public, system, baseURI.
3324
   */
3325
  private ExternalIdentifiers readExternalIds(boolean inNotation,
3326
                                              boolean isSubset)
3327
    throws Exception
3328
  {
3329
    char c;
3330
    ExternalIdentifiers ids = new ExternalIdentifiers();
3331
    int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3332
 
3333
    if (tryRead("PUBLIC"))
3334
      {
3335
        requireWhitespace();
3336
        ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3337
        if (inNotation)
3338
          {
3339
            skipWhitespace();
3340
            c = readCh();
3341
            unread(c);
3342
            if (c == '"' || c == '\'')
3343
              {
3344
                ids.systemId = readLiteral(flags);
3345
              }
3346
          }
3347
        else
3348
          {
3349
            requireWhitespace();
3350
            ids.systemId = readLiteral(flags);
3351
          }
3352
 
3353
        for (int i = 0; i < ids.publicId.length(); i++)
3354
          {
3355
            c = ids.publicId.charAt(i);
3356
            if (c >= 'a' && c <= 'z')
3357
              {
3358
                continue;
3359
              }
3360
            if (c >= 'A' && c <= 'Z')
3361
              {
3362
                continue;
3363
              }
3364
            if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3365
              {
3366
                continue;
3367
              }
3368
            error("illegal PUBLIC id character U+"
3369
                  + Integer.toHexString(c));
3370
          }
3371
      }
3372
    else if (tryRead("SYSTEM"))
3373
      {
3374
        requireWhitespace();
3375
        ids.systemId = readLiteral(flags);
3376
      }
3377
    else if (!isSubset)
3378
      {
3379
        error("missing SYSTEM or PUBLIC keyword");
3380
      }
3381
 
3382
    if (ids.systemId != null)
3383
      {
3384
        if (ids.systemId.indexOf('#') != -1)
3385
          {
3386
            handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3387
          }
3388
        ids.baseUri = handler.getSystemId();
3389
        if (ids.baseUri == null && uriWarnings)
3390
          {
3391
            handler.warn("No base URI; hope URI is absolute: "
3392
                         + ids.systemId);
3393
          }
3394
      }
3395
 
3396
    return ids;
3397
  }
3398
 
3399
  /**
3400
   * Test if a character is whitespace.
3401
   * <pre>
3402
   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3403
   * </pre>
3404
   * @param c The character to test.
3405
   * @return true if the character is whitespace.
3406
   */
3407
  private final boolean isWhitespace(char c)
3408
  {
3409
    if (c > 0x20)
3410
      {
3411
        return false;
3412
      }
3413
    if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3414
      {
3415
        return true;
3416
      }
3417
    return false;  // illegal ...
3418
  }
3419
 
3420
  //////////////////////////////////////////////////////////////////////
3421
  // Utility routines.
3422
  //////////////////////////////////////////////////////////////////////
3423
 
3424
  /**
3425
   * Add a character to the data buffer.
3426
   */
3427
  private void dataBufferAppend(char c)
3428
  {
3429
    // Expand buffer if necessary.
3430
    if (dataBufferPos >= dataBuffer.length)
3431
      {
3432
        dataBuffer = (char[]) extendArray(dataBuffer,
3433
                                          dataBuffer.length, dataBufferPos);
3434
      }
3435
    dataBuffer[dataBufferPos++] = c;
3436
  }
3437
 
3438
  /**
3439
   * Add a string to the data buffer.
3440
   */
3441
  private void dataBufferAppend(String s)
3442
  {
3443
    dataBufferAppend(s.toCharArray(), 0, s.length());
3444
  }
3445
 
3446
  /**
3447
   * Append (part of) a character array to the data buffer.
3448
   */
3449
  private void dataBufferAppend(char[] ch, int start, int length)
3450
  {
3451
    dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3452
                                      dataBufferPos + length);
3453
 
3454
    System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3455
    dataBufferPos += length;
3456
  }
3457
 
3458
  /**
3459
   * Normalise space characters in the data buffer.
3460
   */
3461
  private void dataBufferNormalize()
3462
  {
3463
    int i = 0;
3464
    int j = 0;
3465
    int end = dataBufferPos;
3466
 
3467
    // Skip spaces at the start.
3468
    while (j < end && dataBuffer[j] == ' ')
3469
      {
3470
        j++;
3471
      }
3472
 
3473
    // Skip whitespace at the end.
3474
    while (end > j && dataBuffer[end - 1] == ' ')
3475
      {
3476
        end --;
3477
      }
3478
 
3479
    // Start copying to the left.
3480
    while (j < end)
3481
      {
3482
 
3483
        char c = dataBuffer[j++];
3484
 
3485
        // Normalise all other spaces to
3486
        // a single space.
3487
        if (c == ' ')
3488
          {
3489
            while (j < end && dataBuffer[j++] == ' ')
3490
              {
3491
                continue;
3492
              }
3493
            dataBuffer[i++] = ' ';
3494
            dataBuffer[i++] = dataBuffer[j - 1];
3495
          }
3496
        else
3497
          {
3498
            dataBuffer[i++] = c;
3499
          }
3500
      }
3501
 
3502
    // The new length is <= the old one.
3503
    dataBufferPos = i;
3504
  }
3505
 
3506
  /**
3507
   * Convert the data buffer to a string.
3508
   */
3509
  private String dataBufferToString()
3510
  {
3511
    String s = new String(dataBuffer, 0, dataBufferPos);
3512
    dataBufferPos = 0;
3513
    return s;
3514
  }
3515
 
3516
  /**
3517
   * Flush the contents of the data buffer to the handler, as
3518
   * appropriate, and reset the buffer for new input.
3519
   */
3520
  private void dataBufferFlush()
3521
    throws SAXException
3522
  {
3523
    if (currentElementContent == CONTENT_ELEMENTS
3524
        && dataBufferPos > 0
3525
        && !inCDATA)
3526
      {
3527
        // We can't just trust the buffer to be whitespace, there
3528
        // are (error) cases when it isn't
3529
        for (int i = 0; i < dataBufferPos; i++)
3530
          {
3531
            if (!isWhitespace(dataBuffer[i]))
3532
              {
3533
                handler.charData(dataBuffer, 0, dataBufferPos);
3534
                dataBufferPos = 0;
3535
              }
3536
          }
3537
        if (dataBufferPos > 0)
3538
          {
3539
            handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3540
            dataBufferPos = 0;
3541
          }
3542
      }
3543
    else if (dataBufferPos > 0)
3544
      {
3545
        handler.charData(dataBuffer, 0, dataBufferPos);
3546
        dataBufferPos = 0;
3547
      }
3548
  }
3549
 
3550
  /**
3551
   * Require a string to appear, or throw an exception.
3552
   * <p><em>Precondition:</em> Entity expansion is not required.
3553
   * <p><em>Precondition:</em> data buffer has no characters that
3554
   * will get sent to the application.
3555
   */
3556
  private void require(String delim)
3557
    throws SAXException, IOException
3558
  {
3559
    int length = delim.length();
3560
    char[] ch;
3561
 
3562
    if (length < dataBuffer.length)
3563
      {
3564
        ch = dataBuffer;
3565
        delim.getChars(0, length, ch, 0);
3566
      }
3567
    else
3568
      {
3569
        ch = delim.toCharArray();
3570
      }
3571
 
3572
    if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3573
      {
3574
        int offset = readBufferPos;
3575
 
3576
        for (int i = 0; i < length; i++, offset++)
3577
          {
3578
            if (ch[i] != readBuffer[offset])
3579
              {
3580
                error ("required string", null, delim);
3581
              }
3582
          }
3583
        readBufferPos = offset;
3584
 
3585
      }
3586
    else
3587
      {
3588
        for (int i = 0; i < length; i++)
3589
          {
3590
            require(ch[i]);
3591
          }
3592
      }
3593
  }
3594
 
3595
  /**
3596
   * Require a character to appear, or throw an exception.
3597
   */
3598
  private void require(char delim)
3599
    throws SAXException, IOException
3600
  {
3601
    char c = readCh();
3602
 
3603
    if (c != delim)
3604
      {
3605
        error("required character", c, new Character(delim).toString());
3606
      }
3607
  }
3608
 
3609
  /**
3610
   * Create an interned string from a character array.
3611
   * &AElig;lfred uses this method to create an interned version
3612
   * of all names and name tokens, so that it can test equality
3613
   * with <code>==</code> instead of <code>String.equals ()</code>.
3614
   *
3615
   * <p>This is much more efficient than constructing a non-interned
3616
   * string first, and then interning it.
3617
   *
3618
   * @param ch an array of characters for building the string.
3619
   * @param start the starting position in the array.
3620
   * @param length the number of characters to place in the string.
3621
   * @return an interned string.
3622
   * @see #intern (String)
3623
   * @see java.lang.String#intern
3624
   */
3625
  public String intern(char[] ch, int start, int length)
3626
  {
3627
    int index = 0;
3628
    int hash = 0;
3629
    Object[] bucket;
3630
 
3631
    // Generate a hash code.  This is a widely used string hash,
3632
    // often attributed to Brian Kernighan.
3633
    for (int i = start; i < start + length; i++)
3634
      {
3635
        hash = 31 * hash + ch[i];
3636
      }
3637
    hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3638
 
3639
    // Get the bucket -- consists of {array,String} pairs
3640
    if ((bucket = symbolTable[hash]) == null)
3641
      {
3642
        // first string in this bucket
3643
        bucket = new Object[8];
3644
 
3645
        // Search for a matching tuple, and
3646
        // return the string if we find one.
3647
      }
3648
    else
3649
      {
3650
        while (index < bucket.length)
3651
          {
3652
            char[] chFound = (char[]) bucket[index];
3653
 
3654
            // Stop when we hit an empty entry.
3655
            if (chFound == null)
3656
              {
3657
                break;
3658
              }
3659
 
3660
            // If they're the same length, check for a match.
3661
            if (chFound.length == length)
3662
              {
3663
                for (int i = 0; i < chFound.length; i++)
3664
                  {
3665
                    // continue search on failure
3666
                    if (ch[start + i] != chFound[i])
3667
                      {
3668
                        break;
3669
                      }
3670
                    else if (i == length - 1)
3671
                      {
3672
                        // That's it, we have a match!
3673
                        return (String) bucket[index + 1];
3674
                      }
3675
                  }
3676
              }
3677
            index += 2;
3678
          }
3679
        // Not found -- we'll have to add it.
3680
 
3681
        // Do we have to grow the bucket?
3682
        bucket = (Object[]) extendArray(bucket, bucket.length, index);
3683
      }
3684
    symbolTable[hash] = bucket;
3685
 
3686
    // OK, add it to the end of the bucket -- "local" interning.
3687
    // Intern "globally" to let applications share interning benefits.
3688
    // That is, "!=" and "==" work on our strings, not just equals().
3689
    String s = new String(ch, start, length).intern();
3690
    bucket[index] = s.toCharArray();
3691
    bucket[index + 1] = s;
3692
    return s;
3693
  }
3694
 
3695
  /**
3696
   * Ensure the capacity of an array, allocating a new one if
3697
   * necessary.  Usually extends only for name hash collisions.
3698
   */
3699
  private Object extendArray(Object array, int currentSize, int requiredSize)
3700
  {
3701
    if (requiredSize < currentSize)
3702
      {
3703
        return array;
3704
      }
3705
    else
3706
      {
3707
        Object newArray = null;
3708
        int newSize = currentSize * 2;
3709
 
3710
        if (newSize <= requiredSize)
3711
          {
3712
            newSize = requiredSize + 1;
3713
          }
3714
 
3715
        if (array instanceof char[])
3716
          {
3717
            newArray = new char[newSize];
3718
          }
3719
        else if (array instanceof Object[])
3720
          {
3721
            newArray = new Object[newSize];
3722
          }
3723
        else
3724
          {
3725
            throw new RuntimeException();
3726
          }
3727
 
3728
        System.arraycopy(array, 0, newArray, 0, currentSize);
3729
        return newArray;
3730
      }
3731
  }
3732
 
3733
  //////////////////////////////////////////////////////////////////////
3734
  // XML query routines.
3735
  //////////////////////////////////////////////////////////////////////
3736
 
3737
  boolean isStandalone()
3738
  {
3739
    return docIsStandalone;
3740
  }
3741
 
3742
  //
3743
  // Elements
3744
  //
3745
 
3746
  private int getContentType(ElementDecl element, int defaultType)
3747
  {
3748
    int retval;
3749
 
3750
    if (element == null)
3751
      {
3752
        return defaultType;
3753
      }
3754
    retval = element.contentType;
3755
    if (retval == CONTENT_UNDECLARED)
3756
      {
3757
        retval = defaultType;
3758
      }
3759
    return retval;
3760
  }
3761
 
3762
  /**
3763
   * Look up the content type of an element.
3764
   * @param name The element type name.
3765
   * @return An integer constant representing the content type.
3766
   * @see #CONTENT_UNDECLARED
3767
   * @see #CONTENT_ANY
3768
   * @see #CONTENT_EMPTY
3769
   * @see #CONTENT_MIXED
3770
   * @see #CONTENT_ELEMENTS
3771
   */
3772
  public int getElementContentType(String name)
3773
  {
3774
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3775
    return getContentType(element, CONTENT_UNDECLARED);
3776
  }
3777
 
3778
  /**
3779
   * Register an element.
3780
   * Array format:
3781
   *  [0] element type name
3782
   *  [1] content model (mixed, elements only)
3783
   *  [2] attribute hash table
3784
   */
3785
  private void setElement(String name, int contentType,
3786
                          String contentModel, HashMap attributes)
3787
    throws SAXException
3788
  {
3789
    if (skippedPE)
3790
      {
3791
        return;
3792
      }
3793
 
3794
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3795
 
3796
    // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3797
    if (element == null)
3798
      {
3799
        element = new ElementDecl();
3800
        element.contentType = contentType;
3801
        element.contentModel = contentModel;
3802
        element.attributes = attributes;
3803
        elementInfo.put(name, element);
3804
        return;
3805
      }
3806
 
3807
    // <!ELEMENT ...> declaration?
3808
    if (contentType != CONTENT_UNDECLARED)
3809
      {
3810
        // ... following an associated <!ATTLIST ...>
3811
        if (element.contentType == CONTENT_UNDECLARED)
3812
          {
3813
            element.contentType = contentType;
3814
            element.contentModel = contentModel;
3815
          }
3816
        else
3817
          {
3818
            // VC: Unique Element Type Declaration
3819
            handler.verror("multiple declarations for element type: "
3820
                           + name);
3821
          }
3822
      }
3823
 
3824
    // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3825
    else if (attributes != null)
3826
      {
3827
        element.attributes = attributes;
3828
      }
3829
  }
3830
 
3831
  /**
3832
   * Look up the attribute hash table for an element.
3833
   * The hash table is the second item in the element array.
3834
   */
3835
  private HashMap getElementAttributes(String name)
3836
  {
3837
    ElementDecl element = (ElementDecl) elementInfo.get(name);
3838
    return (element == null) ? null : element.attributes;
3839
  }
3840
 
3841
  //
3842
  // Attributes
3843
  //
3844
 
3845
  /**
3846
   * Get the declared attributes for an element type.
3847
   * @param elname The name of the element type.
3848
   * @return An iterator over all the attributes declared for
3849
   *   a specific element type.  The results will be valid only
3850
   *   after the DTD (if any) has been parsed.
3851
   * @see #getAttributeType
3852
   * @see #getAttributeEnumeration
3853
   * @see #getAttributeDefaultValueType
3854
   * @see #getAttributeDefaultValue
3855
   * @see #getAttributeExpandedValue
3856
   */
3857
  private Iterator declaredAttributes(ElementDecl element)
3858
  {
3859
    HashMap attlist;
3860
 
3861
    if (element == null)
3862
      {
3863
        return null;
3864
      }
3865
    if ((attlist = element.attributes) == null)
3866
      {
3867
        return null;
3868
      }
3869
    return attlist.keySet().iterator();
3870
  }
3871
 
3872
  /**
3873
   * Get the declared attributes for an element type.
3874
   * @param elname The name of the element type.
3875
   * @return An iterator over all the attributes declared for
3876
   *   a specific element type.  The results will be valid only
3877
   *   after the DTD (if any) has been parsed.
3878
   * @see #getAttributeType
3879
   * @see #getAttributeEnumeration
3880
   * @see #getAttributeDefaultValueType
3881
   * @see #getAttributeDefaultValue
3882
   * @see #getAttributeExpandedValue
3883
   */
3884
  public Iterator declaredAttributes(String elname)
3885
  {
3886
    return declaredAttributes((ElementDecl) elementInfo.get(elname));
3887
  }
3888
 
3889
  /**
3890
   * Retrieve the declared type of an attribute.
3891
   * @param name The name of the associated element.
3892
   * @param aname The name of the attribute.
3893
   * @return An interend string denoting the type, or null
3894
   *  indicating an undeclared attribute.
3895
   */
3896
  public String getAttributeType(String name, String aname)
3897
  {
3898
    AttributeDecl attribute = getAttribute(name, aname);
3899
    return (attribute == null) ? null : attribute.type;
3900
  }
3901
 
3902
  /**
3903
   * Retrieve the allowed values for an enumerated attribute type.
3904
   * @param name The name of the associated element.
3905
   * @param aname The name of the attribute.
3906
   * @return A string containing the token list.
3907
   */
3908
  public String getAttributeEnumeration(String name, String aname)
3909
  {
3910
    AttributeDecl attribute = getAttribute(name, aname);
3911
    // assert:  attribute.enumeration is "ENUMERATION" or "NOTATION"
3912
    return (attribute == null) ? null : attribute.enumeration;
3913
  }
3914
 
3915
  /**
3916
   * Retrieve the default value of a declared attribute.
3917
   * @param name The name of the associated element.
3918
   * @param aname The name of the attribute.
3919
   * @return The default value, or null if the attribute was
3920
   *   #IMPLIED or simply undeclared and unspecified.
3921
   * @see #getAttributeExpandedValue
3922
   */
3923
  public String getAttributeDefaultValue(String name, String aname)
3924
  {
3925
    AttributeDecl attribute = getAttribute(name, aname);
3926
    return (attribute == null) ? null : attribute.value;
3927
  }
3928
 
3929
    /*
3930
 
3931
// FIXME:  Leaving this in, until W3C finally resolves the confusion
3932
// between parts of the XML 2nd REC about when entity declararations
3933
// are guaranteed to be known.  Current code matches what section 5.1
3934
// (conformance) describes, but some readings of the self-contradicting
3935
// text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3936
// attribute expansion/normalization must be deferred in some cases
3937
// (just TRY to identify them!).
3938
 
3939
     * Retrieve the expanded value of a declared attribute.
3940
     * <p>General entities (and char refs) will be expanded (once).
3941
     * @param name The name of the associated element.
3942
     * @param aname The name of the attribute.
3943
     * @return The expanded default value, or null if the attribute was
3944
     *   #IMPLIED or simply undeclared
3945
     * @see #getAttributeDefaultValue
3946
    public String getAttributeExpandedValue (String name, String aname)
3947
    throws Exception
3948
    {
3949
  AttributeDecl attribute = getAttribute (name, aname);
3950
 
3951
  if (attribute == null) {
3952
      return null;
3953
  } else if (attribute.defaultValue == null && attribute.value != null) {
3954
      // we MUST use the same buf for both quotes else the literal
3955
      // can't be properly terminated
3956
      char buf [] = new char [1];
3957
      int  flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3958
      String type = getAttributeType (name, aname);
3959
 
3960
      if (type != "CDATA" && type != null)
3961
    flags |= LIT_NORMALIZE;
3962
      buf [0] = '"';
3963
      pushCharArray (null, buf, 0, 1);
3964
      pushString (null, attribute.value);
3965
      pushCharArray (null, buf, 0, 1);
3966
      attribute.defaultValue = readLiteral (flags);
3967
  }
3968
  return attribute.defaultValue;
3969
    }
3970
     */
3971
 
3972
  /**
3973
   * Retrieve the default value mode of a declared attribute.
3974
   * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3975
   * @see #ATTRIBUTE_DEFAULT_IMPLIED
3976
   * @see #ATTRIBUTE_DEFAULT_REQUIRED
3977
   * @see #ATTRIBUTE_DEFAULT_FIXED
3978
   */
3979
  public int getAttributeDefaultValueType(String name, String aname)
3980
  {
3981
    AttributeDecl attribute = getAttribute(name, aname);
3982
    return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
3983
      attribute.valueType;
3984
  }
3985
 
3986
  /**
3987
   * Register an attribute declaration for later retrieval.
3988
   * Format:
3989
   * - String type
3990
   * - String default value
3991
   * - int value type
3992
   * - enumeration
3993
   * - processed default value
3994
   */
3995
  private void setAttribute(String elName, String name, String type,
3996
                            String enumeration, String value, int valueType)
3997
    throws Exception
3998
  {
3999
    HashMap attlist;
4000
 
4001
    if (skippedPE)
4002
      {
4003
        return;
4004
      }
4005
 
4006
    // Create a new hashtable if necessary.
4007
    attlist = getElementAttributes(elName);
4008
    if (attlist == null)
4009
      {
4010
        attlist = new HashMap();
4011
      }
4012
 
4013
    // ignore multiple attribute declarations!
4014
    if (attlist.get(name) != null)
4015
      {
4016
        // warn ...
4017
        return;
4018
      }
4019
    else
4020
      {
4021
        AttributeDecl attribute = new AttributeDecl();
4022
        attribute.type = type;
4023
        attribute.value = value;
4024
        attribute.valueType = valueType;
4025
        attribute.enumeration = enumeration;
4026
        attlist.put(name, attribute);
4027
 
4028
        // save; but don't overwrite any existing <!ELEMENT ...>
4029
        setElement(elName, CONTENT_UNDECLARED, null, attlist);
4030
      }
4031
  }
4032
 
4033
  /**
4034
   * Retrieve the attribute declaration for the given element name and name.
4035
   */
4036
  private AttributeDecl getAttribute(String elName, String name)
4037
  {
4038
    HashMap attlist = getElementAttributes(elName);
4039
    return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
4040
  }
4041
 
4042
  //
4043
  // Entities
4044
  //
4045
 
4046
  /**
4047
   * Find the type of an entity.
4048
   * @returns An integer constant representing the entity type.
4049
   * @see #ENTITY_UNDECLARED
4050
   * @see #ENTITY_INTERNAL
4051
   * @see #ENTITY_NDATA
4052
   * @see #ENTITY_TEXT
4053
   */
4054
  public int getEntityType(String ename)
4055
  {
4056
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4057
    return (entity == null) ?  ENTITY_UNDECLARED : entity.type;
4058
  }
4059
 
4060
  /**
4061
   * Return an external entity's identifiers.
4062
   * @param ename The name of the external entity.
4063
   * @return The entity's public identifier, system identifier, and base URI.
4064
   *  Null if the entity was not declared as an external entity.
4065
   * @see #getEntityType
4066
   */
4067
  public ExternalIdentifiers getEntityIds(String ename)
4068
  {
4069
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4070
    return (entity == null) ? null : entity.ids;
4071
  }
4072
 
4073
  /**
4074
   * Return an internal entity's replacement text.
4075
   * @param ename The name of the internal entity.
4076
   * @return The entity's replacement text, or null if
4077
   *   the entity was not declared as an internal entity.
4078
   * @see #getEntityType
4079
   */
4080
  public String getEntityValue(String ename)
4081
  {
4082
    EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4083
    return (entity == null) ? null : entity.value;
4084
  }
4085
 
4086
  /**
4087
   * Register an entity declaration for later retrieval.
4088
   */
4089
  private void setInternalEntity(String eName, String value)
4090
    throws SAXException
4091
  {
4092
    if (skippedPE)
4093
      {
4094
        return;
4095
      }
4096
 
4097
    if (entityInfo.get(eName) == null)
4098
      {
4099
        EntityInfo entity = new EntityInfo();
4100
        entity.type = ENTITY_INTERNAL;
4101
        entity.value = value;
4102
        entityInfo.put(eName, entity);
4103
      }
4104
    if (handler.stringInterning)
4105
      {
4106
        if ("lt" == eName || "gt" == eName || "quot" == eName
4107
            || "apos" == eName || "amp" == eName)
4108
          {
4109
            return;
4110
          }
4111
      }
4112
    else
4113
      {
4114
        if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4115
            || "apos".equals(eName) || "amp".equals(eName))
4116
          {
4117
            return;
4118
          }
4119
      }
4120
    handler.getDeclHandler().internalEntityDecl(eName, value);
4121
  }
4122
 
4123
  /**
4124
   * Register an external entity declaration for later retrieval.
4125
   */
4126
  private void setExternalEntity(String eName, int eClass,
4127
                                 ExternalIdentifiers ids, String nName)
4128
  {
4129
    if (entityInfo.get(eName) == null)
4130
      {
4131
        EntityInfo entity = new EntityInfo();
4132
        entity.type = eClass;
4133
        entity.ids = ids;
4134
        entity.notationName = nName;
4135
        entityInfo.put(eName, entity);
4136
      }
4137
  }
4138
 
4139
  //
4140
  // Notations.
4141
  //
4142
 
4143
  /**
4144
   * Report a notation declaration, checking for duplicates.
4145
   */
4146
  private void setNotation(String nname, ExternalIdentifiers ids)
4147
    throws SAXException
4148
  {
4149
    if (skippedPE)
4150
      {
4151
        return;
4152
      }
4153
 
4154
    handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4155
    if (notationInfo.get(nname) == null)
4156
      {
4157
        notationInfo.put(nname, nname);
4158
      }
4159
    else
4160
      {
4161
        // VC: Unique Notation Name
4162
        handler.verror("Duplicate notation name decl: " + nname);
4163
      }
4164
  }
4165
 
4166
  //
4167
  // Location.
4168
  //
4169
 
4170
  /**
4171
   * Return the current line number.
4172
   */
4173
  public int getLineNumber()
4174
  {
4175
    return line;
4176
  }
4177
 
4178
  /**
4179
   * Return the current column number.
4180
   */
4181
  public int getColumnNumber()
4182
  {
4183
    return column;
4184
  }
4185
 
4186
  //////////////////////////////////////////////////////////////////////
4187
  // High-level I/O.
4188
  //////////////////////////////////////////////////////////////////////
4189
 
4190
  /**
4191
   * Read a single character from the readBuffer.
4192
   * <p>The readDataChunk () method maintains the buffer.
4193
   * <p>If we hit the end of an entity, try to pop the stack and
4194
   * keep going.
4195
   * <p> (This approach doesn't really enforce XML's rules about
4196
   * entity boundaries, but this is not currently a validating
4197
   * parser).
4198
   * <p>This routine also attempts to keep track of the current
4199
   * position in external entities, but it's not entirely accurate.
4200
   * @return The next available input character.
4201
   * @see #unread (char)
4202
   * @see #readDataChunk
4203
   * @see #readBuffer
4204
   * @see #line
4205
   * @return The next character from the current input source.
4206
   */
4207
  private char readCh()
4208
    throws SAXException, IOException
4209
  {
4210
    // As long as there's nothing in the
4211
    // read buffer, try reading more data
4212
    // (for an external entity) or popping
4213
    // the entity stack (for either).
4214
    while (readBufferPos >= readBufferLength)
4215
      {
4216
        switch (sourceType)
4217
          {
4218
          case INPUT_READER:
4219
          case INPUT_STREAM:
4220
            readDataChunk();
4221
            while (readBufferLength < 1)
4222
              {
4223
                popInput();
4224
                if (readBufferLength < 1)
4225
                  {
4226
                    readDataChunk();
4227
                  }
4228
              }
4229
            break;
4230
 
4231
          default:
4232
 
4233
            popInput();
4234
            break;
4235
          }
4236
      }
4237
 
4238
    char c = readBuffer[readBufferPos++];
4239
 
4240
    if (c == '\n')
4241
      {
4242
        line++;
4243
        column = 0;
4244
      }
4245
    else
4246
      {
4247
        if (c == '<')
4248
          {
4249
            /* the most common return to parseContent () ... NOP */
4250
          }
4251
        else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4252
                 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
4253
                     && xmlVersion == XML_11))
4254
          {
4255
            error("illegal XML character U+" + Integer.toHexString(c));
4256
          }
4257
 
4258
        // If we're in the DTD and in a context where PEs get expanded,
4259
        // do so ... 1/14/2000 errata identify those contexts.  There
4260
        // are also spots in the internal subset where PE refs are fatal
4261
        // errors, hence yet another flag.
4262
        else if (c == '%' && expandPE)
4263
          {
4264
            if (peIsError)
4265
              {
4266
                error("PE reference within decl in internal subset.");
4267
              }
4268
            parsePEReference();
4269
            return readCh();
4270
          }
4271
        column++;
4272
      }
4273
 
4274
    return c;
4275
  }
4276
 
4277
  /**
4278
   * Push a single character back onto the current input stream.
4279
   * <p>This method usually pushes the character back onto
4280
   * the readBuffer.
4281
   * <p>I don't think that this would ever be called with
4282
   * readBufferPos = 0, because the methods always reads a character
4283
   * before unreading it, but just in case, I've added a boundary
4284
   * condition.
4285
   * @param c The character to push back.
4286
   * @see #readCh
4287
   * @see #unread (char[])
4288
   * @see #readBuffer
4289
   */
4290
  private void unread(char c)
4291
    throws SAXException
4292
  {
4293
    // Normal condition.
4294
    if (c == '\n')
4295
      {
4296
        line--;
4297
        column = -1;
4298
      }
4299
    if (readBufferPos > 0)
4300
      {
4301
        readBuffer[--readBufferPos] = c;
4302
      }
4303
    else
4304
      {
4305
        pushString(null, new Character(c).toString());
4306
      }
4307
  }
4308
 
4309
  /**
4310
   * Push a char array back onto the current input stream.
4311
   * <p>NOTE: you must <em>never</em> push back characters that you
4312
   * haven't actually read: use pushString () instead.
4313
   * @see #readCh
4314
   * @see #unread (char)
4315
   * @see #readBuffer
4316
   * @see #pushString
4317
   */
4318
  private void unread(char[] ch, int length)
4319
    throws SAXException
4320
  {
4321
    for (int i = 0; i < length; i++)
4322
      {
4323
        if (ch[i] == '\n')
4324
          {
4325
            line--;
4326
            column = -1;
4327
          }
4328
      }
4329
    if (length < readBufferPos)
4330
      {
4331
        readBufferPos -= length;
4332
      }
4333
    else
4334
      {
4335
        pushCharArray(null, ch, 0, length);
4336
      }
4337
  }
4338
 
4339
  /**
4340
   * Push, or skip, a new external input source.
4341
   * The source will be some kind of parsed entity, such as a PE
4342
   * (including the external DTD subset) or content for the body.
4343
   *
4344
   * @param url The java.net.URL object for the entity.
4345
   * @see SAXDriver#resolveEntity
4346
   * @see #pushString
4347
   * @see #sourceType
4348
   * @see #pushInput
4349
   * @see #detectEncoding
4350
   * @see #sourceType
4351
   * @see #readBuffer
4352
   */
4353
  private void pushURL(boolean isPE,
4354
                       String ename,
4355
                       ExternalIdentifiers ids,
4356
                       Reader reader,
4357
                       InputStream stream,
4358
                       String encoding,
4359
                       boolean doResolve)
4360
    throws SAXException, IOException
4361
  {
4362
    boolean ignoreEncoding;
4363
    String systemId;
4364
    InputSource source;
4365
 
4366
    if (!isPE)
4367
      {
4368
        dataBufferFlush();
4369
      }
4370
 
4371
    scratch.setPublicId(ids.publicId);
4372
    scratch.setSystemId(ids.systemId);
4373
 
4374
    // See if we should skip or substitute the entity.
4375
    // If we're not skipping, resolving reports startEntity()
4376
    // and updates the (handler's) stack of URIs.
4377
    if (doResolve)
4378
      {
4379
        // assert (stream == null && reader == null && encoding == null)
4380
        source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4381
        if (source == null)
4382
          {
4383
            handler.warn("skipping entity: " + ename);
4384
            handler.skippedEntity(ename);
4385
            if (isPE)
4386
              {
4387
                skippedPE = true;
4388
              }
4389
            return;
4390
          }
4391
 
4392
        // we might be using alternate IDs/encoding
4393
        systemId = source.getSystemId();
4394
        // The following warning and setting systemId was deleted bcause
4395
        // the application has the option of not setting systemId
4396
        // provided that it has set the characte/byte stream.
4397
        /*
4398
           if (systemId == null) {
4399
           handler.warn ("missing system ID, using " + ids.systemId);
4400
           systemId = ids.systemId;
4401
           }
4402
         */
4403
      }
4404
    else
4405
      {
4406
        // "[document]", or "[dtd]" via getExternalSubset()
4407
        scratch.setCharacterStream(reader);
4408
        scratch.setByteStream(stream);
4409
        scratch.setEncoding(encoding);
4410
        source = scratch;
4411
        systemId = ids.systemId;
4412
        if (handler.stringInterning)
4413
          {
4414
            handler.startExternalEntity(ename, systemId,
4415
                                        "[document]" == ename);
4416
          }
4417
        else
4418
          {
4419
            handler.startExternalEntity(ename, systemId,
4420
                                        "[document]".equals(ename));
4421
          }
4422
      }
4423
 
4424
    // we may have been given I/O streams directly
4425
    if (source.getCharacterStream() != null)
4426
      {
4427
        if (source.getByteStream() != null)
4428
          error("InputSource has two streams!");
4429
        reader = source.getCharacterStream();
4430
      }
4431
    else if (source.getByteStream() != null)
4432
      {
4433
        encoding = source.getEncoding();
4434
        if (encoding == null)
4435
          {
4436
            stream = source.getByteStream();
4437
          }
4438
        else
4439
          {
4440
            try
4441
              {
4442
                reader = new InputStreamReader(source.getByteStream(),
4443
                                               encoding);
4444
              }
4445
            catch (IOException e)
4446
              {
4447
                stream = source.getByteStream();
4448
              }
4449
          }
4450
      }
4451
    else if (systemId == null)
4452
      {
4453
        error("InputSource has no URI!");
4454
      }
4455
    scratch.setCharacterStream(null);
4456
    scratch.setByteStream(null);
4457
    scratch.setEncoding(null);
4458
 
4459
    // Push the existing status.
4460
    pushInput(ename);
4461
 
4462
    // Create a new read buffer.
4463
    // (Note the four-character margin)
4464
    readBuffer = new char[READ_BUFFER_MAX + 4];
4465
    readBufferPos = 0;
4466
    readBufferLength = 0;
4467
    readBufferOverflow = -1;
4468
    is = null;
4469
    line = 1;
4470
    column = 0;
4471
    currentByteCount = 0;
4472
 
4473
    // If there's an explicit character stream, just
4474
    // ignore encoding declarations.
4475
    if (reader != null)
4476
      {
4477
        sourceType = INPUT_READER;
4478
        this.reader = reader;
4479
        tryEncodingDecl(true);
4480
        return;
4481
      }
4482
 
4483
    // Else we handle the conversion, and need to ensure
4484
    // it's done right.
4485
    sourceType = INPUT_STREAM;
4486
    if (stream != null)
4487
      {
4488
        is = stream;
4489
      }
4490
    else
4491
      {
4492
        // We have to open our own stream to the URL.
4493
        URL url = new URL(systemId);
4494
 
4495
        externalEntity = url.openConnection();
4496
        externalEntity.connect();
4497
        is = externalEntity.getInputStream();
4498
      }
4499
 
4500
    // If we get to here, there must be
4501
    // an InputStream available.
4502
    if (!is.markSupported())
4503
      {
4504
        is = new BufferedInputStream(is);
4505
      }
4506
 
4507
    // Get any external encoding label.
4508
    if (encoding == null && externalEntity != null)
4509
      {
4510
        // External labels can be untrustworthy; filesystems in
4511
        // particular often have the wrong default for content
4512
        // that wasn't locally originated.  Those we autodetect.
4513
        if (!"file".equals(externalEntity.getURL().getProtocol()))
4514
          {
4515
            int temp;
4516
 
4517
            // application/xml;charset=something;otherAttr=...
4518
            // ... with many variants on 'something'
4519
            encoding = externalEntity.getContentType();
4520
 
4521
            // MHK code (fix for Saxon 5.5.1/007):
4522
            // protect against encoding==null
4523
            if (encoding == null)
4524
              {
4525
                temp = -1;
4526
              }
4527
            else
4528
              {
4529
                temp = encoding.indexOf("charset");
4530
              }
4531
 
4532
            // RFC 2376 sez MIME text defaults to ASCII, but since the
4533
            // JDK will create a MIME type out of thin air, we always
4534
            // autodetect when there's no explicit charset attribute.
4535
            if (temp < 0)
4536
              {
4537
                encoding = null;  // autodetect
4538
              }
4539
            else
4540
              {
4541
                // only this one attribute
4542
                if ((temp = encoding.indexOf(';')) > 0)
4543
                  {
4544
                    encoding = encoding.substring(0, temp);
4545
                  }
4546
 
4547
                if ((temp = encoding.indexOf('=', temp + 7)) > 0)
4548
                  {
4549
                    encoding = encoding.substring(temp + 1);
4550
 
4551
                    // attributes can have comment fields (RFC 822)
4552
                    if ((temp = encoding.indexOf('(')) > 0)
4553
                      {
4554
                        encoding = encoding.substring(0, temp);
4555
                      }
4556
                    // ... and values may be quoted
4557
                    if ((temp = encoding.indexOf('"')) > 0)
4558
                      {
4559
                        encoding =
4560
                          encoding.substring(temp + 1,
4561
                                             encoding.indexOf('"', temp + 2));
4562
                      }
4563
                    encoding = encoding.trim();
4564
                  }
4565
                else
4566
                  {
4567
                    handler.warn("ignoring illegal MIME attribute: "
4568
                                 + encoding);
4569
                    encoding = null;
4570
                  }
4571
              }
4572
          }
4573
      }
4574
 
4575
    // if we got an external encoding label, use it ...
4576
    if (encoding != null)
4577
      {
4578
        this.encoding = ENCODING_EXTERNAL;
4579
        setupDecoding(encoding);
4580
        ignoreEncoding = true;
4581
 
4582
        // ... else autodetect from first bytes.
4583
      }
4584
    else
4585
      {
4586
        detectEncoding();
4587
        ignoreEncoding = false;
4588
      }
4589
 
4590
    // Read any XML or text declaration.
4591
    // If we autodetected, it may tell us the "real" encoding.
4592
    try
4593
      {
4594
        tryEncodingDecl(ignoreEncoding);
4595
      }
4596
    catch (UnsupportedEncodingException x)
4597
      {
4598
        encoding = x.getMessage();
4599
 
4600
        // if we don't handle the declared encoding,
4601
        // try letting a JVM InputStreamReader do it
4602
        try
4603
          {
4604
            if (sourceType != INPUT_STREAM)
4605
              {
4606
                throw x;
4607
              }
4608
 
4609
            is.reset();
4610
            readBufferPos = 0;
4611
            readBufferLength = 0;
4612
            readBufferOverflow = -1;
4613
            line = 1;
4614
            currentByteCount = column = 0;
4615
 
4616
            sourceType = INPUT_READER;
4617
            this.reader = new InputStreamReader(is, encoding);
4618
            is = null;
4619
 
4620
            tryEncodingDecl(true);
4621
 
4622
          }
4623
        catch (IOException e)
4624
          {
4625
            error("unsupported text encoding",
4626
                  encoding,
4627
                  null);
4628
          }
4629
      }
4630
  }
4631
 
4632
  /**
4633
   * Check for an encoding declaration.  This is the second part of the
4634
   * XML encoding autodetection algorithm, relying on detectEncoding to
4635
   * get to the point that this part can read any encoding declaration
4636
   * in the document (using only US-ASCII characters).
4637
   *
4638
   * <p> Because this part starts to fill parser buffers with this data,
4639
   * it's tricky to setup a reader so that Java's built-in decoders can be
4640
   * used for the character encodings that aren't built in to this parser
4641
   * (such as EUC-JP, KOI8-R, Big5, etc).
4642
   *
4643
   * @return any encoding in the declaration, uppercased; or null
4644
   * @see detectEncoding
4645
   */
4646
  private String tryEncodingDecl(boolean ignoreEncoding)
4647
    throws SAXException, IOException
4648
  {
4649
    // Read the XML/text declaration.
4650
    if (tryRead("<?xml"))
4651
      {
4652
        if (tryWhitespace())
4653
          {
4654
            if (inputStack.size() > 0)
4655
              {
4656
                return parseTextDecl(ignoreEncoding);
4657
              }
4658
            else
4659
              {
4660
                return parseXMLDecl(ignoreEncoding);
4661
              }
4662
          }
4663
        else
4664
          {
4665
            // <?xml-stylesheet ...?> or similar
4666
            unread('l');
4667
            unread('m');
4668
            unread('x');
4669
            unread('?');
4670
            unread('<');
4671
          }
4672
      }
4673
    return null;
4674
  }
4675
 
4676
  /**
4677
   * Attempt to detect the encoding of an entity.
4678
   * <p>The trick here (as suggested in the XML standard) is that
4679
   * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
4680
   * <b>must</b> begin with an XML declaration or an encoding
4681
   * declaration; we simply have to look for "&lt;?xml" in various
4682
   * encodings.
4683
   * <p>This method has no way to distinguish among 8-bit encodings.
4684
   * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4685
   * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
4686
   * should work, but most will be rejected later by setupDecoding ().
4687
   * @see #tryEncoding (byte[], byte, byte, byte, byte)
4688
   * @see #tryEncoding (byte[], byte, byte)
4689
   * @see #setupDecoding
4690
   */
4691
  private void detectEncoding()
4692
    throws SAXException, IOException
4693
  {
4694
    byte[] signature = new byte[4];
4695
 
4696
    // Read the first four bytes for
4697
    // autodetection.
4698
    is.mark(4);
4699
    is.read(signature);
4700
    is.reset();
4701
 
4702
    //
4703
    // FIRST:  four byte encodings (who uses these?)
4704
    //
4705
    if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4706
                    (byte) 0x00, (byte) 0x3c))
4707
      {
4708
        // UCS-4 must begin with "<?xml"
4709
        // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4710
        // "UTF-32BE"
4711
        encoding = ENCODING_UCS_4_1234;
4712
      }
4713
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4714
                         (byte) 0x00, (byte) 0x00))
4715
      {
4716
        // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4717
        // "UTF-32LE"
4718
        encoding = ENCODING_UCS_4_4321;
4719
      }
4720
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4721
                         (byte) 0x3c, (byte) 0x00))
4722
      {
4723
        // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4724
        encoding = ENCODING_UCS_4_2143;
4725
      }
4726
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4727
                         (byte) 0x00, (byte) 0x00))
4728
      {
4729
        // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4730
        encoding = ENCODING_UCS_4_3412;
4731
 
4732
        // 00 00 fe ff UCS_4_1234 (with BOM)
4733
        // ff fe 00 00 UCS_4_4321 (with BOM)
4734
      }
4735
 
4736
    //
4737
    // SECOND:  two byte encodings
4738
    // note ... with 1/14/2000 errata the XML spec identifies some
4739
    // more "broken UTF-16" autodetection cases, with no XML decl,
4740
    // which we don't handle here (that's legal too).
4741
    //
4742
    else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4743
      {
4744
        // UCS-2 with a byte-order marker. (UTF-16)
4745
        // 0xfe 0xff: UCS-2, big-endian (12)
4746
        encoding = ENCODING_UCS_2_12;
4747
        is.read(); is.read();
4748
      }
4749
    else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4750
      {
4751
        // UCS-2 with a byte-order marker. (UTF-16)
4752
        // 0xff 0xfe: UCS-2, little-endian (21)
4753
        encoding = ENCODING_UCS_2_21;
4754
        is.read(); is.read();
4755
      }
4756
    else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4757
                         (byte) 0x00, (byte) 0x3f))
4758
      {
4759
        // UTF-16BE (otherwise, malformed UTF-16)
4760
        // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4761
        encoding = ENCODING_UCS_2_12;
4762
        error("no byte-order mark for UCS-2 entity");
4763
      }
4764
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4765
                         (byte) 0x3f, (byte) 0x00))
4766
      {
4767
        // UTF-16LE (otherwise, malformed UTF-16)
4768
        // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4769
        encoding = ENCODING_UCS_2_21;
4770
        error("no byte-order mark for UCS-2 entity");
4771
      }
4772
 
4773
    //
4774
    // THIRD:  ASCII-derived encodings, fixed and variable lengths
4775
    //
4776
    else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4777
                         (byte) 0x78, (byte) 0x6d))
4778
      {
4779
        // ASCII derived
4780
        // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4781
        encoding = ENCODING_UTF_8;
4782
        prefetchASCIIEncodingDecl();
4783
      }
4784
    else if (signature[0] == (byte) 0xef
4785
             && signature[1] == (byte) 0xbb
4786
             && signature[2] == (byte) 0xbf)
4787
      {
4788
        // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4789
        // this un-needed notion slipped into XML 2nd ed through a
4790
        // "non-normative" erratum; now required by MSFT and UDDI,
4791
        // and E22 made it normative.
4792
        encoding = ENCODING_UTF_8;
4793
        is.read(); is.read(); is.read();
4794
      }
4795
    else
4796
      {
4797
        // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4798
        // ... but we COULD at least kick in some fixed code page
4799
 
4800
        // (default) UTF-8 without encoding/XML declaration
4801
        encoding = ENCODING_UTF_8;
4802
      }
4803
  }
4804
 
4805
  /**
4806
   * Check for a four-byte signature.
4807
   * <p>Utility routine for detectEncoding ().
4808
   * <p>Always looks for some part of "<?XML" in a specific encoding.
4809
   * @param sig The first four bytes read.
4810
   * @param b1 The first byte of the signature
4811
   * @param b2 The second byte of the signature
4812
   * @param b3 The third byte of the signature
4813
   * @param b4 The fourth byte of the signature
4814
   * @see #detectEncoding
4815
   */
4816
  private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4817
                                     byte b3, byte b4)
4818
  {
4819
    return (sig[0] == b1 && sig[1] == b2
4820
            && sig[2] == b3 && sig[3] == b4);
4821
  }
4822
 
4823
  /**
4824
   * Check for a two-byte signature.
4825
   * <p>Looks for a UCS-2 byte-order mark.
4826
   * <p>Utility routine for detectEncoding ().
4827
   * @param sig The first four bytes read.
4828
   * @param b1 The first byte of the signature
4829
   * @param b2 The second byte of the signature
4830
   * @see #detectEncoding
4831
   */
4832
  private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4833
  {
4834
    return ((sig[0] == b1) && (sig[1] == b2));
4835
  }
4836
 
4837
  /**
4838
   * This method pushes a string back onto input.
4839
   * <p>It is useful either as the expansion of an internal entity,
4840
   * or for backtracking during the parse.
4841
   * <p>Call pushCharArray () to do the actual work.
4842
   * @param s The string to push back onto input.
4843
   * @see #pushCharArray
4844
   */
4845
  private void pushString(String ename, String s)
4846
    throws SAXException
4847
  {
4848
    char[] ch = s.toCharArray();
4849
    pushCharArray(ename, ch, 0, ch.length);
4850
  }
4851
 
4852
  /**
4853
   * Push a new internal input source.
4854
   * <p>This method is useful for expanding an internal entity,
4855
   * or for unreading a string of characters.  It creates a new
4856
   * readBuffer containing the characters in the array, instead
4857
   * of characters converted from an input byte stream.
4858
   * @param ch The char array to push.
4859
   * @see #pushString
4860
   * @see #pushURL
4861
   * @see #readBuffer
4862
   * @see #sourceType
4863
   * @see #pushInput
4864
   */
4865
  private void pushCharArray(String ename, char[] ch, int start, int length)
4866
    throws SAXException
4867
  {
4868
    // Push the existing status
4869
    pushInput(ename);
4870
    if (ename != null && doReport)
4871
      {
4872
        dataBufferFlush();
4873
        handler.startInternalEntity(ename);
4874
      }
4875
    sourceType = INPUT_INTERNAL;
4876
    readBuffer = ch;
4877
    readBufferPos = start;
4878
    readBufferLength = length;
4879
    readBufferOverflow = -1;
4880
  }
4881
 
4882
  /**
4883
   * Save the current input source onto the stack.
4884
   * <p>This method saves all of the global variables associated with
4885
   * the current input source, so that they can be restored when a new
4886
   * input source has finished.  It also tests for entity recursion.
4887
   * <p>The method saves the following global variables onto a stack
4888
   * using a fixed-length array:
4889
   * <ol>
4890
   * <li>sourceType
4891
   * <li>externalEntity
4892
   * <li>readBuffer
4893
   * <li>readBufferPos
4894
   * <li>readBufferLength
4895
   * <li>line
4896
   * <li>encoding
4897
   * </ol>
4898
   * @param ename The name of the entity (if any) causing the new input.
4899
   * @see #popInput
4900
   * @see #sourceType
4901
   * @see #externalEntity
4902
   * @see #readBuffer
4903
   * @see #readBufferPos
4904
   * @see #readBufferLength
4905
   * @see #line
4906
   * @see #encoding
4907
   */
4908
  private void pushInput(String ename)
4909
    throws SAXException
4910
  {
4911
    // Check for entity recursion.
4912
    if (ename != null)
4913
      {
4914
        Iterator entities = entityStack.iterator();
4915
        while (entities.hasNext())
4916
          {
4917
            String e = (String) entities.next();
4918
            if (e != null && e == ename)
4919
              {
4920
                error("recursive reference to entity", ename, null);
4921
              }
4922
          }
4923
      }
4924
    entityStack.addLast(ename);
4925
 
4926
    // Don't bother if there is no current input.
4927
    if (sourceType == INPUT_NONE)
4928
      {
4929
        return;
4930
      }
4931
 
4932
    // Set up a snapshot of the current
4933
    // input source.
4934
    Input input = new Input();
4935
 
4936
    input.sourceType = sourceType;
4937
    input.externalEntity = externalEntity;
4938
    input.readBuffer = readBuffer;
4939
    input.readBufferPos = readBufferPos;
4940
    input.readBufferLength = readBufferLength;
4941
    input.line = line;
4942
    input.encoding = encoding;
4943
    input.readBufferOverflow = readBufferOverflow;
4944
    input.is = is;
4945
    input.currentByteCount = currentByteCount;
4946
    input.column = column;
4947
    input.reader = reader;
4948
 
4949
    // Push it onto the stack.
4950
    inputStack.addLast(input);
4951
  }
4952
 
4953
  /**
4954
   * Restore a previous input source.
4955
   * <p>This method restores all of the global variables associated with
4956
   * the current input source.
4957
   * @exception java.io.EOFException
4958
   *    If there are no more entries on the input stack.
4959
   * @see #pushInput
4960
   * @see #sourceType
4961
   * @see #externalEntity
4962
   * @see #readBuffer
4963
   * @see #readBufferPos
4964
   * @see #readBufferLength
4965
   * @see #line
4966
   * @see #encoding
4967
   */
4968
  private void popInput()
4969
    throws SAXException, IOException
4970
  {
4971
    String ename = (String) entityStack.removeLast();
4972
 
4973
    if (ename != null && doReport)
4974
      {
4975
        dataBufferFlush();
4976
      }
4977
    switch (sourceType)
4978
      {
4979
      case INPUT_STREAM:
4980
        handler.endExternalEntity(ename);
4981
        is.close();
4982
        break;
4983
      case INPUT_READER:
4984
        handler.endExternalEntity(ename);
4985
        reader.close();
4986
        break;
4987
      case INPUT_INTERNAL:
4988
        if (ename != null && doReport)
4989
          {
4990
            handler.endInternalEntity(ename);
4991
          }
4992
        break;
4993
      }
4994
 
4995
    // Throw an EOFException if there
4996
    // is nothing else to pop.
4997
    if (inputStack.isEmpty())
4998
      {
4999
        throw new EOFException("no more input");
5000
      }
5001
 
5002
    Input input = (Input) inputStack.removeLast();
5003
 
5004
    sourceType = input.sourceType;
5005
    externalEntity = input.externalEntity;
5006
    readBuffer = input.readBuffer;
5007
    readBufferPos = input.readBufferPos;
5008
    readBufferLength = input.readBufferLength;
5009
    line = input.line;
5010
    encoding = input.encoding;
5011
    readBufferOverflow = input.readBufferOverflow;
5012
    is = input.is;
5013
    currentByteCount = input.currentByteCount;
5014
    column = input.column;
5015
    reader = input.reader;
5016
  }
5017
 
5018
  /**
5019
   * Return true if we can read the expected character.
5020
   * <p>Note that the character will be removed from the input stream
5021
   * on success, but will be put back on failure.  Do not attempt to
5022
   * read the character again if the method succeeds.
5023
   * @param delim The character that should appear next.  For a
5024
   *        insensitive match, you must supply this in upper-case.
5025
   * @return true if the character was successfully read, or false if
5026
   *   it was not.
5027
   * @see #tryRead (String)
5028
   */
5029
  private boolean tryRead(char delim)
5030
    throws SAXException, IOException
5031
  {
5032
    char c;
5033
 
5034
    // Read the character
5035
    c = readCh();
5036
 
5037
    // Test for a match, and push the character
5038
    // back if the match fails.
5039
    if (c == delim)
5040
      {
5041
        return true;
5042
      }
5043
    else
5044
      {
5045
        unread(c);
5046
        return false;
5047
      }
5048
  }
5049
 
5050
  /**
5051
   * Return true if we can read the expected string.
5052
   * <p>This is simply a convenience method.
5053
   * <p>Note that the string will be removed from the input stream
5054
   * on success, but will be put back on failure.  Do not attempt to
5055
   * read the string again if the method succeeds.
5056
   * <p>This method will push back a character rather than an
5057
   * array whenever possible (probably the majority of cases).
5058
   * @param delim The string that should appear next.
5059
   * @return true if the string was successfully read, or false if
5060
   *   it was not.
5061
   * @see #tryRead (char)
5062
   */
5063
  private boolean tryRead(String delim)
5064
    throws SAXException, IOException
5065
  {
5066
    return tryRead(delim.toCharArray());
5067
  }
5068
 
5069
  private boolean tryRead(char[] ch)
5070
    throws SAXException, IOException
5071
  {
5072
    char c;
5073
 
5074
    // Compare the input, character-
5075
    // by character.
5076
 
5077
    for (int i = 0; i < ch.length; i++)
5078
      {
5079
        c = readCh();
5080
        if (c != ch[i])
5081
          {
5082
            unread(c);
5083
            if (i != 0)
5084
              {
5085
                unread(ch, i);
5086
              }
5087
            return false;
5088
          }
5089
      }
5090
    return true;
5091
  }
5092
 
5093
  /**
5094
   * Return true if we can read some whitespace.
5095
   * <p>This is simply a convenience method.
5096
   * <p>This method will push back a character rather than an
5097
   * array whenever possible (probably the majority of cases).
5098
   * @return true if whitespace was found.
5099
   */
5100
  private boolean tryWhitespace()
5101
    throws SAXException, IOException
5102
  {
5103
    char c;
5104
    c = readCh();
5105
    if (isWhitespace(c))
5106
      {
5107
        skipWhitespace();
5108
        return true;
5109
      }
5110
    else
5111
      {
5112
        unread(c);
5113
        return false;
5114
      }
5115
  }
5116
 
5117
  /**
5118
   * Read all data until we find the specified string.
5119
   * This is useful for scanning CDATA sections and PIs.
5120
   * <p>This is inefficient right now, since it calls tryRead ()
5121
   * for every character.
5122
   * @param delim The string delimiter
5123
   * @see #tryRead (String, boolean)
5124
   * @see #readCh
5125
   */
5126
  private void parseUntil(String delim)
5127
    throws SAXException, IOException
5128
  {
5129
    parseUntil(delim.toCharArray());
5130
  }
5131
 
5132
  private void parseUntil(char[] delim)
5133
    throws SAXException, IOException
5134
  {
5135
    char c;
5136
    int startLine = line;
5137
 
5138
    try
5139
      {
5140
        while (!tryRead(delim))
5141
          {
5142
            c = readCh();
5143
            dataBufferAppend(c);
5144
          }
5145
      }
5146
    catch (EOFException e)
5147
      {
5148
        error("end of input while looking for delimiter "
5149
              + "(started on line " + startLine
5150
              + ')', null, new String(delim));
5151
      }
5152
  }
5153
 
5154
  //////////////////////////////////////////////////////////////////////
5155
  // Low-level I/O.
5156
  //////////////////////////////////////////////////////////////////////
5157
 
5158
  /**
5159
   * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5160
   * Doesn't buffer more than absolutely needed, so that when an encoding
5161
   * decl says we need to create an InputStreamReader, we can discard our
5162
   * buffer and reset().  Caller knows the first chars of the decl exist
5163
   * in the input stream.
5164
   */
5165
  private void prefetchASCIIEncodingDecl()
5166
    throws SAXException, IOException
5167
  {
5168
    int ch;
5169
    readBufferPos = readBufferLength = 0;
5170
 
5171
    is.mark(readBuffer.length);
5172
    while (true)
5173
      {
5174
        ch = is.read();
5175
        readBuffer[readBufferLength++] = (char) ch;
5176
        switch (ch)
5177
          {
5178
          case (int) '>':
5179
            return;
5180
          case -1:
5181
            error("file ends before end of XML or encoding declaration.",
5182
                  null, "?>");
5183
          }
5184
        if (readBuffer.length == readBufferLength)
5185
          {
5186
            error("unfinished XML or encoding declaration");
5187
          }
5188
      }
5189
  }
5190
 
5191
  /**
5192
   * Read a chunk of data from an external input source.
5193
   * <p>This is simply a front-end that fills the rawReadBuffer
5194
   * with bytes, then calls the appropriate encoding handler.
5195
   * @see #encoding
5196
   * @see #rawReadBuffer
5197
   * @see #readBuffer
5198
   * @see #filterCR
5199
   * @see #copyUtf8ReadBuffer
5200
   * @see #copyIso8859_1ReadBuffer
5201
   * @see #copyUcs_2ReadBuffer
5202
   * @see #copyUcs_4ReadBuffer
5203
   */
5204
  private void readDataChunk()
5205
    throws SAXException, IOException
5206
  {
5207
    int count;
5208
 
5209
    // See if we have any overflow (filterCR sets for CR at end)
5210
    if (readBufferOverflow > -1)
5211
      {
5212
        readBuffer[0] = (char) readBufferOverflow;
5213
        readBufferOverflow = -1;
5214
        readBufferPos = 1;
5215
        sawCR = true;
5216
      }
5217
    else
5218
      {
5219
        readBufferPos = 0;
5220
        sawCR = false;
5221
      }
5222
 
5223
    // input from a character stream.
5224
    if (sourceType == INPUT_READER)
5225
      {
5226
        count = reader.read(readBuffer,
5227
                            readBufferPos, READ_BUFFER_MAX - readBufferPos);
5228
        if (count < 0)
5229
          {
5230
            readBufferLength = readBufferPos;
5231
          }
5232
        else
5233
          {
5234
            readBufferLength = readBufferPos + count;
5235
          }
5236
        if (readBufferLength > 0)
5237
          {
5238
            filterCR(count >= 0);
5239
          }
5240
        sawCR = false;
5241
        return;
5242
      }
5243
 
5244
    // Read as many bytes as possible into the raw buffer.
5245
    count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
5246
 
5247
    // Dispatch to an encoding-specific reader method to populate
5248
    // the readBuffer.  In most parser speed profiles, these routines
5249
    // show up at the top of the CPU usage chart.
5250
    if (count > 0)
5251
      {
5252
        switch (encoding)
5253
          {
5254
            // one byte builtins
5255
          case ENCODING_ASCII:
5256
            copyIso8859_1ReadBuffer(count, (char) 0x0080);
5257
            break;
5258
          case ENCODING_UTF_8:
5259
            copyUtf8ReadBuffer(count);
5260
            break;
5261
          case ENCODING_ISO_8859_1:
5262
            copyIso8859_1ReadBuffer(count, (char) 0);
5263
            break;
5264
 
5265
            // two byte builtins
5266
          case ENCODING_UCS_2_12:
5267
            copyUcs2ReadBuffer(count, 8, 0);
5268
            break;
5269
          case ENCODING_UCS_2_21:
5270
            copyUcs2ReadBuffer(count, 0, 8);
5271
            break;
5272
 
5273
            // four byte builtins
5274
          case ENCODING_UCS_4_1234:
5275
            copyUcs4ReadBuffer(count, 24, 16, 8, 0);
5276
            break;
5277
          case ENCODING_UCS_4_4321:
5278
            copyUcs4ReadBuffer(count, 0, 8, 16, 24);
5279
            break;
5280
          case ENCODING_UCS_4_2143:
5281
            copyUcs4ReadBuffer(count, 16, 24, 0, 8);
5282
            break;
5283
          case ENCODING_UCS_4_3412:
5284
            copyUcs4ReadBuffer(count, 8, 0, 24, 16);
5285
            break;
5286
          }
5287
      }
5288
    else
5289
      {
5290
        readBufferLength = readBufferPos;
5291
      }
5292
 
5293
    readBufferPos = 0;
5294
 
5295
    // Filter out all carriage returns if we've seen any
5296
    // (including any saved from a previous read)
5297
    if (sawCR)
5298
      {
5299
        filterCR(count >= 0);
5300
        sawCR = false;
5301
 
5302
        // must actively report EOF, lest some CRs get lost.
5303
        if (readBufferLength == 0 && count >= 0)
5304
          {
5305
            readDataChunk();
5306
          }
5307
      }
5308
 
5309
    if (count > 0)
5310
      {
5311
        currentByteCount += count;
5312
      }
5313
  }
5314
 
5315
  /**
5316
   * Filter carriage returns in the read buffer.
5317
   * CRLF becomes LF; CR becomes LF.
5318
   * @param moreData true iff more data might come from the same source
5319
   * @see #readDataChunk
5320
   * @see #readBuffer
5321
   * @see #readBufferOverflow
5322
   */
5323
  private void filterCR(boolean moreData)
5324
  {
5325
    int i, j;
5326
 
5327
    readBufferOverflow = -1;
5328
 
5329
loop:
5330
    for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5331
      {
5332
        switch (readBuffer[j])
5333
          {
5334
          case '\r':
5335
            if (j == readBufferLength - 1)
5336
              {
5337
                if (moreData)
5338
                  {
5339
                    readBufferOverflow = '\r';
5340
                    readBufferLength--;
5341
                  }
5342
                else   // CR at end of buffer
5343
                  {
5344
                    readBuffer[i++] = '\n';
5345
                  }
5346
                break loop;
5347
              }
5348
            else if (readBuffer[j + 1] == '\n')
5349
              {
5350
                j++;
5351
              }
5352
            readBuffer[i] = '\n';
5353
            break;
5354
 
5355
          case '\n':
5356
          default:
5357
            readBuffer[i] = readBuffer[j];
5358
            break;
5359
          }
5360
      }
5361
    readBufferLength = i;
5362
  }
5363
 
5364
  /**
5365
   * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
5366
   * <p>When readDataChunk () calls this method, the raw bytes are in
5367
   * rawReadBuffer, and the final characters will appear in
5368
   * readBuffer.
5369
   * <p>Note that as of Unicode 3.1, good practice became a requirement,
5370
   * so that each Unicode character has exactly one UTF-8 representation.
5371
   * @param count The number of bytes to convert.
5372
   * @see #readDataChunk
5373
   * @see #rawReadBuffer
5374
   * @see #readBuffer
5375
   * @see #getNextUtf8Byte
5376
   */
5377
  private void copyUtf8ReadBuffer(int count)
5378
    throws SAXException, IOException
5379
  {
5380
    int i = 0;
5381
    int j = readBufferPos;
5382
    int b1;
5383
    char c = 0;
5384
 
5385
    /*
5386
    // check once, so the runtime won't (if it's smart enough)
5387
    if (count < 0 || count > rawReadBuffer.length)
5388
    throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
5389
     */
5390
 
5391
    while (i < count)
5392
      {
5393
        b1 = rawReadBuffer[i++];
5394
 
5395
        // Determine whether we are dealing
5396
        // with a one-, two-, three-, or four-
5397
        // byte sequence.
5398
        if (b1 < 0)
5399
          {
5400
            if ((b1 & 0xe0) == 0xc0)
5401
              {
5402
                // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5403
                c = (char) (((b1 & 0x1f) << 6)
5404
                            | getNextUtf8Byte(i++, count));
5405
                if (c < 0x0080)
5406
                  {
5407
                    encodingError("Illegal two byte UTF-8 sequence",
5408
                                  c, 0);
5409
                  }
5410
 
5411
                //Sec 2.11
5412
                // [1] the two-character sequence #xD #xA
5413
                // [2] the two-character sequence #xD #x85
5414
                if ((c == 0x0085 || c == 0x000a) && sawCR)
5415
                  {
5416
                    continue;
5417
                  }
5418
 
5419
                // Sec 2.11
5420
                // [3] the single character #x85
5421
 
5422
                if (c == 0x0085 && xmlVersion == XML_11)
5423
                  {
5424
                    readBuffer[j++] = '\r';
5425
                  }
5426
              }
5427
            else if ((b1 & 0xf0) == 0xe0)
5428
              {
5429
                // 3-byte sequence:
5430
                // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5431
                // most CJKV characters
5432
                c = (char) (((b1 & 0x0f) << 12) |
5433
                            (getNextUtf8Byte(i++, count) << 6) |
5434
                            getNextUtf8Byte(i++, count));
5435
                //sec 2.11
5436
                //[4] the single character #x2028
5437
                if (c == 0x2028 && xmlVersion == XML_11)
5438
                  {
5439
                    readBuffer[j++] = '\r';
5440
                    sawCR = true;
5441
                    continue;
5442
                  }
5443
                if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
5444
                  {
5445
                    encodingError("Illegal three byte UTF-8 sequence",
5446
                                  c, 0);
5447
                  }
5448
              }
5449
            else if ((b1 & 0xf8) == 0xf0)
5450
              {
5451
                // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
5452
                //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
5453
                // (uuuuu = wwww + 1)
5454
                // "Surrogate Pairs" ... from the "Astral Planes"
5455
                // Unicode 3.1 assigned the first characters there
5456
                int iso646 = b1 & 07;
5457
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5458
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5459
                iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5460
 
5461
                if (iso646 <= 0xffff)
5462
                  {
5463
                    encodingError("Illegal four byte UTF-8 sequence",
5464
                                  iso646, 0);
5465
                  }
5466
                else
5467
                  {
5468
                    if (iso646 > 0x0010ffff)
5469
                      {
5470
                        encodingError("UTF-8 value out of range for Unicode",
5471
                                      iso646, 0);
5472
                      }
5473
                    iso646 -= 0x010000;
5474
                    readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
5475
                    readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
5476
                    continue;
5477
                  }
5478
              }
5479
            else
5480
              {
5481
                // The five and six byte encodings aren't supported;
5482
                // they exceed the Unicode (and XML) range.
5483
                encodingError("unsupported five or six byte UTF-8 sequence",
5484
                              0xff & b1, i);
5485
                // NOTREACHED
5486
                c = 0;
5487
              }
5488
          }
5489
        else
5490
          {
5491
            // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
5492
            // (US-ASCII character, "common" case, one branch to here)
5493
            c = (char) b1;
5494
          }
5495
        readBuffer[j++] = c;
5496
        if (c == '\r')
5497
          {
5498
            sawCR = true;
5499
          }
5500
      }
5501
    // How many characters have we read?
5502
    readBufferLength = j;
5503
  }
5504
 
5505
  /**
5506
   * Return the next byte value in a UTF-8 sequence.
5507
   * If it is not possible to get a byte from the current
5508
   * entity, throw an exception.
5509
   * @param pos The current position in the rawReadBuffer.
5510
   * @param count The number of bytes in the rawReadBuffer
5511
   * @return The significant six bits of a non-initial byte in
5512
   *   a UTF-8 sequence.
5513
   * @exception EOFException If the sequence is incomplete.
5514
   */
5515
  private int getNextUtf8Byte(int pos, int count)
5516
    throws SAXException, IOException
5517
  {
5518
    int val;
5519
 
5520
    // Take a character from the buffer
5521
    // or from the actual input stream.
5522
    if (pos < count)
5523
      {
5524
        val = rawReadBuffer[pos];
5525
      }
5526
    else
5527
      {
5528
        val = is.read();
5529
        if (val == -1)
5530
          {
5531
            encodingError("unfinished multi-byte UTF-8 sequence at EOF",
5532
                          -1, pos);
5533
          }
5534
      }
5535
 
5536
    // Check for the correct bits at the start.
5537
    if ((val & 0xc0) != 0x80)
5538
      {
5539
        encodingError("bad continuation of multi-byte UTF-8 sequence",
5540
                      val, pos + 1);
5541
      }
5542
 
5543
    // Return the significant bits.
5544
    return (val & 0x3f);
5545
  }
5546
 
5547
  /**
5548
   * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
5549
   * UTF-16 characters.
5550
   *
5551
   * <p>When readDataChunk () calls this method, the raw bytes are in
5552
   * rawReadBuffer, and the final characters will appear in
5553
   * readBuffer.
5554
   *
5555
   * @param count The number of bytes to convert.
5556
   * @param mask For ASCII conversion, 0x7f; else, 0xff.
5557
   * @see #readDataChunk
5558
   * @see #rawReadBuffer
5559
   * @see #readBuffer
5560
   */
5561
  private void copyIso8859_1ReadBuffer(int count, char mask)
5562
    throws IOException
5563
  {
5564
    int i, j;
5565
    for (i = 0, j = readBufferPos; i < count; i++, j++)
5566
      {
5567
        char c = (char) (rawReadBuffer[i] & 0xff);
5568
        if ((c & mask) != 0)
5569
          {
5570
            throw new CharConversionException("non-ASCII character U+"
5571
                                              + Integer.toHexString(c));
5572
          }
5573
        if (c == 0x0085 && xmlVersion == XML_11)
5574
          {
5575
            c = '\r';
5576
          }
5577
        readBuffer[j] = c;
5578
        if (c == '\r')
5579
          {
5580
            sawCR = true;
5581
          }
5582
      }
5583
    readBufferLength = j;
5584
  }
5585
 
5586
  /**
5587
   * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
5588
   * (as used in Java string manipulation).
5589
   *
5590
   * <p>When readDataChunk () calls this method, the raw bytes are in
5591
   * rawReadBuffer, and the final characters will appear in
5592
   * readBuffer.
5593
   * @param count The number of bytes to convert.
5594
   * @param shift1 The number of bits to shift byte 1.
5595
   * @param shift2 The number of bits to shift byte 2
5596
   * @see #readDataChunk
5597
   * @see #rawReadBuffer
5598
   * @see #readBuffer
5599
   */
5600
  private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
5601
    throws SAXException
5602
  {
5603
    int j = readBufferPos;
5604
 
5605
    if (count > 0 && (count % 2) != 0)
5606
      {
5607
        encodingError("odd number of bytes in UCS-2 encoding", -1, count);
5608
      }
5609
    // The loops are faster with less internal brancing; hence two
5610
    if (shift1 == 0)
5611
      {  // "UTF-16-LE"
5612
        for (int i = 0; i < count; i += 2)
5613
          {
5614
            char c = (char) (rawReadBuffer[i + 1] << 8);
5615
            c |= 0xff & rawReadBuffer[i];
5616
            readBuffer[j++] = c;
5617
            if (c == '\r')
5618
              {
5619
                sawCR = true;
5620
              }
5621
          }
5622
      }
5623
    else
5624
      {  // "UTF-16-BE"
5625
        for (int i = 0; i < count; i += 2)
5626
          {
5627
            char c = (char) (rawReadBuffer[i] << 8);
5628
            c |= 0xff & rawReadBuffer[i + 1];
5629
            readBuffer[j++] = c;
5630
            if (c == '\r')
5631
              {
5632
                sawCR = true;
5633
              }
5634
          }
5635
      }
5636
    readBufferLength = j;
5637
  }
5638
 
5639
  /**
5640
   * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
5641
   *
5642
   * <p>When readDataChunk () calls this method, the raw bytes are in
5643
   * rawReadBuffer, and the final characters will appear in
5644
   * readBuffer.
5645
   * <p>Java has Unicode chars, and this routine uses surrogate pairs
5646
   * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
5647
   * exception is thrown if the ISO-10646 character has no Unicode
5648
   * representation.
5649
   *
5650
   * @param count The number of bytes to convert.
5651
   * @param shift1 The number of bits to shift byte 1.
5652
   * @param shift2 The number of bits to shift byte 2
5653
   * @param shift3 The number of bits to shift byte 2
5654
   * @param shift4 The number of bits to shift byte 2
5655
   * @see #readDataChunk
5656
   * @see #rawReadBuffer
5657
   * @see #readBuffer
5658
   */
5659
  private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
5660
                                  int shift3, int shift4)
5661
    throws SAXException
5662
  {
5663
    int j = readBufferPos;
5664
 
5665
    if (count > 0 && (count % 4) != 0)
5666
      {
5667
        encodingError("number of bytes in UCS-4 encoding " +
5668
                      "not divisible by 4",
5669
                      -1, count);
5670
      }
5671
    for (int i = 0; i < count; i += 4)
5672
      {
5673
        int value = (((rawReadBuffer [i] & 0xff) << shift1) |
5674
                     ((rawReadBuffer [i + 1] & 0xff) << shift2) |
5675
                     ((rawReadBuffer [i + 2] & 0xff) << shift3) |
5676
                     ((rawReadBuffer [i + 3] & 0xff) << shift4));
5677
        if (value < 0x0000ffff)
5678
          {
5679
            readBuffer [j++] = (char) value;
5680
            if (value == (int) '\r')
5681
              {
5682
                sawCR = true;
5683
              }
5684
          }
5685
        else if (value < 0x0010ffff)
5686
          {
5687
            value -= 0x010000;
5688
            readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
5689
            readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
5690
          }
5691
        else
5692
          {
5693
            encodingError("UCS-4 value out of range for Unicode",
5694
                          value, i);
5695
          }
5696
      }
5697
    readBufferLength = j;
5698
  }
5699
 
5700
  /**
5701
   * Report a character encoding error.
5702
   */
5703
  private void encodingError(String message, int value, int offset)
5704
    throws SAXException
5705
  {
5706
    if (value != -1)
5707
      {
5708
        message = message + " (character code: 0x" +
5709
          Integer.toHexString(value) + ')';
5710
        error(message);
5711
      }
5712
  }
5713
 
5714
  //////////////////////////////////////////////////////////////////////
5715
  // Local Variables.
5716
  //////////////////////////////////////////////////////////////////////
5717
 
5718
  /**
5719
   * Re-initialize the variables for each parse.
5720
   */
5721
  private void initializeVariables()
5722
  {
5723
    // First line
5724
    line = 1;
5725
    column = 0;
5726
 
5727
    // Set up the buffers for data and names
5728
    dataBufferPos = 0;
5729
    dataBuffer = new char[DATA_BUFFER_INITIAL];
5730
    nameBufferPos = 0;
5731
    nameBuffer = new char[NAME_BUFFER_INITIAL];
5732
 
5733
    // Set up the DTD hash tables
5734
    elementInfo = new HashMap();
5735
    entityInfo = new HashMap();
5736
    notationInfo = new HashMap();
5737
    skippedPE = false;
5738
 
5739
    // Set up the variables for the current
5740
    // element context.
5741
    currentElement = null;
5742
    currentElementContent = CONTENT_UNDECLARED;
5743
 
5744
    // Set up the input variables
5745
    sourceType = INPUT_NONE;
5746
    inputStack = new LinkedList();
5747
    entityStack = new LinkedList();
5748
    externalEntity = null;
5749
    tagAttributePos = 0;
5750
    tagAttributes = new String[100];
5751
    rawReadBuffer = new byte[READ_BUFFER_MAX];
5752
    readBufferOverflow = -1;
5753
 
5754
    scratch = new InputSource();
5755
 
5756
    inLiteral = false;
5757
    expandPE = false;
5758
    peIsError = false;
5759
 
5760
    doReport = false;
5761
 
5762
    inCDATA = false;
5763
 
5764
    symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5765
  }
5766
 
5767
  static class ExternalIdentifiers
5768
  {
5769
 
5770
    String publicId;
5771
    String systemId;
5772
    String baseUri;
5773
 
5774
    ExternalIdentifiers()
5775
    {
5776
    }
5777
 
5778
    ExternalIdentifiers(String publicId, String systemId, String baseUri)
5779
    {
5780
      this.publicId = publicId;
5781
      this.systemId = systemId;
5782
      this.baseUri = baseUri;
5783
    }
5784
 
5785
  }
5786
 
5787
  static class EntityInfo
5788
  {
5789
 
5790
    int type;
5791
    ExternalIdentifiers ids;
5792
    String value;
5793
    String notationName;
5794
 
5795
  }
5796
 
5797
  static class AttributeDecl
5798
  {
5799
 
5800
    String type;
5801
    String value;
5802
    int valueType;
5803
    String enumeration;
5804
    String defaultValue;
5805
 
5806
  }
5807
 
5808
  static class ElementDecl
5809
  {
5810
 
5811
    int contentType;
5812
    String contentModel;
5813
    HashMap attributes;
5814
 
5815
  }
5816
 
5817
  static class Input
5818
  {
5819
 
5820
    int sourceType;
5821
    URLConnection externalEntity;
5822
    char[] readBuffer;
5823
    int readBufferPos;
5824
    int readBufferLength;
5825
    int line;
5826
    int encoding;
5827
    int readBufferOverflow;
5828
    InputStream is;
5829
    int currentByteCount;
5830
    int column;
5831
    Reader reader;
5832
 
5833
  }
5834
 
5835
}
5836
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.