OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [classpath/] [gnu/] [javax/] [swing/] [text/] [html/] [parser/] [support/] [Parser.java] - Blame information for rev 769

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 769 jeremybenn
/* Parser.java -- HTML parser.
2
   Copyright (C) 2005 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
 
39
package gnu.javax.swing.text.html.parser.support;
40
 
41
import gnu.java.lang.CPStringBuilder;
42
 
43
import gnu.javax.swing.text.html.parser.htmlAttributeSet;
44
import gnu.javax.swing.text.html.parser.htmlValidator;
45
import gnu.javax.swing.text.html.parser.support.low.Constants;
46
import gnu.javax.swing.text.html.parser.support.low.ParseException;
47
import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
48
import gnu.javax.swing.text.html.parser.support.low.Token;
49
import gnu.javax.swing.text.html.parser.support.low.node;
50
import gnu.javax.swing.text.html.parser.support.low.pattern;
51
 
52
import java.io.IOException;
53
import java.io.Reader;
54
 
55
import java.util.Comparator;
56
import java.util.Set;
57
import java.util.TreeSet;
58
import java.util.Vector;
59
 
60
import javax.swing.text.ChangedCharSetException;
61
import javax.swing.text.SimpleAttributeSet;
62
import javax.swing.text.html.HTML;
63
import javax.swing.text.html.parser.AttributeList;
64
import javax.swing.text.html.parser.DTD;
65
import javax.swing.text.html.parser.DTDConstants;
66
import javax.swing.text.html.parser.Element;
67
import javax.swing.text.html.parser.Entity;
68
import javax.swing.text.html.parser.TagElement;
69
 
70
/**
71
 * <p>A simple error-tolerant HTML parser that uses a DTD document
72
 * to access data on the possible tokens, arguments and syntax.</p>
73
 * <p> The parser reads an HTML content from a Reader and calls various
74
 * notifying methods (which should be overridden in a subclass)
75
 * when tags or data are encountered.</p>
76
 * <p>Some HTML elements need no opening or closing tags. The
77
 * task of this parser is to invoke the tag handling methods also when
78
 * the tags are not explicitly specified and must be supposed using
79
 * information, stored in the DTD.
80
 * For  example, parsing the document
81
 * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
82
 * will invoke exactly the handling methods exactly in the same order
83
 * (and with the same parameters) as if parsing the document: <br>
84
 * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
85
 * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
86
 * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
87
 * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
88
 * (supposed tags are given in italics). The parser also supports
89
 * obsolete elements of HTML syntax.<p>
90
 * </p>
91
 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
92
 */
93
public class Parser
94
  extends ReaderTokenizer
95
  implements DTDConstants
96
{
97
  /**
98
   * The current html tag.
99
   */
100
  public Token hTag = new Token();
101
 
102
  /**
103
   * The document template description that will be used to parse the documents.
104
   */
105
  protected DTD dtd;
106
 
107
  /**
108
   * The value of this field determines whether or not the Parser will be
109
   * strict in enforcing SGML compatibility. The default value is false,
110
   * stating that the parser should do everything to parse and get at least
111
   * some information even from the incorrectly written HTML input.
112
   */
113
  protected boolean strict;
114
 
115
  /**
116
   * This fields has positive values in preformatted tags.
117
   */
118
  protected int preformatted = 0;
119
 
120
  /**
121
   * The set of the document tags. This field is used for supporting
122
   * markFirstTime().
123
   */
124
  private Set documentTags =
125
    new TreeSet(new Comparator()
126
      {
127
        public int compare(Object a, Object b)
128
        {
129
          return ((String) a).compareToIgnoreCase((String) b);
130
        }
131
      }
132
               );
133
 
134
  /**
135
  * The buffer to collect the incremental output like text or coment.
136
  */
137
  private final StringBuffer buffer = new StringBuffer();
138
 
139
  /**
140
   * The buffer to store the document title.
141
   */
142
  private final StringBuffer title = new StringBuffer();
143
 
144
  /**
145
   * The current token.
146
   */
147
  private Token t;
148
 
149
  /**
150
   * True means that the 'title' tag of this document has
151
   * already been handled.
152
   */
153
  private boolean titleHandled;
154
 
155
  /**
156
   * True means that the 'title' tag is currently open and all
157
   * text is also added to the title buffer.
158
   */
159
  private boolean titleOpen;
160
 
161
  /**
162
   * The attributes of the current HTML element.
163
   * Package-private to avoid an accessor method.
164
   */
165
  htmlAttributeSet attributes =
166
    htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
167
 
168
  /**
169
   * The validator, controlling the forcible closing of the tags that
170
   * (in accordance to dtd) are not allowed in the current context.
171
   */
172
  private htmlValidator validator;
173
 
174
  /**
175
   * Provides the default values for parameters in the case when these
176
   * values are defined in the DTD.
177
   */
178
  private parameterDefaulter defaulter;
179
 
180
  /**
181
   * The text pre-processor for handling line ends and tabs.
182
   */
183
  private textPreProcessor textProcessor = new textPreProcessor();
184
 
185
  /**
186
   * Creates a new Parser that uses the given
187
   * {@link javax.swing.text.html.parser.DTD }. The only standard way
188
   * to get an instance of DTD is to construct it manually, filling in
189
   * all required fields.
190
   * @param a_dtd The DTD to use. The parser behaviour after passing null
191
   * as an argument is not documented and may vary between implementations.
192
   */
193
  public Parser(DTD a_dtd)
194
  {
195
    if (a_dtd == null)
196
      dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
197
    else
198
      dtd = a_dtd;
199
 
200
    defaulter = new parameterDefaulter(dtd);
201
 
202
    validator =
203
      new htmlValidator(dtd)
204
        {
205
          /**
206
           * Handles the error message. This method must be overridden to pass
207
           * the message where required.
208
           * @param msg The message text.
209
           */
210
          protected void s_error(String msg)
211
          {
212
            error(msg);
213
          }
214
 
215
          /**
216
           * The method is called when the tag validator decides to close the
217
           * tag on its own initiative. After reaching the end of stream,
218
           * The tag validator closes all unclosed elements that are required
219
           * to have the end (closing) tag.
220
           *
221
           * @param tElement The tag being fictionally (forcibly) closed.
222
           */
223
          protected void handleSupposedEndTag(Element tElement)
224
          {
225
            // The tag is cloned as the original tElement is the
226
            // element from the starting tag - may be accidently used
227
            // somewhere else.
228
            TagElement tag = makeTag(tElement, true);
229
            _handleEndTag_remaining(tag);
230
          }
231
 
232
          /**
233
           * The method is called when the the tag validator decides to open
234
           * the new tag on its own initiative. The tags, opened in this
235
           * way, are HTML, HEAD and BODY. The attribute set is temporary
236
           * assigned to the empty one, the previous value is
237
           * restored before return.
238
           *
239
           * @param tElement The tag being fictionally (forcibly) closed.
240
           */
241
          protected void handleSupposedStartTag(Element tElement)
242
          {
243
            TagElement tag = makeTag(tElement, true);
244
            htmlAttributeSet were = attributes;
245
            attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
246
            _handleStartTag(tag);
247
            attributes = were;
248
          }
249
        };
250
  }
251
 
252
  /**
253
   * Get the attributes of the current tag.
254
   * @return The attribute set, representing the attributes of the current tag.
255
   */
256
  public SimpleAttributeSet getAttributes()
257
  {
258
    return new SimpleAttributeSet(attributes);
259
  }
260
 
261
  /**
262
   * Invokes the error handler. The default method in this implementation
263
   * delegates the call to handleError, also providing the current line.
264
   */
265
  public void error(String msg)
266
  {
267
    error(msg, getTokenAhead());
268
  }
269
 
270
  public void error(String msg, Token atToken)
271
  {
272
    if (atToken != null)
273
      handleError(atToken.where.beginLine,
274
                  msg + ": line " + atToken.where.beginLine +
275
                  ", absolute pos " + atToken.where.startPosition
276
                 );
277
    else
278
      handleError(0, msg);
279
  }
280
 
281
  /**
282
   * Invokes the error handler. The default method in this implementation
283
   * delegates the call to error (parm1+": '"+parm2+"'").
284
   */
285
  public void error(String msg, String invalid)
286
  {
287
    error(msg + ": '" + invalid + "'");
288
  }
289
 
290
  /**
291
   * Invokes the error handler. The default method in this implementation
292
   * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
293
   */
294
  public void error(String parm1, String parm2, String parm3)
295
  {
296
    error(parm1 + " " + parm2 + " " + parm3);
297
  }
298
 
299
  /**
300
   * Invokes the error handler. The default method in this implementation
301
   * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
302
   */
303
  public void error(String parm1, String parm2, String parm3, String parm4)
304
  {
305
    error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
306
  }
307
 
308
  public void flushAttributes()
309
  {
310
  }
311
 
312
  /**
313
   * Parse the HTML text, calling various methods in response to the
314
   * occurence of the corresponding HTML constructions.
315
   * @param reader The reader to read the source HTML from.
316
   * @throws IOException If the reader throws one.
317
   */
318
  public synchronized void parse(Reader reader)
319
                          throws IOException
320
  {
321
    reset(reader);
322
    restart();
323
    try
324
      {
325
        parseDocument();
326
        validator.closeAll();
327
      }
328
    catch (ParseException ex)
329
      {
330
        if (ex != null)
331
          {
332
            error("Unable to continue parsing the document", ex.getMessage());
333
 
334
            Throwable cause = ex.getCause();
335
            if (cause instanceof IOException)
336
              throw (IOException) cause;
337
          }
338
      }
339
  }
340
 
341
  /**
342
   * Parses DTD markup declaration. Currently returns null without action.
343
   * @return null.
344
   * @throws IOException
345
   */
346
  public String parseDTDMarkup()
347
                        throws IOException
348
  {
349
    return null;
350
  }
351
 
352
  /**
353
   * Parse SGML insertion ( &lt;! ... &gt; ). When the
354
   * the SGML insertion is found, this method is called, passing
355
   * SGML in the string buffer as a parameter. The default method
356
   * returns false without action and can be overridden to
357
   * implement user - defined SGML support.
358
   * <p>
359
   * If you need more information about SGML insertions in HTML documents,
360
   * the author suggests to read SGML tutorial on
361
   * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
362
   * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
363
   * Oxford University Press, 688 p, ISBN: 0198537379.
364
   * </p>
365
   * @param strBuff
366
   * @return true if this is a valid DTD markup declaration.
367
   * @throws IOException
368
   */
369
  public boolean parseMarkupDeclarations(StringBuffer strBuff)
370
                                  throws IOException
371
  {
372
    return false;
373
  }
374
 
375
  /**
376
   * Get the first line of the last parsed token.
377
   */
378
  protected int getCurrentLine()
379
  {
380
    return hTag.where.beginLine;
381
  }
382
 
383
  /**
384
   * Read parseable character data, add to buffer.
385
   * @param clearBuffer If true, buffer if filled by CDATA section,
386
   * otherwise the section is appended to the existing content of the
387
   * buffer.
388
   *
389
   * @throws ParseException
390
   */
391
  protected void CDATA(boolean clearBuffer)
392
                throws ParseException
393
  {
394
    Token start = hTag = getTokenAhead();
395
 
396
    if (clearBuffer)
397
      buffer.setLength(0);
398
 
399
    // Handle expected EOF.
400
    if (start.kind == EOF)
401
      return;
402
 
403
    read:
404
    while (true)
405
      {
406
        t = getTokenAhead();
407
        if (t.kind == EOF)
408
          {
409
            error("unexpected eof", t);
410
            break read;
411
          }
412
        else if (t.kind == BEGIN)
413
          break read;
414
        else if (t.kind == Constants.ENTITY)
415
          {
416
            resolveAndAppendEntity(t);
417
            getNextToken();
418
          }
419
        else
420
          {
421
            append(t);
422
            getNextToken();
423
          }
424
      }
425
    hTag = new Token(start, getTokenAhead(0));
426
    if (buffer.length() != 0)
427
      _handleText();
428
  }
429
 
430
  /**
431
  * Process Comment. This method skips till --> without
432
  * taking SGML constructs into consideration.  The supported SGML
433
  * constructs are handled separately.
434
  */
435
  protected void Comment()
436
                  throws ParseException
437
  {
438
    buffer.setLength(0);
439
 
440
    Token start = hTag = mustBe(BEGIN);
441
    optional(WS);
442
    mustBe(EXCLAMATION);
443
    optional(WS);
444
    mustBe(DOUBLE_DASH);
445
 
446
    Token t;
447
    Token last;
448
 
449
    comment:
450
    while (true)
451
      {
452
        t = getTokenAhead();
453
        if (t.kind == EOF)
454
          {
455
            handleEOFInComment();
456
            last = t;
457
            break comment;
458
          }
459
        else if (COMMENT_END.matches(this))
460
          {
461
            mustBe(DOUBLE_DASH);
462
            optional(WS);
463
            last = mustBe(END);
464
            break comment;
465
          }
466
        else if (COMMENT_TRIPLEDASH_END.matches(this))
467
          {
468
            mustBe(DOUBLE_DASH);
469
            t = mustBe(NUMTOKEN);
470
            if (t.getImage().equals("-"))
471
              {
472
                append(t);
473
                last = mustBe(END);
474
                break comment;
475
              }
476
            else
477
              {
478
                buffer.append("--");
479
                append(t);
480
                t = getTokenAhead();
481
              }
482
          }
483
        else
484
        /* The lllll-- can match as NUMTOKEN */
485
        if ((t.getImage().endsWith("--")) &&
486
            (
487
              getTokenAhead(1).kind == END ||
488
              (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
489
            )
490
           )
491
          {
492
            buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
493
 
494
            /* Skip the closing > that we have already checked. */
495
            last = mustBe(t.kind);
496
            break comment;
497
          }
498
        else
499
          append(t);
500
        mustBe(t.kind);
501
      }
502
    hTag = new Token(start, last);
503
 
504
    // Consume any whitespace immediately following a comment.
505
    optional(WS);
506
    handleComment();
507
  }
508
 
509
  /**
510
  * Read a script. The text, returned without any changes,
511
  * is terminated only by the closing tag SCRIPT.
512
  */
513
  protected void Script()
514
                 throws ParseException
515
  {
516
    Token name;
517
 
518
    Token start = hTag = mustBe(BEGIN);
519
    optional(WS);
520
 
521
    name = mustBe(SCRIPT);
522
 
523
    optional(WS);
524
 
525
    restOfTag(false, name, start);
526
 
527
    buffer.setLength(0);
528
 
529
    while (!SCRIPT_CLOSE.matches(this))
530
      {
531
        append(getNextToken());
532
      }
533
 
534
    consume(SCRIPT_CLOSE);
535
 
536
    _handleText();
537
 
538
    endTag(false);
539
    _handleEndTag(makeTagElement(name.getImage(), false));
540
  }
541
 
542
  /**
543
  * Process SGML insertion that is not a comment.
544
  */
545
  protected void Sgml()
546
               throws ParseException
547
  {
548
    if (COMMENT_OPEN.matches(this))
549
      Comment();
550
    else // skip till ">"
551
      {
552
        Token start = hTag = mustBe(BEGIN);
553
        optional(WS);
554
        mustBe(EXCLAMATION);
555
 
556
        buffer.setLength(0);
557
        read:
558
        while (true)
559
          {
560
            t = getNextToken();
561
            if (t.kind == Constants.ENTITY)
562
              {
563
                resolveAndAppendEntity(t);
564
              }
565
            else if (t.kind == EOF)
566
              {
567
                error("unexpected eof", t);
568
                break read;
569
              }
570
            else if (t.kind == END)
571
              break read;
572
            else
573
              append(t);
574
          }
575
 
576
        try
577
          {
578
            parseMarkupDeclarations(buffer);
579
          }
580
        catch (IOException ex)
581
          {
582
            error("Unable to parse SGML insertion: '" + buffer + "'",
583
                  new Token(start, t)
584
                 );
585
          }
586
      }
587
    // Consume any whitespace that follows the Sgml insertion.
588
    optional(WS);
589
  }
590
 
591
  /**
592
  * Read a style definition. The text, returned without any changes,
593
  * is terminated only by the closing tag STYLE.
594
  */
595
  protected void Style()
596
                throws ParseException
597
  {
598
    Token name;
599
 
600
    Token start = hTag = mustBe(BEGIN);
601
    optional(WS);
602
 
603
    name = mustBe(STYLE);
604
 
605
    optional(WS);
606
 
607
    restOfTag(false, name, start);
608
 
609
    buffer.setLength(0);
610
 
611
    while (!STYLE_CLOSE.matches(this))
612
      {
613
        append(getNextToken());
614
      }
615
 
616
    consume(STYLE_CLOSE);
617
 
618
    _handleText();
619
 
620
    endTag(false);
621
    _handleEndTag(makeTagElement(name.getImage(), false));
622
  }
623
 
624
  /**
625
   * Read a html tag.
626
   */
627
  protected void Tag()
628
              throws ParseException
629
  {
630
    mark(true);
631
 
632
    boolean closing = false;
633
    Token name;
634
    Token start = hTag = mustBe(BEGIN);
635
 
636
    optional(WS);
637
    name = getNextToken();
638
    optional(WS);
639
 
640
    if (name.kind == SLASH)
641
      {
642
        closing = true;
643
        name = getNextToken();
644
      }
645
 
646
    restOfTag(closing, name, start);
647
  }
648
 
649
  /**
650
   * A hook, for operations, preceeding call to handleText.
651
   * Handle text in a string buffer.
652
   * In non - preformatted mode, all line breaks immediately following the
653
   * start tag and immediately before an end tag is discarded,
654
   * \r, \n and \t are replaced by spaces, multiple space are replaced
655
   * by the single one and the result is  moved into array,
656
   * passing it  to handleText().
657
   */
658
  protected void _handleText()
659
  {
660
    char[] text;
661
 
662
    if (preformatted > 0)
663
      text = textProcessor.preprocessPreformatted(buffer);
664
    else
665
      text = textProcessor.preprocess(buffer);
666
 
667
    if (text != null && text.length > 0
668
        // According to the specs we need to discard whitespace immediately
669
        // before a closing tag.
670
        && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
671
      {
672
        TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
673
        attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
674
        _handleEmptyTag(pcdata);
675
 
676
        handleText(text);
677
        if (titleOpen)
678
          title.append(text);
679
      }
680
  }
681
 
682
  /**
683
   * Add the image of this token to the buffer.
684
   * @param t A token to append.
685
   */
686
  protected final void append(Token t)
687
  {
688
    if (t.kind != EOF)
689
      t.appendTo(buffer);
690
  }
691
 
692
  /**
693
   * Consume pattern that must match.
694
   * @param p A pattern to consume.
695
   */
696
  protected final void consume(pattern p)
697
  {
698
    node n;
699
    for (int i = 0; i < p.nodes.length; i++)
700
      {
701
        n = p.nodes [ i ];
702
        if (n.optional)
703
          optional(n.kind);
704
        else
705
          mustBe(n.kind);
706
      }
707
  }
708
 
709
  /**
710
   * The method is called when the HTML end (closing) tag is found or if
711
   * the parser concludes that the one should be present in the
712
   * current position. The method is called immediatly
713
   * before calling the handleEndTag().
714
   * @param omitted True if the tag is no actually present in the document,
715
   * but is supposed by the parser (like &lt;/html&gt; at the end of the
716
   * document).
717
   */
718
  protected void endTag(boolean omitted)
719
  {
720
  }
721
 
722
  /**
723
   * Handle HTML comment. The default method returns without action.
724
   * @param comment
725
   */
726
  protected void handleComment(char[] comment)
727
  {
728
  }
729
 
730
  /**
731
   * This is additionally called in when the HTML content terminates
732
   * without closing the HTML comment. This can only happen if the
733
   * HTML document contains errors (for example, the closing --;gt is
734
   * missing.
735
   */
736
  protected void handleEOFInComment()
737
  {
738
    error("Unclosed comment");
739
  }
740
 
741
  /**
742
   * Handle the tag with no content, like &lt;br&gt;. The method is
743
   * called for the elements that, in accordance with the current DTD,
744
   * has an empty content.
745
   * @param tag The tag being handled.
746
   * @throws javax.swing.text.ChangedCharSetException
747
   */
748
  protected void handleEmptyTag(TagElement tag)
749
                         throws javax.swing.text.ChangedCharSetException
750
  {
751
  }
752
 
753
  /**
754
   * The method is called when the HTML closing tag ((like &lt;/table&gt;)
755
   * is found or if the parser concludes that the one should be present
756
   * in the current position.
757
   * @param tag The tag
758
   */
759
  protected void handleEndTag(TagElement tag)
760
  {
761
  }
762
 
763
  /* Handle error that has occured in the given line. */
764
  protected void handleError(int line, String message)
765
  {
766
  }
767
 
768
  /**
769
   * The method is called when the HTML opening tag ((like &lt;table&gt;)
770
   * is found or if the parser concludes that the one should be present
771
   * in the current position.
772
   * @param tag The tag
773
   */
774
  protected void handleStartTag(TagElement tag)
775
  {
776
  }
777
 
778
  /**
779
   * Handle the text section.
780
   * <p> For non-preformatted section, the parser replaces
781
   * \t, \r and \n by spaces and then multiple spaces
782
   * by a single space. Additionaly, all whitespace around
783
   * tags is discarded.
784
   * </p>
785
   * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
786
   * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
787
   * if it is present. Additionally, it replaces each occurence of \r or \r\n
788
   * by a single \n.</p>
789
   *
790
   * @param text A section text.
791
   */
792
  protected void handleText(char[] text)
793
  {
794
  }
795
 
796
  /**
797
   * Handle HTML &lt;title&gt; tag. This method is invoked when
798
   * both title starting and closing tags are already behind.
799
   * The passed argument contains the concatenation of all
800
   * title text sections.
801
   * @param title The title text.
802
   */
803
  protected void handleTitle(char[] title)
804
  {
805
  }
806
 
807
  /**
808
   * Constructs the tag from the given element. In this implementation,
809
   * this is defined, but never called.
810
   * @return the tag
811
   */
812
  protected TagElement makeTag(Element element)
813
  {
814
    return makeTag(element, false);
815
  }
816
 
817
  /**
818
   * Constructs the tag from the given element.
819
   * @param the tag base {@link javax.swing.text.html.parser.Element}
820
   * @param isSupposed true if the tag is not actually present in the
821
   * html input, but the parser supposes that it should to occur in
822
   * the current location.
823
   * @return the tag
824
   */
825
  protected TagElement makeTag(Element element, boolean isSupposed)
826
  {
827
    return new TagElement(element, isSupposed);
828
  }
829
 
830
  /**
831
   * This is called when the tag, representing the given element,
832
   * occurs first time in the document.
833
   * @param element
834
   */
835
  protected void markFirstTime(Element element)
836
  {
837
  }
838
 
839
  /**
840
   * Consume the token that was checked before and hence MUST be present.
841
   * @param kind The kind of token to consume.
842
   */
843
  protected Token mustBe(int kind)
844
  {
845
    if (getTokenAhead().kind == kind)
846
      return getNextToken();
847
    else
848
      {
849
        String ei = "";
850
        if (kind < 1000)
851
          ei = " ('" + (char) kind + "') ";
852
        throw new AssertionError("The token of kind " + kind + ei +
853
                                 " MUST be here,"
854
                                );
855
      }
856
  }
857
 
858
  /**
859
   * Handle attribute without value. The default method uses
860
   * the only allowed attribute value from DTD.
861
   * If the attribute is unknown or allows several values,
862
   * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
863
   * this value is added to the attribute set.
864
   * @param element The name of element.
865
   * @param attribute The name of attribute without value.
866
   */
867
  protected void noValueAttribute(String element, String attribute)
868
  {
869
    Object value = HTML.NULL_ATTRIBUTE_VALUE;
870
 
871
    Element e = dtd.elementHash.get(element.toLowerCase());
872
    if (e != null)
873
      {
874
        AttributeList attr = e.getAttribute(attribute);
875
        if (attr != null)
876
          {
877
            Vector values = attr.values;
878
            if (values != null && values.size() == 1)
879
              value = values.get(0);
880
          }
881
      }
882
    attributes.addAttribute(attribute, value);
883
  }
884
 
885
  /**
886
   * Consume the optional token, if present.
887
   * @param kind The kind of token to consume.
888
   */
889
  protected Token optional(int kind)
890
  {
891
    if (getTokenAhead().kind == kind)
892
      return getNextToken();
893
    else
894
      return null;
895
  }
896
 
897
  /** Parse the html document. */
898
  protected void parseDocument()
899
                        throws ParseException
900
  {
901
    // Read up any initial whitespace.
902
    optional(WS);
903
    while (getTokenAhead().kind != EOF)
904
      {
905
        advanced = false;
906
        if (TAG.matches(this))
907
          Tag();
908
        else if (COMMENT_OPEN.matches(this))
909
          Comment();
910
        else if (STYLE_OPEN.matches(this))
911
          Style();
912
        else if (SCRIPT_OPEN.matches(this))
913
          Script();
914
        else if (SGML.matches(this))
915
          Sgml();
916
        else
917
          CDATA(true);
918
 
919
        // Surely HTML error, treat as a text.
920
        if (!advanced)
921
          {
922
            Token wrong = getNextToken();
923
            error("unexpected '" + wrong.getImage() + "'", wrong);
924
            buffer.setLength(0);
925
            buffer.append(wrong.getImage());
926
            _handleText();
927
          }
928
      }
929
  }
930
 
931
  /**
932
   * Read the element attributes, adding them into attribute set.
933
   * @param element The element name (needed to access attribute
934
   * information in dtd).
935
   */
936
  protected void readAttributes(String element)
937
  {
938
    Token name;
939
    Token value;
940
    Token next;
941
    String attrValue;
942
 
943
    attributes = new htmlAttributeSet();
944
 
945
    optional(WS);
946
 
947
    attributeReading:
948
      while (getTokenAhead().kind == NUMTOKEN)
949
      {
950
        name = getNextToken();
951
        optional(WS);
952
 
953
        next = getTokenAhead();
954
        if (next.kind == EQ)
955
          {
956
            mustBe(EQ);
957
            optional(WS);
958
 
959
            next = getNextToken();
960
 
961
            switch (next.kind)
962
              {
963
              case QUOT:
964
 
965
                // read "quoted" attribute.
966
                buffer.setLength(0);
967
                readTillTokenE(QUOT);
968
                attrValue = buffer.toString();
969
                break;
970
 
971
              case AP:
972
 
973
                // read 'quoted' attribute.
974
                buffer.setLength(0);
975
                readTillTokenE(AP);
976
                attrValue = buffer.toString();
977
                break;
978
 
979
              // read unquoted attribute.
980
              case NUMTOKEN:
981
                value = next;
982
                optional(WS);
983
 
984
                // Check maybe the opening quote is missing.
985
                next = getTokenAhead();
986
                if (bQUOTING.get(next.kind))
987
                  {
988
                    hTag = next;
989
                    error("The value without opening quote is closed with '"
990
                          + next.getImage() + "'");
991
                    attrValue = value.getImage();
992
                  }
993
                else if (next.kind == SLASH || next.kind == OTHER)
994
                // The slash and other characters (like %) in this context is
995
                // treated as the ordinary
996
                // character, not as a token. The character may be part of
997
                // the unquoted URL.
998
                  {
999
                    CPStringBuilder image = new CPStringBuilder(value.getImage());
1000
                    while (next.kind == NUMTOKEN || next.kind == SLASH
1001
                           || next.kind == OTHER)
1002
                      {
1003
                        image.append(getNextToken().getImage());
1004
                        next = getTokenAhead();
1005
                      }
1006
                    attrValue = image.toString();
1007
                  }
1008
                else
1009
                  attrValue = value.getImage();
1010
                break;
1011
 
1012
              case SLASH:
1013
                value = next;
1014
                optional(WS);
1015
 
1016
                // Check maybe the opening quote is missing.
1017
                next = getTokenAhead();
1018
                if (bQUOTING.get(next.kind))
1019
                  {
1020
                    hTag = next;
1021
                    error("The value without opening quote is closed with '"
1022
                          + next.getImage() + "'");
1023
                    attrValue = value.getImage();
1024
                  }
1025
                else if (next.kind == NUMTOKEN || next.kind == SLASH)
1026
                // The slash in this context is treated as the ordinary
1027
                // character, not as a token. The slash may be part of
1028
                // the unquoted URL.
1029
                  {
1030
                    CPStringBuilder image = new CPStringBuilder(value.getImage());
1031
                    while (next.kind == NUMTOKEN || next.kind == SLASH)
1032
                      {
1033
                        image.append(getNextToken().getImage());
1034
                        next = getTokenAhead();
1035
                      }
1036
                    attrValue = image.toString();
1037
                  }
1038
                else
1039
                  attrValue = value.getImage();
1040
                break;
1041
              default:
1042
                break attributeReading;
1043
              }
1044
            attributes.addAttribute(name.getImage(), attrValue);
1045
            optional(WS);
1046
          }
1047
        else
1048
          // The '=' is missing: attribute without value.
1049
          {
1050
            noValueAttribute(element, name.getImage());
1051
          }
1052
      }
1053
  }
1054
 
1055
  /**
1056
   * Return string, corresponding the given named entity. The name is passed
1057
   * with the preceeding &, but without the ending semicolon.
1058
   */
1059
  protected String resolveNamedEntity(final String a_tag)
1060
  {
1061
    // Discard &
1062
    if (!a_tag.startsWith("&"))
1063
      throw new AssertionError("Named entity " + a_tag +
1064
                               " must start witn '&'."
1065
                              );
1066
 
1067
    String tag = a_tag.substring(1);
1068
 
1069
    try
1070
      {
1071
        Entity entity = dtd.getEntity(tag);
1072
        if (entity != null)
1073
          return entity.getString();
1074
 
1075
        entity = dtd.getEntity(tag.toLowerCase());
1076
 
1077
        if (entity != null)
1078
          {
1079
            error("The name of this entity should be in lowercase", a_tag);
1080
            return entity.getString();
1081
          }
1082
      }
1083
    catch (IndexOutOfBoundsException ibx)
1084
      {
1085
        /* The error will be reported. */
1086
      }
1087
 
1088
    error("Unknown named entity", a_tag);
1089
    return a_tag;
1090
  }
1091
 
1092
  /**
1093
   * Return char, corresponding the given numeric entity.
1094
   * The name is passed with the preceeding &#, but without
1095
   * the ending semicolon.
1096
   */
1097
  protected char resolveNumericEntity(final String a_tag)
1098
  {
1099
    // Discard &#
1100
    if (!a_tag.startsWith("&#"))
1101
      throw new AssertionError("Numeric entity " + a_tag +
1102
                               " must start witn '&#'."
1103
                              );
1104
 
1105
    String tag = a_tag.substring(2);
1106
 
1107
    try
1108
      {
1109
        // Determine the encoding type:
1110
        char cx = tag.charAt(0);
1111
        if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1112
 
1113
          return (char) Integer.parseInt(tag.substring(1), 16);
1114
 
1115
        return (char) Integer.parseInt(tag);
1116
      }
1117
 
1118
    /* The error will be reported. */
1119
    catch (NumberFormatException nex)
1120
      {
1121
      }
1122
    catch (IndexOutOfBoundsException ix)
1123
      {
1124
      }
1125
 
1126
    error("Invalid numeric entity", a_tag);
1127
    return '?';
1128
  }
1129
 
1130
  /**
1131
   * Reset all fields into the intial default state, preparing the
1132
   * parset for parsing the next document.
1133
   */
1134
  protected void restart()
1135
  {
1136
    documentTags.clear();
1137
    titleHandled = false;
1138
    titleOpen = false;
1139
    buffer.setLength(0);
1140
    title.setLength(0);
1141
    validator.restart();
1142
  }
1143
 
1144
  /**
1145
   * The method is called when the HTML opening tag ((like &lt;table&gt;)
1146
   * is found or if the parser concludes that the one should be present
1147
   * in the current position. The method is called immediately before
1148
   * calling the handleStartTag.
1149
   * @param tag The tag
1150
   */
1151
  protected void startTag(TagElement tag)
1152
                   throws ChangedCharSetException
1153
  {
1154
  }
1155
 
1156
  /**
1157
   * Handle a complete element, when the tag content is already present in the
1158
   * buffer and both starting and heading tags behind. This is called
1159
   * in the case when the tag text must not be parsed for the nested
1160
   * elements (elements STYLE and SCRIPT).
1161
   */
1162
  private void _handleCompleteElement(TagElement tag)
1163
  {
1164
    _handleStartTag(tag);
1165
 
1166
    // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1167
    HTML.Tag h = tag.getHTMLTag();
1168
    if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1169
      {
1170
        boolean tmp = titleOpen;
1171
        titleOpen = false;
1172
        _handleText();
1173
        titleOpen = tmp;
1174
      }
1175
    else
1176
      _handleText();
1177
 
1178
    _handleEndTag(tag);
1179
  }
1180
 
1181
  /**
1182
   * A hooks for operations, preceeding call to handleEmptyTag().
1183
   * Handle the tag with no content, like &lt;br&gt;. As no any
1184
   * nested tags are expected, the tag validator is not involved.
1185
   * @param tag The tag being handled.
1186
   */
1187
  private void _handleEmptyTag(TagElement tag)
1188
  {
1189
    try
1190
      {
1191
        validator.validateTag(tag, attributes);
1192
        handleEmptyTag(tag);
1193
        HTML.Tag h = tag.getHTMLTag();
1194
        // When a block tag is closed, consume whitespace that follows after
1195
        // it.
1196
        // For some unknown reason a FRAME tag is not treated as block element.
1197
        // However in this case it should be treated as such.
1198
        if (isBlock(h))
1199
          optional(WS);
1200
      }
1201
    catch (ChangedCharSetException ex)
1202
      {
1203
        error("Changed charset exception:", ex.getMessage());
1204
      }
1205
  }
1206
 
1207
  /**
1208
   * A hooks for operations, preceeding call to handleEndTag().
1209
   * The method is called when the HTML closing tag
1210
   * is found. Calls handleTitle after closing the 'title' tag.
1211
   * @param tag The tag
1212
   */
1213
  private void _handleEndTag(TagElement tag)
1214
  {
1215
    if (validator.closeTag(tag))
1216
       _handleEndTag_remaining(tag);
1217
  }
1218
 
1219
  /**
1220
   * Actions that are also required if the closing action was
1221
   * initiated by the tag validator.
1222
   * Package-private to avoid an accessor method.
1223
   */
1224
  void _handleEndTag_remaining(TagElement tag)
1225
  {
1226
    HTML.Tag h = tag.getHTMLTag();
1227
 
1228
    handleEndTag(tag);
1229
    endTag(tag.fictional());
1230
 
1231
    if (h.isPreformatted())
1232
      preformatted--;
1233
    if (preformatted < 0)
1234
      preformatted = 0;
1235
 
1236
    // When a block tag is closed, consume whitespace that follows after
1237
    // it.
1238
    if (isBlock(h))
1239
      optional(WS);
1240
 
1241
    if (h == HTML.Tag.TITLE)
1242
      {
1243
        titleOpen = false;
1244
        titleHandled = true;
1245
 
1246
        char[] a = new char[ title.length() ];
1247
        title.getChars(0, a.length, a, 0);
1248
        handleTitle(a);
1249
      }
1250
  }
1251
 
1252
  /**
1253
   * A hooks for operations, preceeding call to handleStartTag().
1254
   * The method is called when the HTML opening tag ((like &lt;table&gt;)
1255
   * is found.
1256
   * Package-private to avoid an accessor method.
1257
   * @param tag The tag
1258
   */
1259
  void _handleStartTag(TagElement tag)
1260
  {
1261
    validator.openTag(tag, attributes);
1262
    startingTag(tag);
1263
    handleStartTag(tag);
1264
 
1265
    HTML.Tag h = tag.getHTMLTag();
1266
 
1267
    if (isBlock(h))
1268
      optional(WS);
1269
 
1270
    if (h.isPreformatted())
1271
      preformatted++;
1272
 
1273
    if (h == HTML.Tag.TITLE)
1274
      {
1275
        if (titleHandled)
1276
          error("Repetetive <TITLE> tag");
1277
        titleOpen = true;
1278
        titleHandled = false;
1279
      }
1280
  }
1281
 
1282
  /**
1283
   * Resume parsing after heavy errors in HTML tag structure.
1284
   * @throws ParseException
1285
   */
1286
  private void forciblyCloseTheTag()
1287
                            throws ParseException
1288
  {
1289
    int closeAt = 0;
1290
    buffer.setLength(0);
1291
 
1292
    ahead:
1293
    for (int i = 1; i < 100; i++)
1294
      {
1295
        t = getTokenAhead(i - 1);
1296
        if (t.kind == EOF || t.kind == BEGIN)
1297
          break ahead;
1298
        if (t.kind == END)
1299
          {
1300
            /* Closing '>' found. */
1301
            closeAt = i;
1302
            break ahead;
1303
          }
1304
      }
1305
    if (closeAt > 0)
1306
      {
1307
        buffer.append("Ignoring '");
1308
        for (int i = 1; i <= closeAt; i++)
1309
          {
1310
            t = getNextToken();
1311
            append(t);
1312
          }
1313
        buffer.append('\'');
1314
        error(buffer.toString());
1315
      }
1316
  }
1317
 
1318
  /**
1319
   * Handle comment in string buffer. You can avoid allocating a char
1320
   * array each time by processing your comment directly here.
1321
   */
1322
  private void handleComment()
1323
  {
1324
    char[] a = new char[ buffer.length() ];
1325
    buffer.getChars(0, a.length, a, 0);
1326
    handleComment(a);
1327
  }
1328
 
1329
  private TagElement makeTagElement(String name, boolean isSupposed)
1330
  {
1331
    Element e = dtd.elementHash.get(name.toLowerCase());
1332
    if (e == null)
1333
      {
1334
        error("Unknown tag <" + name + ">");
1335
        e = dtd.getElement(name);
1336
        e.name = name.toUpperCase();
1337
        e.index = -1;
1338
      }
1339
 
1340
    if (!documentTags.contains(e.name))
1341
      {
1342
        markFirstTime(e);
1343
        documentTags.add(e.name);
1344
      }
1345
 
1346
    return makeTag(e, isSupposed);
1347
  }
1348
 
1349
  /**
1350
   * Read till the given token, resolving entities. Consume the given
1351
   * token without adding it to buffer.
1352
   * @param till The token to read till
1353
   * @throws ParseException
1354
   */
1355
  private void readTillTokenE(int till)
1356
                       throws ParseException
1357
  {
1358
    buffer.setLength(0);
1359
    read:
1360
    while (true)
1361
      {
1362
        t = getNextToken();
1363
        if (t.kind == Constants.ENTITY)
1364
          {
1365
            resolveAndAppendEntity(t);
1366
          }
1367
        else if (t.kind == EOF)
1368
          {
1369
            error("unexpected eof", t);
1370
            break read;
1371
          }
1372
        else if (t.kind == till)
1373
          break read;
1374
        else if (t.kind == WS)
1375
          {
1376
            // Processing whitespace in accordance with CDATA rules:
1377
            String s = t.getImage();
1378
            char c;
1379
            for (int i = 0; i < s.length(); i++)
1380
              {
1381
                c = s.charAt(i);
1382
                if (c == '\r')
1383
                  buffer.append(' '); // CR replaced by space
1384
                else if (c == '\n')
1385
                  { /* LF ignored */ }
1386
                else if (c == '\t')
1387
                  buffer.append(' '); // Tab replaced by space
1388
                else
1389
                  buffer.append(c);
1390
              }
1391
          }
1392
        else
1393
          append(t);
1394
      }
1395
  }
1396
 
1397
  /**
1398
   * Resolve the entity and append it to the end of buffer.
1399
   * @param entity
1400
   */
1401
  private void resolveAndAppendEntity(Token entity)
1402
  {
1403
    switch (entity.category)
1404
      {
1405
        case ENTITY_NAMED :
1406
          buffer.append(resolveNamedEntity(entity.getImage()));
1407
          break;
1408
 
1409
        case ENTITY_NUMERIC :
1410
          buffer.append(resolveNumericEntity(entity.getImage()));
1411
          break;
1412
 
1413
        default :
1414
          throw new AssertionError("Invalid entity category " +
1415
                                   entity.category
1416
                                  );
1417
      }
1418
  }
1419
 
1420
  /**
1421
   * Handle the remaining of HTML tags. This is a common end for
1422
   * TAG, SCRIPT and STYLE.
1423
   * @param closing True for closing tags ( &lt;/TAG&gt; ).
1424
   * @param name Name of element
1425
   * @param start Token where element has started
1426
   * @throws ParseException
1427
   */
1428
  private void restOfTag(boolean closing, Token name, Token start)
1429
                  throws ParseException
1430
  {
1431
    boolean end = false;
1432
    Token next;
1433
 
1434
    optional(WS);
1435
 
1436
    readAttributes(name.getImage());
1437
 
1438
    optional(WS);
1439
 
1440
    next = getTokenAhead();
1441
    if (next.kind == END)
1442
      {
1443
        mustBe(END);
1444
        end = true;
1445
      }
1446
 
1447
    hTag = new Token(start, next);
1448
 
1449
    if (!end)
1450
      {
1451
        // The tag body contains errors. If additionally the tag
1452
        // name is not valid, this construction is treated as text.
1453
        if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1454
            backupMode
1455
           )
1456
          {
1457
            error("Errors in tag body and unknown tag name. " +
1458
                  "Treating the tag as a text."
1459
                 );
1460
            reset();
1461
 
1462
            hTag = mustBe(BEGIN);
1463
            buffer.setLength(0);
1464
            buffer.append(hTag.getImage());
1465
            CDATA(false);
1466
            return;
1467
          }
1468
        else
1469
          {
1470
            error("Forcibly closing invalid parameter list");
1471
            forciblyCloseTheTag();
1472
          }
1473
      }
1474
 
1475
    if (closing)
1476
      {
1477
        endTag(false);
1478
        _handleEndTag(makeTagElement(name.getImage(), false));
1479
      }
1480
    else
1481
      {
1482
        TagElement te = makeTagElement(name.getImage(), false);
1483
        if (te.getElement().type == DTDConstants.EMPTY)
1484
          _handleEmptyTag(te);
1485
        else
1486
          {
1487
            // According to the specs we need to consume whitespace following
1488
            // immediately after a opening tag.
1489
            optional(WS);
1490
            _handleStartTag(te);
1491
          }
1492
      }
1493
  }
1494
 
1495
  /**
1496
   * This should fire additional actions in response to the
1497
   * ChangedCharSetException.  The current implementation
1498
   * does nothing.
1499
   * @param tag
1500
   */
1501
  private void startingTag(TagElement tag)
1502
  {
1503
    try
1504
      {
1505
        startTag(tag);
1506
      }
1507
    catch (ChangedCharSetException cax)
1508
      {
1509
        error("Invalid change of charset");
1510
      }
1511
  }
1512
 
1513
  private void ws_error()
1514
  {
1515
    error("Whitespace here is not permitted");
1516
  }
1517
 
1518
  /**
1519
   * Returns true when the specified tag should be considered a block tag
1520
   * wrt whitespace handling. We need this special handling, since there
1521
   * are a couple of tags that we must treat as block tags but which aren't
1522
   * officially block tags.
1523
   *
1524
   * @param tag the tag to check
1525
   * @return true when the specified tag should be considered a block tag
1526
   *         wrt whitespace handling
1527
   */
1528
  private boolean isBlock(HTML.Tag tag)
1529
  {
1530
    return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;
1531
  }
1532
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.