OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [html/] [token.go] - Blame information for rev 868

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
package html
6
 
7
import (
8
        "bytes"
9
        "io"
10
        "strconv"
11
        "strings"
12
)
13
 
14
// A TokenType is the type of a Token.
15
type TokenType int
16
 
17
const (
18
        // ErrorToken means that an error occurred during tokenization.
19
        ErrorToken TokenType = iota
20
        // TextToken means a text node.
21
        TextToken
22
        // A StartTagToken looks like .
23
        StartTagToken
24
        // An EndTagToken looks like .
25
        EndTagToken
26
        // A SelfClosingTagToken tag looks like 
.
27
        SelfClosingTagToken
28
        // A CommentToken looks like .
29
        CommentToken
30
        // A DoctypeToken looks like 
31
        DoctypeToken
32
)
33
 
34
// String returns a string representation of the TokenType.
35
func (t TokenType) String() string {
36
        switch t {
37
        case ErrorToken:
38
                return "Error"
39
        case TextToken:
40
                return "Text"
41
        case StartTagToken:
42
                return "StartTag"
43
        case EndTagToken:
44
                return "EndTag"
45
        case SelfClosingTagToken:
46
                return "SelfClosingTag"
47
        case CommentToken:
48
                return "Comment"
49
        case DoctypeToken:
50
                return "Doctype"
51
        }
52
        return "Invalid(" + strconv.Itoa(int(t)) + ")"
53
}
54
 
55
// An Attribute is an attribute namespace-key-value triple. Namespace is
56
// non-empty for foreign attributes like xlink, Key is alphabetic (and hence
57
// does not contain escapable characters like '&', '<' or '>'), and Val is
58
// unescaped (it looks like "a
59
//
60
// Namespace is only used by the parser, not the tokenizer.
61
type Attribute struct {
62
        Namespace, Key, Val string
63
}
64
 
65
// A Token consists of a TokenType and some Data (tag name for start and end
66
// tags, content for text, comments and doctypes). A tag Token may also contain
67
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a
68
// rather than "a<b").
69
type Token struct {
70
        Type TokenType
71
        Data string
72
        Attr []Attribute
73
}
74
 
75
// tagString returns a string representation of a tag Token's Data and Attr.
76
func (t Token) tagString() string {
77
        if len(t.Attr) == 0 {
78
                return t.Data
79
        }
80
        buf := bytes.NewBufferString(t.Data)
81
        for _, a := range t.Attr {
82
                buf.WriteByte(' ')
83
                buf.WriteString(a.Key)
84
                buf.WriteString(`="`)
85
                escape(buf, a.Val)
86
                buf.WriteByte('"')
87
        }
88
        return buf.String()
89
}
90
 
91
// String returns a string representation of the Token.
92
func (t Token) String() string {
93
        switch t.Type {
94
        case ErrorToken:
95
                return ""
96
        case TextToken:
97
                return EscapeString(t.Data)
98
        case StartTagToken:
99
                return "<" + t.tagString() + ">"
100
        case EndTagToken:
101
                return ""
102
        case SelfClosingTagToken:
103
                return "<" + t.tagString() + "/>"
104
        case CommentToken:
105
                return ""
106
        case DoctypeToken:
107
                return ""
108
        }
109
        return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
110
}
111
 
112
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
113
// the end is exclusive.
114
type span struct {
115
        start, end int
116
}
117
 
118
// A Tokenizer returns a stream of HTML Tokens.
119
type Tokenizer struct {
120
        // r is the source of the HTML text.
121
        r io.Reader
122
        // tt is the TokenType of the current token.
123
        tt TokenType
124
        // err is the first error encountered during tokenization. It is possible
125
        // for tt != Error && err != nil to hold: this means that Next returned a
126
        // valid token but the subsequent Next call will return an error token.
127
        // For example, if the HTML text input was just "plain", then the first
128
        // Next call would set z.err to io.EOF but return a TextToken, and all
129
        // subsequent Next calls would return an ErrorToken.
130
        // err is never reset. Once it becomes non-nil, it stays non-nil.
131
        err error
132
        // buf[raw.start:raw.end] holds the raw bytes of the current token.
133
        // buf[raw.end:] is buffered input that will yield future tokens.
134
        raw span
135
        buf []byte
136
        // buf[data.start:data.end] holds the raw bytes of the current token's data:
137
        // a text token's text, a tag token's tag name, etc.
138
        data span
139
        // pendingAttr is the attribute key and value currently being tokenized.
140
        // When complete, pendingAttr is pushed onto attr. nAttrReturned is
141
        // incremented on each call to TagAttr.
142
        pendingAttr   [2]span
143
        attr          [][2]span
144
        nAttrReturned int
145
        // rawTag is the "script" in "" that closes the next token. If
146
        // non-empty, the subsequent call to Next will return a raw or RCDATA text
147
        // token: one that treats "

" as text instead of an element.

148
        // rawTag's contents are lower-cased.
149
        rawTag string
150
        // textIsRaw is whether the current text token's data is not escaped.
151
        textIsRaw bool
152
}
153
 
154
// Err returns the error associated with the most recent ErrorToken token.
155
// This is typically io.EOF, meaning the end of tokenization.
156
func (z *Tokenizer) Err() error {
157
        if z.tt != ErrorToken {
158
                return nil
159
        }
160
        return z.err
161
}
162
 
163
// readByte returns the next byte from the input stream, doing a buffered read
164
// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
165
// slice that holds all the bytes read so far for the current token.
166
// It sets z.err if the underlying reader returns an error.
167
// Pre-condition: z.err == nil.
168
func (z *Tokenizer) readByte() byte {
169
        if z.raw.end >= len(z.buf) {
170
                // Our buffer is exhausted and we have to read from z.r.
171
                // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
172
                // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
173
                // allocate a new buffer before the copy.
174
                c := cap(z.buf)
175
                d := z.raw.end - z.raw.start
176
                var buf1 []byte
177
                if 2*d > c {
178
                        buf1 = make([]byte, d, 2*c)
179
                } else {
180
                        buf1 = z.buf[:d]
181
                }
182
                copy(buf1, z.buf[z.raw.start:z.raw.end])
183
                if x := z.raw.start; x != 0 {
184
                        // Adjust the data/attr spans to refer to the same contents after the copy.
185
                        z.data.start -= x
186
                        z.data.end -= x
187
                        z.pendingAttr[0].start -= x
188
                        z.pendingAttr[0].end -= x
189
                        z.pendingAttr[1].start -= x
190
                        z.pendingAttr[1].end -= x
191
                        for i := range z.attr {
192
                                z.attr[i][0].start -= x
193
                                z.attr[i][0].end -= x
194
                                z.attr[i][1].start -= x
195
                                z.attr[i][1].end -= x
196
                        }
197
                }
198
                z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
199
                // Now that we have copied the live bytes to the start of the buffer,
200
                // we read from z.r into the remainder.
201
                n, err := z.r.Read(buf1[d:cap(buf1)])
202
                if err != nil {
203
                        z.err = err
204
                        return 0
205
                }
206
                z.buf = buf1[:d+n]
207
        }
208
        x := z.buf[z.raw.end]
209
        z.raw.end++
210
        return x
211
}
212
 
213
// skipWhiteSpace skips past any white space.
214
func (z *Tokenizer) skipWhiteSpace() {
215
        if z.err != nil {
216
                return
217
        }
218
        for {
219
                c := z.readByte()
220
                if z.err != nil {
221
                        return
222
                }
223
                switch c {
224
                case ' ', '\n', '\r', '\t', '\f':
225
                        // No-op.
226
                default:
227
                        z.raw.end--
228
                        return
229
                }
230
        }
231
}
232
 
233
// readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and
234
// is typically something like "script" or "textarea".
235
func (z *Tokenizer) readRawOrRCDATA() {
236
loop:
237
        for {
238
                c := z.readByte()
239
                if z.err != nil {
240
                        break loop
241
                }
242
                if c != '<' {
243
                        continue loop
244
                }
245
                c = z.readByte()
246
                if z.err != nil {
247
                        break loop
248
                }
249
                if c != '/' {
250
                        continue loop
251
                }
252
                for i := 0; i < len(z.rawTag); i++ {
253
                        c = z.readByte()
254
                        if z.err != nil {
255
                                break loop
256
                        }
257
                        if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
258
                                continue loop
259
                        }
260
                }
261
                c = z.readByte()
262
                if z.err != nil {
263
                        break loop
264
                }
265
                switch c {
266
                case ' ', '\n', '\r', '\t', '\f', '/', '>':
267
                        // The 3 is 2 for the leading "
268
                        z.raw.end -= 3 + len(z.rawTag)
269
                        break loop
270
                case '<':
271
                        // Step back one, to catch "".
272
                        z.raw.end--
273
                }
274
        }
275
        z.data.end = z.raw.end
276
        // A textarea's or title's RCDATA can contain escaped entities.
277
        z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
278
        z.rawTag = ""
279
}
280
 
281
// readComment reads the next comment token starting with ".
288
                        z.data.end = z.data.start
289
                }
290
        }()
291
        for dashCount := 2; ; {
292
                c := z.readByte()
293
                if z.err != nil {
294
                        // Ignore up to two dashes at EOF.
295
                        if dashCount > 2 {
296
                                dashCount = 2
297
                        }
298
                        z.data.end = z.raw.end - dashCount
299
                        return
300
                }
301
                switch c {
302
                case '-':
303
                        dashCount++
304
                        continue
305
                case '>':
306
                        if dashCount >= 2 {
307
                                z.data.end = z.raw.end - len("-->")
308
                                return
309
                        }
310
                case '!':
311
                        if dashCount >= 2 {
312
                                c = z.readByte()
313
                                if z.err != nil {
314
                                        z.data.end = z.raw.end
315
                                        return
316
                                }
317
                                if c == '>' {
318
                                        z.data.end = z.raw.end - len("--!>")
319
                                        return
320
                                }
321
                        }
322
                }
323
                dashCount = 0
324
        }
325
}
326
 
327
// readUntilCloseAngle reads until the next ">".
328
func (z *Tokenizer) readUntilCloseAngle() {
329
        z.data.start = z.raw.end
330
        for {
331
                c := z.readByte()
332
                if z.err != nil {
333
                        z.data.end = z.raw.end
334
                        return
335
                }
336
                if c == '>' {
337
                        z.data.end = z.raw.end - len(">")
338
                        return
339
                }
340
        }
341
}
342
 
343
// readMarkupDeclaration reads the next token starting with "
344
// a "", a "", or "
345
// "
346
func (z *Tokenizer) readMarkupDeclaration() TokenType {
347
        z.data.start = z.raw.end
348
        var c [2]byte
349
        for i := 0; i < 2; i++ {
350
                c[i] = z.readByte()
351
                if z.err != nil {
352
                        z.data.end = z.raw.end
353
                        return CommentToken
354
                }
355
        }
356
        if c[0] == '-' && c[1] == '-' {
357
                z.readComment()
358
                return CommentToken
359
        }
360
        z.raw.end -= 2
361
        const s = "DOCTYPE"
362
        for i := 0; i < len(s); i++ {
363
                c := z.readByte()
364
                if z.err != nil {
365
                        z.data.end = z.raw.end
366
                        return CommentToken
367
                }
368
                if c != s[i] && c != s[i]+('a'-'A') {
369
                        // Back up to read the fragment of "DOCTYPE" again.
370
                        z.raw.end = z.data.start
371
                        z.readUntilCloseAngle()
372
                        return CommentToken
373
                }
374
        }
375
        if z.skipWhiteSpace(); z.err != nil {
376
                z.data.start = z.raw.end
377
                z.data.end = z.raw.end
378
                return DoctypeToken
379
        }
380
        z.readUntilCloseAngle()
381
        return DoctypeToken
382
}
383
 
384
// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
385
// case-insensitively matches any element of ss.
386
func (z *Tokenizer) startTagIn(ss ...string) bool {
387
loop:
388
        for _, s := range ss {
389
                if z.data.end-z.data.start != len(s) {
390
                        continue loop
391
                }
392
                for i := 0; i < len(s); i++ {
393
                        c := z.buf[z.data.start+i]
394
                        if 'A' <= c && c <= 'Z' {
395
                                c += 'a' - 'A'
396
                        }
397
                        if c != s[i] {
398
                                continue loop
399
                        }
400
                }
401
                return true
402
        }
403
        return false
404
}
405
 
406
// readStartTag reads the next start tag token. The opening "
407
// been consumed, where 'a' means anything in [A-Za-z].
408
func (z *Tokenizer) readStartTag() TokenType {
409
        z.attr = z.attr[:0]
410
        z.nAttrReturned = 0
411
        // Read the tag name and attribute key/value pairs.
412
        z.readTagName()
413
        if z.skipWhiteSpace(); z.err != nil {
414
                return ErrorToken
415
        }
416
        for {
417
                c := z.readByte()
418
                if z.err != nil || c == '>' {
419
                        break
420
                }
421
                z.raw.end--
422
                z.readTagAttrKey()
423
                z.readTagAttrVal()
424
                // Save pendingAttr if it has a non-empty key.
425
                if z.pendingAttr[0].start != z.pendingAttr[0].end {
426
                        z.attr = append(z.attr, z.pendingAttr)
427
                }
428
                if z.skipWhiteSpace(); z.err != nil {
429
                        break
430
                }
431
        }
432
        // Several tags flag the tokenizer's next token as raw.
433
        c, raw := z.buf[z.data.start], false
434
        if 'A' <= c && c <= 'Z' {
435
                c += 'a' - 'A'
436
        }
437
        switch c {
438
        case 'i':
439
                raw = z.startTagIn("iframe")
440
        case 'n':
441
                raw = z.startTagIn("noembed", "noframes", "noscript")
442
        case 'p':
443
                raw = z.startTagIn("plaintext")
444
        case 's':
445
                raw = z.startTagIn("script", "style")
446
        case 't':
447
                raw = z.startTagIn("textarea", "title")
448
        case 'x':
449
                raw = z.startTagIn("xmp")
450
        }
451
        if raw {
452
                z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))
453
        }
454
        // Look for a self-closing token like "
".
455
        if z.err == nil && z.buf[z.raw.end-2] == '/' {
456
                return SelfClosingTagToken
457
        }
458
        return StartTagToken
459
}
460
 
461
// readEndTag reads the next end tag token. The opening "
462
// been consumed, where 'a' means anything in [A-Za-z].
463
func (z *Tokenizer) readEndTag() {
464
        z.attr = z.attr[:0]
465
        z.nAttrReturned = 0
466
        z.readTagName()
467
        for {
468
                c := z.readByte()
469
                if z.err != nil || c == '>' {
470
                        return
471
                }
472
        }
473
}
474
 
475
// readTagName sets z.data to the "div" in "
". The reader (z.raw.end)
476
// is positioned such that the first byte of the tag name (the "d" in "
477
// has already been consumed.
478
func (z *Tokenizer) readTagName() {
479
        z.data.start = z.raw.end - 1
480
        for {
481
                c := z.readByte()
482
                if z.err != nil {
483
                        z.data.end = z.raw.end
484
                        return
485
                }
486
                switch c {
487
                case ' ', '\n', '\r', '\t', '\f':
488
                        z.data.end = z.raw.end - 1
489
                        return
490
                case '/', '>':
491
                        z.raw.end--
492
                        z.data.end = z.raw.end
493
                        return
494
                }
495
        }
496
}
497
 
498
// readTagAttrKey sets z.pendingAttr[0] to the "k" in "
".
499
// Precondition: z.err == nil.
500
func (z *Tokenizer) readTagAttrKey() {
501
        z.pendingAttr[0].start = z.raw.end
502
        for {
503
                c := z.readByte()
504
                if z.err != nil {
505
                        z.pendingAttr[0].end = z.raw.end
506
                        return
507
                }
508
                switch c {
509
                case ' ', '\n', '\r', '\t', '\f', '/':
510
                        z.pendingAttr[0].end = z.raw.end - 1
511
                        return
512
                case '=', '>':
513
                        z.raw.end--
514
                        z.pendingAttr[0].end = z.raw.end
515
                        return
516
                }
517
        }
518
}
519
 
520
// readTagAttrVal sets z.pendingAttr[1] to the "v" in "
".
521
func (z *Tokenizer) readTagAttrVal() {
522
        z.pendingAttr[1].start = z.raw.end
523
        z.pendingAttr[1].end = z.raw.end
524
        if z.skipWhiteSpace(); z.err != nil {
525
                return
526
        }
527
        c := z.readByte()
528
        if z.err != nil {
529
                return
530
        }
531
        if c != '=' {
532
                z.raw.end--
533
                return
534
        }
535
        if z.skipWhiteSpace(); z.err != nil {
536
                return
537
        }
538
        quote := z.readByte()
539
        if z.err != nil {
540
                return
541
        }
542
        switch quote {
543
        case '>':
544
                z.raw.end--
545
                return
546
 
547
        case '\'', '"':
548
                z.pendingAttr[1].start = z.raw.end
549
                for {
550
                        c := z.readByte()
551
                        if z.err != nil {
552
                                z.pendingAttr[1].end = z.raw.end
553
                                return
554
                        }
555
                        if c == quote {
556
                                z.pendingAttr[1].end = z.raw.end - 1
557
                                return
558
                        }
559
                }
560
 
561
        default:
562
                z.pendingAttr[1].start = z.raw.end - 1
563
                for {
564
                        c := z.readByte()
565
                        if z.err != nil {
566
                                z.pendingAttr[1].end = z.raw.end
567
                                return
568
                        }
569
                        switch c {
570
                        case ' ', '\n', '\r', '\t', '\f':
571
                                z.pendingAttr[1].end = z.raw.end - 1
572
                                return
573
                        case '>':
574
                                z.raw.end--
575
                                z.pendingAttr[1].end = z.raw.end
576
                                return
577
                        }
578
                }
579
        }
580
}
581
 
582
// Next scans the next token and returns its type.
583
func (z *Tokenizer) Next() TokenType {
584
        if z.err != nil {
585
                z.tt = ErrorToken
586
                return z.tt
587
        }
588
        z.raw.start = z.raw.end
589
        z.data.start = z.raw.end
590
        z.data.end = z.raw.end
591
        if z.rawTag != "" {
592
                if z.rawTag == "plaintext" {
593
                        // Read everything up to EOF.
594
                        for z.err == nil {
595
                                z.readByte()
596
                        }
597
                        z.textIsRaw = true
598
                } else {
599
                        z.readRawOrRCDATA()
600
                }
601
                if z.data.end > z.data.start {
602
                        z.tt = TextToken
603
                        return z.tt
604
                }
605
        }
606
        z.textIsRaw = false
607
 
608
loop:
609
        for {
610
                c := z.readByte()
611
                if z.err != nil {
612
                        break loop
613
                }
614
                if c != '<' {
615
                        continue loop
616
                }
617
 
618
                // Check if the '<' we have just read is part of a tag, comment
619
                // or doctype. If not, it's part of the accumulated text token.
620
                c = z.readByte()
621
                if z.err != nil {
622
                        break loop
623
                }
624
                var tokenType TokenType
625
                switch {
626
                case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
627
                        tokenType = StartTagToken
628
                case c == '/':
629
                        tokenType = EndTagToken
630
                case c == '!' || c == '?':
631
                        // We use CommentToken to mean any of "",
632
                        // "" and "".
633
                        tokenType = CommentToken
634
                default:
635
                        continue
636
                }
637
 
638
                // We have a non-text token, but we might have accumulated some text
639
                // before that. If so, we return the text first, and return the non-
640
                // text token on the subsequent call to Next.
641
                if x := z.raw.end - len("
642
                        z.raw.end = x
643
                        z.data.end = x
644
                        z.tt = TextToken
645
                        return z.tt
646
                }
647
                switch tokenType {
648
                case StartTagToken:
649
                        z.tt = z.readStartTag()
650
                        return z.tt
651
                case EndTagToken:
652
                        c = z.readByte()
653
                        if z.err != nil {
654
                                break loop
655
                        }
656
                        if c == '>' {
657
                                // "" does not generate a token at all.
658
                                // Reset the tokenizer state and start again.
659
                                z.raw.start = z.raw.end
660
                                z.data.start = z.raw.end
661
                                z.data.end = z.raw.end
662
                                continue loop
663
                        }
664
                        if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
665
                                z.readEndTag()
666
                                z.tt = EndTagToken
667
                                return z.tt
668
                        }
669
                        z.raw.end--
670
                        z.readUntilCloseAngle()
671
                        z.tt = CommentToken
672
                        return z.tt
673
                case CommentToken:
674
                        if c == '!' {
675
                                z.tt = z.readMarkupDeclaration()
676
                                return z.tt
677
                        }
678
                        z.raw.end--
679
                        z.readUntilCloseAngle()
680
                        z.tt = CommentToken
681
                        return z.tt
682
                }
683
        }
684
        if z.raw.start < z.raw.end {
685
                z.data.end = z.raw.end
686
                z.tt = TextToken
687
                return z.tt
688
        }
689
        z.tt = ErrorToken
690
        return z.tt
691
}
692
 
693
// Raw returns the unmodified text of the current token. Calling Next, Token,
694
// Text, TagName or TagAttr may change the contents of the returned slice.
695
func (z *Tokenizer) Raw() []byte {
696
        return z.buf[z.raw.start:z.raw.end]
697
}
698
 
699
// Text returns the unescaped text of a text, comment or doctype token. The
700
// contents of the returned slice may change on the next call to Next.
701
func (z *Tokenizer) Text() []byte {
702
        switch z.tt {
703
        case TextToken, CommentToken, DoctypeToken:
704
                s := z.buf[z.data.start:z.data.end]
705
                z.data.start = z.raw.end
706
                z.data.end = z.raw.end
707
                if !z.textIsRaw {
708
                        s = unescape(s)
709
                }
710
                return s
711
        }
712
        return nil
713
}
714
 
715
// TagName returns the lower-cased name of a tag token (the `img` out of
716
// ``) and whether the tag has attributes.
717
// The contents of the returned slice may change on the next call to Next.
718
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
719
        if z.data.start < z.data.end {
720
                switch z.tt {
721
                case StartTagToken, EndTagToken, SelfClosingTagToken:
722
                        s := z.buf[z.data.start:z.data.end]
723
                        z.data.start = z.raw.end
724
                        z.data.end = z.raw.end
725
                        return lower(s), z.nAttrReturned < len(z.attr)
726
                }
727
        }
728
        return nil, false
729
}
730
 
731
// TagAttr returns the lower-cased key and unescaped value of the next unparsed
732
// attribute for the current tag token and whether there are more attributes.
733
// The contents of the returned slices may change on the next call to Next.
734
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
735
        if z.nAttrReturned < len(z.attr) {
736
                switch z.tt {
737
                case StartTagToken, SelfClosingTagToken:
738
                        x := z.attr[z.nAttrReturned]
739
                        z.nAttrReturned++
740
                        key = z.buf[x[0].start:x[0].end]
741
                        val = z.buf[x[1].start:x[1].end]
742
                        return lower(key), unescape(val), z.nAttrReturned < len(z.attr)
743
                }
744
        }
745
        return nil, nil, false
746
}
747
 
748
// Token returns the next Token. The result's Data and Attr values remain valid
749
// after subsequent Next calls.
750
func (z *Tokenizer) Token() Token {
751
        t := Token{Type: z.tt}
752
        switch z.tt {
753
        case TextToken, CommentToken, DoctypeToken:
754
                t.Data = string(z.Text())
755
        case StartTagToken, SelfClosingTagToken:
756
                var attr []Attribute
757
                name, moreAttr := z.TagName()
758
                for moreAttr {
759
                        var key, val []byte
760
                        key, val, moreAttr = z.TagAttr()
761
                        attr = append(attr, Attribute{"", string(key), string(val)})
762
                }
763
                t.Data = string(name)
764
                t.Attr = attr
765
        case EndTagToken:
766
                name, _ := z.TagName()
767
                t.Data = string(name)
768
        }
769
        return t
770
}
771
 
772
// NewTokenizer returns a new HTML Tokenizer for the given Reader.
773
// The input is assumed to be UTF-8 encoded.
774
func NewTokenizer(r io.Reader) *Tokenizer {
775
        return &Tokenizer{
776
                r:   r,
777
                buf: make([]byte, 0, 4096),
778
        }
779
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.