URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [html/] [token.go] - Blame information for rev 747

Details | Compare with Previous | View Log


// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
 
package html
 
import (
        "bytes"
        "io"
        "strconv"
        "strings"
)
 
// A TokenType is the type of a Token.
type TokenType int
 
const (
        // ErrorToken means that an error occurred during tokenization.
        ErrorToken TokenType = iota
        // TextToken means a text node.
        TextToken
        // A StartTagToken looks like .
        StartTagToken
        // An EndTagToken looks like .
        EndTagToken
        // A SelfClosingTagToken tag looks like .
        SelfClosingTagToken
        // A CommentToken looks like .
        CommentToken
        // A DoctypeToken looks like 
        DoctypeToken
)
 
// String returns a string representation of the TokenType.
func (t TokenType) String() string {
        switch t {
        case ErrorToken:
                return "Error"
        case TextToken:
                return "Text"
        case StartTagToken:
                return "StartTag"
        case EndTagToken:
                return "EndTag"
        case SelfClosingTagToken:
                return "SelfClosingTag"
        case CommentToken:
                return "Comment"
        case DoctypeToken:
                return "Doctype"
        }
        return "Invalid(" + strconv.Itoa(int(t)) + ")"
}
 
// An Attribute is an attribute namespace-key-value triple. Namespace is
// non-empty for foreign attributes like xlink, Key is alphabetic (and hence
// does not contain escapable characters like '&', '<' or '>'), and Val is
// unescaped (it looks like "a
//
// Namespace is only used by the parser, not the tokenizer.
type Attribute struct {
        Namespace, Key, Val string
}
 
// A Token consists of a TokenType and some Data (tag name for start and end
// tags, content for text, comments and doctypes). A tag Token may also contain
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a
// rather than "a<b").
type Token struct {
        Type TokenType
        Data string
        Attr []Attribute
}
 
// tagString returns a string representation of a tag Token's Data and Attr.
func (t Token) tagString() string {
        if len(t.Attr) == 0 {
                return t.Data
        }
        buf := bytes.NewBufferString(t.Data)
        for _, a := range t.Attr {
                buf.WriteByte(' ')
                buf.WriteString(a.Key)
                buf.WriteString(`="`)
                escape(buf, a.Val)
                buf.WriteByte('"')
        }
        return buf.String()
}
 
// String returns a string representation of the Token.
func (t Token) String() string {
        switch t.Type {
        case ErrorToken:
                return ""
        case TextToken:
                return EscapeString(t.Data)
        case StartTagToken:
                return "<" + t.tagString() + ">"
        case EndTagToken:
                return ""
        case SelfClosingTagToken:
                return "<" + t.tagString() + "/>"
        case CommentToken:
                return ""
        case DoctypeToken:
                return ""
        }
        return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
 
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
// the end is exclusive.
type span struct {
        start, end int
}
 
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
        // r is the source of the HTML text.
        r io.Reader
        // tt is the TokenType of the current token.
        tt TokenType
        // err is the first error encountered during tokenization. It is possible
        // for tt != Error && err != nil to hold: this means that Next returned a
        // valid token but the subsequent Next call will return an error token.
        // For example, if the HTML text input was just "plain", then the first
        // Next call would set z.err to io.EOF but return a TextToken, and all
        // subsequent Next calls would return an ErrorToken.
        // err is never reset. Once it becomes non-nil, it stays non-nil.
        err error
        // buf[raw.start:raw.end] holds the raw bytes of the current token.
        // buf[raw.end:] is buffered input that will yield future tokens.
        raw span
        buf []byte
        // buf[data.start:data.end] holds the raw bytes of the current token's data:
        // a text token's text, a tag token's tag name, etc.
        data span
        // pendingAttr is the attribute key and value currently being tokenized.
        // When complete, pendingAttr is pushed onto attr. nAttrReturned is
        // incremented on each call to TagAttr.
        pendingAttr   [2]span
        attr          [][2]span
        nAttrReturned int
        // rawTag is the "script" in "" that closes the next token. If
        // non-empty, the subsequent call to Next will return a raw or RCDATA text
        // token: one that treats "" as text instead of an element. 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
// 
// 
// 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
// 
// 
func 
 
 
 
 
 

powered e> // rawTag's contents are lower-cased. rawTag string // textIsRaw is whether the current text token's data is not escaped. textIsRaw bool } Err returns the error associated with the most recent ErrorToken token. This is typically io.EOF, meaning the end of tokenization. (z *Tokenizer) Err() error { if z.tt != ErrorToken { return nil } return z.err } readByte returns the next byte from the input stream, doing a buffered read from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte slice that holds all the bytes read so far for the current token. It sets z.err if the underlying reader returns an error. Pre-condition: z.err == nil. (z *Tokenizer) readByte() byte { if z.raw.end >= len(z.buf) { // Our buffer is exhausted and we have to read from z.r. // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we // allocate a new buffer before the copy. c := cap(z.buf) d := z.raw.end - z.raw.start var buf1 []byte if 2*d > c { buf1 = make([]byte, d, 2*c) } else { buf1 = z.buf[:d] } copy(buf1, z.buf[z.raw.start:z.raw.end]) if x := z.raw.start; x != 0 { // Adjust the data/attr spans to refer to the same contents after the copy. z.data.start -= x z.data.end -= x z.pendingAttr[0].start -= x z.pendingAttr[0].end -= x z.pendingAttr[1].start -= x z.pendingAttr[1].end -= x for i := range z.attr { z.attr[i][0].start -= x z.attr[i][0].end -= x z.attr[i][1].start -= x z.attr[i][1].end -= x } } z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] // Now that we have copied the live bytes to the start of the buffer, // we read from z.r into the remainder. n, err := z.r.Read(buf1[d:cap(buf1)]) if err != nil { z.err = err return 0 } z.buf = buf1[:d+n] } x := z.buf[z.raw.end] z.raw.end++ return x } skipWhiteSpace skips past any white space. (z *Tokenizer) skipWhiteSpace() { if z.err != nil { return } for { c := z.readByte() if z.err != nil { return } switch c { case ' ', '\n', '\r', '\t', '\f': // No-op. default: z.raw.end-- return } } } readRawOrRCDATA reads until the next "", where "foo" is z.rawTag and is typically something like "script" or "textarea". (z *Tokenizer) readRawOrRCDATA() { loop: for { c := z.readByte() if z.err != nil { break loop } if c != '<' { continue loop } c = z.readByte() if z.err != nil { break loop } if c != '/' { continue loop } for i := 0; i < len(z.rawTag); i++ { c = z.readByte() if z.err != nil { break loop } if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { continue loop } } c = z.readByte() if z.err != nil { break loop } switch c { case ' ', '\n', '\r', '\t', '\f', '/', '>': // The 3 is 2 for the leading " z.raw.end -= 3 + len(z.rawTag) break loop case '<': // Step back one, to catch "". z.raw.end-- } } z.data.end = z.raw.end // A textarea's or title's RCDATA can contain escaped entities. z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" z.rawTag = "" } readComment reads the next comment token starting with ". z.data.end = z.data.start } }() for dashCount := 2; ; { c := z.readByte() if z.err != nil { // Ignore up to two dashes at EOF. if dashCount > 2 { dashCount = 2 } z.data.end = z.raw.end - dashCount return } switch c { case '-': dashCount++ continue case '>': if dashCount >= 2 { z.data.end = z.raw.end - len("-->") return } case '!': if dashCount >= 2 { c = z.readByte() if z.err != nil { z.data.end = z.raw.end return } if c == '>' { z.data.end = z.raw.end - len("--!>") return } } } dashCount = 0 } } readUntilCloseAngle reads until the next ">". (z *Tokenizer) readUntilCloseAngle() { z.data.start = z.raw.end for { c := z.readByte() if z.err != nil { z.data.end = z.raw.end return } if c == '>' { z.data.end = z.raw.end - len(">") return } } } readMarkupDeclaration reads the next token starting with " a "", a "", or " " (z *Tokenizer) readMarkupDeclaration() TokenType { z.data.start = z.raw.end var c [2]byte for i := 0; i < 2; i++ { c[i] = z.readByte() if z.err != nil { z.data.end = z.raw.end return CommentToken } } if c[0] == '-' && c[1] == '-' { z.readComment() return CommentToken } z.raw.end -= 2 const s = "DOCTYPE" for i := 0; i < len(s); i++ { c := z.readByte() if z.err != nil { z.data.end = z.raw.end return CommentToken } if c != s[i] && c != s[i]+('a'-'A') { // Back up to read the fragment of "DOCTYPE" again. z.raw.end = z.data.start z.readUntilCloseAngle() return CommentToken } } if z.skipWhiteSpace(); z.err != nil { z.data.start = z.raw.end z.data.end = z.raw.end return DoctypeToken } z.readUntilCloseAngle() return DoctypeToken } startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] case-insensitively matches any element of ss. (z *Tokenizer) startTagIn(ss ...string) bool { loop: for _, s := range ss { if z.data.end-z.data.start != len(s) { continue loop } for i := 0; i < len(s); i++ { c := z.buf[z.data.start+i] if 'A' <= c && c <= 'Z' { c += 'a' - 'A' } if c != s[i] { continue loop } } return true } return false } readStartTag reads the next start tag token. The opening " been consumed, where 'a' means anything in [A-Za-z]. (z *Tokenizer) readStartTag() TokenType { z.attr = z.attr[:0] z.nAttrReturned = 0 // Read the tag name and attribute key/value pairs. z.readTagName() if z.skipWhiteSpace(); z.err != nil { return ErrorToken } for { c := z.readByte() if z.err != nil || c == '>' { break } z.raw.end-- z.readTagAttrKey() z.readTagAttrVal() // Save pendingAttr if it has a non-empty key. if z.pendingAttr[0].start != z.pendingAttr[0].end { z.attr = append(z.attr, z.pendingAttr) } if z.skipWhiteSpace(); z.err != nil { break } } // Several tags flag the tokenizer's next token as raw. c, raw := z.buf[z.data.start], false if 'A' <= c && c <= 'Z' { c += 'a' - 'A' } switch c { case 'i': raw = z.startTagIn("iframe") case 'n': raw = z.startTagIn("noembed", "noframes", "noscript") case 'p': raw = z.startTagIn("plaintext") case 's': raw = z.startTagIn("script", "style") case 't': raw = z.startTagIn("textarea", "title") case 'x': raw = z.startTagIn("xmp") } if raw { z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end])) } // Look for a self-closing token like "
". if z.err == nil && z.buf[z.raw.end-2] == '/' { return SelfClosingTagToken } return StartTagToken } readEndTag reads the next end tag token. The opening " been consumed, where 'a' means anything in [A-Za-z]. (z *Tokenizer) readEndTag() { z.attr = z.attr[:0] z.nAttrReturned = 0 z.readTagName() for { c := z.readByte() if z.err != nil || c == '>' { return } } } readTagName sets z.data to the "div" in "

". The reader (z.raw.end) is positioned such that the first byte of the tag name (the "d" in " has already been consumed. (z *Tokenizer) readTagName() { z.data.start = z.raw.end - 1 for { c := z.readByte() if z.err != nil { z.data.end = z.raw.end return } switch c { case ' ', '\n', '\r', '\t', '\f': z.data.end = z.raw.end - 1 return case '/', '>': z.raw.end-- z.data.end = z.raw.end return } } } readTagAttrKey sets z.pendingAttr[0] to the "k" in "

". Precondition: z.err == nil. (z *Tokenizer) readTagAttrKey() { z.pendingAttr[0].start = z.raw.end for { c := z.readByte() if z.err != nil { z.pendingAttr[0].end = z.raw.end return } switch c { case ' ', '\n', '\r', '\t', '\f', '/': z.pendingAttr[0].end = z.raw.end - 1 return case '=', '>': z.raw.end-- z.pendingAttr[0].end = z.raw.end return } } } readTagAttrVal sets z.pendingAttr[1] to the "v" in "

". (z *Tokenizer) readTagAttrVal() { z.pendingAttr[1].start = z.raw.end z.pendingAttr[1].end = z.raw.end if z.skipWhiteSpace(); z.err != nil { return } c := z.readByte() if z.err != nil { return } if c != '=' { z.raw.end-- return } if z.skipWhiteSpace(); z.err != nil { return } quote := z.readByte() if z.err != nil { return } switch quote { case '>': z.raw.end-- return case '\'', '"': z.pendingAttr[1].start = z.raw.end for { c := z.readByte() if z.err != nil { z.pendingAttr[1].end = z.raw.end return } if c == quote { z.pendingAttr[1].end = z.raw.end - 1 return } } default: z.pendingAttr[1].start = z.raw.end - 1 for { c := z.readByte() if z.err != nil { z.pendingAttr[1].end = z.raw.end return } switch c { case ' ', '\n', '\r', '\t', '\f': z.pendingAttr[1].end = z.raw.end - 1 return case '>': z.raw.end-- z.pendingAttr[1].end = z.raw.end return } } } } Next scans the next token and returns its type. (z *Tokenizer) Next() TokenType { if z.err != nil { z.tt = ErrorToken return z.tt } z.raw.start = z.raw.end z.data.start = z.raw.end z.data.end = z.raw.end if z.rawTag != "" { if z.rawTag == "plaintext" { // Read everything up to EOF. for z.err == nil { z.readByte() } z.textIsRaw = true } else { z.readRawOrRCDATA() } if z.data.end > z.data.start { z.tt = TextToken return z.tt } } z.textIsRaw = false loop: for { c := z.readByte() if z.err != nil { break loop } if c != '<' { continue loop } // Check if the '<' we have just read is part of a tag, comment // or doctype. If not, it's part of the accumulated text token. c = z.readByte() if z.err != nil { break loop } var tokenType TokenType switch { case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': tokenType = StartTagToken case c == '/': tokenType = EndTagToken case c == '!' || c == '?': // We use CommentToken to mean any of "", // "" and "". tokenType = CommentToken default: continue } // We have a non-text token, but we might have accumulated some text // before that. If so, we return the text first, and return the non- // text token on the subsequent call to Next. if x := z.raw.end - len(" z.raw.end = x z.data.end = x z.tt = TextToken return z.tt } switch tokenType { case StartTagToken: z.tt = z.readStartTag() return z.tt case EndTagToken: c = z.readByte() if z.err != nil { break loop } if c == '>' { // "" does not generate a token at all. // Reset the tokenizer state and start again. z.raw.start = z.raw.end z.data.start = z.raw.end z.data.end = z.raw.end continue loop } if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { z.readEndTag() z.tt = EndTagToken return z.tt } z.raw.end-- z.readUntilCloseAngle() z.tt = CommentToken return z.tt case CommentToken: if c == '!' { z.tt = z.readMarkupDeclaration() return z.tt } z.raw.end-- z.readUntilCloseAngle() z.tt = CommentToken return z.tt } } if z.raw.start < z.raw.end { z.data.end = z.raw.end z.tt = TextToken return z.tt } z.tt = ErrorToken return z.tt } Raw returns the unmodified text of the current token. Calling Next, Token, Text, TagName or TagAttr may change the contents of the returned slice. (z *Tokenizer) Raw() []byte { return z.buf[z.raw.start:z.raw.end] } Text returns the unescaped text of a text, comment or doctype token. The contents of the returned slice may change on the next call to Next. (z *Tokenizer) Text() []byte { switch z.tt { case TextToken, CommentToken, DoctypeToken: s := z.buf[z.data.start:z.data.end] z.data.start = z.raw.end z.data.end = z.raw.end if !z.textIsRaw { s = unescape(s) } return s } return nil } TagName returns the lower-cased name of a tag token (the `img` out of `

`) and whether the tag has attributes. The contents of the returned slice may change on the next call to Next. (z *Tokenizer) TagName() (name []byte, hasAttr bool) { if z.data.start < z.data.end { switch z.tt { case StartTagToken, EndTagToken, SelfClosingTagToken: s := z.buf[z.data.start:z.data.end] z.data.start = z.raw.end z.data.end = z.raw.end return lower(s), z.nAttrReturned < len(z.attr) } } return nil, false } TagAttr returns the lower-cased key and unescaped value of the next unparsed attribute for the current tag token and whether there are more attributes. The contents of the returned slices may change on the next call to Next. (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { if z.nAttrReturned < len(z.attr) { switch z.tt { case StartTagToken, SelfClosingTagToken: x := z.attr[z.nAttrReturned] z.nAttrReturned++ key = z.buf[x[0].start:x[0].end] val = z.buf[x[1].start:x[1].end] return lower(key), unescape(val), z.nAttrReturned < len(z.attr) } } return nil, nil, false } Token returns the next Token. The result's Data and Attr values remain valid after subsequent Next calls. (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, SelfClosingTagToken: var attr []Attribute name, moreAttr := z.TagName() for moreAttr { var key, val []byte key, val, moreAttr = z.TagAttr() attr = append(attr, Attribute{"", string(key), string(val)}) } t.Data = string(name) t.Attr = attr case EndTagToken: name, _ := z.TagName() t.Data = string(name) } return t } NewTokenizer returns a new HTML Tokenizer for the given Reader. The input is assumed to be UTF-8 encoded. NewTokenizer(r io.Reader) *Tokenizer { return &Tokenizer{ r: r, buf: make([]byte, 0, 4096), } } ('a'); { function() { mouseover(this) } ); function() { mouseout(this) } ); the link etElementsByTagName('div'); i++) { { iv[i]); func, false); + type, func); class="date">2012-03-02 13:20:09 GMT<\/span><\/div>

Initial check-in of GCC, with properties matching the upstream.<\/div>'; by: WebSVN 2.1.0

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [html/] [token.go] - Blame information for rev 747

Line No.	Rev	Author	Line
1	747	jeremybenn	`// Copyright 2010 The Go Authors. All rights reserved.`
2			`// Use of this source code is governed by a BSD-style`
3			`// license that can be found in the LICENSE file.`
4
5			`package html`
6
7			`import (`
8			`"bytes"`
9			`"io"`
10			`"strconv"`
11			`"strings"`
12			`)`
13
14			`// A TokenType is the type of a Token.`
15			`type TokenType int`
16
17			`const (`
18			`// ErrorToken means that an error occurred during tokenization.`
19			`ErrorToken TokenType = iota`
20			`// TextToken means a text node.`
21			`TextToken`
22			`// A StartTagToken looks like .`
23			`StartTagToken`
24			`// An EndTagToken looks like .`
25			`EndTagToken`
26			`// A SelfClosingTagToken tag looks like .`
27			`SelfClosingTagToken`
28			`// A CommentToken looks like .`
29			`CommentToken`
30			`// A DoctypeToken looks like`
31			`DoctypeToken`
32			`)`
33
34			`// String returns a string representation of the TokenType.`
35			`func (t TokenType) String() string {`
36			`switch t {`
37			`case ErrorToken:`
38			`return "Error"`
39			`case TextToken:`
40			`return "Text"`
41			`case StartTagToken:`
42			`return "StartTag"`
43			`case EndTagToken:`
44			`return "EndTag"`
45			`case SelfClosingTagToken:`
46			`return "SelfClosingTag"`
47			`case CommentToken:`
48			`return "Comment"`
49			`case DoctypeToken:`
50			`return "Doctype"`
51			`}`
52			`return "Invalid(" + strconv.Itoa(int(t)) + ")"`
53			`}`
54
55			`// An Attribute is an attribute namespace-key-value triple. Namespace is`
56			`// non-empty for foreign attributes like xlink, Key is alphabetic (and hence`
57			`// does not contain escapable characters like '&', '<' or '>'), and Val is`
58			`// unescaped (it looks like "a`
59			`//`
60			`// Namespace is only used by the parser, not the tokenizer.`
61			`type Attribute struct {`
62			`Namespace, Key, Val string`
63			`}`
64
65			`// A Token consists of a TokenType and some Data (tag name for start and end`
66			`// tags, content for text, comments and doctypes). A tag Token may also contain`
67			`// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a`
68			`// rather than "a<b").`
69			`type Token struct {`
70			`Type TokenType`
71			`Data string`
72			`Attr []Attribute`
73			`}`
74
75			`// tagString returns a string representation of a tag Token's Data and Attr.`
76			`func (t Token) tagString() string {`
77			`if len(t.Attr) == 0 {`
78			`return t.Data`
79			`}`
80			`buf := bytes.NewBufferString(t.Data)`
81			`for _, a := range t.Attr {`
82			`buf.WriteByte(' ')`
83			`buf.WriteString(a.Key)`
84			buf.WriteString(`="`)
85			`escape(buf, a.Val)`
86			`buf.WriteByte('"')`
87			`}`
88			`return buf.String()`
89			`}`
90
91			`// String returns a string representation of the Token.`
92			`func (t Token) String() string {`
93			`switch t.Type {`
94			`case ErrorToken:`
95			`return ""`
96			`case TextToken:`
97			`return EscapeString(t.Data)`
98			`case StartTagToken:`
99			`return "<" + t.tagString() + ">"`
100			`case EndTagToken:`
101			`return ""`
102			`case SelfClosingTagToken:`
103			`return "<" + t.tagString() + "/>"`
104			`case CommentToken:`
105			`return ""`
106			`case DoctypeToken:`
107			`return ""`
108			`}`
109			`return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"`
110			`}`
111
112			`// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,`
113			`// the end is exclusive.`
114			`type span struct {`
115			`start, end int`
116			`}`
117
118			`// A Tokenizer returns a stream of HTML Tokens.`
119			`type Tokenizer struct {`
120			`// r is the source of the HTML text.`
121			`r io.Reader`
122			`// tt is the TokenType of the current token.`
123			`tt TokenType`
124			`// err is the first error encountered during tokenization. It is possible`
125			`// for tt != Error && err != nil to hold: this means that Next returned a`
126			`// valid token but the subsequent Next call will return an error token.`
127			`// For example, if the HTML text input was just "plain", then the first`
128			`// Next call would set z.err to io.EOF but return a TextToken, and all`
129			`// subsequent Next calls would return an ErrorToken.`
130			`// err is never reset. Once it becomes non-nil, it stays non-nil.`
131			`err error`
132			`// buf[raw.start:raw.end] holds the raw bytes of the current token.`
133			`// buf[raw.end:] is buffered input that will yield future tokens.`
134			`raw span`
135			`buf []byte`
136			`// buf[data.start:data.end] holds the raw bytes of the current token's data:`
137			`// a text token's text, a tag token's tag name, etc.`
138			`data span`
139			`// pendingAttr is the attribute key and value currently being tokenized.`
140			`// When complete, pendingAttr is pushed onto attr. nAttrReturned is`
141			`// incremented on each call to TagAttr.`
142			`pendingAttr [2]span`
143			`attr [][2]span`
144			`nAttrReturned int`
145			`// rawTag is the "script" in "" that closes the next token. If`
146			`// non-empty, the subsequent call to Next will return a raw or RCDATA text`
147			`// token: one that treats "" as text instead of an element.`
148
149
150
151
152
153
154			`//`
155			`//`
156			`func`
157
158
159
160
161
162
163			`//`
164			`//`
165			`//`
166			`//`
167			`//`
168			`func`
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213			`//`
214			`func`
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233			`//`
234			`//`
235			`func`
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281			`//`
282			`//`
283			`func`
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327			`//`
328			`func`
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343			`//`
344			`//`
345			`//`
346			`func`
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384			`//`
385			`//`
386			`func`
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406			`//`
407			`//`
408			`func`
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461			`//`
462			`//`
463			`func`
464
465
466
467
468
469
470
471
472
473
474
475			`//`
476			`//`
477			`//`
478			`func`
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498			`//`
499			`//`
500			`func`
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520			`//`
521			`func`
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582			`//`
583			`func`
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693			`//`
694			`//`
695			`func`
696
697
698
699			`//`
700			`//`
701			`func`
702
703
704
705
706
707
708
709
710
711
712
713
714
715			`//`
716			`//`
717			`//`
718			`func`
719
720
721
722
723
724
725
726
727
728
729
730
731			`//`
732			`//`
733			`//`
734			`func`
735
736
737
738
739
740
741
742
743
744
745
746
747
748			`//`
749			`//`
750			`func`
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772			`//`
773			`//`
774			`func`
775
776
777
778
779