URL
https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc
[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [html/] [token.go] - Rev 747
Compare with Previous | Blame | View Log
// Copyright 2010 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package htmlimport ("bytes""io""strconv""strings")// A TokenType is the type of a Token.type TokenType intconst (// ErrorToken means that an error occurred during tokenization.ErrorToken TokenType = iota// TextToken means a text node.TextToken// A StartTagToken looks like <a>.StartTagToken// An EndTagToken looks like </a>.EndTagToken// A SelfClosingTagToken tag looks like <br/>.SelfClosingTagToken// A CommentToken looks like <!--x-->.CommentToken// A DoctypeToken looks like <!DOCTYPE x>DoctypeToken)// String returns a string representation of the TokenType.func (t TokenType) String() string {switch t {case ErrorToken:return "Error"case TextToken:return "Text"case StartTagToken:return "StartTag"case EndTagToken:return "EndTag"case SelfClosingTagToken:return "SelfClosingTag"case CommentToken:return "Comment"case DoctypeToken:return "Doctype"}return "Invalid(" + strconv.Itoa(int(t)) + ")"}// An Attribute is an attribute namespace-key-value triple. Namespace is// non-empty for foreign attributes like xlink, Key is alphabetic (and hence// does not contain escapable characters like '&', '<' or '>'), and Val is// unescaped (it looks like "a<b" rather than "a<b").//// Namespace is only used by the parser, not the tokenizer.type Attribute struct {Namespace, Key, Val string}// A Token consists of a TokenType and some Data (tag name for start and end// tags, content for text, comments and doctypes). A tag Token may also contain// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"// rather than "a<b").type Token struct {Type TokenTypeData stringAttr []Attribute}// tagString returns a string representation of a tag Token's Data and Attr.func (t Token) tagString() string {if len(t.Attr) == 0 {return t.Data}buf := bytes.NewBufferString(t.Data)for _, a := range t.Attr {buf.WriteByte(' ')buf.WriteString(a.Key)buf.WriteString(`="`)escape(buf, a.Val)buf.WriteByte('"')}return buf.String()}// String returns a string representation of the Token.func (t Token) String() string {switch t.Type {case ErrorToken:return ""case TextToken:return EscapeString(t.Data)case StartTagToken:return "<" + t.tagString() + ">"case EndTagToken:return "</" + t.tagString() + ">"case SelfClosingTagToken:return "<" + t.tagString() + "/>"case CommentToken:return "<!--" + t.Data + "-->"case DoctypeToken:return "<!DOCTYPE " + t.Data + ">"}return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"}// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,// the end is exclusive.type span struct {start, end int}// A Tokenizer returns a stream of HTML Tokens.type Tokenizer struct {// r is the source of the HTML text.r io.Reader// tt is the TokenType of the current token.tt TokenType// err is the first error encountered during tokenization. It is possible// for tt != Error && err != nil to hold: this means that Next returned a// valid token but the subsequent Next call will return an error token.// For example, if the HTML text input was just "plain", then the first// Next call would set z.err to io.EOF but return a TextToken, and all// subsequent Next calls would return an ErrorToken.// err is never reset. Once it becomes non-nil, it stays non-nil.err error// buf[raw.start:raw.end] holds the raw bytes of the current token.// buf[raw.end:] is buffered input that will yield future tokens.raw spanbuf []byte// buf[data.start:data.end] holds the raw bytes of the current token's data:// a text token's text, a tag token's tag name, etc.data span// pendingAttr is the attribute key and value currently being tokenized.// When complete, pendingAttr is pushed onto attr. nAttrReturned is// incremented on each call to TagAttr.pendingAttr [2]spanattr [][2]spannAttrReturned int// rawTag is the "script" in "</script>" that closes the next token. If// non-empty, the subsequent call to Next will return a raw or RCDATA text// token: one that treats "<p>" as text instead of an element.// rawTag's contents are lower-cased.rawTag string// textIsRaw is whether the current text token's data is not escaped.textIsRaw bool}// Err returns the error associated with the most recent ErrorToken token.// This is typically io.EOF, meaning the end of tokenization.func (z *Tokenizer) Err() error {if z.tt != ErrorToken {return nil}return z.err}// readByte returns the next byte from the input stream, doing a buffered read// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte// slice that holds all the bytes read so far for the current token.// It sets z.err if the underlying reader returns an error.// Pre-condition: z.err == nil.func (z *Tokenizer) readByte() byte {if z.raw.end >= len(z.buf) {// Our buffer is exhausted and we have to read from z.r.// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we// allocate a new buffer before the copy.c := cap(z.buf)d := z.raw.end - z.raw.startvar buf1 []byteif 2*d > c {buf1 = make([]byte, d, 2*c)} else {buf1 = z.buf[:d]}copy(buf1, z.buf[z.raw.start:z.raw.end])if x := z.raw.start; x != 0 {// Adjust the data/attr spans to refer to the same contents after the copy.z.data.start -= xz.data.end -= xz.pendingAttr[0].start -= xz.pendingAttr[0].end -= xz.pendingAttr[1].start -= xz.pendingAttr[1].end -= xfor i := range z.attr {z.attr[i][0].start -= xz.attr[i][0].end -= xz.attr[i][1].start -= xz.attr[i][1].end -= x}}z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]// Now that we have copied the live bytes to the start of the buffer,// we read from z.r into the remainder.n, err := z.r.Read(buf1[d:cap(buf1)])if err != nil {z.err = errreturn 0}z.buf = buf1[:d+n]}x := z.buf[z.raw.end]z.raw.end++return x}// skipWhiteSpace skips past any white space.func (z *Tokenizer) skipWhiteSpace() {if z.err != nil {return}for {c := z.readByte()if z.err != nil {return}switch c {case ' ', '\n', '\r', '\t', '\f':// No-op.default:z.raw.end--return}}}// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and// is typically something like "script" or "textarea".func (z *Tokenizer) readRawOrRCDATA() {loop:for {c := z.readByte()if z.err != nil {break loop}if c != '<' {continue loop}c = z.readByte()if z.err != nil {break loop}if c != '/' {continue loop}for i := 0; i < len(z.rawTag); i++ {c = z.readByte()if z.err != nil {break loop}if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {continue loop}}c = z.readByte()if z.err != nil {break loop}switch c {case ' ', '\n', '\r', '\t', '\f', '/', '>':// The 3 is 2 for the leading "</" plus 1 for the trailing character c.z.raw.end -= 3 + len(z.rawTag)break loopcase '<':// Step back one, to catch "</foo</foo>".z.raw.end--}}z.data.end = z.raw.end// A textarea's or title's RCDATA can contain escaped entities.z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"z.rawTag = ""}// readComment reads the next comment token starting with "<!--". The opening// "<!--" has already been consumed.func (z *Tokenizer) readComment() {z.data.start = z.raw.enddefer func() {if z.data.end < z.data.start {// It's a comment with no data, like <!-->.z.data.end = z.data.start}}()for dashCount := 2; ; {c := z.readByte()if z.err != nil {// Ignore up to two dashes at EOF.if dashCount > 2 {dashCount = 2}z.data.end = z.raw.end - dashCountreturn}switch c {case '-':dashCount++continuecase '>':if dashCount >= 2 {z.data.end = z.raw.end - len("-->")return}case '!':if dashCount >= 2 {c = z.readByte()if z.err != nil {z.data.end = z.raw.endreturn}if c == '>' {z.data.end = z.raw.end - len("--!>")return}}}dashCount = 0}}// readUntilCloseAngle reads until the next ">".func (z *Tokenizer) readUntilCloseAngle() {z.data.start = z.raw.endfor {c := z.readByte()if z.err != nil {z.data.end = z.raw.endreturn}if c == '>' {z.data.end = z.raw.end - len(">")return}}}// readMarkupDeclaration reads the next token starting with "<!". It might be// a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening// "<!" has already been consumed.func (z *Tokenizer) readMarkupDeclaration() TokenType {z.data.start = z.raw.endvar c [2]bytefor i := 0; i < 2; i++ {c[i] = z.readByte()if z.err != nil {z.data.end = z.raw.endreturn CommentToken}}if c[0] == '-' && c[1] == '-' {z.readComment()return CommentToken}z.raw.end -= 2const s = "DOCTYPE"for i := 0; i < len(s); i++ {c := z.readByte()if z.err != nil {z.data.end = z.raw.endreturn CommentToken}if c != s[i] && c != s[i]+('a'-'A') {// Back up to read the fragment of "DOCTYPE" again.z.raw.end = z.data.startz.readUntilCloseAngle()return CommentToken}}if z.skipWhiteSpace(); z.err != nil {z.data.start = z.raw.endz.data.end = z.raw.endreturn DoctypeToken}z.readUntilCloseAngle()return DoctypeToken}// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]// case-insensitively matches any element of ss.func (z *Tokenizer) startTagIn(ss ...string) bool {loop:for _, s := range ss {if z.data.end-z.data.start != len(s) {continue loop}for i := 0; i < len(s); i++ {c := z.buf[z.data.start+i]if 'A' <= c && c <= 'Z' {c += 'a' - 'A'}if c != s[i] {continue loop}}return true}return false}// readStartTag reads the next start tag token. The opening "<a" has already// been consumed, where 'a' means anything in [A-Za-z].func (z *Tokenizer) readStartTag() TokenType {z.attr = z.attr[:0]z.nAttrReturned = 0// Read the tag name and attribute key/value pairs.z.readTagName()if z.skipWhiteSpace(); z.err != nil {return ErrorToken}for {c := z.readByte()if z.err != nil || c == '>' {break}z.raw.end--z.readTagAttrKey()z.readTagAttrVal()// Save pendingAttr if it has a non-empty key.if z.pendingAttr[0].start != z.pendingAttr[0].end {z.attr = append(z.attr, z.pendingAttr)}if z.skipWhiteSpace(); z.err != nil {break}}// Several tags flag the tokenizer's next token as raw.c, raw := z.buf[z.data.start], falseif 'A' <= c && c <= 'Z' {c += 'a' - 'A'}switch c {case 'i':raw = z.startTagIn("iframe")case 'n':raw = z.startTagIn("noembed", "noframes", "noscript")case 'p':raw = z.startTagIn("plaintext")case 's':raw = z.startTagIn("script", "style")case 't':raw = z.startTagIn("textarea", "title")case 'x':raw = z.startTagIn("xmp")}if raw {z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))}// Look for a self-closing token like "<br/>".if z.err == nil && z.buf[z.raw.end-2] == '/' {return SelfClosingTagToken}return StartTagToken}// readEndTag reads the next end tag token. The opening "</a" has already// been consumed, where 'a' means anything in [A-Za-z].func (z *Tokenizer) readEndTag() {z.attr = z.attr[:0]z.nAttrReturned = 0z.readTagName()for {c := z.readByte()if z.err != nil || c == '>' {return}}}// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)// is positioned such that the first byte of the tag name (the "d" in "<div")// has already been consumed.func (z *Tokenizer) readTagName() {z.data.start = z.raw.end - 1for {c := z.readByte()if z.err != nil {z.data.end = z.raw.endreturn}switch c {case ' ', '\n', '\r', '\t', '\f':z.data.end = z.raw.end - 1returncase '/', '>':z.raw.end--z.data.end = z.raw.endreturn}}}// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".// Precondition: z.err == nil.func (z *Tokenizer) readTagAttrKey() {z.pendingAttr[0].start = z.raw.endfor {c := z.readByte()if z.err != nil {z.pendingAttr[0].end = z.raw.endreturn}switch c {case ' ', '\n', '\r', '\t', '\f', '/':z.pendingAttr[0].end = z.raw.end - 1returncase '=', '>':z.raw.end--z.pendingAttr[0].end = z.raw.endreturn}}}// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".func (z *Tokenizer) readTagAttrVal() {z.pendingAttr[1].start = z.raw.endz.pendingAttr[1].end = z.raw.endif z.skipWhiteSpace(); z.err != nil {return}c := z.readByte()if z.err != nil {return}if c != '=' {z.raw.end--return}if z.skipWhiteSpace(); z.err != nil {return}quote := z.readByte()if z.err != nil {return}switch quote {case '>':z.raw.end--returncase '\'', '"':z.pendingAttr[1].start = z.raw.endfor {c := z.readByte()if z.err != nil {z.pendingAttr[1].end = z.raw.endreturn}if c == quote {z.pendingAttr[1].end = z.raw.end - 1return}}default:z.pendingAttr[1].start = z.raw.end - 1for {c := z.readByte()if z.err != nil {z.pendingAttr[1].end = z.raw.endreturn}switch c {case ' ', '\n', '\r', '\t', '\f':z.pendingAttr[1].end = z.raw.end - 1returncase '>':z.raw.end--z.pendingAttr[1].end = z.raw.endreturn}}}}// Next scans the next token and returns its type.func (z *Tokenizer) Next() TokenType {if z.err != nil {z.tt = ErrorTokenreturn z.tt}z.raw.start = z.raw.endz.data.start = z.raw.endz.data.end = z.raw.endif z.rawTag != "" {if z.rawTag == "plaintext" {// Read everything up to EOF.for z.err == nil {z.readByte()}z.textIsRaw = true} else {z.readRawOrRCDATA()}if z.data.end > z.data.start {z.tt = TextTokenreturn z.tt}}z.textIsRaw = falseloop:for {c := z.readByte()if z.err != nil {break loop}if c != '<' {continue loop}// Check if the '<' we have just read is part of a tag, comment// or doctype. If not, it's part of the accumulated text token.c = z.readByte()if z.err != nil {break loop}var tokenType TokenTypeswitch {case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':tokenType = StartTagTokencase c == '/':tokenType = EndTagTokencase c == '!' || c == '?':// We use CommentToken to mean any of "<!--actual comments-->",// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".tokenType = CommentTokendefault:continue}// We have a non-text token, but we might have accumulated some text// before that. If so, we return the text first, and return the non-// text token on the subsequent call to Next.if x := z.raw.end - len("<a"); z.raw.start < x {z.raw.end = xz.data.end = xz.tt = TextTokenreturn z.tt}switch tokenType {case StartTagToken:z.tt = z.readStartTag()return z.ttcase EndTagToken:c = z.readByte()if z.err != nil {break loop}if c == '>' {// "</>" does not generate a token at all.// Reset the tokenizer state and start again.z.raw.start = z.raw.endz.data.start = z.raw.endz.data.end = z.raw.endcontinue loop}if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {z.readEndTag()z.tt = EndTagTokenreturn z.tt}z.raw.end--z.readUntilCloseAngle()z.tt = CommentTokenreturn z.ttcase CommentToken:if c == '!' {z.tt = z.readMarkupDeclaration()return z.tt}z.raw.end--z.readUntilCloseAngle()z.tt = CommentTokenreturn z.tt}}if z.raw.start < z.raw.end {z.data.end = z.raw.endz.tt = TextTokenreturn z.tt}z.tt = ErrorTokenreturn z.tt}// Raw returns the unmodified text of the current token. Calling Next, Token,// Text, TagName or TagAttr may change the contents of the returned slice.func (z *Tokenizer) Raw() []byte {return z.buf[z.raw.start:z.raw.end]}// Text returns the unescaped text of a text, comment or doctype token. The// contents of the returned slice may change on the next call to Next.func (z *Tokenizer) Text() []byte {switch z.tt {case TextToken, CommentToken, DoctypeToken:s := z.buf[z.data.start:z.data.end]z.data.start = z.raw.endz.data.end = z.raw.endif !z.textIsRaw {s = unescape(s)}return s}return nil}// TagName returns the lower-cased name of a tag token (the `img` out of// `<IMG SRC="foo">`) and whether the tag has attributes.// The contents of the returned slice may change on the next call to Next.func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {if z.data.start < z.data.end {switch z.tt {case StartTagToken, EndTagToken, SelfClosingTagToken:s := z.buf[z.data.start:z.data.end]z.data.start = z.raw.endz.data.end = z.raw.endreturn lower(s), z.nAttrReturned < len(z.attr)}}return nil, false}// TagAttr returns the lower-cased key and unescaped value of the next unparsed// attribute for the current tag token and whether there are more attributes.// The contents of the returned slices may change on the next call to Next.func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {if z.nAttrReturned < len(z.attr) {switch z.tt {case StartTagToken, SelfClosingTagToken:x := z.attr[z.nAttrReturned]z.nAttrReturned++key = z.buf[x[0].start:x[0].end]val = z.buf[x[1].start:x[1].end]return lower(key), unescape(val), z.nAttrReturned < len(z.attr)}}return nil, nil, false}// Token returns the next Token. The result's Data and Attr values remain valid// after subsequent Next calls.func (z *Tokenizer) Token() Token {t := Token{Type: z.tt}switch z.tt {case TextToken, CommentToken, DoctypeToken:t.Data = string(z.Text())case StartTagToken, SelfClosingTagToken:var attr []Attributename, moreAttr := z.TagName()for moreAttr {var key, val []bytekey, val, moreAttr = z.TagAttr()attr = append(attr, Attribute{"", string(key), string(val)})}t.Data = string(name)t.Attr = attrcase EndTagToken:name, _ := z.TagName()t.Data = string(name)}return t}// NewTokenizer returns a new HTML Tokenizer for the given Reader.// The input is assumed to be UTF-8 encoded.func NewTokenizer(r io.Reader) *Tokenizer {return &Tokenizer{r: r,buf: make([]byte, 0, 4096),}}
