OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [html/] [escape.go] - Blame information for rev 860

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// Package html provides functions for escaping and unescaping HTML text.
6
package html
7
 
8
import (
9
        "bytes"
10
        "strings"
11
        "unicode/utf8"
12
)
13
 
14
type writer interface {
15
        WriteString(string) (int, error)
16
}
17
 
18
// These replacements permit compatibility with old numeric entities that
19
// assumed Windows-1252 encoding.
20
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
21
var replacementTable = [...]rune{
22
        '\u20AC', // First entry is what 0x80 should be replaced with.
23
        '\u0081',
24
        '\u201A',
25
        '\u0192',
26
        '\u201E',
27
        '\u2026',
28
        '\u2020',
29
        '\u2021',
30
        '\u02C6',
31
        '\u2030',
32
        '\u0160',
33
        '\u2039',
34
        '\u0152',
35
        '\u008D',
36
        '\u017D',
37
        '\u008F',
38
        '\u0090',
39
        '\u2018',
40
        '\u2019',
41
        '\u201C',
42
        '\u201D',
43
        '\u2022',
44
        '\u2013',
45
        '\u2014',
46
        '\u02DC',
47
        '\u2122',
48
        '\u0161',
49
        '\u203A',
50
        '\u0153',
51
        '\u009D',
52
        '\u017E',
53
        '\u0178', // Last entry is 0x9F.
54
        // 0x00->'\uFFFD' is handled programmatically.
55
        // 0x0D->'\u000D' is a no-op.
56
}
57
 
58
// unescapeEntity reads an entity like "<" from b[src:] and writes the
59
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
60
// Precondition: b[src] == '&' && dst <= src.
61
// attribute should be true if parsing an attribute value.
62
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
63
        // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
64
 
65
        // i starts at 1 because we already know that s[0] == '&'.
66
        i, s := 1, b[src:]
67
 
68
        if len(s) <= 1 {
69
                b[dst] = b[src]
70
                return dst + 1, src + 1
71
        }
72
 
73
        if s[i] == '#' {
74
                if len(s) <= 3 { // We need to have at least "&#.".
75
                        b[dst] = b[src]
76
                        return dst + 1, src + 1
77
                }
78
                i++
79
                c := s[i]
80
                hex := false
81
                if c == 'x' || c == 'X' {
82
                        hex = true
83
                        i++
84
                }
85
 
86
                x := '\x00'
87
                for i < len(s) {
88
                        c = s[i]
89
                        i++
90
                        if hex {
91
                                if '0' <= c && c <= '9' {
92
                                        x = 16*x + rune(c) - '0'
93
                                        continue
94
                                } else if 'a' <= c && c <= 'f' {
95
                                        x = 16*x + rune(c) - 'a' + 10
96
                                        continue
97
                                } else if 'A' <= c && c <= 'F' {
98
                                        x = 16*x + rune(c) - 'A' + 10
99
                                        continue
100
                                }
101
                        } else if '0' <= c && c <= '9' {
102
                                x = 10*x + rune(c) - '0'
103
                                continue
104
                        }
105
                        if c != ';' {
106
                                i--
107
                        }
108
                        break
109
                }
110
 
111
                if i <= 3 { // No characters matched.
112
                        b[dst] = b[src]
113
                        return dst + 1, src + 1
114
                }
115
 
116
                if 0x80 <= x && x <= 0x9F {
117
                        // Replace characters from Windows-1252 with UTF-8 equivalents.
118
                        x = replacementTable[x-0x80]
119
                } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
120
                        // Replace invalid characters with the replacement character.
121
                        x = '\uFFFD'
122
                }
123
 
124
                return dst + utf8.EncodeRune(b[dst:], x), src + i
125
        }
126
 
127
        // Consume the maximum number of characters possible, with the
128
        // consumed characters matching one of the named references.
129
 
130
        for i < len(s) {
131
                c := s[i]
132
                i++
133
                // Lower-cased characters are more common in entities, so we check for them first.
134
                if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
135
                        continue
136
                }
137
                if c != ';' {
138
                        i--
139
                }
140
                break
141
        }
142
 
143
        entityName := string(s[1:i])
144
        if entityName == "" {
145
                // No-op.
146
        } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
147
                // No-op.
148
        } else if x := entity[entityName]; x != 0 {
149
                return dst + utf8.EncodeRune(b[dst:], x), src + i
150
        } else if x := entity2[entityName]; x[0] != 0 {
151
                dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
152
                return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
153
        } else if !attribute {
154
                maxLen := len(entityName) - 1
155
                if maxLen > longestEntityWithoutSemicolon {
156
                        maxLen = longestEntityWithoutSemicolon
157
                }
158
                for j := maxLen; j > 1; j-- {
159
                        if x := entity[entityName[:j]]; x != 0 {
160
                                return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
161
                        }
162
                }
163
        }
164
 
165
        dst1, src1 = dst+i, src+i
166
        copy(b[dst:dst1], b[src:src1])
167
        return dst1, src1
168
}
169
 
170
// unescape unescapes b's entities in-place, so that "a<b" becomes "a
171
func unescape(b []byte) []byte {
172
        for i, c := range b {
173
                if c == '&' {
174
                        dst, src := unescapeEntity(b, i, i, false)
175
                        for src < len(b) {
176
                                c := b[src]
177
                                if c == '&' {
178
                                        dst, src = unescapeEntity(b, dst, src, false)
179
                                } else {
180
                                        b[dst] = c
181
                                        dst, src = dst+1, src+1
182
                                }
183
                        }
184
                        return b[0:dst]
185
                }
186
        }
187
        return b
188
}
189
 
190
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
191
func lower(b []byte) []byte {
192
        for i, c := range b {
193
                if 'A' <= c && c <= 'Z' {
194
                        b[i] = c + 'a' - 'A'
195
                }
196
        }
197
        return b
198
}
199
 
200
const escapedChars = `&'<>"`
201
 
202
func escape(w writer, s string) error {
203
        i := strings.IndexAny(s, escapedChars)
204
        for i != -1 {
205
                if _, err := w.WriteString(s[:i]); err != nil {
206
                        return err
207
                }
208
                var esc string
209
                switch s[i] {
210
                case '&':
211
                        esc = "&"
212
                case '\'':
213
                        esc = "'"
214
                case '<':
215
                        esc = "<"
216
                case '>':
217
                        esc = ">"
218
                case '"':
219
                        esc = """
220
                default:
221
                        panic("unrecognized escape character")
222
                }
223
                s = s[i+1:]
224
                if _, err := w.WriteString(esc); err != nil {
225
                        return err
226
                }
227
                i = strings.IndexAny(s, escapedChars)
228
        }
229
        _, err := w.WriteString(s)
230
        return err
231
}
232
 
233
// EscapeString escapes special characters like "<" to become "<". It
234
// escapes only five such characters: amp, apos, lt, gt and quot.
235
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
236
// always true.
237
func EscapeString(s string) string {
238
        if strings.IndexAny(s, escapedChars) == -1 {
239
                return s
240
        }
241
        var buf bytes.Buffer
242
        escape(&buf, s)
243
        return buf.String()
244
}
245
 
246
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
247
// larger range of entities than EscapeString escapes. For example, "á"
248
// unescapes to "รก", as does "á" and "&xE1;".
249
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
250
// always true.
251
func UnescapeString(s string) string {
252
        for _, c := range s {
253
                if c == '&' {
254
                        return string(unescape([]byte(s)))
255
                }
256
        }
257
        return s
258
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.