OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [exp/] [html/] [escape.go] - Blame information for rev 867

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
package html
6
 
7
import (
8
        "bytes"
9
        "strings"
10
        "unicode/utf8"
11
)
12
 
13
// These replacements permit compatibility with old numeric entities that
14
// assumed Windows-1252 encoding.
15
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
16
var replacementTable = [...]rune{
17
        '\u20AC', // First entry is what 0x80 should be replaced with.
18
        '\u0081',
19
        '\u201A',
20
        '\u0192',
21
        '\u201E',
22
        '\u2026',
23
        '\u2020',
24
        '\u2021',
25
        '\u02C6',
26
        '\u2030',
27
        '\u0160',
28
        '\u2039',
29
        '\u0152',
30
        '\u008D',
31
        '\u017D',
32
        '\u008F',
33
        '\u0090',
34
        '\u2018',
35
        '\u2019',
36
        '\u201C',
37
        '\u201D',
38
        '\u2022',
39
        '\u2013',
40
        '\u2014',
41
        '\u02DC',
42
        '\u2122',
43
        '\u0161',
44
        '\u203A',
45
        '\u0153',
46
        '\u009D',
47
        '\u017E',
48
        '\u0178', // Last entry is 0x9F.
49
        // 0x00->'\uFFFD' is handled programmatically.
50
        // 0x0D->'\u000D' is a no-op.
51
}
52
 
53
// unescapeEntity reads an entity like "<" from b[src:] and writes the
54
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55
// Precondition: b[src] == '&' && dst <= src.
56
// attribute should be true if parsing an attribute value.
57
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58
        // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
59
 
60
        // i starts at 1 because we already know that s[0] == '&'.
61
        i, s := 1, b[src:]
62
 
63
        if len(s) <= 1 {
64
                b[dst] = b[src]
65
                return dst + 1, src + 1
66
        }
67
 
68
        if s[i] == '#' {
69
                if len(s) <= 3 { // We need to have at least "&#.".
70
                        b[dst] = b[src]
71
                        return dst + 1, src + 1
72
                }
73
                i++
74
                c := s[i]
75
                hex := false
76
                if c == 'x' || c == 'X' {
77
                        hex = true
78
                        i++
79
                }
80
 
81
                x := '\x00'
82
                for i < len(s) {
83
                        c = s[i]
84
                        i++
85
                        if hex {
86
                                if '0' <= c && c <= '9' {
87
                                        x = 16*x + rune(c) - '0'
88
                                        continue
89
                                } else if 'a' <= c && c <= 'f' {
90
                                        x = 16*x + rune(c) - 'a' + 10
91
                                        continue
92
                                } else if 'A' <= c && c <= 'F' {
93
                                        x = 16*x + rune(c) - 'A' + 10
94
                                        continue
95
                                }
96
                        } else if '0' <= c && c <= '9' {
97
                                x = 10*x + rune(c) - '0'
98
                                continue
99
                        }
100
                        if c != ';' {
101
                                i--
102
                        }
103
                        break
104
                }
105
 
106
                if i <= 3 { // No characters matched.
107
                        b[dst] = b[src]
108
                        return dst + 1, src + 1
109
                }
110
 
111
                if 0x80 <= x && x <= 0x9F {
112
                        // Replace characters from Windows-1252 with UTF-8 equivalents.
113
                        x = replacementTable[x-0x80]
114
                } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
115
                        // Replace invalid characters with the replacement character.
116
                        x = '\uFFFD'
117
                }
118
 
119
                return dst + utf8.EncodeRune(b[dst:], x), src + i
120
        }
121
 
122
        // Consume the maximum number of characters possible, with the
123
        // consumed characters matching one of the named references.
124
 
125
        for i < len(s) {
126
                c := s[i]
127
                i++
128
                // Lower-cased characters are more common in entities, so we check for them first.
129
                if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
130
                        continue
131
                }
132
                if c != ';' {
133
                        i--
134
                }
135
                break
136
        }
137
 
138
        entityName := string(s[1:i])
139
        if entityName == "" {
140
                // No-op.
141
        } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
142
                // No-op.
143
        } else if x := entity[entityName]; x != 0 {
144
                return dst + utf8.EncodeRune(b[dst:], x), src + i
145
        } else if x := entity2[entityName]; x[0] != 0 {
146
                dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147
                return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148
        } else if !attribute {
149
                maxLen := len(entityName) - 1
150
                if maxLen > longestEntityWithoutSemicolon {
151
                        maxLen = longestEntityWithoutSemicolon
152
                }
153
                for j := maxLen; j > 1; j-- {
154
                        if x := entity[entityName[:j]]; x != 0 {
155
                                return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
156
                        }
157
                }
158
        }
159
 
160
        dst1, src1 = dst+i, src+i
161
        copy(b[dst:dst1], b[src:src1])
162
        return dst1, src1
163
}
164
 
165
// unescape unescapes b's entities in-place, so that "a<b" becomes "a
166
func unescape(b []byte) []byte {
167
        for i, c := range b {
168
                if c == '&' {
169
                        dst, src := unescapeEntity(b, i, i, false)
170
                        for src < len(b) {
171
                                c := b[src]
172
                                if c == '&' {
173
                                        dst, src = unescapeEntity(b, dst, src, false)
174
                                } else {
175
                                        b[dst] = c
176
                                        dst, src = dst+1, src+1
177
                                }
178
                        }
179
                        return b[0:dst]
180
                }
181
        }
182
        return b
183
}
184
 
185
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
186
func lower(b []byte) []byte {
187
        for i, c := range b {
188
                if 'A' <= c && c <= 'Z' {
189
                        b[i] = c + 'a' - 'A'
190
                }
191
        }
192
        return b
193
}
194
 
195
const escapedChars = `&'<>"`
196
 
197
func escape(w writer, s string) error {
198
        i := strings.IndexAny(s, escapedChars)
199
        for i != -1 {
200
                if _, err := w.WriteString(s[:i]); err != nil {
201
                        return err
202
                }
203
                var esc string
204
                switch s[i] {
205
                case '&':
206
                        esc = "&"
207
                case '\'':
208
                        esc = "'"
209
                case '<':
210
                        esc = "<"
211
                case '>':
212
                        esc = ">"
213
                case '"':
214
                        esc = """
215
                default:
216
                        panic("unrecognized escape character")
217
                }
218
                s = s[i+1:]
219
                if _, err := w.WriteString(esc); err != nil {
220
                        return err
221
                }
222
                i = strings.IndexAny(s, escapedChars)
223
        }
224
        _, err := w.WriteString(s)
225
        return err
226
}
227
 
228
// EscapeString escapes special characters like "<" to become "<". It
229
// escapes only five such characters: amp, apos, lt, gt and quot.
230
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
231
// always true.
232
func EscapeString(s string) string {
233
        if strings.IndexAny(s, escapedChars) == -1 {
234
                return s
235
        }
236
        var buf bytes.Buffer
237
        escape(&buf, s)
238
        return buf.String()
239
}
240
 
241
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
242
// larger range of entities than EscapeString escapes. For example, "á"
243
// unescapes to "รก", as does "á" and "&xE1;".
244
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
245
// always true.
246
func UnescapeString(s string) string {
247
        for _, c := range s {
248
                if c == '&' {
249
                        return string(unescape([]byte(s)))
250
                }
251
        }
252
        return s
253
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.