1 |
747 |
jeremybenn |
// Copyright 2011 The Go Authors. All rights reserved.
|
2 |
|
|
// Use of this source code is governed by a BSD-style
|
3 |
|
|
// license that can be found in the LICENSE file.
|
4 |
|
|
|
5 |
|
|
package unicode
|
6 |
|
|
|
7 |
|
|
// Bit masks for each code point under U+0100, for fast lookup.
|
8 |
|
|
const (
|
9 |
|
|
pC = 1 << iota // a control character.
|
10 |
|
|
pP // a punctuation character.
|
11 |
|
|
pN // a numeral.
|
12 |
|
|
pS // a symbolic character.
|
13 |
|
|
pZ // a spacing character.
|
14 |
|
|
pLu // an upper-case letter.
|
15 |
|
|
pLl // a lower-case letter.
|
16 |
|
|
pp // a printable character according to Go's definition.
|
17 |
|
|
pg = pp | pZ // a graphical character according to the Unicode definition.
|
18 |
|
|
)
|
19 |
|
|
|
20 |
|
|
// GraphicRanges defines the set of graphic characters according to Unicode.
|
21 |
|
|
var GraphicRanges = []*RangeTable{
|
22 |
|
|
L, M, N, P, S, Zs,
|
23 |
|
|
}
|
24 |
|
|
|
25 |
|
|
// PrintRanges defines the set of printable characters according to Go.
|
26 |
|
|
// ASCII space, U+0020, is handled separately.
|
27 |
|
|
var PrintRanges = []*RangeTable{
|
28 |
|
|
L, M, N, P, S,
|
29 |
|
|
}
|
30 |
|
|
|
31 |
|
|
// IsGraphic reports whether the rune is defined as a Graphic by Unicode.
|
32 |
|
|
// Such characters include letters, marks, numbers, punctuation, symbols, and
|
33 |
|
|
// spaces, from categories L, M, N, P, S, Zs.
|
34 |
|
|
func IsGraphic(r rune) bool {
|
35 |
|
|
// We convert to uint32 to avoid the extra test for negative,
|
36 |
|
|
// and in the index we convert to uint8 to avoid the range check.
|
37 |
|
|
if uint32(r) <= MaxLatin1 {
|
38 |
|
|
return properties[uint8(r)]&pg != 0
|
39 |
|
|
}
|
40 |
|
|
return IsOneOf(GraphicRanges, r)
|
41 |
|
|
}
|
42 |
|
|
|
43 |
|
|
// IsPrint reports whether the rune is defined as printable by Go. Such
|
44 |
|
|
// characters include letters, marks, numbers, punctuation, symbols, and the
|
45 |
|
|
// ASCII space character, from categories L, M, N, P, S and the ASCII space
|
46 |
|
|
// character. This categorization is the same as IsGraphic except that the
|
47 |
|
|
// only spacing character is ASCII space, U+0020.
|
48 |
|
|
func IsPrint(r rune) bool {
|
49 |
|
|
if uint32(r) <= MaxLatin1 {
|
50 |
|
|
return properties[uint8(r)]&pp != 0
|
51 |
|
|
}
|
52 |
|
|
return IsOneOf(PrintRanges, r)
|
53 |
|
|
}
|
54 |
|
|
|
55 |
|
|
// IsOneOf reports whether the rune is a member of one of the ranges.
|
56 |
|
|
// The rune is known to be above Latin-1.
|
57 |
|
|
func IsOneOf(set []*RangeTable, r rune) bool {
|
58 |
|
|
for _, inside := range set {
|
59 |
|
|
if Is(inside, r) {
|
60 |
|
|
return true
|
61 |
|
|
}
|
62 |
|
|
}
|
63 |
|
|
return false
|
64 |
|
|
}
|
65 |
|
|
|
66 |
|
|
// IsControl reports whether the rune is a control character.
|
67 |
|
|
// The C (Other) Unicode category includes more code points
|
68 |
|
|
// such as surrogates; use Is(C, rune) to test for them.
|
69 |
|
|
func IsControl(r rune) bool {
|
70 |
|
|
if uint32(r) <= MaxLatin1 {
|
71 |
|
|
return properties[uint8(r)]&pC != 0
|
72 |
|
|
}
|
73 |
|
|
// All control characters are < Latin1Max.
|
74 |
|
|
return false
|
75 |
|
|
}
|
76 |
|
|
|
77 |
|
|
// IsLetter reports whether the rune is a letter (category L).
|
78 |
|
|
func IsLetter(r rune) bool {
|
79 |
|
|
if uint32(r) <= MaxLatin1 {
|
80 |
|
|
return properties[uint8(r)]&(pLu|pLl) != 0
|
81 |
|
|
}
|
82 |
|
|
return Is(Letter, r)
|
83 |
|
|
}
|
84 |
|
|
|
85 |
|
|
// IsMark reports whether the rune is a mark character (category M).
|
86 |
|
|
func IsMark(r rune) bool {
|
87 |
|
|
// There are no mark characters in Latin-1.
|
88 |
|
|
return Is(Mark, r)
|
89 |
|
|
}
|
90 |
|
|
|
91 |
|
|
// IsNumber reports whether the rune is a number (category N).
|
92 |
|
|
func IsNumber(r rune) bool {
|
93 |
|
|
if uint32(r) <= MaxLatin1 {
|
94 |
|
|
return properties[uint8(r)]&pN != 0
|
95 |
|
|
}
|
96 |
|
|
return Is(Number, r)
|
97 |
|
|
}
|
98 |
|
|
|
99 |
|
|
// IsPunct reports whether the rune is a Unicode punctuation character
|
100 |
|
|
// (category P).
|
101 |
|
|
func IsPunct(r rune) bool {
|
102 |
|
|
if uint32(r) <= MaxLatin1 {
|
103 |
|
|
return properties[uint8(r)]&pP != 0
|
104 |
|
|
}
|
105 |
|
|
return Is(Punct, r)
|
106 |
|
|
}
|
107 |
|
|
|
108 |
|
|
// IsSpace reports whether the rune is a space character as defined
|
109 |
|
|
// by Unicode's White Space property; in the Latin-1 space
|
110 |
|
|
// this is
|
111 |
|
|
// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
|
112 |
|
|
// Other definitions of spacing characters are set by category
|
113 |
|
|
// Z and property Pattern_White_Space.
|
114 |
|
|
func IsSpace(r rune) bool {
|
115 |
|
|
// This property isn't the same as Z; special-case it.
|
116 |
|
|
if uint32(r) <= MaxLatin1 {
|
117 |
|
|
switch r {
|
118 |
|
|
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
|
119 |
|
|
return true
|
120 |
|
|
}
|
121 |
|
|
return false
|
122 |
|
|
}
|
123 |
|
|
return Is(White_Space, r)
|
124 |
|
|
}
|
125 |
|
|
|
126 |
|
|
// IsSymbol reports whether the rune is a symbolic character.
|
127 |
|
|
func IsSymbol(r rune) bool {
|
128 |
|
|
if uint32(r) <= MaxLatin1 {
|
129 |
|
|
return properties[uint8(r)]&pS != 0
|
130 |
|
|
}
|
131 |
|
|
return Is(Symbol, r)
|
132 |
|
|
}
|