1 |
199 |
simons |
/* Capitalization rules for HPFS */
|
2 |
|
|
|
3 |
|
|
/* In OS/2, HPFS filenames preserve upper and lower case letter distinctions
|
4 |
|
|
but filename matching ignores case. That is, creating a file "Foo"
|
5 |
|
|
actually creates a file named "Foo" which can be looked up as "Foo",
|
6 |
|
|
"foo", or "FOO", among other possibilities.
|
7 |
|
|
|
8 |
|
|
Also, HPFS is internationalized -- a table giving the uppercase
|
9 |
|
|
equivalent of every character is stored in the filesystem, so that
|
10 |
|
|
any national character set may be used. If several different
|
11 |
|
|
national character sets are in use, several tables are stored
|
12 |
|
|
in the filesystem.
|
13 |
|
|
|
14 |
|
|
It would be perfectly reasonable for Linux HPFS to act as a Unix
|
15 |
|
|
filesystem and match "Foo" only if asked for "Foo" exactly. But
|
16 |
|
|
the sort order of HPFS directories is case-insensitive, so Linux
|
17 |
|
|
still has to know the capitalization rules used by OS/2. Because
|
18 |
|
|
of this, it turns out to be more natural for us to be case-insensitive
|
19 |
|
|
than not.
|
20 |
|
|
|
21 |
|
|
Currently the standard character set used by Linux is Latin-1.
|
22 |
|
|
Work is underway to permit people to use UTF-8 instead, therefore
|
23 |
|
|
all code that depends on the character set is segregated here.
|
24 |
|
|
|
25 |
|
|
(It would be wonderful if Linux HPFS could be independent of what
|
26 |
|
|
character set is in use on the Linux side, but because of the
|
27 |
|
|
necessary case folding this is impossible.)
|
28 |
|
|
|
29 |
|
|
There is a map from Latin-1 into code page 850 for every printing
|
30 |
|
|
character in Latin-1. The NLS documentation of OS/2 shows that
|
31 |
|
|
everybody has 850 available unless they don't have Western latin
|
32 |
|
|
chars available at all (so fitting them to Linux without Unicode
|
33 |
|
|
is a doomed exercise).
|
34 |
|
|
|
35 |
|
|
It is not clear exactly how HPFS.IFS handles the situation when
|
36 |
|
|
multiple code pages are in use. Experiments show that
|
37 |
|
|
|
38 |
|
|
- tables on the disk give uppercasing rules for the installed code pages
|
39 |
|
|
|
40 |
|
|
- each directory entry is tagged with what code page was current
|
41 |
|
|
when that name was created
|
42 |
|
|
|
43 |
|
|
- doing just CHCP, without changing what's on the disk in any way,
|
44 |
|
|
can change what DIR reports, and what name a case-folded match
|
45 |
|
|
will match.
|
46 |
|
|
|
47 |
|
|
This means, I think, that HPFS.IFS operates in the current code
|
48 |
|
|
page, without regard to the uppercasing information recorded in
|
49 |
|
|
the tables on the disk. It does record the uppercasing rules
|
50 |
|
|
it used, perhaps for CHKDSK, but it does not appear to use them
|
51 |
|
|
itself.
|
52 |
|
|
|
53 |
|
|
So: Linux, a Latin-1 system, will operate in code page 850. We
|
54 |
|
|
recode between 850 and Latin-1 when dealing with the names actually
|
55 |
|
|
on the disk. We don't use the uppercasing tables either.
|
56 |
|
|
|
57 |
|
|
In a hypothetical UTF-8 implementation, one reasonable way to
|
58 |
|
|
proceed that matches OS/2 (for least surprise) is: do case
|
59 |
|
|
translation in UTF-8, and recode to/from one of the code pages
|
60 |
|
|
available on the mounted filesystem. Reject as invalid any name
|
61 |
|
|
containing chars that can't be represented on disk by one of the
|
62 |
|
|
code pages OS/2 is using. Recoding from on-disk names to UTF-8
|
63 |
|
|
could use the code page tags, though this is not what OS/2 does. */
|
64 |
|
|
|
65 |
|
|
|
66 |
|
|
static const unsigned char tb_cp850_to_latin1[128] =
|
67 |
|
|
{
|
68 |
|
|
199, 252, 233, 226, 228, 224, 229, 231,
|
69 |
|
|
234, 235, 232, 239, 238, 236, 196, 197,
|
70 |
|
|
201, 230, 198, 244, 246, 242, 251, 249,
|
71 |
|
|
255, 214, 220, 248, 163, 216, 215, 159,
|
72 |
|
|
225, 237, 243, 250, 241, 209, 170, 186,
|
73 |
|
|
191, 174, 172, 189, 188, 161, 171, 187,
|
74 |
|
|
155, 156, 157, 144, 151, 193, 194, 192,
|
75 |
|
|
169, 135, 128, 131, 133, 162, 165, 147,
|
76 |
|
|
148, 153, 152, 150, 145, 154, 227, 195,
|
77 |
|
|
132, 130, 137, 136, 134, 129, 138, 164,
|
78 |
|
|
240, 208, 202, 203, 200, 158, 205, 206,
|
79 |
|
|
207, 149, 146, 141, 140, 166, 204, 139,
|
80 |
|
|
211, 223, 212, 210, 245, 213, 181, 254,
|
81 |
|
|
222, 218, 219, 217, 253, 221, 175, 180,
|
82 |
|
|
173, 177, 143, 190, 182, 167, 247, 184,
|
83 |
|
|
176, 168, 183, 185, 179, 178, 142, 160,
|
84 |
|
|
};
|
85 |
|
|
|
86 |
|
|
#if 0
|
87 |
|
|
static const unsigned char tb_latin1_to_cp850[128] =
|
88 |
|
|
{
|
89 |
|
|
186, 205, 201, 187, 200, 188, 204, 185,
|
90 |
|
|
203, 202, 206, 223, 220, 219, 254, 242,
|
91 |
|
|
179, 196, 218, 191, 192, 217, 195, 180,
|
92 |
|
|
194, 193, 197, 176, 177, 178, 213, 159,
|
93 |
|
|
255, 173, 189, 156, 207, 190, 221, 245,
|
94 |
|
|
249, 184, 166, 174, 170, 240, 169, 238,
|
95 |
|
|
248, 241, 253, 252, 239, 230, 244, 250,
|
96 |
|
|
247, 251, 167, 175, 172, 171, 243, 168,
|
97 |
|
|
183, 181, 182, 199, 142, 143, 146, 128,
|
98 |
|
|
212, 144, 210, 211, 222, 214, 215, 216,
|
99 |
|
|
209, 165, 227, 224, 226, 229, 153, 158,
|
100 |
|
|
157, 235, 233, 234, 154, 237, 232, 225,
|
101 |
|
|
133, 160, 131, 198, 132, 134, 145, 135,
|
102 |
|
|
138, 130, 136, 137, 141, 161, 140, 139,
|
103 |
|
|
208, 164, 149, 162, 147, 228, 148, 246,
|
104 |
|
|
155, 151, 163, 150, 129, 236, 231, 152,
|
105 |
|
|
};
|
106 |
|
|
#endif
|
107 |
|
|
|
108 |
|
|
#define A_GRAVE 0300
|
109 |
|
|
#define THORN 0336
|
110 |
|
|
#define MULTIPLY 0327
|
111 |
|
|
#define a_grave 0340
|
112 |
|
|
#define thorn 0376
|
113 |
|
|
#define divide 0367
|
114 |
|
|
|
115 |
|
|
static inline unsigned latin1_upcase (unsigned c)
|
116 |
|
|
{
|
117 |
|
|
if (c - 'a' <= 'z' - 'a'
|
118 |
|
|
|| (c - a_grave <= thorn - a_grave
|
119 |
|
|
&& c != divide))
|
120 |
|
|
return c - 'a' + 'A';
|
121 |
|
|
else
|
122 |
|
|
return c;
|
123 |
|
|
}
|
124 |
|
|
|
125 |
|
|
static inline unsigned latin1_downcase (unsigned c)
|
126 |
|
|
{
|
127 |
|
|
if (c - 'A' <= 'Z' - 'A'
|
128 |
|
|
|| (c - A_GRAVE <= THORN - A_GRAVE
|
129 |
|
|
&& c != MULTIPLY))
|
130 |
|
|
return c + 'a' - 'A';
|
131 |
|
|
else
|
132 |
|
|
return c;
|
133 |
|
|
}
|
134 |
|
|
|
135 |
|
|
#if 0
|
136 |
|
|
static inline unsigned latin1_to_cp850 (unsigned c)
|
137 |
|
|
{
|
138 |
|
|
if ((signed) c - 128 >= 0)
|
139 |
|
|
return tb_latin1_to_cp850[c - 128];
|
140 |
|
|
else
|
141 |
|
|
return c;
|
142 |
|
|
}
|
143 |
|
|
#endif
|
144 |
|
|
|
145 |
|
|
static inline unsigned cp850_to_latin1 (unsigned c)
|
146 |
|
|
{
|
147 |
|
|
if ((signed) c - 128 >= 0)
|
148 |
|
|
return tb_cp850_to_latin1[c - 128];
|
149 |
|
|
else
|
150 |
|
|
return c;
|
151 |
|
|
}
|
152 |
|
|
|
153 |
|
|
unsigned hpfs_char_to_upper_linux (unsigned c)
|
154 |
|
|
{
|
155 |
|
|
return latin1_upcase (cp850_to_latin1 (c));
|
156 |
|
|
}
|
157 |
|
|
|
158 |
|
|
unsigned linux_char_to_upper_linux (unsigned c)
|
159 |
|
|
{
|
160 |
|
|
return latin1_upcase (c);
|
161 |
|
|
}
|
162 |
|
|
|
163 |
|
|
unsigned hpfs_char_to_lower_linux (unsigned c)
|
164 |
|
|
{
|
165 |
|
|
return latin1_downcase (cp850_to_latin1 (c));
|
166 |
|
|
}
|
167 |
|
|
|
168 |
|
|
unsigned hpfs_char_to_linux (unsigned c)
|
169 |
|
|
{
|
170 |
|
|
return cp850_to_latin1 (c);
|
171 |
|
|
}
|