OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [newlib-1.17.0/] [newlib/] [testsuite/] [newlib.locale/] [UTF-8.c] - Blame information for rev 407

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 148 jeremybenn
/*
2
 *  Copyright (C) 2002 by Red Hat, Incorporated. All rights reserved.
3
 *
4
 *  Permission to use, copy, modify, and distribute this software
5
 *  is freely granted, provided that this notice is preserved.
6
 *
7
 *  Tests gleaned from Markus Kuhn's UTF-8 and Unicode FAQ,
8
 *  and specifically, his UTF-8-test.txt decoder stress test file.
9
 */
10
 
11
#include <stdio.h>
12
#include <stdlib.h>
13
#include <locale.h>
14
 
15
#define MAX_BYTES 65
16
 
17
int num_invalid(const char *s, int len);
18
 
19
char first[6][6] = {
20
  {0x0},                                   /* U-00000000 */
21
  {0xc2, 0x80},                            /* U-00000080 */
22
  {0xe0, 0xa0, 0x80},                      /* U-00000800 */
23
  {0xf0, 0x90, 0x80, 0x80},                /* U-00010000 */
24
  {0xf8, 0x88, 0x80, 0x80, 0x80},          /* U-00200000 */
25
  {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80}     /* U-04000000 */
26
};
27
 
28
char last[6][6] = {
29
  {0x7f},                                  /* U-0000007F */
30
  {0xdf, 0xbf},                            /* U-000007FF */
31
  {0xef, 0xbf, 0xbf},                      /* U-0000FFFF */
32
  {0xf7, 0xbf, 0xbf, 0xbf},                /* U-001FFFFF */
33
  {0xfb, 0xbf, 0xbf, 0xbf, 0xbf},          /* U-03FFFFFF */
34
  {0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf}     /* U-7FFFFFFF */
35
};
36
 
37
char boundary[5][6] = {
38
  {0xed, 0x9f, 0xbf},                      /* U-0000D7FF */
39
  {0xee, 0x80, 0x80},                      /* U-0000E000 */
40
  {0xef, 0xbf, 0xbd},                      /* U-0000FFFD */
41
  {0xf4, 0x8f, 0xbf, 0xbf},                /* U-0010FFFF */
42
  {0xf4, 0x90, 0x80, 0x80}                 /* U-00110000 */
43
};
44
 
45
char continuation_bytes[8][7] = {
46
  {0x80},
47
  {0xbf},
48
  {0x80, 0xbf},
49
  {0x80, 0xbf, 0x80},
50
  {0x80, 0xbf, 0x80, 0xbf},
51
  {0x80, 0xbf, 0x80, 0xbf, 0x80},
52
  {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
53
  {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80}
54
};
55
 
56
char all_continuation_bytes[64];
57
 
58
 
59
char all_two_byte_seq[32];
60
char all_three_byte_seq[16];
61
char all_four_byte_seq[8];
62
char all_five_byte_seq[4];
63
char all_six_byte_seq[2];
64
 
65
char incomplete_seq[10][6] = {
66
  {0xc2},                            /* U-00000080 */
67
  {0xe0, 0x80},                      /* U-00000800 */
68
  {0xf0, 0x80, 0x80},                /* U-00010000 */
69
  {0xf8, 0x80, 0x80, 0x80},          /* U-00200000 */
70
  {0xfc, 0x80, 0x80, 0x80, 0x80},    /* U-04000000 */
71
  {0xdf},                            /* U-000007FF */
72
  {0xef, 0xbf},                      /* U-0000FFFF */
73
  {0xf7, 0xbf, 0xbf},                /* U-001FFFFF */
74
  {0xfb, 0xbf, 0xbf, 0xbf},          /* U-03FFFFFF */
75
  {0xfd, 0xbf, 0xbf, 0xbf, 0xbf}     /* U-7FFFFFFF */
76
};
77
 
78
char incomplete_seq_concat[30];
79
 
80
char impossible_bytes[3][4] = {
81
  {0xfe},
82
  {0xff},
83
  {0xfe, 0xfe, 0xff, 0xff}
84
};
85
 
86
char overlong[5][6] = {
87
  {0xc0, 0xaf},
88
  {0xe0, 0x80, 0xaf},
89
  {0xf0, 0x80, 0x80, 0xaf},
90
  {0xf8, 0x80, 0x80, 0x80, 0xaf},
91
  {0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf}
92
};
93
 
94
char overlong_max[5][6] = {
95
  {0xc1, 0xbf},
96
  {0xe0, 0x9f, 0xbf},
97
  {0xf0, 0x8f, 0xbf, 0xbf},
98
  {0xf8, 0x87, 0xbf, 0xbf, 0xbf},
99
  {0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf}
100
};
101
 
102
char overlong_nul[5][6] = {
103
  {0xc0, 0x80},
104
  {0xe0, 0x80, 0x80},
105
  {0xf0, 0x80, 0x80, 0x80},
106
  {0xf8, 0x80, 0x80, 0x80, 0x80},
107
  {0xfc, 0x80, 0x80, 0x80, 0x80, 0x80}
108
};
109
 
110
char single_surrogates[7][3] = {
111
  {0xed, 0xa0, 0x80},
112
  {0xed, 0xad, 0xbf},
113
  {0xed, 0xae, 0x80},
114
  {0xed, 0xaf, 0xbf},
115
  {0xed, 0xb0, 0x80},
116
  {0xed, 0xbe, 0x80},
117
  {0xed, 0xbf, 0xbf}
118
};
119
 
120
char paired_surrogates[8][6] = {
121
  {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
122
  {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
123
  {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
124
  {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
125
  {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
126
  {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
127
  {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
128
  {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}
129
};
130
 
131
char illegal_pos[2][3] = {
132
  {0xff, 0xfe},
133
  {0xff, 0xff}
134
};
135
 
136
int main()
137
  {
138
    wchar_t wchar;
139
    int retval;
140
    int i;
141
 
142
    if (!setlocale(LC_CTYPE, "C-UTF-8"))
143
      {
144
        printf("Failed to set C-UTF-8 locale.\n");
145
        return 1;
146
      }
147
    else
148
      printf("Set C-UTF-8 locale.\n");
149
 
150
    /* 2  Boundary condition test cases */
151
    /* 2.1  First possible sequence of a certain length */
152
    retval = mbtowc(&wchar, first[0], MAX_BYTES);
153
    if (retval == 0)
154
      printf("2.1.1: U-%08d\n", wchar);
155
    else
156
      printf("2.1.1: Invalid\n");
157
 
158
    for (i = 2; i < 7; i++)
159
    {
160
      retval = mbtowc (&wchar, first[i-1], MAX_BYTES);
161
      if (retval == i)
162
        printf("2.1.%d: U-%08x\n", i, wchar);
163
      else
164
        printf("2.1.%d: Invalid\n", i);
165
    }
166
 
167
    /* 2.2  Last possible sequence of a certain length */
168
    for (i = 1; i < 7; i++)
169
    {
170
      retval = mbtowc (&wchar, last[i-1], MAX_BYTES);
171
      if (retval == i)
172
        printf("2.2.%d: U-%08x\n", i, wchar);
173
      else
174
        printf("2.2.%d: Invalid\n", i);
175
    }
176
 
177
    /* 2.3  Other boundary conditions */
178
    for (i = 1; i < 6; i++)
179
      {
180
        retval = mbtowc (&wchar, boundary[i-1], MAX_BYTES);
181
        if ((i < 4 && retval == 3) || (i > 3 && retval == 4))
182
          printf("2.3.%d: U-%08x\n", i, wchar);
183
        else
184
          printf("2.3.%d: Invalid\n", i);
185
      }
186
 
187
    /* 3  Malformed sequences */
188
    /* 3.1  Unexpected continuation bytes */
189
    retval = mbtowc (&wchar, continuation_bytes[0], MAX_BYTES);
190
    if (retval == 1)
191
      printf("3.1.1: U-%08x\n", wchar);
192
    else
193
      printf("3.1.1: 1 Invalid\n");
194
 
195
    retval = mbtowc (&wchar, continuation_bytes[1], MAX_BYTES);
196
    if (retval == 1)
197
      printf("3.1.2: U-%08x\n", wchar);
198
    else
199
      printf("3.1.2: 1 Invalid\n");
200
 
201
    for(i=2; i< 8; i++)
202
      {
203
        retval = num_invalid(continuation_bytes[i], i);
204
        if (retval == -1)
205
          printf("3.1.%d: Valid Character Found\n", i+1);
206
        else
207
          printf("3.1.%d: %d Invalid\n", i+1, retval);
208
      }
209
 
210
    for(i = 0x80; i < 0xc0; i++)
211
      all_continuation_bytes[i-0x80] = i;
212
 
213
    retval = num_invalid(all_continuation_bytes, 0xc0 - 0x80);
214
    if (retval == -1)
215
      printf("3.1.9: Valid Character Found\n");
216
    else
217
      printf("3.1.9: %d Invalid\n", retval);
218
 
219
    /* 3.2  Lonely start characters */
220
    for(i = 0xc0; i < 0xe0; i++)
221
      all_two_byte_seq[i-0xc0] = i;
222
 
223
    retval = num_invalid(all_two_byte_seq, 0xe0 - 0xc0);
224
    if (retval == -1)
225
      printf("3.2.1: Valid Character Found\n");
226
    else
227
      printf("3.2.1: %d Invalid\n", retval);
228
 
229
    for(i = 0xe0; i < 0xf0; i++)
230
      all_three_byte_seq[i-0xe0] = i;
231
 
232
    retval = num_invalid(all_three_byte_seq, 0xf0 - 0xe0);
233
    if (retval == -1)
234
      printf("3.2.2: Valid Character Found\n");
235
    else
236
      printf("3.2.2: %d Invalid\n", retval);
237
 
238
    for(i = 0xf0; i < 0xf8; i++)
239
      all_four_byte_seq[i-0xf0] = i;
240
 
241
    retval = num_invalid(all_four_byte_seq, 0xf8 - 0xf0);
242
    if (retval == -1)
243
      printf("3.2.3: Valid Character Found\n");
244
    else
245
      printf("3.2.3: %d Invalid\n", retval);
246
 
247
    for(i = 0xf8; i < 0xfc; i++)
248
      all_five_byte_seq[i-0xf8] = i;
249
 
250
    retval = num_invalid(all_five_byte_seq, 0xfc - 0xf8);
251
    if (retval == -1)
252
      printf("3.2.4: Valid Character Found\n");
253
    else
254
      printf("3.2.4: %d Invalid\n", retval);
255
 
256
    for(i = 0xfc; i < 0xfe; i++)
257
      all_six_byte_seq[i-0xfc] = i;
258
 
259
    retval = num_invalid(all_six_byte_seq, 0xfe - 0xfc);
260
    if (retval == -1)
261
      printf("3.2.5: Valid Character Found\n");
262
    else
263
      printf("3.2.5: %d Invalid\n", retval);
264
 
265
    /* 3.3  Sequences with last continuation byte missing */
266
    for(i = 1; i < 6; i++)
267
      {
268
        retval = mbtowc(&wchar, incomplete_seq[i-1], i);
269
        if(retval == -1)
270
          printf("3.3.%d: 1 Invalid\n", i);
271
        else
272
          printf("3.3.%d: Valid Character Found\n", i);
273
      }
274
 
275
    for(i = 6; i < 11; i++)
276
      {
277
        retval = mbtowc(&wchar, incomplete_seq[i-1], i - 5);
278
        if(retval == -1)
279
          printf("3.3.%d: 1 Invalid\n", i);
280
        else
281
          printf("3.3.%d: Valid Character Found\n", i);
282
      }
283
 
284
    /* 3.4  Concatenation of incomplete sequences */
285
    /* This test is excluded because the mbtowc function does not return the
286
       number of bytes read in an invalid multi-byte sequence. */
287
 
288
    /* 3.5  Impossible bytes */
289
    retval = mbtowc(&wchar, impossible_bytes[0], 1);
290
    if(retval == -1)
291
      printf("3.5.1: 1 Invalid\n");
292
    else
293
      printf("3.5.1: Valid Character Found\n");
294
 
295
    retval = mbtowc(&wchar, impossible_bytes[1], 1);
296
    if(retval == -1)
297
      printf("3.5.2: 1 Invalid\n");
298
    else
299
      printf("3.5.2: Valid Character Found\n");
300
 
301
    retval = mbtowc(&wchar, impossible_bytes[2], 4);
302
    if(retval == -1)
303
      printf("3.5.3: 1 Invalid\n");
304
    else
305
      printf("3.5.3: Valid Character Found\n");
306
 
307
    /* 4  Overlong sequences */
308
    /* 4.1  Examples of an overlong ASCII character */
309
    for(i = 2; i < 7; i++)
310
      {
311
        retval = mbtowc(&wchar, overlong[i-2], i);
312
        if(retval == -1)
313
          printf("4.1.%d: 1 Invalid\n", i-1);
314
        else
315
          printf("4.1.%d: Valid Character Found\n", i-1);
316
      }
317
 
318
    /* 4.2  Maximum overlong sequences */
319
    for(i = 2; i < 7; i++)
320
      {
321
        retval = mbtowc(&wchar, overlong_max[i-2], i);
322
        if(retval == -1)
323
          printf("4.2.%d: 1 Invalid\n", i-1);
324
        else
325
          printf("4.2.%d: Valid Character Found\n", i-1);
326
      }
327
 
328
    /* 4.3  Overlong representation of the NUL character */
329
    for(i = 2; i < 7; i++)
330
      {
331
        retval = mbtowc(&wchar, overlong_nul[i-2], i);
332
        if(retval == -1)
333
          printf("4.3.%d: 1 Invalid\n", i-1);
334
        else
335
          printf("4.3.%d: Valid Character Found\n", i-1);
336
      }
337
 
338
    /* 5  Illegal code positions */
339
    /* 5.1 Single UTF-16 surrogates */
340
    for (i = 1; i < 8; i++)
341
      {
342
        retval = mbtowc(&wchar, single_surrogates[i-1], 3);
343
        if(retval == -1)
344
          printf("5.1.%d: 1 Invalid\n", i);
345
        else
346
          printf("5.1.%d: Valid Character Found\n", i);
347
      }
348
 
349
    /* 5.2 Paired UTF-16 surrogates */
350
    for (i = 1; i < 8; i++)
351
      {
352
        retval = mbtowc(&wchar, paired_surrogates[i-1], 6);
353
        if(retval == -1)
354
          printf("5.2.%d: 1 Invalid\n", i);
355
        else
356
          printf("5.2.%d: Valid Character Found\n", i);
357
      }
358
 
359
    /* 5.3 Other illegal code positions */
360
    retval = mbtowc(&wchar, illegal_pos[0], 3);
361
    if(retval == -1)
362
      printf("5.3.1: 1 Invalid\n");
363
    else
364
      printf("5.3.1: Valid Character Found\n");
365
 
366
    retval = mbtowc(&wchar, illegal_pos[1], 3);
367
    if(retval == -1)
368
      printf("5.3.2: 1 Invalid\n");
369
    else
370
      printf("5.3.2: Valid Character Found\n");
371
 
372
    return 0;
373
  }
374
 
375
/* return number of invalid characters in string,
376
   returns -1 if a valid character is found */
377
int
378
num_invalid(const char *s, int len)
379
{
380
  int retval = 0;
381
  int i = 0;
382
  int num_inv = 0;
383
  wchar_t wchar;
384
  const char *t;
385
 
386
  t = s;
387
 
388
  for(i=0; i<len; t++, i++)
389
    {
390
      retval = mbtowc (&wchar, t, len - i);
391
      if(retval == -1)
392
        num_inv++;
393
      else
394
        return -1;
395
    }
396
  return num_inv;
397
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.