OpenCores
URL https://opencores.org/ocsvn/or1k_old/or1k_old/trunk

Subversion Repositories or1k_old

[/] [or1k_old/] [trunk/] [linux/] [uClibc/] [libc/] [misc/] [wchar/] [wchar.c] - Blame information for rev 1782

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1325 phoenix
 
2
/*  Copyright (C) 2002     Manuel Novoa III
3
 *
4
 *  This library is free software; you can redistribute it and/or
5
 *  modify it under the terms of the GNU Library General Public
6
 *  License as published by the Free Software Foundation; either
7
 *  version 2 of the License, or (at your option) any later version.
8
 *
9
 *  This library is distributed in the hope that it will be useful,
10
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 *  Library General Public License for more details.
13
 *
14
 *  You should have received a copy of the GNU Library General Public
15
 *  License along with this library; if not, write to the Free
16
 *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
 */
18
 
19
/*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
20
 *
21
 *  Besides uClibc, I'm using this code in my libc for elks, which is
22
 *  a 16-bit environment with a fairly limited compiler.  It would make
23
 *  things much easier for me if this file isn't modified unnecessarily.
24
 *  In particular, please put any new or replacement functions somewhere
25
 *  else, and modify the makefile to use your version instead.
26
 *  Thanks.  Manuel
27
 *
28
 *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
29
 
30
 
31
/* May 23, 2002     Initial Notes:
32
 *
33
 * I'm still tweaking this stuff, but it passes the tests I've thrown
34
 * at it, and Erik needs it for the gcc port.  The glibc extension
35
 * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36
 * in the glibc source.  I also need to fix the behavior of
37
 * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
38
 *
39
 * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40
 * file on my platform (x86) show about 5-10% faster conversion speed than
41
 * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42
 * individual mbrtowc()/wcrtomb() calls.
43
 *
44
 * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45
 * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
46
 * needs to deal gracefully with whatever is sent to it.  In that mode,
47
 * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
48
 * an arg to force that behavior, so the interface will be changing.
49
 *
50
 * I need to fix the error checking for 16-bit wide chars.  This isn't
51
 * an issue for uClibc, but may be for ELKS.  I'm currently not sure
52
 * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
53
 *
54
 * July 1, 2002
55
 *
56
 * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57
 * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
58
 *    locales.
59
 * Enabled building of a C/POSIX-locale-only version, so full locale support
60
 *    no longer needs to be enabled.
61
 *
62
 * Nov 4, 2002
63
 *
64
 * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
65
 * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66
 *   order to support %ls in printf.  See comments below for details.
67
 * Change behaviour of wc<->mb functions when in the C locale.  Now they do
68
 *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
69
 *   and consistency with the stds requirements that a printf format string by
70
 *   a valid multibyte string beginning and ending in it's initial shift state.
71
 *
72
 * Nov 5, 2002
73
 *
74
 * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
75
 *
76
 * Nov 7, 2002
77
 *
78
 * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79
 *   Added some size/speed optimizations and integrated it into my locale
80
 *   framework.  Minimally tested at the moment, but the stub C-locale
81
 *   version (which most people would probably be using) should be fine.
82
 *
83
 * Nov 21, 2002
84
 *
85
 * Revert the wc<->mb changes from earlier this month involving the C-locale.
86
 * Add a couple of ugly hacks to support *wprintf.
87
 * Add a mini iconv() and iconv implementation (requires locale support).
88
 *
89
 * Aug 1, 2003
90
 * Bug fix for mbrtowc.
91
 *
92
 * Aug 18, 2003
93
 * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
94
 *
95
 * Manuel
96
 */
97
 
98
#define _GNU_SOURCE
99
#define _ISOC99_SOURCE
100
#include <errno.h>
101
#include <stddef.h>
102
#include <limits.h>
103
#include <stdint.h>
104
#include <inttypes.h>
105
#include <stdlib.h>
106
#include <stdio.h>
107
#include <assert.h>
108
#include <locale.h>
109
#include <wchar.h>
110
#include <bits/uClibc_uwchar.h>
111
 
112
/**********************************************************************/
113
#ifdef __UCLIBC_HAS_LOCALE__
114
#ifdef __UCLIBC_MJN3_ONLY__
115
#ifdef L_iswspace
116
/* generates one warning */
117
#warning TODO: Fix Cc2wc* and Cwc2c* defines!
118
#endif
119
#endif /* __UCLIBC_MJN3_ONLY__ */
120
 
121
#define ENCODING                ((__UCLIBC_CURLOCALE_DATA).encoding)
122
 
123
#define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
124
#define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
125
#define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
126
#define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
127
#define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
128
#define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
129
 
130
#ifndef __CTYPE_HAS_UTF_8_LOCALES
131
#warning __CTYPE_HAS_UTF_8_LOCALES not set!
132
#endif
133
 
134
#else  /* __UCLIBC_HAS_LOCALE__ */
135
 
136
#ifdef __UCLIBC_MJN3_ONLY__
137
#ifdef L_btowc
138
/* emit only once */
139
#warning fix preprocessor logic testing locale settings
140
#endif
141
#endif
142
 
143
#define ENCODING (__ctype_encoding_7_bit)
144
#ifdef __CTYPE_HAS_8_BIT_LOCALES
145
#error __CTYPE_HAS_8_BIT_LOCALES is defined!
146
#endif
147
#ifdef __CTYPE_HAS_UTF_8_LOCALES
148
#error __CTYPE_HAS_UTF_8_LOCALES is defined!
149
#endif
150
#undef L__wchar_utf8sntowcs
151
#undef L__wchar_wcsntoutf8s
152
 
153
#endif /* __UCLIBC_HAS_LOCALE__ */
154
/**********************************************************************/
155
 
156
#if WCHAR_MAX > 0xffffUL
157
#define UTF_8_MAX_LEN 6
158
#else
159
#define UTF_8_MAX_LEN 3
160
#endif
161
 
162
#define KUHN 1
163
 
164
/* Implementation-specific work functions. */
165
 
166
extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
167
                                                                 const char **__restrict src, size_t n,
168
                                                                 mbstate_t *ps, int allow_continuation);
169
 
170
extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
171
                                                                 const wchar_t **__restrict src, size_t wn);
172
 
173
/* glibc extensions. */
174
 
175
extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
176
                                                   const char **__restrict src,
177
                                                   size_t NMC, size_t len, mbstate_t *__restrict ps);
178
 
179
extern size_t __wcsnrtombs(char *__restrict dst,
180
                                                   const wchar_t **__restrict src,
181
                                                   size_t NWC, size_t len, mbstate_t *__restrict ps);
182
 
183
/**********************************************************************/
184
#ifdef L_btowc
185
 
186
wint_t btowc(int c)
187
{
188
#ifdef __CTYPE_HAS_8_BIT_LOCALES
189
 
190
        wchar_t wc;
191
        unsigned char buf[1];
192
        mbstate_t mbstate;
193
 
194
        if (c != EOF) {
195
                *buf = (unsigned char) c;
196
                mbstate.mask = 0;                /* Initialize the mbstate. */
197
                if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
198
                        return wc;
199
                }
200
        }
201
        return WEOF;
202
 
203
#else  /*  __CTYPE_HAS_8_BIT_LOCALES */
204
 
205
#ifdef __UCLIBC_HAS_LOCALE__
206
        assert((ENCODING == __ctype_encoding_7_bit)
207
                   || (ENCODING == __ctype_encoding_utf8));
208
#endif /* __UCLIBC_HAS_LOCALE__ */
209
 
210
        /* If we don't have 8-bit locale support, then this is trivial since
211
         * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
212
        return (((unsigned int)c) < 0x80) ? c : WEOF;
213
 
214
#endif /*  __CTYPE_HAS_8_BIT_LOCALES */
215
}
216
 
217
#endif
218
/**********************************************************************/
219
#ifdef L_wctob
220
 
221
/* Note: We completely ignore ps in all currently supported conversions. */
222
 
223
int wctob(wint_t c)
224
{
225
#ifdef __CTYPE_HAS_8_BIT_LOCALES
226
 
227
        unsigned char buf[MB_LEN_MAX];
228
 
229
        return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
230
 
231
#else  /*  __CTYPE_HAS_8_BIT_LOCALES */
232
 
233
#ifdef __UCLIBC_HAS_LOCALE__
234
        assert((ENCODING == __ctype_encoding_7_bit)
235
                   || (ENCODING == __ctype_encoding_utf8));
236
#endif /* __UCLIBC_HAS_LOCALE__ */
237
 
238
        /* If we don't have 8-bit locale support, then this is trivial since
239
         * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
240
 
241
        /* TODO: need unsigned version of wint_t... */
242
/*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
243
        return ((c >= 0) && (c < 0x80)) ? c : EOF;
244
 
245
#endif /*  __CTYPE_HAS_8_BIT_LOCALES */
246
}
247
 
248
#endif
249
/**********************************************************************/
250
#ifdef L_mbsinit
251
 
252
int mbsinit(const mbstate_t *ps)
253
{
254
        return !ps || !ps->mask;
255
}
256
 
257
#endif
258
/**********************************************************************/
259
#ifdef L_mbrlen
260
 
261
size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
262
         __attribute__ ((__weak__, __alias__("__mbrlen")));
263
 
264
size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
265
{
266
        static mbstate_t mbstate;       /* Rely on bss 0-init. */
267
 
268
        return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
269
}
270
 
271
#endif
272
/**********************************************************************/
273
#ifdef L_mbrtowc
274
 
275
size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
276
                           size_t n, mbstate_t *__restrict ps)
277
{
278
        static mbstate_t mbstate;       /* Rely on bss 0-init. */
279
        wchar_t wcbuf[1];
280
        const char *p;
281
        size_t r;
282
        char empty_string[1];           /* Avoid static to be fPIC friendly. */
283
 
284
        if (!ps) {
285
                ps = &mbstate;
286
        }
287
 
288
        if (!s) {
289
                pwc = (wchar_t *) s;    /* NULL */
290
                empty_string[0] = 0;      /* Init the empty string when necessary. */
291
                s = empty_string;
292
                n = 1;
293
        } else if (!n) {
294
                return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
295
                        ? ((size_t) -1) : ((size_t) -2);
296
        }
297
 
298
        p = s;
299
 
300
#ifdef __CTYPE_HAS_UTF_8_LOCALES
301
        /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
302
        if (ENCODING == __ctype_encoding_utf8) {
303
                if (!pwc) {
304
                        pwc = wcbuf;
305
                }
306
                r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
307
                return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
308
        }
309
#endif
310
 
311
#ifdef __UCLIBC_MJN3_ONLY__
312
#warning TODO: This adds a trailing nul!
313
#endif /* __UCLIBC_MJN3_ONLY__ */
314
 
315
        r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
316
 
317
        if (((ssize_t) r) >= 0) {
318
                if (pwc) {
319
                        *pwc = *wcbuf;
320
                }
321
        }
322
        return (size_t) r;
323
}
324
 
325
#endif
326
/**********************************************************************/
327
#ifdef L_wcrtomb
328
 
329
/* Note: We completely ignore ps in all currently supported conversions. */
330
/* TODO: Check for valid state anyway? */
331
 
332
size_t wcrtomb(register char *__restrict s, wchar_t wc,
333
                           mbstate_t *__restrict ps)
334
{
335
#ifdef __UCLIBC_MJN3_ONLY__
336
#warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
337
#endif /* __UCLIBC_MJN3_ONLY__ */
338
        wchar_t wcbuf[1];
339
        const wchar_t *pwc;
340
        size_t r;
341
        char buf[MB_LEN_MAX];
342
 
343
        if (!s) {
344
                s = buf;
345
                wc = 0;
346
        }
347
 
348
        pwc = wcbuf;
349
        wcbuf[0] = wc;
350
 
351
        r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
352
        return (r != 0) ? r : 1;
353
}
354
 
355
#endif
356
/**********************************************************************/
357
#ifdef L_mbsrtowcs
358
 
359
size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
360
                                 size_t len, mbstate_t *__restrict ps)
361
{
362
        static mbstate_t mbstate;       /* Rely on bss 0-init. */
363
 
364
        return __mbsnrtowcs(dst, src, SIZE_MAX, len,
365
                                                ((ps != NULL) ? ps : &mbstate));
366
}
367
 
368
#endif
369
/**********************************************************************/
370
#ifdef L_wcsrtombs
371
 
372
/* Note: We completely ignore ps in all currently supported conversions.
373
 
374
 * TODO: Check for valid state anyway? */
375
 
376
size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
377
                                 size_t len, mbstate_t *__restrict ps)
378
{
379
        return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
380
}
381
 
382
#endif
383
/**********************************************************************/
384
#ifdef L__wchar_utf8sntowcs
385
 
386
/* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
387
 * UTF-8-test.txt strss test.
388
 */
389
/*  #define DECODER */
390
 
391
#ifdef DECODER
392
#ifndef KUHN
393
#define KUHN
394
#endif
395
#endif
396
 
397
size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
398
                                                  const char **__restrict src, size_t n,
399
                                                  mbstate_t *ps, int allow_continuation)
400
{
401
        register const char *s;
402
        __uwchar_t mask;
403
        __uwchar_t wc;
404
        wchar_t wcbuf[1];
405
        size_t count;
406
        int incr;
407
 
408
        s = *src;
409
 
410
        assert(s != NULL);
411
        assert(ps != NULL);
412
 
413
        incr = 1;
414
        /* NOTE: The following is an AWFUL HACK!  In order to support %s in
415
         * wprintf, we need to be able to compute the number of wchars needed
416
         * for the mbs conversion, not to exceed the precision specified.
417
         * But if dst is NULL, the return value is the length assuming a
418
         * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
419
         * as pwc in order to flag that we really want the length, subject
420
         * to the restricted buffer size and no partial conversions.
421
         * See mbsnrtowcs() as well. */
422
        if (!pwc || (pwc == ((wchar_t *)ps))) {
423
                if (!pwc) {
424
                        wn = SIZE_MAX;
425
                }
426
                pwc = wcbuf;
427
                incr = 0;
428
        }
429
 
430
        /* This is really here only to support the glibc extension function
431
         * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
432
         * check on the validity of the mbstate. */
433
        if (!(count = wn)) {
434
                return 0;
435
        }
436
 
437
        if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
438
#ifdef DECODER
439
                wc = (__uwchar_t) ps->wc;
440
                if (n) {
441
                        goto CONTINUE;
442
                }
443
                goto DONE;
444
#else
445
                if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
446
                        /* TODO: change error code here and below? */
447
                        if (n) {
448
                                goto CONTINUE;
449
                        }
450
                        goto DONE;
451
                }
452
                __set_errno(EILSEQ);
453
                return (size_t) -1;             /* We're in an error state. */
454
#endif
455
        }
456
 
457
        do {
458
                if (!n) {
459
                        goto DONE;
460
                }
461
                --n;
462
                if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
463
                        mask = 0x40;
464
#ifdef __UCLIBC_MJN3_ONLY__
465
#warning TODO: Fix range for 16 bit wchar_t case.
466
#endif
467
                        if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
468
                                goto START;
469
                        }
470
                BAD:
471
#ifdef DECODER
472
                        wc = 0xfffdU;
473
                        goto COMPLETE;
474
#else
475
                        ps->mask = mask;
476
                        ps->wc = 0xffffU;
477
                        __set_errno(EILSEQ);
478
                        return (size_t) -1;     /* Illegal start byte! */
479
#endif
480
 
481
                CONTINUE:
482
                        while (n) {
483
                                --n;
484
                                if ((*s & 0xc0) != 0x80) {
485
                                        goto BAD;
486
                                }
487
                                mask <<= 5;
488
                                wc <<= 6;
489
                                wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
490
                                ++s;
491
                        START:
492
                                wc &= ~(mask << 1);
493
 
494
                                if ((wc & mask) == 0) {  /* Character completed. */
495
                                        if ((mask >>= 5) == 0x40) {
496
                                                mask += mask;
497
                                        }
498
                                        /* Check for invalid sequences (longer than necessary)
499
                                         * and invalid chars.  */
500
                                        if ( (wc < mask) /* Sequence not minimal length. */
501
#ifdef KUHN
502
#if UTF_8_MAX_LEN == 3
503
#error broken since mask can overflow!!
504
                                                 /* For plane 0, these are the only defined values.*/
505
                                                 || (wc > 0xfffdU)
506
#else
507
                                                 /* Note that we don't need to worry about exceeding */
508
                                                 /* 31 bits as that is the most that UTF-8 provides. */
509
                                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
510
#endif
511
                                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
512
#endif /* KUHN */
513
                                                 ) {
514
                                                goto BAD;
515
                                        }
516
                                        goto COMPLETE;
517
                                }
518
                        }
519
                        /* Character potentially valid but incomplete. */
520
                        if (!allow_continuation) {
521
                                if (count != wn) {
522
                                        return 0;
523
                                }
524
                                /* NOTE: The following can fail if you allow and then disallow
525
                                 * continuation!!! */
526
#if UTF_8_MAX_LEN == 3
527
#error broken since mask can overflow!!
528
#endif
529
                                /* Need to back up... */
530
                                do {
531
                                        --s;
532
                                } while ((mask >>= 5) >= 0x40);
533
                                goto DONE;
534
                        }
535
                        ps->mask = (wchar_t) mask;
536
                        ps->wc = (wchar_t) wc;
537
                        *src = s;
538
                        return (size_t) -2;
539
                }
540
        COMPLETE:
541
                *pwc = wc;
542
                pwc += incr;
543
        }
544
#ifdef DECODER
545
        while (--count);
546
#else
547
        while (wc && --count);
548
 
549
        if (!wc) {
550
                s = NULL;
551
        }
552
#endif
553
 
554
 DONE:
555
        /* ps->wc is irrelavent here. */
556
        ps->mask = 0;
557
        if (pwc != wcbuf) {
558
                *src = s;
559
        }
560
 
561
        return wn - count;
562
}
563
 
564
#endif
565
/**********************************************************************/
566
#ifdef L__wchar_wcsntoutf8s
567
 
568
size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
569
                                                  const wchar_t **__restrict src, size_t wn)
570
{
571
        register char *p;
572
        size_t len, t;
573
        __uwchar_t wc;
574
        const __uwchar_t *swc;
575
        int store;
576
        char buf[MB_LEN_MAX];
577
        char m;
578
 
579
        store = 1;
580
        /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
581
         * printf, we need to be able to compute the number of bytes needed
582
         * for the mbs conversion, not to exceed the precision specified.
583
         * But if dst is NULL, the return value is the length assuming a
584
         * sufficiently sized buffer.  So, we allow passing of (char *) src
585
         * as dst in order to flag that we really want the length, subject
586
         * to the restricted buffer size and no partial conversions.
587
         * See wcsnrtombs() as well. */
588
        if (!s || (s == ((char *) src))) {
589
                if (!s) {
590
                        n = SIZE_MAX;
591
                }
592
            s = buf;
593
                store = 0;
594
        }
595
 
596
        t = n;
597
        swc = (const __uwchar_t *) *src;
598
 
599
        assert(swc != NULL);
600
 
601
        while (wn && t) {
602
                wc = *swc;
603
 
604
                *s = wc;
605
                len = 1;
606
 
607
                if (wc >= 0x80) {
608
#ifdef KUHN
609
                        if (
610
#if UTF_8_MAX_LEN == 3
611
                                /* For plane 0, these are the only defined values.*/
612
                                /* Note that we don't need to worry about exceeding */
613
                                /* 31 bits as that is the most that UTF-8 provides. */
614
                                (wc > 0xfffdU)
615
#else
616
                                /* UTF_8_MAX_LEN == 6 */
617
                                (wc > 0x7fffffffUL)
618
                                || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
619
#endif
620
                                || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
621
                                ) {
622
                                __set_errno(EILSEQ);
623
                                return (size_t) -1;
624
                        }
625
#else  /* KUHN */
626
#if UTF_8_MAX_LEN != 3
627
                        if (wc > 0x7fffffffUL) { /* Value too large. */
628
                                __set_errno(EILSEQ);
629
                                return (size_t) -1;
630
                        }
631
#endif
632
#endif /* KUHN */
633
 
634
                        wc >>= 1;
635
                        p = s;
636
                        do {
637
                                ++p;
638
                        } while (wc >>= 5);
639
                        wc = *swc;
640
                        if ((len = p - s) > t) { /* Not enough space. */
641
                                break;
642
                        }
643
 
644
                        m = 0x80;
645
                        while( p>s ) {
646
                                m = (m >> 1) | 0x80;
647
                                *--p = (wc & 0x3f) | 0x80;
648
                                wc >>= 6;
649
                        }
650
                        *s |= (m << 1);
651
                } else if (wc == 0) {    /* End of string. */
652
                        swc = NULL;
653
                        break;
654
                }
655
 
656
                ++swc;
657
                --wn;
658
                t -= len;
659
                if (store) {
660
                        s += len;
661
                }
662
        }
663
 
664
        if (store) {
665
                *src = (const wchar_t *) swc;
666
        }
667
 
668
        return n - t;
669
}
670
 
671
 
672
#endif
673
/**********************************************************************/
674
#ifdef L___mbsnrtowcs
675
 
676
/* WARNING: We treat len as SIZE_MAX when dst is NULL! */
677
 
678
size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
679
                                  size_t NMC, size_t len, mbstate_t *__restrict ps)
680
         __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
681
 
682
size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
683
                                        size_t NMC, size_t len, mbstate_t *__restrict ps)
684
{
685
        static mbstate_t mbstate;       /* Rely on bss 0-init. */
686
        wchar_t wcbuf[1];
687
        const char *s;
688
        size_t count;
689
        int incr;
690
 
691
        if (!ps) {
692
                ps = &mbstate;
693
        }
694
 
695
#ifdef __CTYPE_HAS_UTF_8_LOCALES
696
        if (ENCODING == __ctype_encoding_utf8) {
697
                size_t r;
698
                return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
699
                                != (size_t) -2) ? r : 0;
700
        }
701
#endif
702
        incr = 1;
703
        /* NOTE: The following is an AWFUL HACK!  In order to support %s in
704
         * wprintf, we need to be able to compute the number of wchars needed
705
         * for the mbs conversion, not to exceed the precision specified.
706
         * But if dst is NULL, the return value is the length assuming a
707
         * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
708
         * as dst in order to flag that we really want the length, subject
709
         * to the restricted buffer size and no partial conversions.
710
         * See _wchar_utf8sntowcs() as well. */
711
        if (!dst || (dst == ((wchar_t *)ps))) {
712
                if (!dst) {
713
                        len = SIZE_MAX;
714
                }
715
                dst = wcbuf;
716
                incr = 0;
717
        }
718
 
719
        /* Since all the following encodings are single-byte encodings... */
720
        if (len > NMC) {
721
                len = NMC;
722
        }
723
 
724
        count = len;
725
        s = *src;
726
 
727
#ifdef __CTYPE_HAS_8_BIT_LOCALES
728
        if (ENCODING == __ctype_encoding_8_bit) {
729
                wchar_t wc;
730
                while (count) {
731
                        if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
732
                                wc -= 0x80;
733
                                wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
734
                                                  (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
735
                                                   << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
736
                                if (!wc) {
737
                                        goto BAD;
738
                                }
739
                        }
740
                        if (!(*dst = wc)) {
741
                                s = NULL;
742
                                break;
743
                        }
744
                        dst += incr;
745
                        ++s;
746
                        --count;
747
                }
748
                if (dst != wcbuf) {
749
                        *src = s;
750
                }
751
                return len - count;
752
        }
753
#endif
754
 
755
#ifdef __UCLIBC_HAS_LOCALE__
756
        assert(ENCODING == __ctype_encoding_7_bit);
757
#endif
758
 
759
        while (count) {
760
                if ((*dst = (unsigned char) *s) == 0) {
761
                        s = NULL;
762
                        break;
763
                }
764
                if (*dst >= 0x80) {
765
#ifdef __CTYPE_HAS_8_BIT_LOCALES
766
                BAD:
767
#endif
768
                        __set_errno(EILSEQ);
769
                        return (size_t) -1;
770
                }
771
                ++s;
772
                dst += incr;
773
                --count;
774
        }
775
        if (dst != wcbuf) {
776
                *src = s;
777
        }
778
        return len - count;
779
}
780
 
781
#endif
782
/**********************************************************************/
783
#ifdef L___wcsnrtombs
784
 
785
/* WARNING: We treat len as SIZE_MAX when dst is NULL! */
786
 
787
/* Note: We completely ignore ps in all currently supported conversions.
788
 * TODO: Check for valid state anyway? */
789
 
790
size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
791
                                  size_t NWC, size_t len, mbstate_t *__restrict ps)
792
         __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
793
 
794
size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
795
                                        size_t NWC, size_t len, mbstate_t *__restrict ps)
796
{
797
        const __uwchar_t *s;
798
        size_t count;
799
        int incr;
800
        char buf[MB_LEN_MAX];
801
 
802
#ifdef __CTYPE_HAS_UTF_8_LOCALES
803
        if (ENCODING == __ctype_encoding_utf8) {
804
                return _wchar_wcsntoutf8s(dst, len, src, NWC);
805
        }
806
#endif /* __CTYPE_HAS_UTF_8_LOCALES */
807
 
808
        incr = 1;
809
        /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
810
         * printf, we need to be able to compute the number of bytes needed
811
         * for the mbs conversion, not to exceed the precision specified.
812
         * But if dst is NULL, the return value is the length assuming a
813
         * sufficiently sized buffer.  So, we allow passing of (char *) src
814
         * as dst in order to flag that we really want the length, subject
815
         * to the restricted buffer size and no partial conversions.
816
         * See _wchar_wcsntoutf8s() as well. */
817
        if (!dst || (dst == ((char *) src))) {
818
                if (!dst) {
819
                        len = SIZE_MAX;
820
                }
821
                dst = buf;
822
                incr = 0;
823
        }
824
 
825
        /* Since all the following encodings are single-byte encodings... */
826
        if (len > NWC) {
827
                len = NWC;
828
        }
829
 
830
        count = len;
831
        s = (const __uwchar_t *) *src;
832
 
833
#ifdef __CTYPE_HAS_8_BIT_LOCALES
834
        if (ENCODING == __ctype_encoding_8_bit) {
835
                __uwchar_t wc;
836
                __uwchar_t u;
837
                while (count) {
838
                        if ((wc = *s) <= 0x7f) {
839
                                if (!(*dst = (unsigned char) wc)) {
840
                                        s = NULL;
841
                                        break;
842
                                }
843
                        } else {
844
                                u = 0;
845
                                if (wc <= Cwc2c_DOMAIN_MAX) {
846
                                        u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
847
                                                                                                                + Cwc2c_TT_SHIFT)];
848
                                        u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
849
                                                                        + ((wc >> Cwc2c_TT_SHIFT)
850
                                                                           & ((1 << Cwc2c_TI_SHIFT)-1))];
851
                                        u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
852
                                                                        + (u << Cwc2c_TT_SHIFT)
853
                                                                        + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
854
                                }
855
 
856
#define __WCHAR_REPLACEMENT_CHAR '?'
857
#ifdef __WCHAR_REPLACEMENT_CHAR
858
                                *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
859
#else  /* __WCHAR_REPLACEMENT_CHAR */
860
                                if (!u) {
861
                                        goto BAD;
862
                                }
863
                                *dst = (unsigned char) u;
864
#endif /* __WCHAR_REPLACEMENT_CHAR */
865
                        }
866
                        ++s;
867
                        dst += incr;
868
                        --count;
869
                }
870
                if (dst != buf) {
871
                        *src = (const wchar_t *) s;
872
                }
873
                return len - count;
874
        }
875
#endif /* __CTYPE_HAS_8_BIT_LOCALES */
876
 
877
#ifdef __UCLIBC_HAS_LOCALE__
878
        assert(ENCODING == __ctype_encoding_7_bit);
879
#endif
880
 
881
        while (count) {
882
                if (*s >= 0x80) {
883
#if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
884
                BAD:
885
#endif
886
                        __set_errno(EILSEQ);
887
                        return (size_t) -1;
888
                }
889
                if ((*dst = (unsigned char) *s) == 0) {
890
                        s = NULL;
891
                        break;
892
                }
893
                ++s;
894
                dst += incr;
895
                --count;
896
        }
897
        if (dst != buf) {
898
                *src = (const wchar_t *) s;
899
        }
900
        return len - count;
901
}
902
 
903
#endif
904
/**********************************************************************/
905
#ifdef L_wcswidth
906
 
907
#ifdef __UCLIBC_MJN3_ONLY__
908
#warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
909
#warning TODO: Update wcwidth to match latest by Kuhn.
910
#endif
911
 
912
#if defined(__UCLIBC_HAS_LOCALE__) && \
913
( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
914
 
915
static const unsigned char new_idx[] = {
916
        0,    5,    5,    6,   10,   15,   28,   39,
917
        48,   48,   71,   94,  113,  128,  139,  154,
918
        175,  186,  188,  188,  188,  188,  188,  188,
919
        203,  208,  208,  208,  208,  208,  208,  208,
920
        208,  219,  219,  219,  222,  222,  222,  222,
921
        222,  222,  222,  222,  222,  222,  222,  224,
922
        224,  231,  231,  231,  231,  231,  231,  231,
923
        231,  231,  231,  231,  231,  231,  231,  231,
924
        231,  231,  231,  231,  231,  231,  231,  231,
925
        231,  231,  231,  231,  231,  231,  231,  231,
926
        231,  231,  231,  231,  231,  231,  231,  231,
927
        231,  231,  231,  231,  231,  231,  231,  231,
928
        231,  231,  231,  231,  231,  231,  231,  231,
929
        231,  231,  231,  231,  231,  231,  231,  231,
930
        231,  231,  231,  231,  231,  231,  231,  231,
931
        231,  231,  231,  231,  231,  231,  231,  231,
932
        231,  231,  231,  231,  231,  231,  231,  231,
933
        231,  231,  231,  231,  231,  231,  231,  231,
934
        231,  231,  231,  231,  231,  231,  231,  231,
935
        231,  231,  231,  231,  231,  231,  231,  231,
936
        231,  231,  231,  231,  231,  233,  233,  233,
937
        233,  233,  233,  233,  234,  234,  234,  234,
938
        234,  234,  234,  234,  234,  234,  234,  234,
939
        234,  234,  234,  234,  234,  234,  234,  234,
940
        234,  234,  234,  234,  234,  234,  234,  234,
941
        234,  234,  234,  234,  234,  234,  234,  234,
942
        234,  234,  234,  234,  234,  234,  234,  234,
943
        236,  236,  236,  236,  236,  236,  236,  236,
944
        236,  236,  236,  236,  236,  236,  236,  236,
945
        236,  236,  236,  236,  236,  236,  236,  236,
946
        236,  236,  236,  236,  236,  236,  236,  236,
947
        236,  237,  237,  238,  241,  241,  242,  249,
948
        255,
949
};
950
 
951
static const unsigned char new_tbl[] = {
952
        0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
953
        0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
954
        0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
955
        0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
956
        0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
957
        0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
958
        0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
959
        0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
960
        0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
961
        0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
962
        0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
963
        0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
964
        0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
965
        0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
966
        0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
967
        0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
968
        0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
969
        0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
970
        0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
971
        0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
972
        0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
973
        0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
974
        0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
975
        0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
976
        0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
977
        0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
978
        0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
979
        0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
980
        0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
981
        0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
982
        0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
983
        0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
984
};
985
 
986
static const signed char new_wtbl[] = {
987
        0,   -1,    1,   -1,    1,    1,    0,    1,
988
        0,    1,    1,    0,    1,    0,    1,    1,
989
        0,    1,    0,    1,    0,    1,    0,    1,
990
        0,    1,    0,    1,    1,    0,    1,    0,
991
        1,    0,    1,    0,    1,    0,    1,    1,
992
        0,    1,    0,    1,    0,    1,    0,    1,
993
        1,    0,    1,    0,    1,    0,    1,    0,
994
        1,    0,    1,    0,    1,    0,    1,    0,
995
        1,    0,    1,    0,    1,    0,    1,    1,
996
        0,    1,    0,    1,    0,    1,    0,    1,
997
        0,    1,    0,    1,    0,    1,    0,    1,
998
        0,    1,    0,    1,    0,    1,    1,    0,
999
        1,    0,    1,    0,    1,    0,    1,    0,
1000
        1,    0,    1,    0,    1,    0,    1,    0,
1001
        1,    1,    0,    1,    0,    1,    0,    1,
1002
        0,    1,    0,    1,    0,    1,    0,    1,
1003
        1,    0,    1,    0,    1,    0,    1,    0,
1004
        1,    0,    1,    1,    0,    1,    0,    1,
1005
        0,    1,    0,    1,    0,    1,    0,    1,
1006
        0,    1,    1,    0,    1,    0,    1,    0,
1007
        1,    0,    1,    0,    1,    0,    1,    0,
1008
        1,    0,    1,    0,    1,    0,    1,    1,
1009
        0,    1,    0,    1,    0,    1,    0,    1,
1010
        0,    1,    2,    0,    1,    0,    1,    0,
1011
        1,    0,    1,    0,    1,    0,    1,    0,
1012
        1,    0,    1,    1,    0,    1,    0,    1,
1013
        1,    0,    1,    0,    1,    0,    1,    0,
1014
        1,    0,    1,    1,    2,    1,    1,    2,
1015
        2,    0,    2,    1,    2,    0,    2,    2,
1016
        1,    1,    2,    1,    1,    2,    1,    0,
1017
        1,    1,    0,    1,    0,    1,    2,    1,
1018
        0,    2,    1,    2,    1,    0,    1,
1019
};
1020
 
1021
int wcswidth(const wchar_t *pwcs, size_t n)
1022
{
1023
    int h, l, m, count;
1024
    wchar_t wc;
1025
    unsigned char b;
1026
 
1027
        if (ENCODING == __ctype_encoding_7_bit) {
1028
                size_t i;
1029
 
1030
                for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1031
                        if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
1032
                                return -1;
1033
                        }
1034
                }
1035
        }
1036
#ifdef __CTYPE_HAS_8_BIT_LOCALES
1037
        else if (ENCODING == __ctype_encoding_8_bit) {
1038
                mbstate_t mbstate;
1039
 
1040
                mbstate.mask = 0;                        /* Initialize the mbstate. */
1041
                if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1042
                        return -1;
1043
                }
1044
        }
1045
#endif /* __CTYPE_HAS_8_BIT_LOCALES */
1046
#if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1047
        /* For stricter handling of allowed unicode values... see comments above. */
1048
        else if (ENCODING == __ctype_encoding_utf8) {
1049
                size_t i;
1050
 
1051
                for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1052
                        if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1053
                                 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1054
                                ) {
1055
                                return -1;
1056
                        }
1057
                }
1058
        }
1059
#endif /* __CTYPE_HAS_UTF_8_LOCALES */
1060
 
1061
    for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1062
                if (wc <= 0xff) {
1063
                        /* If we're here, wc != 0. */
1064
                        if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1065
                                return -1;
1066
                        }
1067
                        ++count;
1068
                        continue;
1069
                }
1070
                if (((unsigned int) wc) <= 0xffff) {
1071
                        b = wc & 0xff;
1072
                        h = (wc >> 8);
1073
                        l = new_idx[h];
1074
                        h = new_idx[h+1];
1075
                        while ((m = (l+h) >> 1) != l) {
1076
                                if (b >= new_tbl[m]) {
1077
                                        l = m;
1078
                                } else {                /* wc < tbl[m] */
1079
                                        h = m;
1080
                                }
1081
                        }
1082
                        count += new_wtbl[l]; /* none should be -1. */
1083
                        continue;
1084
                }
1085
 
1086
                /* Redo this to minimize average number of compares?*/
1087
                if (wc >= 0x1d167) {
1088
                        if (wc <= 0x1d1ad) {
1089
                                if ((wc <= 0x1d169
1090
                                         || (wc >= 0x1d173
1091
                                                 && (wc <= 0x1d182
1092
                                                         || (wc >= 0x1d185
1093
                                                                 && (wc <= 0x1d18b
1094
                                                                         || (wc >= 0x1d1aa))))))
1095
                                        ) {
1096
                                        continue;
1097
                                }
1098
                        } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1099
                                continue;
1100
                        } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1101
                                ++count;                /* need 2.. add one here */
1102
                        }
1103
#if (WCHAR_MAX > 0x7fffffffL)
1104
                        else if (wc > 0x7fffffffL) {
1105
                                return -1;
1106
                        }
1107
#endif /* (WCHAR_MAX > 0x7fffffffL) */
1108
                }
1109
 
1110
                ++count;
1111
    }
1112
 
1113
    return count;
1114
}
1115
 
1116
#else  /*  __UCLIBC_HAS_LOCALE__ */
1117
 
1118
int wcswidth(const wchar_t *pwcs, size_t n)
1119
{
1120
        int count;
1121
        wchar_t wc;
1122
 
1123
    for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1124
                if (wc <= 0xff) {
1125
                        /* If we're here, wc != 0. */
1126
                        if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1127
                                return -1;
1128
                        }
1129
                        ++count;
1130
                        continue;
1131
                } else {
1132
                        return -1;
1133
                }
1134
        }
1135
 
1136
        return count;
1137
}
1138
 
1139
#endif /*  __UCLIBC_HAS_LOCALE__ */
1140
 
1141
#endif
1142
/**********************************************************************/
1143
#ifdef L_wcwidth
1144
 
1145
int wcwidth(wchar_t wc)
1146
{
1147
    return wcswidth(&wc, 1);
1148
}
1149
 
1150
#endif
1151
/**********************************************************************/
1152
 
1153
 
1154
typedef struct {
1155
        mbstate_t tostate;
1156
        mbstate_t fromstate;
1157
        int tocodeset;
1158
        int fromcodeset;
1159
        int frombom;
1160
        int tobom;
1161
        int fromcodeset0;
1162
        int frombom0;
1163
        int tobom0;
1164
        int skip_invalid_input;         /* To support iconv -c option. */
1165
} _UC_iconv_t;
1166
 
1167
 
1168
 
1169
#ifdef L_iconv
1170
 
1171
#include <iconv.h>
1172
#include <string.h>
1173
#include <endian.h>
1174
#include <byteswap.h>
1175
 
1176
#if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1177
#error unsupported endianness for iconv
1178
#endif
1179
 
1180
#ifndef __CTYPE_HAS_8_BIT_LOCALES
1181
#error currently iconv requires 8 bit locales
1182
#endif
1183
#ifndef __CTYPE_HAS_UTF_8_LOCALES
1184
#error currently iconv requires UTF-8 locales
1185
#endif
1186
 
1187
 
1188
enum {
1189
        IC_WCHAR_T = 0xe0,
1190
        IC_MULTIBYTE = 0xe0,
1191
#if __BYTE_ORDER == __BIG_ENDIAN
1192
        IC_UCS_4 =      0xec,
1193
        IC_UTF_32 = 0xe4,
1194
        IC_UCS_2 =      0xe2,
1195
        IC_UTF_16 = 0xea,
1196
#else
1197
        IC_UCS_4 =      0xed,
1198
        IC_UTF_32 = 0xe5,
1199
        IC_UCS_2 =      0xe3,
1200
        IC_UTF_16 = 0xeb,
1201
#endif
1202
        IC_UTF_8 = 2,
1203
        IC_ASCII = 1
1204
};
1205
 
1206
/* For the multibyte
1207
 * bit 0 means swap endian
1208
 * bit 1 means 2 byte
1209
 * bit 2 means 4 byte
1210
 *
1211
 */
1212
 
1213
const unsigned char __iconv_codesets[] =
1214
        "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1215
#if __BYTE_ORDER == __BIG_ENDIAN
1216
        "\x08\xec""UCS-4\x00"           /* always BE */
1217
        "\x0a\xec""UCS-4BE\x00"
1218
        "\x0a\xed""UCS-4LE\x00"
1219
        "\x09\fe4""UTF-32\x00"          /* platform endian with BOM */
1220
        "\x0b\xe4""UTF-32BE\x00"
1221
        "\x0b\xe5""UTF-32LE\x00"
1222
        "\x08\xe2""UCS-2\x00"           /* always BE */
1223
        "\x0a\xe2""UCS-2BE\x00"
1224
        "\x0a\xe3""UCS-2LE\x00"
1225
        "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1226
        "\x0b\xea""UTF-16BE\x00"
1227
        "\x0b\xeb""UTF-16LE\x00"
1228
#elif __BYTE_ORDER == __LITTLE_ENDIAN
1229
        "\x08\xed""UCS-4\x00"           /* always BE */
1230
        "\x0a\xed""UCS-4BE\x00"
1231
        "\x0a\xec""UCS-4LE\x00"
1232
        "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1233
        "\x0b\xe5""UTF-32BE\x00"
1234
        "\x0b\xe4""UTF-32LE\x00"
1235
        "\x08\xe3""UCS-2\x00"           /* always BE */
1236
        "\x0a\xe3""UCS-2BE\x00"
1237
        "\x0a\xe2""UCS-2LE\x00"
1238
        "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1239
        "\x0b\xeb""UTF-16BE\x00"
1240
        "\x0b\xea""UTF-16LE\x00"
1241
#endif
1242
        "\x08\x02""UTF-8\x00"
1243
        "\x0b\x01""US-ASCII\x00"
1244
        "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1245
 
1246
static int find_codeset(const char *name)
1247
{
1248
        const unsigned char *s;
1249
        int codeset;
1250
 
1251
        for (s = __iconv_codesets ; *s ; s += *s) {
1252
                if (!strcasecmp(s+2, name)) {
1253
                        return s[1];
1254
                }
1255
        }
1256
 
1257
        /* The following is ripped from find_locale in locale.c. */
1258
 
1259
        /* TODO: maybe CODESET_LIST + *s ??? */
1260
        /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1261
        codeset = 2;
1262
        s = __LOCALE_DATA_CODESET_LIST;
1263
        do {
1264
                ++codeset;              /* Increment codeset first. */
1265
                if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1266
                        return codeset;
1267
                }
1268
        } while (*++s);
1269
 
1270
        return 0;                        /* No matching codeset! */
1271
}
1272
 
1273
iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1274
{
1275
        register _UC_iconv_t *px;
1276
        int tocodeset, fromcodeset;
1277
 
1278
        if (((tocodeset = find_codeset(tocode)) != 0)
1279
                && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1280
                if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1281
                        px->tocodeset = tocodeset;
1282
                        px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1283
                        px->fromcodeset0 = px->fromcodeset = fromcodeset;
1284
                        px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1285
                        px->skip_invalid_input = px->tostate.mask = px->fromstate.mask = 0;
1286
                        return (iconv_t) px;
1287
                }
1288
        } else {
1289
                __set_errno(EINVAL);
1290
        }
1291
        return (iconv_t)(-1);
1292
}
1293
 
1294
int weak_function iconv_close(iconv_t cd)
1295
{
1296
        free(cd);
1297
 
1298
        return 0;
1299
}
1300
 
1301
size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1302
                                                   size_t *__restrict inbytesleft,
1303
                                                   char **__restrict outbuf,
1304
                                                   size_t *__restrict outbytesleft)
1305
{
1306
        _UC_iconv_t *px = (_UC_iconv_t *) cd;
1307
        size_t nrcount, r;
1308
        wchar_t wc, wc2;
1309
        int inci, inco;
1310
 
1311
        assert(px != (_UC_iconv_t *)(-1));
1312
        assert(sizeof(wchar_t) == 4);
1313
 
1314
        if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1315
                /* Note: For shift-state encodings we possibly need to output the
1316
                 * shift sequence to return to initial state! */
1317
                if ((px->fromcodeset & 0xf0) == 0xe0) {
1318
                }
1319
                px->tostate.mask = px->fromstate.mask = 0;
1320
                px->fromcodeset = px->fromcodeset0;
1321
                px->tobom = px->tobom0;
1322
                px->frombom = px->frombom0;
1323
                return 0;
1324
        }
1325
 
1326
        nrcount = 0;
1327
        while (*inbytesleft) {
1328
                if (!*outbytesleft) {
1329
                TOO_BIG:
1330
                        __set_errno(E2BIG);
1331
                        return (size_t) -1;
1332
                }
1333
 
1334
                inci = inco = 1;
1335
                if (px->fromcodeset >= IC_MULTIBYTE) {
1336
                        inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1337
                        if (*inbytesleft < inci) goto INVALID;
1338
                        wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1339
                                + ((unsigned char)((*inbuf)[1]));
1340
                        if (inci == 4) {
1341
                                wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1342
                                        + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1343
                                if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1344
                        } else {
1345
                                if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1346
                                if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1347
                                         && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1348
                                        ) {                     /* surrogate */
1349
                                        wc =- 0xd800U;
1350
                                        if (*inbytesleft < 4) goto INVALID;
1351
                                        wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1352
                                                + ((unsigned char)((*inbuf)[3]));
1353
                                        if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1354
                                        if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1355
                                                goto ILLEGAL;
1356
                                        }
1357
                                        inci = 4;       /* Change inci here in case skipping illegals. */
1358
                                        wc = 0x10000UL + (wc << 10) + wc2;
1359
                                }
1360
                        }
1361
 
1362
                        if (px->frombom) {
1363
                                px->frombom = 0;
1364
                                if ((wc == 0xfeffU)
1365
                                        || (wc == ((inci == 4)
1366
                                                           ? (((wchar_t) 0xfffe0000UL))
1367
                                                           : ((wchar_t)(0xfffeUL))))
1368
                                        ) {
1369
                                        if (wc != 0xfeffU) {
1370
                                                px->fromcodeset ^= 1; /* toggle endianness */
1371
                                                wc = 0xfeffU;
1372
                                        }
1373
                                        if (!px->frombom) {
1374
                                                goto BOM_SKIP_OUTPUT;
1375
                                        }
1376
                                        goto GOT_BOM;
1377
                                }
1378
                        }
1379
 
1380
                        if (px->fromcodeset != IC_WCHAR_T) {
1381
                                if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1382
                                                                                 ? 0x7fffffffUL : 0x10ffffUL)
1383
#ifdef KUHN
1384
                                        || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1385
                                        || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1386
#endif
1387
                                        ) {
1388
                                        goto ILLEGAL;
1389
                                }
1390
                        }
1391
                } else if (px->fromcodeset == IC_UTF_8) {
1392
                        const char *p = *inbuf;
1393
                        r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1394
                        if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1395
                                if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1396
                                        assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1397
                                        if (r == (size_t)(-2)) {
1398
                                        INVALID:
1399
                                                __set_errno(EINVAL);
1400
                                        } else {
1401
                                                px->fromstate.mask = 0;
1402
                                                inci = 1;
1403
                                        ILLEGAL:
1404
                                                if (px->skip_invalid_input) {
1405
                                                        px->skip_invalid_input = 2;     /* flag for iconv utility */
1406
                                                        goto BOM_SKIP_OUTPUT;
1407
                                                }
1408
                                                __set_errno(EILSEQ);
1409
                                        }
1410
                                        return (size_t)(-1);
1411
                                }
1412
#ifdef __UCLIBC_MJN3_ONLY__
1413
#warning TODO: optimize this.
1414
#endif
1415
                                if (p != NULL) { /* incomplete char case */
1416
                                        goto INVALID;
1417
                                }
1418
                                p = *inbuf + 1; /* nul */
1419
                        }
1420
                        inci = p - *inbuf;
1421
                } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1422
                        if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1423
                                goto ILLEGAL;
1424
                        } else {                        /* some other 8-bit ascii-extension codeset */
1425
                                const __codeset_8_bit_t *c8b
1426
                                        = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1427
                                wc -= 0x80;
1428
                                wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
1429
                                                         (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1430
                                                          << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1431
                                if (!wc) {
1432
                                        goto ILLEGAL;
1433
                                }
1434
                        }
1435
                }
1436
 
1437
 
1438
                if (px->tobom) {
1439
                        inci = 0;
1440
                        wc = 0xfeffU;
1441
        GOT_BOM:
1442
                        px->tobom = 0;
1443
                }
1444
 
1445
                if (px->tocodeset >= IC_MULTIBYTE) {
1446
                        inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1447
                        if (*outbytesleft < inci) goto TOO_BIG;
1448
                        if (px->tocodeset != IC_WCHAR_T) {
1449
                                if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1450
                                                                                 ? 0x7fffffffUL : 0x10ffffUL)
1451
#ifdef KUHN
1452
                                        || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1453
                                        || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1454
#endif
1455
                                        ) {
1456
                                REPLACE_32:
1457
                                        wc = 0xfffd;
1458
                                        ++nrcount;
1459
                                }
1460
                        }
1461
                        if (inco == 4) {
1462
                                if (px->tocodeset & 1) wc = bswap_32(wc);
1463
                        } else {
1464
                                if (((__uwchar_t)wc ) > 0xffffU) {
1465
                                        if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1466
                                                goto REPLACE_32;
1467
                                        }
1468
                                        if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1469
                                        wc2 = 0xdc00U + (wc & 0x3ff);
1470
                                        wc = 0xd800U + ((wc >> 10) & 0x3ff);
1471
                                        if (px->tocodeset & 1) {
1472
                                                wc = bswap_16(wc);
1473
                                                wc2 = bswap_16(wc2);
1474
                                        }
1475
                                        wc += (wc2 << 16);
1476
                                } else if (px->tocodeset & 1) wc = bswap_16(wc);
1477
                        }
1478
                        (*outbuf)[0] = (char)((unsigned char)(wc));
1479
                        (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1480
                        if (inco == 4) {
1481
                                (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1482
                                (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1483
                        }
1484
                } else if (px->tocodeset == IC_UTF_8) {
1485
                        const wchar_t *pw = &wc;
1486
                        do {
1487
                                r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1488
                                if (r != (size_t)(-1)) {
1489
#ifdef __UCLIBC_MJN3_ONLY__
1490
#warning TODO: What happens for a nul?
1491
#endif
1492
                                        if (r == 0) {
1493
                                                if (wc != 0) {
1494
                                                        goto TOO_BIG;
1495
                                                }
1496
                                                ++r;
1497
                                        }
1498
                                        break;
1499
                                }
1500
                                wc = 0xfffdU;
1501
                                ++nrcount;
1502
                        } while (1);
1503
                        inco = r;
1504
                } else if (((__uwchar_t)(wc)) < 0x80) {
1505
                CHAR_GOOD:
1506
                                **outbuf = wc;
1507
                } else {
1508
                        if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1509
                                const __codeset_8_bit_t *c8b
1510
                                        = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1511
                                __uwchar_t u;
1512
                                u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1513
                                u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1514
                                                 + ((wc >> Cwc2c_TT_SHIFT)
1515
                                                        & ((1 << Cwc2c_TI_SHIFT)-1))];
1516
                                wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
1517
                                                 + (u << Cwc2c_TT_SHIFT)
1518
                                                 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1519
                                if (wc) {
1520
                                        goto CHAR_GOOD;
1521
                                }
1522
                        }
1523
                        **outbuf = '?';
1524
                        ++nrcount;
1525
                }
1526
 
1527
                *outbuf += inco;
1528
                *outbytesleft -= inco;
1529
        BOM_SKIP_OUTPUT:
1530
                *inbuf += inci;
1531
                *inbytesleft -= inci;
1532
        }
1533
        return nrcount;
1534
}
1535
 
1536
#endif
1537
/**********************************************************************/
1538
#ifdef L_iconv_main
1539
 
1540
#include <stdio.h>
1541
#include <stdlib.h>
1542
#include <string.h>
1543
#include <wchar.h>
1544
#include <iconv.h>
1545
#include <stdarg.h>
1546
#include <libgen.h>
1547
 
1548
extern const unsigned char __iconv_codesets[];
1549
 
1550
#define IBUF BUFSIZ
1551
#define OBUF BUFSIZ
1552
 
1553
char *progname;
1554
int hide_errors;
1555
 
1556
static void error_msg(const char *fmt, ...)
1557
         __attribute__ ((noreturn, format (printf, 1, 2)));
1558
 
1559
static void error_msg(const char *fmt, ...)
1560
{
1561
        va_list arg;
1562
 
1563
        if (!hide_errors) {
1564
                fprintf(stderr, "%s: ", progname);
1565
                va_start(arg, fmt);
1566
                vfprintf(stderr, fmt, arg);
1567
                va_end(arg);
1568
        }
1569
 
1570
        exit(EXIT_FAILURE);
1571
}
1572
 
1573
int main(int argc, char **argv)
1574
{
1575
        FILE *ifile;
1576
        FILE *ofile = stdout;
1577
        const char *p;
1578
        const char *s;
1579
        static const char opt_chars[] = "tfocsl";
1580
                                      /* 012345 */
1581
        const char *opts[sizeof(opt_chars)]; /* last is infile name */
1582
        iconv_t ic;
1583
        char ibuf[IBUF];
1584
        char obuf[OBUF];
1585
        char *pi;
1586
        char *po;
1587
        size_t ni, no, r, pos;
1588
 
1589
        hide_errors = 0;
1590
 
1591
        for (s = opt_chars ; *s ; s++) {
1592
                opts[ s - opt_chars ] = NULL;
1593
        }
1594
 
1595
        progname = *argv;
1596
        while (--argc) {
1597
                p = *++argv;
1598
                if ((*p != '-') || (*++p == 0)) {
1599
                        break;
1600
                }
1601
                do {
1602
                        if ((s = strchr(opt_chars,*p)) == NULL) {
1603
                        USAGE:
1604
                                s = basename(progname);
1605
                                fprintf(stderr,
1606
                                                "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1607
                                                "  or\n%s -l\n", s, s);
1608
                                return EXIT_FAILURE;
1609
                        }
1610
                        if ((s - opt_chars) < 3) {
1611
                                if ((--argc == 0) || opts[s - opt_chars]) {
1612
                                        goto USAGE;
1613
                                }
1614
                                opts[s - opt_chars] = *++argv;
1615
                        } else {
1616
                                opts[s - opt_chars] = p;
1617
                        }
1618
                } while (*++p);
1619
        }
1620
 
1621
        if (opts[5]) {                          /* -l */
1622
                fprintf(stderr, "Recognized codesets:\n");
1623
                for (s = __iconv_codesets ; *s ; s += *s) {
1624
                        fprintf(stderr,"  %s\n", s+2);
1625
                }
1626
                s = __LOCALE_DATA_CODESET_LIST;
1627
                do {
1628
                        fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1629
                } while (*++s);
1630
 
1631
                return EXIT_SUCCESS;
1632
        }
1633
 
1634
        if (opts[4]) {
1635
                hide_errors = 1;
1636
        }
1637
 
1638
        if (!opts[0] || !opts[1]) {
1639
                goto USAGE;
1640
        }
1641
        if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1642
                error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1643
        }
1644
        if (opts[3]) {                          /* -c */
1645
                ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1646
        }
1647
 
1648
        if ((s = opts[2]) != NULL) {
1649
                if (!(ofile = fopen(s, "w"))) {
1650
                        error_msg( "couldn't open %s for writing\n", s);
1651
                }
1652
        }
1653
 
1654
        pos = ni = 0;
1655
        do {
1656
                if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1657
                        ifile = stdin;          /* we don't check for duplicates */
1658
                } else if (!(ifile = fopen(*argv, "r"))) {
1659
                        error_msg( "couldn't open %s for reading\n", *argv);
1660
                }
1661
 
1662
                while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1663
                        pos += r;
1664
                        ni += r;
1665
                        no = OBUF;
1666
                        pi = ibuf;
1667
                        po = obuf;
1668
                        if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1669
                                if ((errno != EINVAL) && (errno != E2BIG)) {
1670
                                        error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1671
                                }
1672
                        }
1673
                        if ((r = OBUF - no) > 0) {
1674
                                if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1675
                                        error_msg( "write error\n");
1676
                                }
1677
                        }
1678
                        if (ni) {                       /* still bytes in buffer! */
1679
                                memmove(ibuf, pi, ni);
1680
                        }
1681
                }
1682
 
1683
                if (ferror(ifile)) {
1684
                        error_msg( "read error\n");
1685
                }
1686
 
1687
                ++argv;
1688
 
1689
                if (ifile != stdin) {
1690
                        fclose(ifile);
1691
                }
1692
 
1693
        } while (--argc > 0);
1694
 
1695
        iconv_close(ic);
1696
 
1697
        if (ni) {
1698
                error_msg( "incomplete sequence\n");
1699
        }
1700
 
1701
        return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1702
                ? EXIT_SUCCESS : EXIT_FAILURE;
1703
}
1704
 
1705
#endif
1706
/**********************************************************************/

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.