OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.2.2/] [libcpp/] [charset.c] - Blame information for rev 283

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 38 julius
/* CPP Library - charsets
2
   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3
   Free Software Foundation, Inc.
4
 
5
   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6
 
7
This program is free software; you can redistribute it and/or modify it
8
under the terms of the GNU General Public License as published by the
9
Free Software Foundation; either version 2, or (at your option) any
10
later version.
11
 
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
 
17
You should have received a copy of the GNU General Public License
18
along with this program; if not, write to the Free Software
19
Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20
 
21
#include "config.h"
22
#include "system.h"
23
#include "cpplib.h"
24
#include "internal.h"
25
 
26
/* Character set handling for C-family languages.
27
 
28
   Terminological note: In what follows, "charset" or "character set"
29
   will be taken to mean both an abstract set of characters and an
30
   encoding for that set.
31
 
32
   The C99 standard discusses two character sets: source and execution.
33
   The source character set is used for internal processing in translation
34
   phases 1 through 4; the execution character set is used thereafter.
35
   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36
   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37
   of these terms).  Furthermore, the "basic character set" (listed in
38
   5.2.1p3) is to be encoded in each with values one byte wide, and is
39
   to appear in the initial shift state.
40
 
41
   It is not explicitly mentioned, but there is also a "wide execution
42
   character set" used to encode wide character constants and wide
43
   string literals; this is supposed to be the result of applying the
44
   standard library function mbstowcs() to an equivalent narrow string
45
   (6.4.5p5).  However, the behavior of hexadecimal and octal
46
   \-escapes is at odds with this; they are supposed to be translated
47
   directly to wchar_t values (6.4.4.4p5,6).
48
 
49
   The source character set is not necessarily the character set used
50
   to encode physical source files on disk; translation phase 1 converts
51
   from whatever that encoding is to the source character set.
52
 
53
   The presence of universal character names in C99 (6.4.3 et seq.)
54
   forces the source character set to be isomorphic to ISO 10646,
55
   that is, Unicode.  There is no such constraint on the execution
56
   character set; note also that the conversion from source to
57
   execution character set does not occur for identifiers (5.1.1.2p1#5).
58
 
59
   For convenience of implementation, the source character set's
60
   encoding of the basic character set should be identical to the
61
   execution character set OF THE HOST SYSTEM's encoding of the basic
62
   character set, and it should not be a state-dependent encoding.
63
 
64
   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65
   depending on whether the host is based on ASCII or EBCDIC (see
66
   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67
   Technical Report #16).  With limited exceptions, it relies on the
68
   system library's iconv() primitive to do charset conversion
69
   (specified in SUSv2).  */
70
 
71
#if !HAVE_ICONV
72
/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73
   below, which are guarded only by if statements with compile-time
74
   constant conditions, do not cause link errors.  */
75
#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76
#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77
#define iconv_close(x)   (void)0
78
#define ICONV_CONST
79
#endif
80
 
81
#if HOST_CHARSET == HOST_CHARSET_ASCII
82
#define SOURCE_CHARSET "UTF-8"
83
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85
#define SOURCE_CHARSET "UTF-EBCDIC"
86
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87
#else
88
#error "Unrecognized basic host character set"
89
#endif
90
 
91
#ifndef EILSEQ
92
#define EILSEQ EINVAL
93
#endif
94
 
95
/* This structure is used for a resizable string buffer throughout.  */
96
/* Don't call it strbuf, as that conflicts with unistd.h on systems
97
   such as DYNIX/ptx where unistd.h includes stropts.h.  */
98
struct _cpp_strbuf
99
{
100
  uchar *text;
101
  size_t asize;
102
  size_t len;
103
};
104
 
105
/* This is enough to hold any string that fits on a single 80-column
106
   line, even if iconv quadruples its size (e.g. conversion from
107
   ASCII to UTF-32) rounded up to a power of two.  */
108
#define OUTBUF_BLOCK_SIZE 256
109
 
110
/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111
   logic.  This is because a depressing number of systems lack iconv,
112
   or have have iconv libraries that do not do these conversions, so
113
   we need a fallback implementation for them.  To ensure the fallback
114
   doesn't break due to neglect, it is used on all systems.
115
 
116
   UTF-32 encoding is nice and simple: a four-byte binary number,
117
   constrained to the range 00000000-7FFFFFFF to avoid questions of
118
   signedness.  We do have to cope with big- and little-endian
119
   variants.
120
 
121
   UTF-16 encoding uses two-byte binary numbers, again in big- and
122
   little-endian variants, for all values in the 00000000-0000FFFF
123
   range.  Values in the 00010000-0010FFFF range are encoded as pairs
124
   of two-byte numbers, called "surrogate pairs": given a number S in
125
   this range, it is mapped to a pair (H, L) as follows:
126
 
127
     H = (S - 0x10000) / 0x400 + 0xD800
128
     L = (S - 0x10000) % 0x400 + 0xDC00
129
 
130
   Two-byte values in the D800...DFFF range are ill-formed except as a
131
   component of a surrogate pair.  Even if the encoding within a
132
   two-byte value is little-endian, the H member of the surrogate pair
133
   comes first.
134
 
135
   There is no way to encode values in the 00110000-7FFFFFFF range,
136
   which is not currently a problem as there are no assigned code
137
   points in that range; however, the author expects that it will
138
   eventually become necessary to abandon UTF-16 due to this
139
   limitation.  Note also that, because of these pairs, UTF-16 does
140
   not meet the requirements of the C standard for a wide character
141
   encoding (see 3.7.3 and 6.4.4.4p11).
142
 
143
   UTF-8 encoding looks like this:
144
 
145
   value range         encoded as
146
   00000000-0000007F   0xxxxxxx
147
   00000080-000007FF   110xxxxx 10xxxxxx
148
   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
149
   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150
   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151
   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152
 
153
   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154
   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155
   never occur.  Note also that any value that can be encoded by a
156
   given row of the table can also be encoded by all successive rows,
157
   but this is not done; only the shortest possible encoding for any
158
   given value is valid.  For instance, the character 07C0 could be
159
   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160
   FC 80 80 80 9F 80.  Only the first is valid.
161
 
162
   An implementation note: the transformation from UTF-16 to UTF-8, or
163
   vice versa, is easiest done by using UTF-32 as an intermediary.  */
164
 
165
/* Internal primitives which go from an UTF-8 byte stream to native-endian
166
   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167
   operation in several places below.  */
168
static inline int
169
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170
                     cppchar_t *cp)
171
{
172
  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
173
  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174
 
175
  cppchar_t c;
176
  const uchar *inbuf = *inbufp;
177
  size_t nbytes, i;
178
 
179
  if (*inbytesleftp < 1)
180
    return EINVAL;
181
 
182
  c = *inbuf;
183
  if (c < 0x80)
184
    {
185
      *cp = c;
186
      *inbytesleftp -= 1;
187
      *inbufp += 1;
188
      return 0;
189
    }
190
 
191
  /* The number of leading 1-bits in the first byte indicates how many
192
     bytes follow.  */
193
  for (nbytes = 2; nbytes < 7; nbytes++)
194
    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195
      goto found;
196
  return EILSEQ;
197
 found:
198
 
199
  if (*inbytesleftp < nbytes)
200
    return EINVAL;
201
 
202
  c = (c & masks[nbytes-1]);
203
  inbuf++;
204
  for (i = 1; i < nbytes; i++)
205
    {
206
      cppchar_t n = *inbuf++;
207
      if ((n & 0xC0) != 0x80)
208
        return EILSEQ;
209
      c = ((c << 6) + (n & 0x3F));
210
    }
211
 
212
  /* Make sure the shortest possible encoding was used.  */
213
  if (c <=      0x7F && nbytes > 1) return EILSEQ;
214
  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
215
  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
216
  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
217
  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218
 
219
  /* Make sure the character is valid.  */
220
  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221
 
222
  *cp = c;
223
  *inbufp = inbuf;
224
  *inbytesleftp -= nbytes;
225
  return 0;
226
}
227
 
228
static inline int
229
one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230
{
231
  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232
  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233
  size_t nbytes;
234
  uchar buf[6], *p = &buf[6];
235
  uchar *outbuf = *outbufp;
236
 
237
  nbytes = 1;
238
  if (c < 0x80)
239
    *--p = c;
240
  else
241
    {
242
      do
243
        {
244
          *--p = ((c & 0x3F) | 0x80);
245
          c >>= 6;
246
          nbytes++;
247
        }
248
      while (c >= 0x3F || (c & limits[nbytes-1]));
249
      *--p = (c | masks[nbytes-1]);
250
    }
251
 
252
  if (*outbytesleftp < nbytes)
253
    return E2BIG;
254
 
255
  while (p < &buf[6])
256
    *outbuf++ = *p++;
257
  *outbytesleftp -= nbytes;
258
  *outbufp = outbuf;
259
  return 0;
260
}
261
 
262
/* The following four functions transform one character between the two
263
   encodings named in the function name.  All have the signature
264
   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265
           uchar **outbufp, size_t *outbytesleftp)
266
 
267
   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268
   interpreted as a boolean indicating whether big-endian or
269
   little-endian encoding is to be used for the member of the pair
270
   that is not UTF-8.
271
 
272
   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273
   do for iconv.
274
 
275
   The return value is either 0 for success, or an errno value for
276
   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277
   input sequence), ir EINVAL (incomplete input sequence).  */
278
 
279
static inline int
280
one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281
                   uchar **outbufp, size_t *outbytesleftp)
282
{
283
  uchar *outbuf;
284
  cppchar_t s = 0;
285
  int rval;
286
 
287
  /* Check for space first, since we know exactly how much we need.  */
288
  if (*outbytesleftp < 4)
289
    return E2BIG;
290
 
291
  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292
  if (rval)
293
    return rval;
294
 
295
  outbuf = *outbufp;
296
  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297
  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298
  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299
  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300
 
301
  *outbufp += 4;
302
  *outbytesleftp -= 4;
303
  return 0;
304
}
305
 
306
static inline int
307
one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308
                   uchar **outbufp, size_t *outbytesleftp)
309
{
310
  cppchar_t s;
311
  int rval;
312
  const uchar *inbuf;
313
 
314
  if (*inbytesleftp < 4)
315
    return EINVAL;
316
 
317
  inbuf = *inbufp;
318
 
319
  s  = inbuf[bigend ? 0 : 3] << 24;
320
  s += inbuf[bigend ? 1 : 2] << 16;
321
  s += inbuf[bigend ? 2 : 1] << 8;
322
  s += inbuf[bigend ? 3 : 0];
323
 
324
  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325
    return EILSEQ;
326
 
327
  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328
  if (rval)
329
    return rval;
330
 
331
  *inbufp += 4;
332
  *inbytesleftp -= 4;
333
  return 0;
334
}
335
 
336
static inline int
337
one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338
                   uchar **outbufp, size_t *outbytesleftp)
339
{
340
  int rval;
341
  cppchar_t s = 0;
342
  const uchar *save_inbuf = *inbufp;
343
  size_t save_inbytesleft = *inbytesleftp;
344
  uchar *outbuf = *outbufp;
345
 
346
  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347
  if (rval)
348
    return rval;
349
 
350
  if (s > 0x0010FFFF)
351
    {
352
      *inbufp = save_inbuf;
353
      *inbytesleftp = save_inbytesleft;
354
      return EILSEQ;
355
    }
356
 
357
  if (s < 0xFFFF)
358
    {
359
      if (*outbytesleftp < 2)
360
        {
361
          *inbufp = save_inbuf;
362
          *inbytesleftp = save_inbytesleft;
363
          return E2BIG;
364
        }
365
      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366
      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367
 
368
      *outbufp += 2;
369
      *outbytesleftp -= 2;
370
      return 0;
371
    }
372
  else
373
    {
374
      cppchar_t hi, lo;
375
 
376
      if (*outbytesleftp < 4)
377
        {
378
          *inbufp = save_inbuf;
379
          *inbytesleftp = save_inbytesleft;
380
          return E2BIG;
381
        }
382
 
383
      hi = (s - 0x10000) / 0x400 + 0xD800;
384
      lo = (s - 0x10000) % 0x400 + 0xDC00;
385
 
386
      /* Even if we are little-endian, put the high surrogate first.
387
         ??? Matches practice?  */
388
      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389
      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390
      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391
      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392
 
393
      *outbufp += 4;
394
      *outbytesleftp -= 4;
395
      return 0;
396
    }
397
}
398
 
399
static inline int
400
one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401
                   uchar **outbufp, size_t *outbytesleftp)
402
{
403
  cppchar_t s;
404
  const uchar *inbuf = *inbufp;
405
  int rval;
406
 
407
  if (*inbytesleftp < 2)
408
    return EINVAL;
409
  s  = inbuf[bigend ? 0 : 1] << 8;
410
  s += inbuf[bigend ? 1 : 0];
411
 
412
  /* Low surrogate without immediately preceding high surrogate is invalid.  */
413
  if (s >= 0xDC00 && s <= 0xDFFF)
414
    return EILSEQ;
415
  /* High surrogate must have a following low surrogate.  */
416
  else if (s >= 0xD800 && s <= 0xDBFF)
417
    {
418
      cppchar_t hi = s, lo;
419
      if (*inbytesleftp < 4)
420
        return EINVAL;
421
 
422
      lo  = inbuf[bigend ? 2 : 3] << 8;
423
      lo += inbuf[bigend ? 3 : 2];
424
 
425
      if (lo < 0xDC00 || lo > 0xDFFF)
426
        return EILSEQ;
427
 
428
      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429
    }
430
 
431
  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432
  if (rval)
433
    return rval;
434
 
435
  /* Success - update the input pointers (one_cppchar_to_utf8 has done
436
     the output pointers for us).  */
437
  if (s <= 0xFFFF)
438
    {
439
      *inbufp += 2;
440
      *inbytesleftp -= 2;
441
    }
442
  else
443
    {
444
      *inbufp += 4;
445
      *inbytesleftp -= 4;
446
    }
447
  return 0;
448
}
449
 
450
/* Helper routine for the next few functions.  The 'const' on
451
   one_conversion means that we promise not to modify what function is
452
   pointed to, which lets the inliner see through it.  */
453
 
454
static inline bool
455
conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456
                                             uchar **, size_t *),
457
                 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458
{
459
  const uchar *inbuf;
460
  uchar *outbuf;
461
  size_t inbytesleft, outbytesleft;
462
  int rval;
463
 
464
  inbuf = from;
465
  inbytesleft = flen;
466
  outbuf = to->text + to->len;
467
  outbytesleft = to->asize - to->len;
468
 
469
  for (;;)
470
    {
471
      do
472
        rval = one_conversion (cd, &inbuf, &inbytesleft,
473
                               &outbuf, &outbytesleft);
474
      while (inbytesleft && !rval);
475
 
476
      if (__builtin_expect (inbytesleft == 0, 1))
477
        {
478
          to->len = to->asize - outbytesleft;
479
          return true;
480
        }
481
      if (rval != E2BIG)
482
        {
483
          errno = rval;
484
          return false;
485
        }
486
 
487
      outbytesleft += OUTBUF_BLOCK_SIZE;
488
      to->asize += OUTBUF_BLOCK_SIZE;
489
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
490
      outbuf = to->text + to->asize - outbytesleft;
491
    }
492
}
493
 
494
 
495
/* These functions convert entire strings between character sets.
496
   They all have the signature
497
 
498
   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499
 
500
   The input string FROM is converted as specified by the function
501
   name plus the iconv descriptor CD (which may be fake), and the
502
   result appended to TO.  On any error, false is returned, otherwise true.  */
503
 
504
/* These four use the custom conversion code above.  */
505
static bool
506
convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507
                    struct _cpp_strbuf *to)
508
{
509
  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510
}
511
 
512
static bool
513
convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514
                    struct _cpp_strbuf *to)
515
{
516
  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517
}
518
 
519
static bool
520
convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521
                    struct _cpp_strbuf *to)
522
{
523
  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524
}
525
 
526
static bool
527
convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528
                    struct _cpp_strbuf *to)
529
{
530
  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531
}
532
 
533
/* Identity conversion, used when we have no alternative.  */
534
static bool
535
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536
                       const uchar *from, size_t flen, struct _cpp_strbuf *to)
537
{
538
  if (to->len + flen > to->asize)
539
    {
540
      to->asize = to->len + flen;
541
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542
    }
543
  memcpy (to->text + to->len, from, flen);
544
  to->len += flen;
545
  return true;
546
}
547
 
548
/* And this one uses the system iconv primitive.  It's a little
549
   different, since iconv's interface is a little different.  */
550
#if HAVE_ICONV
551
static bool
552
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
553
                     struct _cpp_strbuf *to)
554
{
555
  ICONV_CONST char *inbuf;
556
  char *outbuf;
557
  size_t inbytesleft, outbytesleft;
558
 
559
  /* Reset conversion descriptor and check that it is valid.  */
560
  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
561
    return false;
562
 
563
  inbuf = (ICONV_CONST char *)from;
564
  inbytesleft = flen;
565
  outbuf = (char *)to->text + to->len;
566
  outbytesleft = to->asize - to->len;
567
 
568
  for (;;)
569
    {
570
      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
571
      if (__builtin_expect (inbytesleft == 0, 1))
572
        {
573
          to->len = to->asize - outbytesleft;
574
          return true;
575
        }
576
      if (errno != E2BIG)
577
        return false;
578
 
579
      outbytesleft += OUTBUF_BLOCK_SIZE;
580
      to->asize += OUTBUF_BLOCK_SIZE;
581
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
582
      outbuf = (char *)to->text + to->asize - outbytesleft;
583
    }
584
}
585
#else
586
#define convert_using_iconv 0 /* prevent undefined symbol error below */
587
#endif
588
 
589
/* Arrange for the above custom conversion logic to be used automatically
590
   when conversion between a suitable pair of character sets is requested.  */
591
 
592
#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
593
   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
594
 
595
struct conversion
596
{
597
  const char *pair;
598
  convert_f func;
599
  iconv_t fake_cd;
600
};
601
static const struct conversion conversion_tab[] = {
602
  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
603
  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
604
  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
605
  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
606
  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
607
  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
608
  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
609
  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
610
};
611
 
612
/* Subroutine of cpp_init_iconv: initialize and return a
613
   cset_converter structure for conversion from FROM to TO.  If
614
   iconv_open() fails, issue an error and return an identity
615
   converter.  Silently return an identity converter if FROM and TO
616
   are identical.  */
617
static struct cset_converter
618
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
619
{
620
  struct cset_converter ret;
621
  char *pair;
622
  size_t i;
623
 
624
  if (!strcasecmp (to, from))
625
    {
626
      ret.func = convert_no_conversion;
627
      ret.cd = (iconv_t) -1;
628
      return ret;
629
    }
630
 
631
  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
632
 
633
  strcpy(pair, from);
634
  strcat(pair, "/");
635
  strcat(pair, to);
636
  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
637
    if (!strcasecmp (pair, conversion_tab[i].pair))
638
      {
639
        ret.func = conversion_tab[i].func;
640
        ret.cd = conversion_tab[i].fake_cd;
641
        return ret;
642
      }
643
 
644
  /* No custom converter - try iconv.  */
645
  if (HAVE_ICONV)
646
    {
647
      ret.func = convert_using_iconv;
648
      ret.cd = iconv_open (to, from);
649
 
650
      if (ret.cd == (iconv_t) -1)
651
        {
652
          if (errno == EINVAL)
653
            cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
654
                       "conversion from %s to %s not supported by iconv",
655
                       from, to);
656
          else
657
            cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
658
 
659
          ret.func = convert_no_conversion;
660
        }
661
    }
662
  else
663
    {
664
      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
665
                 "no iconv implementation, cannot convert from %s to %s",
666
                 from, to);
667
      ret.func = convert_no_conversion;
668
      ret.cd = (iconv_t) -1;
669
    }
670
  return ret;
671
}
672
 
673
/* If charset conversion is requested, initialize iconv(3) descriptors
674
   for conversion from the source character set to the execution
675
   character sets.  If iconv is not present in the C library, and
676
   conversion is requested, issue an error.  */
677
 
678
void
679
cpp_init_iconv (cpp_reader *pfile)
680
{
681
  const char *ncset = CPP_OPTION (pfile, narrow_charset);
682
  const char *wcset = CPP_OPTION (pfile, wide_charset);
683
  const char *default_wcset;
684
 
685
  bool be = CPP_OPTION (pfile, bytes_big_endian);
686
 
687
  if (CPP_OPTION (pfile, wchar_precision) >= 32)
688
    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
689
  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
690
    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
691
  else
692
    /* This effectively means that wide strings are not supported,
693
       so don't do any conversion at all.  */
694
   default_wcset = SOURCE_CHARSET;
695
 
696
  if (!ncset)
697
    ncset = SOURCE_CHARSET;
698
  if (!wcset)
699
    wcset = default_wcset;
700
 
701
  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
702
  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
703
}
704
 
705
/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
706
void
707
_cpp_destroy_iconv (cpp_reader *pfile)
708
{
709
  if (HAVE_ICONV)
710
    {
711
      if (pfile->narrow_cset_desc.func == convert_using_iconv)
712
        iconv_close (pfile->narrow_cset_desc.cd);
713
      if (pfile->wide_cset_desc.func == convert_using_iconv)
714
        iconv_close (pfile->wide_cset_desc.cd);
715
    }
716
}
717
 
718
/* Utility routine for use by a full compiler.  C is a character taken
719
   from the *basic* source character set, encoded in the host's
720
   execution encoding.  Convert it to (the target's) execution
721
   encoding, and return that value.
722
 
723
   Issues an internal error if C's representation in the narrow
724
   execution character set fails to be a single-byte value (C99
725
   5.2.1p3: "The representation of each member of the source and
726
   execution character sets shall fit in a byte.")  May also issue an
727
   internal error if C fails to be a member of the basic source
728
   character set (testing this exactly is too hard, especially when
729
   the host character set is EBCDIC).  */
730
cppchar_t
731
cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
732
{
733
  uchar sbuf[1];
734
  struct _cpp_strbuf tbuf;
735
 
736
  /* This test is merely an approximation, but it suffices to catch
737
     the most important thing, which is that we don't get handed a
738
     character outside the unibyte range of the host character set.  */
739
  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
740
    {
741
      cpp_error (pfile, CPP_DL_ICE,
742
                 "character 0x%lx is not in the basic source character set\n",
743
                 (unsigned long)c);
744
      return 0;
745
    }
746
 
747
  /* Being a character in the unibyte range of the host character set,
748
     we can safely splat it into a one-byte buffer and trust that that
749
     is a well-formed string.  */
750
  sbuf[0] = c;
751
 
752
  /* This should never need to reallocate, but just in case... */
753
  tbuf.asize = 1;
754
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
755
  tbuf.len = 0;
756
 
757
  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
758
    {
759
      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
760
      return 0;
761
    }
762
  if (tbuf.len != 1)
763
    {
764
      cpp_error (pfile, CPP_DL_ICE,
765
                 "character 0x%lx is not unibyte in execution character set",
766
                 (unsigned long)c);
767
      return 0;
768
    }
769
  c = tbuf.text[0];
770
  free(tbuf.text);
771
  return c;
772
}
773
 
774
 
775
 
776
/* Utility routine that computes a mask of the form 0000...111... with
777
   WIDTH 1-bits.  */
778
static inline size_t
779
width_to_mask (size_t width)
780
{
781
  width = MIN (width, BITS_PER_CPPCHAR_T);
782
  if (width >= CHAR_BIT * sizeof (size_t))
783
    return ~(size_t) 0;
784
  else
785
    return ((size_t) 1 << width) - 1;
786
}
787
 
788
/* A large table of unicode character information.  */
789
enum {
790
  /* Valid in a C99 identifier?  */
791
  C99 = 1,
792
  /* Valid in a C99 identifier, but not as the first character?  */
793
  DIG = 2,
794
  /* Valid in a C++ identifier?  */
795
  CXX = 4,
796
  /* NFC representation is not valid in an identifier?  */
797
  CID = 8,
798
  /* Might be valid NFC form?  */
799
  NFC = 16,
800
  /* Might be valid NFKC form?  */
801
  NKC = 32,
802
  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
803
  CTX = 64
804
};
805
 
806
static const struct {
807
  /* Bitmap of flags above.  */
808
  unsigned char flags;
809
  /* Combining class of the character.  */
810
  unsigned char combine;
811
  /* Last character in the range described by this entry.  */
812
  unsigned short end;
813
} ucnranges[] = {
814
#include "ucnid.h"
815
};
816
 
817
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
818
   the start of an identifier, and 0 if C is not valid in an
819
   identifier.  We assume C has already gone through the checks of
820
   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
821
   algorithm is a simple binary search on the table defined in
822
   ucnid.h.  */
823
 
824
static int
825
ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
826
                         struct normalize_state *nst)
827
{
828
  int mn, mx, md;
829
 
830
  if (c > 0xFFFF)
831
    return 0;
832
 
833
  mn = 0;
834
  mx = ARRAY_SIZE (ucnranges) - 1;
835
  while (mx != mn)
836
    {
837
      md = (mn + mx) / 2;
838
      if (c <= ucnranges[md].end)
839
        mx = md;
840
      else
841
        mn = md + 1;
842
    }
843
 
844
  /* When -pedantic, we require the character to have been listed by
845
     the standard for the current language.  Otherwise, we accept the
846
     union of the acceptable sets for C++98 and C99.  */
847
  if (! (ucnranges[mn].flags & (C99 | CXX)))
848
      return 0;
849
 
850
  if (CPP_PEDANTIC (pfile)
851
      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
852
          || (CPP_OPTION (pfile, cplusplus)
853
              && !(ucnranges[mn].flags & CXX))))
854
    return 0;
855
 
856
  /* Update NST.  */
857
  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
858
    nst->level = normalized_none;
859
  else if (ucnranges[mn].flags & CTX)
860
    {
861
      bool safe;
862
      cppchar_t p = nst->previous;
863
 
864
      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
865
      if (c == 0x09BE)
866
        safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
867
      else if (c == 0x0B3E)
868
        safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
869
      else if (c == 0x0BBE)
870
        safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
871
      else if (c == 0x0CC2)
872
        safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
873
      else if (c == 0x0D3E)
874
        safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
875
      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
876
         and are combined algorithmically from a sequence of the form
877
         1100-1112 1161-1175 11A8-11C2
878
         (if the third is not present, it is treated as 11A7, which is not
879
         really a valid character).
880
         Unfortunately, C99 allows (only) the NFC form, but C++ allows
881
         only the combining characters.  */
882
      else if (c >= 0x1161 && c <= 0x1175)
883
        safe = p < 0x1100 || p > 0x1112;
884
      else if (c >= 0x11A8 && c <= 0x11C2)
885
        safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
886
      else
887
        {
888
          /* Uh-oh, someone updated ucnid.h without updating this code.  */
889
          cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
890
          safe = true;
891
        }
892
      if (!safe && c < 0x1161)
893
        nst->level = normalized_none;
894
      else if (!safe)
895
        nst->level = MAX (nst->level, normalized_identifier_C);
896
    }
897
  else if (ucnranges[mn].flags & NKC)
898
    ;
899
  else if (ucnranges[mn].flags & NFC)
900
    nst->level = MAX (nst->level, normalized_C);
901
  else if (ucnranges[mn].flags & CID)
902
    nst->level = MAX (nst->level, normalized_identifier_C);
903
  else
904
    nst->level = normalized_none;
905
  nst->previous = c;
906
  nst->prev_class = ucnranges[mn].combine;
907
 
908
  /* In C99, UCN digits may not begin identifiers.  */
909
  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
910
    return 2;
911
 
912
  return 1;
913
}
914
 
915
/* [lex.charset]: The character designated by the universal character
916
   name \UNNNNNNNN is that character whose character short name in
917
   ISO/IEC 10646 is NNNNNNNN; the character designated by the
918
   universal character name \uNNNN is that character whose character
919
   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
920
   for a universal character name is less than 0x20 or in the range
921
   0x7F-0x9F (inclusive), or if the universal character name
922
   designates a character in the basic source character set, then the
923
   program is ill-formed.
924
 
925
   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
926
   buffer end is delimited by a non-hex digit.  Returns zero if the
927
   UCN has not been consumed.
928
 
929
   Otherwise the nonzero value of the UCN, whether valid or invalid,
930
   is returned.  Diagnostics are emitted for invalid values.  PSTR
931
   is updated to point one beyond the UCN, or to the syntactically
932
   invalid character.
933
 
934
   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
935
   an identifier, or 2 otherwise.  */
936
 
937
cppchar_t
938
_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
939
                const uchar *limit, int identifier_pos,
940
                struct normalize_state *nst)
941
{
942
  cppchar_t result, c;
943
  unsigned int length;
944
  const uchar *str = *pstr;
945
  const uchar *base = str - 2;
946
 
947
  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
948
    cpp_error (pfile, CPP_DL_WARNING,
949
               "universal character names are only valid in C++ and C99");
950
  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
951
    cpp_error (pfile, CPP_DL_WARNING,
952
               "the meaning of '\\%c' is different in traditional C",
953
               (int) str[-1]);
954
 
955
  if (str[-1] == 'u')
956
    length = 4;
957
  else if (str[-1] == 'U')
958
    length = 8;
959
  else
960
    {
961
      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
962
      length = 4;
963
    }
964
 
965
  result = 0;
966
  do
967
    {
968
      c = *str;
969
      if (!ISXDIGIT (c))
970
        break;
971
      str++;
972
      result = (result << 4) + hex_value (c);
973
    }
974
  while (--length && str < limit);
975
 
976
  /* Partial UCNs are not valid in strings, but decompose into
977
     multiple tokens in identifiers, so we can't give a helpful
978
     error message in that case.  */
979
  if (length && identifier_pos)
980
    return 0;
981
 
982
  *pstr = str;
983
  if (length)
984
    {
985
      cpp_error (pfile, CPP_DL_ERROR,
986
                 "incomplete universal character name %.*s",
987
                 (int) (str - base), base);
988
      result = 1;
989
    }
990
  /* The standard permits $, @ and ` to be specified as UCNs.  We use
991
     hex escapes so that this also works with EBCDIC hosts.  */
992
  else if ((result < 0xa0
993
            && (result != 0x24 && result != 0x40 && result != 0x60))
994
           || (result & 0x80000000)
995
           || (result >= 0xD800 && result <= 0xDFFF))
996
    {
997
      cpp_error (pfile, CPP_DL_ERROR,
998
                 "%.*s is not a valid universal character",
999
                 (int) (str - base), base);
1000
      result = 1;
1001
    }
1002
  else if (identifier_pos && result == 0x24
1003
           && CPP_OPTION (pfile, dollars_in_ident))
1004
    {
1005
      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1006
        {
1007
          CPP_OPTION (pfile, warn_dollars) = 0;
1008
          cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1009
        }
1010
      NORMALIZE_STATE_UPDATE_IDNUM (nst);
1011
    }
1012
  else if (identifier_pos)
1013
    {
1014
      int validity = ucn_valid_in_identifier (pfile, result, nst);
1015
 
1016
      if (validity == 0)
1017
        cpp_error (pfile, CPP_DL_ERROR,
1018
                   "universal character %.*s is not valid in an identifier",
1019
                   (int) (str - base), base);
1020
      else if (validity == 2 && identifier_pos == 1)
1021
        cpp_error (pfile, CPP_DL_ERROR,
1022
   "universal character %.*s is not valid at the start of an identifier",
1023
                   (int) (str - base), base);
1024
    }
1025
 
1026
  if (result == 0)
1027
    result = 1;
1028
 
1029
  return result;
1030
}
1031
 
1032
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1033
   it to the execution character set and write the result into TBUF.
1034
   An advanced pointer is returned.  Issues all relevant diagnostics.  */
1035
static const uchar *
1036
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1037
             struct _cpp_strbuf *tbuf, bool wide)
1038
{
1039
  cppchar_t ucn;
1040
  uchar buf[6];
1041
  uchar *bufp = buf;
1042
  size_t bytesleft = 6;
1043
  int rval;
1044
  struct cset_converter cvt
1045
    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1046
  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1047
 
1048
  from++;  /* Skip u/U.  */
1049
  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1050
 
1051
  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1052
  if (rval)
1053
    {
1054
      errno = rval;
1055
      cpp_errno (pfile, CPP_DL_ERROR,
1056
                 "converting UCN to source character set");
1057
    }
1058
  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1059
    cpp_errno (pfile, CPP_DL_ERROR,
1060
               "converting UCN to execution character set");
1061
 
1062
  return from;
1063
}
1064
 
1065
/* Subroutine of convert_hex and convert_oct.  N is the representation
1066
   in the execution character set of a numeric escape; write it into the
1067
   string buffer TBUF and update the end-of-string pointer therein.  WIDE
1068
   is true if it's a wide string that's being assembled in TBUF.  This
1069
   function issues no diagnostics and never fails.  */
1070
static void
1071
emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1072
                     struct _cpp_strbuf *tbuf, bool wide)
1073
{
1074
  if (wide)
1075
    {
1076
      /* We have to render this into the target byte order, which may not
1077
         be our byte order.  */
1078
      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1079
      size_t width = CPP_OPTION (pfile, wchar_precision);
1080
      size_t cwidth = CPP_OPTION (pfile, char_precision);
1081
      size_t cmask = width_to_mask (cwidth);
1082
      size_t nbwc = width / cwidth;
1083
      size_t i;
1084
      size_t off = tbuf->len;
1085
      cppchar_t c;
1086
 
1087
      if (tbuf->len + nbwc > tbuf->asize)
1088
        {
1089
          tbuf->asize += OUTBUF_BLOCK_SIZE;
1090
          tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1091
        }
1092
 
1093
      for (i = 0; i < nbwc; i++)
1094
        {
1095
          c = n & cmask;
1096
          n >>= cwidth;
1097
          tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1098
        }
1099
      tbuf->len += nbwc;
1100
    }
1101
  else
1102
    {
1103
      /* Note: this code does not handle the case where the target
1104
         and host have a different number of bits in a byte.  */
1105
      if (tbuf->len + 1 > tbuf->asize)
1106
        {
1107
          tbuf->asize += OUTBUF_BLOCK_SIZE;
1108
          tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1109
        }
1110
      tbuf->text[tbuf->len++] = n;
1111
    }
1112
}
1113
 
1114
/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1115
   character set and write it into the string buffer TBUF.  Returns an
1116
   advanced pointer, and issues diagnostics as necessary.
1117
   No character set translation occurs; this routine always produces the
1118
   execution-set character with numeric value equal to the given hex
1119
   number.  You can, e.g. generate surrogate pairs this way.  */
1120
static const uchar *
1121
convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1122
             struct _cpp_strbuf *tbuf, bool wide)
1123
{
1124
  cppchar_t c, n = 0, overflow = 0;
1125
  int digits_found = 0;
1126
  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1127
                  : CPP_OPTION (pfile, char_precision));
1128
  size_t mask = width_to_mask (width);
1129
 
1130
  if (CPP_WTRADITIONAL (pfile))
1131
    cpp_error (pfile, CPP_DL_WARNING,
1132
               "the meaning of '\\x' is different in traditional C");
1133
 
1134
  from++;  /* Skip 'x'.  */
1135
  while (from < limit)
1136
    {
1137
      c = *from;
1138
      if (! hex_p (c))
1139
        break;
1140
      from++;
1141
      overflow |= n ^ (n << 4 >> 4);
1142
      n = (n << 4) + hex_value (c);
1143
      digits_found = 1;
1144
    }
1145
 
1146
  if (!digits_found)
1147
    {
1148
      cpp_error (pfile, CPP_DL_ERROR,
1149
                 "\\x used with no following hex digits");
1150
      return from;
1151
    }
1152
 
1153
  if (overflow | (n != (n & mask)))
1154
    {
1155
      cpp_error (pfile, CPP_DL_PEDWARN,
1156
                 "hex escape sequence out of range");
1157
      n &= mask;
1158
    }
1159
 
1160
  emit_numeric_escape (pfile, n, tbuf, wide);
1161
 
1162
  return from;
1163
}
1164
 
1165
/* Convert an octal escape, pointed to by FROM, to the execution
1166
   character set and write it into the string buffer TBUF.  Returns an
1167
   advanced pointer, and issues diagnostics as necessary.
1168
   No character set translation occurs; this routine always produces the
1169
   execution-set character with numeric value equal to the given octal
1170
   number.  */
1171
static const uchar *
1172
convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1173
             struct _cpp_strbuf *tbuf, bool wide)
1174
{
1175
  size_t count = 0;
1176
  cppchar_t c, n = 0;
1177
  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1178
                  : CPP_OPTION (pfile, char_precision));
1179
  size_t mask = width_to_mask (width);
1180
  bool overflow = false;
1181
 
1182
  while (from < limit && count++ < 3)
1183
    {
1184
      c = *from;
1185
      if (c < '0' || c > '7')
1186
        break;
1187
      from++;
1188
      overflow |= n ^ (n << 3 >> 3);
1189
      n = (n << 3) + c - '0';
1190
    }
1191
 
1192
  if (n != (n & mask))
1193
    {
1194
      cpp_error (pfile, CPP_DL_PEDWARN,
1195
                 "octal escape sequence out of range");
1196
      n &= mask;
1197
    }
1198
 
1199
  emit_numeric_escape (pfile, n, tbuf, wide);
1200
 
1201
  return from;
1202
}
1203
 
1204
/* Convert an escape sequence (pointed to by FROM) to its value on
1205
   the target, and to the execution character set.  Do not scan past
1206
   LIMIT.  Write the converted value into TBUF.  Returns an advanced
1207
   pointer.  Handles all relevant diagnostics.  */
1208
static const uchar *
1209
convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1210
                struct _cpp_strbuf *tbuf, bool wide)
1211
{
1212
  /* Values of \a \b \e \f \n \r \t \v respectively.  */
1213
#if HOST_CHARSET == HOST_CHARSET_ASCII
1214
  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1215
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1216
  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1217
#else
1218
#error "unknown host character set"
1219
#endif
1220
 
1221
  uchar c;
1222
  struct cset_converter cvt
1223
    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1224
 
1225
  c = *from;
1226
  switch (c)
1227
    {
1228
      /* UCNs, hex escapes, and octal escapes are processed separately.  */
1229
    case 'u': case 'U':
1230
      return convert_ucn (pfile, from, limit, tbuf, wide);
1231
 
1232
    case 'x':
1233
      return convert_hex (pfile, from, limit, tbuf, wide);
1234
      break;
1235
 
1236
    case '0':  case '1':  case '2':  case '3':
1237
    case '4':  case '5':  case '6':  case '7':
1238
      return convert_oct (pfile, from, limit, tbuf, wide);
1239
 
1240
      /* Various letter escapes.  Get the appropriate host-charset
1241
         value into C.  */
1242
    case '\\': case '\'': case '"': case '?': break;
1243
 
1244
    case '(': case '{': case '[': case '%':
1245
      /* '\(', etc, can be used at the beginning of a line in a long
1246
         string split onto multiple lines with \-newline, to prevent
1247
         Emacs or other text editors from getting confused.  '\%' can
1248
         be used to prevent SCCS from mangling printf format strings.  */
1249
      if (CPP_PEDANTIC (pfile))
1250
        goto unknown;
1251
      break;
1252
 
1253
    case 'b': c = charconsts[1];  break;
1254
    case 'f': c = charconsts[3];  break;
1255
    case 'n': c = charconsts[4];  break;
1256
    case 'r': c = charconsts[5];  break;
1257
    case 't': c = charconsts[6];  break;
1258
    case 'v': c = charconsts[7];  break;
1259
 
1260
    case 'a':
1261
      if (CPP_WTRADITIONAL (pfile))
1262
        cpp_error (pfile, CPP_DL_WARNING,
1263
                   "the meaning of '\\a' is different in traditional C");
1264
      c = charconsts[0];
1265
      break;
1266
 
1267
    case 'e': case 'E':
1268
      if (CPP_PEDANTIC (pfile))
1269
        cpp_error (pfile, CPP_DL_PEDWARN,
1270
                   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1271
      c = charconsts[2];
1272
      break;
1273
 
1274
    default:
1275
    unknown:
1276
      if (ISGRAPH (c))
1277
        cpp_error (pfile, CPP_DL_PEDWARN,
1278
                   "unknown escape sequence '\\%c'", (int) c);
1279
      else
1280
        {
1281
          /* diagnostic.c does not support "%03o".  When it does, this
1282
             code can use %03o directly in the diagnostic again.  */
1283
          char buf[32];
1284
          sprintf(buf, "%03o", (int) c);
1285
          cpp_error (pfile, CPP_DL_PEDWARN,
1286
                     "unknown escape sequence: '\\%s'", buf);
1287
        }
1288
    }
1289
 
1290
  /* Now convert what we have to the execution character set.  */
1291
  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1292
    cpp_errno (pfile, CPP_DL_ERROR,
1293
               "converting escape sequence to execution character set");
1294
 
1295
  return from + 1;
1296
}
1297
 
1298
/* FROM is an array of cpp_string structures of length COUNT.  These
1299
   are to be converted from the source to the execution character set,
1300
   escape sequences translated, and finally all are to be
1301
   concatenated.  WIDE indicates whether or not to produce a wide
1302
   string.  The result is written into TO.  Returns true for success,
1303
   false for failure.  */
1304
bool
1305
cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1306
                      cpp_string *to, bool wide)
1307
{
1308
  struct _cpp_strbuf tbuf;
1309
  const uchar *p, *base, *limit;
1310
  size_t i;
1311
  struct cset_converter cvt
1312
    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1313
 
1314
  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1315
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
1316
  tbuf.len = 0;
1317
 
1318
  for (i = 0; i < count; i++)
1319
    {
1320
      p = from[i].text;
1321
      if (*p == 'L') p++;
1322
      p++; /* Skip leading quote.  */
1323
      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1324
 
1325
      for (;;)
1326
        {
1327
          base = p;
1328
          while (p < limit && *p != '\\')
1329
            p++;
1330
          if (p > base)
1331
            {
1332
              /* We have a run of normal characters; these can be fed
1333
                 directly to convert_cset.  */
1334
              if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1335
                goto fail;
1336
            }
1337
          if (p == limit)
1338
            break;
1339
 
1340
          p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1341
        }
1342
    }
1343
  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1344
     structure.  */
1345
  emit_numeric_escape (pfile, 0, &tbuf, wide);
1346
  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1347
  to->text = tbuf.text;
1348
  to->len = tbuf.len;
1349
  return true;
1350
 
1351
 fail:
1352
  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1353
  free (tbuf.text);
1354
  return false;
1355
}
1356
 
1357
/* Subroutine of do_line and do_linemarker.  Convert escape sequences
1358
   in a string, but do not perform character set conversion.  */
1359
bool
1360
cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1361
                                  size_t count, cpp_string *to, bool wide)
1362
{
1363
  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1364
  bool retval;
1365
 
1366
  pfile->narrow_cset_desc.func = convert_no_conversion;
1367
  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1368
 
1369
  retval = cpp_interpret_string (pfile, from, count, to, wide);
1370
 
1371
  pfile->narrow_cset_desc = save_narrow_cset_desc;
1372
  return retval;
1373
}
1374
 
1375
 
1376
/* Subroutine of cpp_interpret_charconst which performs the conversion
1377
   to a number, for narrow strings.  STR is the string structure returned
1378
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1379
   cpp_interpret_charconst.  */
1380
static cppchar_t
1381
narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1382
                         unsigned int *pchars_seen, int *unsignedp)
1383
{
1384
  size_t width = CPP_OPTION (pfile, char_precision);
1385
  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1386
  size_t mask = width_to_mask (width);
1387
  size_t i;
1388
  cppchar_t result, c;
1389
  bool unsigned_p;
1390
 
1391
  /* The value of a multi-character character constant, or a
1392
     single-character character constant whose representation in the
1393
     execution character set is more than one byte long, is
1394
     implementation defined.  This implementation defines it to be the
1395
     number formed by interpreting the byte sequence in memory as a
1396
     big-endian binary number.  If overflow occurs, the high bytes are
1397
     lost, and a warning is issued.
1398
 
1399
     We don't want to process the NUL terminator handed back by
1400
     cpp_interpret_string.  */
1401
  result = 0;
1402
  for (i = 0; i < str.len - 1; i++)
1403
    {
1404
      c = str.text[i] & mask;
1405
      if (width < BITS_PER_CPPCHAR_T)
1406
        result = (result << width) | c;
1407
      else
1408
        result = c;
1409
    }
1410
 
1411
  if (i > max_chars)
1412
    {
1413
      i = max_chars;
1414
      cpp_error (pfile, CPP_DL_WARNING,
1415
                 "character constant too long for its type");
1416
    }
1417
  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1418
    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1419
 
1420
  /* Multichar constants are of type int and therefore signed.  */
1421
  if (i > 1)
1422
    unsigned_p = 0;
1423
  else
1424
    unsigned_p = CPP_OPTION (pfile, unsigned_char);
1425
 
1426
  /* Truncate the constant to its natural width, and simultaneously
1427
     sign- or zero-extend to the full width of cppchar_t.
1428
     For single-character constants, the value is WIDTH bits wide.
1429
     For multi-character constants, the value is INT_PRECISION bits wide.  */
1430
  if (i > 1)
1431
    width = CPP_OPTION (pfile, int_precision);
1432
  if (width < BITS_PER_CPPCHAR_T)
1433
    {
1434
      mask = ((cppchar_t) 1 << width) - 1;
1435
      if (unsigned_p || !(result & (1 << (width - 1))))
1436
        result &= mask;
1437
      else
1438
        result |= ~mask;
1439
    }
1440
  *pchars_seen = i;
1441
  *unsignedp = unsigned_p;
1442
  return result;
1443
}
1444
 
1445
/* Subroutine of cpp_interpret_charconst which performs the conversion
1446
   to a number, for wide strings.  STR is the string structure returned
1447
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1448
   cpp_interpret_charconst.  */
1449
static cppchar_t
1450
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1451
                       unsigned int *pchars_seen, int *unsignedp)
1452
{
1453
  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1454
  size_t width = CPP_OPTION (pfile, wchar_precision);
1455
  size_t cwidth = CPP_OPTION (pfile, char_precision);
1456
  size_t mask = width_to_mask (width);
1457
  size_t cmask = width_to_mask (cwidth);
1458
  size_t nbwc = width / cwidth;
1459
  size_t off, i;
1460
  cppchar_t result = 0, c;
1461
 
1462
  /* This is finicky because the string is in the target's byte order,
1463
     which may not be our byte order.  Only the last character, ignoring
1464
     the NUL terminator, is relevant.  */
1465
  off = str.len - (nbwc * 2);
1466
  result = 0;
1467
  for (i = 0; i < nbwc; i++)
1468
    {
1469
      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1470
      result = (result << cwidth) | (c & cmask);
1471
    }
1472
 
1473
  /* Wide character constants have type wchar_t, and a single
1474
     character exactly fills a wchar_t, so a multi-character wide
1475
     character constant is guaranteed to overflow.  */
1476
  if (off > 0)
1477
    cpp_error (pfile, CPP_DL_WARNING,
1478
               "character constant too long for its type");
1479
 
1480
  /* Truncate the constant to its natural width, and simultaneously
1481
     sign- or zero-extend to the full width of cppchar_t.  */
1482
  if (width < BITS_PER_CPPCHAR_T)
1483
    {
1484
      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1485
        result &= mask;
1486
      else
1487
        result |= ~mask;
1488
    }
1489
 
1490
  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1491
  *pchars_seen = 1;
1492
  return result;
1493
}
1494
 
1495
/* Interpret a (possibly wide) character constant in TOKEN.
1496
   PCHARS_SEEN points to a variable that is filled in with the number
1497
   of characters seen, and UNSIGNEDP to a variable that indicates
1498
   whether the result has signed type.  */
1499
cppchar_t
1500
cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1501
                         unsigned int *pchars_seen, int *unsignedp)
1502
{
1503
  cpp_string str = { 0, 0 };
1504
  bool wide = (token->type == CPP_WCHAR);
1505
  cppchar_t result;
1506
 
1507
  /* an empty constant will appear as L'' or '' */
1508
  if (token->val.str.len == (size_t) (2 + wide))
1509
    {
1510
      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1511
      return 0;
1512
    }
1513
  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1514
    return 0;
1515
 
1516
  if (wide)
1517
    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1518
  else
1519
    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1520
 
1521
  if (str.text != token->val.str.text)
1522
    free ((void *)str.text);
1523
 
1524
  return result;
1525
}
1526
 
1527
/* Convert an identifier denoted by ID and LEN, which might contain
1528
   UCN escapes, to the source character set, either UTF-8 or
1529
   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1530
cpp_hashnode *
1531
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1532
{
1533
  /* It turns out that a UCN escape always turns into fewer characters
1534
     than the escape itself, so we can allocate a temporary in advance.  */
1535
  uchar * buf = (uchar *) alloca (len + 1);
1536
  uchar * bufp = buf;
1537
  size_t idp;
1538
 
1539
  for (idp = 0; idp < len; idp++)
1540
    if (id[idp] != '\\')
1541
      *bufp++ = id[idp];
1542
    else
1543
      {
1544
        unsigned length = id[idp+1] == 'u' ? 4 : 8;
1545
        cppchar_t value = 0;
1546
        size_t bufleft = len - (bufp - buf);
1547
        int rval;
1548
 
1549
        idp += 2;
1550
        while (length && idp < len && ISXDIGIT (id[idp]))
1551
          {
1552
            value = (value << 4) + hex_value (id[idp]);
1553
            idp++;
1554
            length--;
1555
          }
1556
        idp--;
1557
 
1558
        /* Special case for EBCDIC: if the identifier contains
1559
           a '$' specified using a UCN, translate it to EBCDIC.  */
1560
        if (value == 0x24)
1561
          {
1562
            *bufp++ = '$';
1563
            continue;
1564
          }
1565
 
1566
        rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1567
        if (rval)
1568
          {
1569
            errno = rval;
1570
            cpp_errno (pfile, CPP_DL_ERROR,
1571
                       "converting UCN to source character set");
1572
            break;
1573
          }
1574
      }
1575
 
1576
  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1577
                                  buf, bufp - buf, HT_ALLOC));
1578
}
1579
 
1580
/* Convert an input buffer (containing the complete contents of one
1581
   source file) from INPUT_CHARSET to the source character set.  INPUT
1582
   points to the input buffer, SIZE is its allocated size, and LEN is
1583
   the length of the meaningful data within the buffer.  The
1584
   translated buffer is returned, and *ST_SIZE is set to the length of
1585
   the meaningful data within the translated buffer.
1586
 
1587
   INPUT is expected to have been allocated with xmalloc.  This function
1588
   will either return INPUT, or free it and return a pointer to another
1589
   xmalloc-allocated block of memory.  */
1590
uchar *
1591
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1592
                    uchar *input, size_t size, size_t len, off_t *st_size)
1593
{
1594
  struct cset_converter input_cset;
1595
  struct _cpp_strbuf to;
1596
 
1597
  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1598
  if (input_cset.func == convert_no_conversion)
1599
    {
1600
      to.text = input;
1601
      to.asize = size;
1602
      to.len = len;
1603
    }
1604
  else
1605
    {
1606
      to.asize = MAX (65536, len);
1607
      to.text = XNEWVEC (uchar, to.asize);
1608
      to.len = 0;
1609
 
1610
      if (!APPLY_CONVERSION (input_cset, input, len, &to))
1611
        cpp_error (pfile, CPP_DL_ERROR,
1612
                   "failure to convert %s to %s",
1613
                   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1614
 
1615
      free (input);
1616
    }
1617
 
1618
  /* Clean up the mess.  */
1619
  if (input_cset.func == convert_using_iconv)
1620
    iconv_close (input_cset.cd);
1621
 
1622
  /* Resize buffer if we allocated substantially too much, or if we
1623
     haven't enough space for the \n-terminator.  */
1624
  if (to.len + 4096 < to.asize || to.len >= to.asize)
1625
    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1626
 
1627
  /* If the file is using old-school Mac line endings (\r only),
1628
     terminate with another \r, not an \n, so that we do not mistake
1629
     the \r\n sequence for a single DOS line ending and erroneously
1630
     issue the "No newline at end of file" diagnostic.  */
1631
  if (to.text[to.len - 1] == '\r')
1632
    to.text[to.len] = '\r';
1633
  else
1634
    to.text[to.len] = '\n';
1635
 
1636
  *st_size = to.len;
1637
  return to.text;
1638
}
1639
 
1640
/* Decide on the default encoding to assume for input files.  */
1641
const char *
1642
_cpp_default_encoding (void)
1643
{
1644
  const char *current_encoding = NULL;
1645
 
1646
  /* We disable this because the default codeset is 7-bit ASCII on
1647
     most platforms, and this causes conversion failures on every
1648
     file in GCC that happens to have one of the upper 128 characters
1649
     in it -- most likely, as part of the name of a contributor.
1650
     We should definitely recognize in-band markers of file encoding,
1651
     like:
1652
     - the appropriate Unicode byte-order mark (FE FF) to recognize
1653
       UTF16 and UCS4 (in both big-endian and little-endian flavors)
1654
       and UTF8
1655
     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1656
       distinguish ASCII and EBCDIC.
1657
     - now we can parse something like "#pragma GCC encoding <xyz>
1658
       on the first line, or even Emacs/VIM's mode line tags (there's
1659
       a problem here in that VIM uses the last line, and Emacs has
1660
       its more elaborate "local variables" convention).
1661
     - investigate whether Java has another common convention, which
1662
       would be friendly to support.
1663
     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1664
#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1665
  setlocale (LC_CTYPE, "");
1666
  current_encoding = nl_langinfo (CODESET);
1667
#endif
1668
  if (current_encoding == NULL || *current_encoding == '\0')
1669
    current_encoding = SOURCE_CHARSET;
1670
 
1671
  return current_encoding;
1672
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.