OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libcpp/] [charset.c] - Blame information for rev 809

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 730 jeremybenn
/* CPP Library - charsets
2
   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009,
3
   2010 Free Software Foundation, Inc.
4
 
5
   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6
 
7
This program is free software; you can redistribute it and/or modify it
8
under the terms of the GNU General Public License as published by the
9
Free Software Foundation; either version 3, or (at your option) any
10
later version.
11
 
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
 
17
You should have received a copy of the GNU General Public License
18
along with this program; see the file COPYING3.  If not see
19
<http://www.gnu.org/licenses/>.  */
20
 
21
#include "config.h"
22
#include "system.h"
23
#include "cpplib.h"
24
#include "internal.h"
25
 
26
/* Character set handling for C-family languages.
27
 
28
   Terminological note: In what follows, "charset" or "character set"
29
   will be taken to mean both an abstract set of characters and an
30
   encoding for that set.
31
 
32
   The C99 standard discusses two character sets: source and execution.
33
   The source character set is used for internal processing in translation
34
   phases 1 through 4; the execution character set is used thereafter.
35
   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36
   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37
   of these terms).  Furthermore, the "basic character set" (listed in
38
   5.2.1p3) is to be encoded in each with values one byte wide, and is
39
   to appear in the initial shift state.
40
 
41
   It is not explicitly mentioned, but there is also a "wide execution
42
   character set" used to encode wide character constants and wide
43
   string literals; this is supposed to be the result of applying the
44
   standard library function mbstowcs() to an equivalent narrow string
45
   (6.4.5p5).  However, the behavior of hexadecimal and octal
46
   \-escapes is at odds with this; they are supposed to be translated
47
   directly to wchar_t values (6.4.4.4p5,6).
48
 
49
   The source character set is not necessarily the character set used
50
   to encode physical source files on disk; translation phase 1 converts
51
   from whatever that encoding is to the source character set.
52
 
53
   The presence of universal character names in C99 (6.4.3 et seq.)
54
   forces the source character set to be isomorphic to ISO 10646,
55
   that is, Unicode.  There is no such constraint on the execution
56
   character set; note also that the conversion from source to
57
   execution character set does not occur for identifiers (5.1.1.2p1#5).
58
 
59
   For convenience of implementation, the source character set's
60
   encoding of the basic character set should be identical to the
61
   execution character set OF THE HOST SYSTEM's encoding of the basic
62
   character set, and it should not be a state-dependent encoding.
63
 
64
   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65
   depending on whether the host is based on ASCII or EBCDIC (see
66
   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67
   Technical Report #16).  With limited exceptions, it relies on the
68
   system library's iconv() primitive to do charset conversion
69
   (specified in SUSv2).  */
70
 
71
#if !HAVE_ICONV
72
/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73
   below, which are guarded only by if statements with compile-time
74
   constant conditions, do not cause link errors.  */
75
#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76
#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77
#define iconv_close(x)   (void)0
78
#define ICONV_CONST
79
#endif
80
 
81
#if HOST_CHARSET == HOST_CHARSET_ASCII
82
#define SOURCE_CHARSET "UTF-8"
83
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85
#define SOURCE_CHARSET "UTF-EBCDIC"
86
#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87
#else
88
#error "Unrecognized basic host character set"
89
#endif
90
 
91
#ifndef EILSEQ
92
#define EILSEQ EINVAL
93
#endif
94
 
95
/* This structure is used for a resizable string buffer throughout.  */
96
/* Don't call it strbuf, as that conflicts with unistd.h on systems
97
   such as DYNIX/ptx where unistd.h includes stropts.h.  */
98
struct _cpp_strbuf
99
{
100
  uchar *text;
101
  size_t asize;
102
  size_t len;
103
};
104
 
105
/* This is enough to hold any string that fits on a single 80-column
106
   line, even if iconv quadruples its size (e.g. conversion from
107
   ASCII to UTF-32) rounded up to a power of two.  */
108
#define OUTBUF_BLOCK_SIZE 256
109
 
110
/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111
   logic.  This is because a depressing number of systems lack iconv,
112
   or have have iconv libraries that do not do these conversions, so
113
   we need a fallback implementation for them.  To ensure the fallback
114
   doesn't break due to neglect, it is used on all systems.
115
 
116
   UTF-32 encoding is nice and simple: a four-byte binary number,
117
   constrained to the range 00000000-7FFFFFFF to avoid questions of
118
   signedness.  We do have to cope with big- and little-endian
119
   variants.
120
 
121
   UTF-16 encoding uses two-byte binary numbers, again in big- and
122
   little-endian variants, for all values in the 00000000-0000FFFF
123
   range.  Values in the 00010000-0010FFFF range are encoded as pairs
124
   of two-byte numbers, called "surrogate pairs": given a number S in
125
   this range, it is mapped to a pair (H, L) as follows:
126
 
127
     H = (S - 0x10000) / 0x400 + 0xD800
128
     L = (S - 0x10000) % 0x400 + 0xDC00
129
 
130
   Two-byte values in the D800...DFFF range are ill-formed except as a
131
   component of a surrogate pair.  Even if the encoding within a
132
   two-byte value is little-endian, the H member of the surrogate pair
133
   comes first.
134
 
135
   There is no way to encode values in the 00110000-7FFFFFFF range,
136
   which is not currently a problem as there are no assigned code
137
   points in that range; however, the author expects that it will
138
   eventually become necessary to abandon UTF-16 due to this
139
   limitation.  Note also that, because of these pairs, UTF-16 does
140
   not meet the requirements of the C standard for a wide character
141
   encoding (see 3.7.3 and 6.4.4.4p11).
142
 
143
   UTF-8 encoding looks like this:
144
 
145
   value range         encoded as
146
   00000000-0000007F   0xxxxxxx
147
   00000080-000007FF   110xxxxx 10xxxxxx
148
   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
149
   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150
   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151
   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152
 
153
   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154
   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155
   never occur.  Note also that any value that can be encoded by a
156
   given row of the table can also be encoded by all successive rows,
157
   but this is not done; only the shortest possible encoding for any
158
   given value is valid.  For instance, the character 07C0 could be
159
   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160
   FC 80 80 80 9F 80.  Only the first is valid.
161
 
162
   An implementation note: the transformation from UTF-16 to UTF-8, or
163
   vice versa, is easiest done by using UTF-32 as an intermediary.  */
164
 
165
/* Internal primitives which go from an UTF-8 byte stream to native-endian
166
   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167
   operation in several places below.  */
168
static inline int
169
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170
                     cppchar_t *cp)
171
{
172
  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
173
  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174
 
175
  cppchar_t c;
176
  const uchar *inbuf = *inbufp;
177
  size_t nbytes, i;
178
 
179
  if (*inbytesleftp < 1)
180
    return EINVAL;
181
 
182
  c = *inbuf;
183
  if (c < 0x80)
184
    {
185
      *cp = c;
186
      *inbytesleftp -= 1;
187
      *inbufp += 1;
188
      return 0;
189
    }
190
 
191
  /* The number of leading 1-bits in the first byte indicates how many
192
     bytes follow.  */
193
  for (nbytes = 2; nbytes < 7; nbytes++)
194
    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195
      goto found;
196
  return EILSEQ;
197
 found:
198
 
199
  if (*inbytesleftp < nbytes)
200
    return EINVAL;
201
 
202
  c = (c & masks[nbytes-1]);
203
  inbuf++;
204
  for (i = 1; i < nbytes; i++)
205
    {
206
      cppchar_t n = *inbuf++;
207
      if ((n & 0xC0) != 0x80)
208
        return EILSEQ;
209
      c = ((c << 6) + (n & 0x3F));
210
    }
211
 
212
  /* Make sure the shortest possible encoding was used.  */
213
  if (c <=      0x7F && nbytes > 1) return EILSEQ;
214
  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
215
  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
216
  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
217
  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218
 
219
  /* Make sure the character is valid.  */
220
  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221
 
222
  *cp = c;
223
  *inbufp = inbuf;
224
  *inbytesleftp -= nbytes;
225
  return 0;
226
}
227
 
228
static inline int
229
one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230
{
231
  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232
  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233
  size_t nbytes;
234
  uchar buf[6], *p = &buf[6];
235
  uchar *outbuf = *outbufp;
236
 
237
  nbytes = 1;
238
  if (c < 0x80)
239
    *--p = c;
240
  else
241
    {
242
      do
243
        {
244
          *--p = ((c & 0x3F) | 0x80);
245
          c >>= 6;
246
          nbytes++;
247
        }
248
      while (c >= 0x3F || (c & limits[nbytes-1]));
249
      *--p = (c | masks[nbytes-1]);
250
    }
251
 
252
  if (*outbytesleftp < nbytes)
253
    return E2BIG;
254
 
255
  while (p < &buf[6])
256
    *outbuf++ = *p++;
257
  *outbytesleftp -= nbytes;
258
  *outbufp = outbuf;
259
  return 0;
260
}
261
 
262
/* The following four functions transform one character between the two
263
   encodings named in the function name.  All have the signature
264
   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265
           uchar **outbufp, size_t *outbytesleftp)
266
 
267
   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268
   interpreted as a boolean indicating whether big-endian or
269
   little-endian encoding is to be used for the member of the pair
270
   that is not UTF-8.
271
 
272
   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273
   do for iconv.
274
 
275
   The return value is either 0 for success, or an errno value for
276
   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277
   input sequence), ir EINVAL (incomplete input sequence).  */
278
 
279
static inline int
280
one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281
                   uchar **outbufp, size_t *outbytesleftp)
282
{
283
  uchar *outbuf;
284
  cppchar_t s = 0;
285
  int rval;
286
 
287
  /* Check for space first, since we know exactly how much we need.  */
288
  if (*outbytesleftp < 4)
289
    return E2BIG;
290
 
291
  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292
  if (rval)
293
    return rval;
294
 
295
  outbuf = *outbufp;
296
  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297
  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298
  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299
  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300
 
301
  *outbufp += 4;
302
  *outbytesleftp -= 4;
303
  return 0;
304
}
305
 
306
static inline int
307
one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308
                   uchar **outbufp, size_t *outbytesleftp)
309
{
310
  cppchar_t s;
311
  int rval;
312
  const uchar *inbuf;
313
 
314
  if (*inbytesleftp < 4)
315
    return EINVAL;
316
 
317
  inbuf = *inbufp;
318
 
319
  s  = inbuf[bigend ? 0 : 3] << 24;
320
  s += inbuf[bigend ? 1 : 2] << 16;
321
  s += inbuf[bigend ? 2 : 1] << 8;
322
  s += inbuf[bigend ? 3 : 0];
323
 
324
  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325
    return EILSEQ;
326
 
327
  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328
  if (rval)
329
    return rval;
330
 
331
  *inbufp += 4;
332
  *inbytesleftp -= 4;
333
  return 0;
334
}
335
 
336
static inline int
337
one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338
                   uchar **outbufp, size_t *outbytesleftp)
339
{
340
  int rval;
341
  cppchar_t s = 0;
342
  const uchar *save_inbuf = *inbufp;
343
  size_t save_inbytesleft = *inbytesleftp;
344
  uchar *outbuf = *outbufp;
345
 
346
  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347
  if (rval)
348
    return rval;
349
 
350
  if (s > 0x0010FFFF)
351
    {
352
      *inbufp = save_inbuf;
353
      *inbytesleftp = save_inbytesleft;
354
      return EILSEQ;
355
    }
356
 
357
  if (s < 0xFFFF)
358
    {
359
      if (*outbytesleftp < 2)
360
        {
361
          *inbufp = save_inbuf;
362
          *inbytesleftp = save_inbytesleft;
363
          return E2BIG;
364
        }
365
      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366
      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367
 
368
      *outbufp += 2;
369
      *outbytesleftp -= 2;
370
      return 0;
371
    }
372
  else
373
    {
374
      cppchar_t hi, lo;
375
 
376
      if (*outbytesleftp < 4)
377
        {
378
          *inbufp = save_inbuf;
379
          *inbytesleftp = save_inbytesleft;
380
          return E2BIG;
381
        }
382
 
383
      hi = (s - 0x10000) / 0x400 + 0xD800;
384
      lo = (s - 0x10000) % 0x400 + 0xDC00;
385
 
386
      /* Even if we are little-endian, put the high surrogate first.
387
         ??? Matches practice?  */
388
      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389
      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390
      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391
      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392
 
393
      *outbufp += 4;
394
      *outbytesleftp -= 4;
395
      return 0;
396
    }
397
}
398
 
399
static inline int
400
one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401
                   uchar **outbufp, size_t *outbytesleftp)
402
{
403
  cppchar_t s;
404
  const uchar *inbuf = *inbufp;
405
  int rval;
406
 
407
  if (*inbytesleftp < 2)
408
    return EINVAL;
409
  s  = inbuf[bigend ? 0 : 1] << 8;
410
  s += inbuf[bigend ? 1 : 0];
411
 
412
  /* Low surrogate without immediately preceding high surrogate is invalid.  */
413
  if (s >= 0xDC00 && s <= 0xDFFF)
414
    return EILSEQ;
415
  /* High surrogate must have a following low surrogate.  */
416
  else if (s >= 0xD800 && s <= 0xDBFF)
417
    {
418
      cppchar_t hi = s, lo;
419
      if (*inbytesleftp < 4)
420
        return EINVAL;
421
 
422
      lo  = inbuf[bigend ? 2 : 3] << 8;
423
      lo += inbuf[bigend ? 3 : 2];
424
 
425
      if (lo < 0xDC00 || lo > 0xDFFF)
426
        return EILSEQ;
427
 
428
      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429
    }
430
 
431
  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432
  if (rval)
433
    return rval;
434
 
435
  /* Success - update the input pointers (one_cppchar_to_utf8 has done
436
     the output pointers for us).  */
437
  if (s <= 0xFFFF)
438
    {
439
      *inbufp += 2;
440
      *inbytesleftp -= 2;
441
    }
442
  else
443
    {
444
      *inbufp += 4;
445
      *inbytesleftp -= 4;
446
    }
447
  return 0;
448
}
449
 
450
/* Helper routine for the next few functions.  The 'const' on
451
   one_conversion means that we promise not to modify what function is
452
   pointed to, which lets the inliner see through it.  */
453
 
454
static inline bool
455
conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456
                                             uchar **, size_t *),
457
                 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458
{
459
  const uchar *inbuf;
460
  uchar *outbuf;
461
  size_t inbytesleft, outbytesleft;
462
  int rval;
463
 
464
  inbuf = from;
465
  inbytesleft = flen;
466
  outbuf = to->text + to->len;
467
  outbytesleft = to->asize - to->len;
468
 
469
  for (;;)
470
    {
471
      do
472
        rval = one_conversion (cd, &inbuf, &inbytesleft,
473
                               &outbuf, &outbytesleft);
474
      while (inbytesleft && !rval);
475
 
476
      if (__builtin_expect (inbytesleft == 0, 1))
477
        {
478
          to->len = to->asize - outbytesleft;
479
          return true;
480
        }
481
      if (rval != E2BIG)
482
        {
483
          errno = rval;
484
          return false;
485
        }
486
 
487
      outbytesleft += OUTBUF_BLOCK_SIZE;
488
      to->asize += OUTBUF_BLOCK_SIZE;
489
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
490
      outbuf = to->text + to->asize - outbytesleft;
491
    }
492
}
493
 
494
 
495
/* These functions convert entire strings between character sets.
496
   They all have the signature
497
 
498
   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499
 
500
   The input string FROM is converted as specified by the function
501
   name plus the iconv descriptor CD (which may be fake), and the
502
   result appended to TO.  On any error, false is returned, otherwise true.  */
503
 
504
/* These four use the custom conversion code above.  */
505
static bool
506
convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507
                    struct _cpp_strbuf *to)
508
{
509
  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510
}
511
 
512
static bool
513
convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514
                    struct _cpp_strbuf *to)
515
{
516
  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517
}
518
 
519
static bool
520
convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521
                    struct _cpp_strbuf *to)
522
{
523
  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524
}
525
 
526
static bool
527
convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528
                    struct _cpp_strbuf *to)
529
{
530
  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531
}
532
 
533
/* Identity conversion, used when we have no alternative.  */
534
static bool
535
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536
                       const uchar *from, size_t flen, struct _cpp_strbuf *to)
537
{
538
  if (to->len + flen > to->asize)
539
    {
540
      to->asize = to->len + flen;
541
      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542
    }
543
  memcpy (to->text + to->len, from, flen);
544
  to->len += flen;
545
  return true;
546
}
547
 
548
/* And this one uses the system iconv primitive.  It's a little
549
   different, since iconv's interface is a little different.  */
550
#if HAVE_ICONV
551
 
552
#define CONVERT_ICONV_GROW_BUFFER \
553
  do { \
554
      outbytesleft += OUTBUF_BLOCK_SIZE; \
555
      to->asize += OUTBUF_BLOCK_SIZE; \
556
      to->text = XRESIZEVEC (uchar, to->text, to->asize); \
557
      outbuf = (char *)to->text + to->asize - outbytesleft; \
558
  } while (0)
559
 
560
static bool
561
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
562
                     struct _cpp_strbuf *to)
563
{
564
  ICONV_CONST char *inbuf;
565
  char *outbuf;
566
  size_t inbytesleft, outbytesleft;
567
 
568
  /* Reset conversion descriptor and check that it is valid.  */
569
  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
570
    return false;
571
 
572
  inbuf = (ICONV_CONST char *)from;
573
  inbytesleft = flen;
574
  outbuf = (char *)to->text + to->len;
575
  outbytesleft = to->asize - to->len;
576
 
577
  for (;;)
578
    {
579
      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
580
      if (__builtin_expect (inbytesleft == 0, 1))
581
        {
582
          /* Close out any shift states, returning to the initial state.  */
583
          if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
584
            {
585
              if (errno != E2BIG)
586
                return false;
587
 
588
              CONVERT_ICONV_GROW_BUFFER;
589
              if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
590
                return false;
591
            }
592
 
593
          to->len = to->asize - outbytesleft;
594
          return true;
595
        }
596
      if (errno != E2BIG)
597
        return false;
598
 
599
      CONVERT_ICONV_GROW_BUFFER;
600
    }
601
}
602
#else
603
#define convert_using_iconv 0 /* prevent undefined symbol error below */
604
#endif
605
 
606
/* Arrange for the above custom conversion logic to be used automatically
607
   when conversion between a suitable pair of character sets is requested.  */
608
 
609
#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
610
   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
611
 
612
struct conversion
613
{
614
  const char *pair;
615
  convert_f func;
616
  iconv_t fake_cd;
617
};
618
static const struct conversion conversion_tab[] = {
619
  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
620
  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
621
  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
622
  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
623
  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
624
  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
625
  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
626
  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
627
};
628
 
629
/* Subroutine of cpp_init_iconv: initialize and return a
630
   cset_converter structure for conversion from FROM to TO.  If
631
   iconv_open() fails, issue an error and return an identity
632
   converter.  Silently return an identity converter if FROM and TO
633
   are identical.  */
634
static struct cset_converter
635
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
636
{
637
  struct cset_converter ret;
638
  char *pair;
639
  size_t i;
640
 
641
  if (!strcasecmp (to, from))
642
    {
643
      ret.func = convert_no_conversion;
644
      ret.cd = (iconv_t) -1;
645
      ret.width = -1;
646
      return ret;
647
    }
648
 
649
  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
650
 
651
  strcpy(pair, from);
652
  strcat(pair, "/");
653
  strcat(pair, to);
654
  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
655
    if (!strcasecmp (pair, conversion_tab[i].pair))
656
      {
657
        ret.func = conversion_tab[i].func;
658
        ret.cd = conversion_tab[i].fake_cd;
659
        ret.width = -1;
660
        return ret;
661
      }
662
 
663
  /* No custom converter - try iconv.  */
664
  if (HAVE_ICONV)
665
    {
666
      ret.func = convert_using_iconv;
667
      ret.cd = iconv_open (to, from);
668
      ret.width = -1;
669
 
670
      if (ret.cd == (iconv_t) -1)
671
        {
672
          if (errno == EINVAL)
673
            cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
674
                       "conversion from %s to %s not supported by iconv",
675
                       from, to);
676
          else
677
            cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
678
 
679
          ret.func = convert_no_conversion;
680
        }
681
    }
682
  else
683
    {
684
      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
685
                 "no iconv implementation, cannot convert from %s to %s",
686
                 from, to);
687
      ret.func = convert_no_conversion;
688
      ret.cd = (iconv_t) -1;
689
      ret.width = -1;
690
    }
691
  return ret;
692
}
693
 
694
/* If charset conversion is requested, initialize iconv(3) descriptors
695
   for conversion from the source character set to the execution
696
   character sets.  If iconv is not present in the C library, and
697
   conversion is requested, issue an error.  */
698
 
699
void
700
cpp_init_iconv (cpp_reader *pfile)
701
{
702
  const char *ncset = CPP_OPTION (pfile, narrow_charset);
703
  const char *wcset = CPP_OPTION (pfile, wide_charset);
704
  const char *default_wcset;
705
 
706
  bool be = CPP_OPTION (pfile, bytes_big_endian);
707
 
708
  if (CPP_OPTION (pfile, wchar_precision) >= 32)
709
    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
710
  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
711
    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
712
  else
713
    /* This effectively means that wide strings are not supported,
714
       so don't do any conversion at all.  */
715
   default_wcset = SOURCE_CHARSET;
716
 
717
  if (!ncset)
718
    ncset = SOURCE_CHARSET;
719
  if (!wcset)
720
    wcset = default_wcset;
721
 
722
  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
723
  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
724
  pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
725
  pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
726
  pfile->char16_cset_desc = init_iconv_desc (pfile,
727
                                             be ? "UTF-16BE" : "UTF-16LE",
728
                                             SOURCE_CHARSET);
729
  pfile->char16_cset_desc.width = 16;
730
  pfile->char32_cset_desc = init_iconv_desc (pfile,
731
                                             be ? "UTF-32BE" : "UTF-32LE",
732
                                             SOURCE_CHARSET);
733
  pfile->char32_cset_desc.width = 32;
734
  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
735
  pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
736
}
737
 
738
/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
739
void
740
_cpp_destroy_iconv (cpp_reader *pfile)
741
{
742
  if (HAVE_ICONV)
743
    {
744
      if (pfile->narrow_cset_desc.func == convert_using_iconv)
745
        iconv_close (pfile->narrow_cset_desc.cd);
746
      if (pfile->utf8_cset_desc.func == convert_using_iconv)
747
        iconv_close (pfile->utf8_cset_desc.cd);
748
      if (pfile->char16_cset_desc.func == convert_using_iconv)
749
        iconv_close (pfile->char16_cset_desc.cd);
750
      if (pfile->char32_cset_desc.func == convert_using_iconv)
751
        iconv_close (pfile->char32_cset_desc.cd);
752
      if (pfile->wide_cset_desc.func == convert_using_iconv)
753
        iconv_close (pfile->wide_cset_desc.cd);
754
    }
755
}
756
 
757
/* Utility routine for use by a full compiler.  C is a character taken
758
   from the *basic* source character set, encoded in the host's
759
   execution encoding.  Convert it to (the target's) execution
760
   encoding, and return that value.
761
 
762
   Issues an internal error if C's representation in the narrow
763
   execution character set fails to be a single-byte value (C99
764
   5.2.1p3: "The representation of each member of the source and
765
   execution character sets shall fit in a byte.")  May also issue an
766
   internal error if C fails to be a member of the basic source
767
   character set (testing this exactly is too hard, especially when
768
   the host character set is EBCDIC).  */
769
cppchar_t
770
cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
771
{
772
  uchar sbuf[1];
773
  struct _cpp_strbuf tbuf;
774
 
775
  /* This test is merely an approximation, but it suffices to catch
776
     the most important thing, which is that we don't get handed a
777
     character outside the unibyte range of the host character set.  */
778
  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
779
    {
780
      cpp_error (pfile, CPP_DL_ICE,
781
                 "character 0x%lx is not in the basic source character set\n",
782
                 (unsigned long)c);
783
      return 0;
784
    }
785
 
786
  /* Being a character in the unibyte range of the host character set,
787
     we can safely splat it into a one-byte buffer and trust that that
788
     is a well-formed string.  */
789
  sbuf[0] = c;
790
 
791
  /* This should never need to reallocate, but just in case... */
792
  tbuf.asize = 1;
793
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
794
  tbuf.len = 0;
795
 
796
  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
797
    {
798
      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
799
      return 0;
800
    }
801
  if (tbuf.len != 1)
802
    {
803
      cpp_error (pfile, CPP_DL_ICE,
804
                 "character 0x%lx is not unibyte in execution character set",
805
                 (unsigned long)c);
806
      return 0;
807
    }
808
  c = tbuf.text[0];
809
  free(tbuf.text);
810
  return c;
811
}
812
 
813
 
814
 
815
/* Utility routine that computes a mask of the form 0000...111... with
816
   WIDTH 1-bits.  */
817
static inline size_t
818
width_to_mask (size_t width)
819
{
820
  width = MIN (width, BITS_PER_CPPCHAR_T);
821
  if (width >= CHAR_BIT * sizeof (size_t))
822
    return ~(size_t) 0;
823
  else
824
    return ((size_t) 1 << width) - 1;
825
}
826
 
827
/* A large table of unicode character information.  */
828
enum {
829
  /* Valid in a C99 identifier?  */
830
  C99 = 1,
831
  /* Valid in a C99 identifier, but not as the first character?  */
832
  DIG = 2,
833
  /* Valid in a C++ identifier?  */
834
  CXX = 4,
835
  /* NFC representation is not valid in an identifier?  */
836
  CID = 8,
837
  /* Might be valid NFC form?  */
838
  NFC = 16,
839
  /* Might be valid NFKC form?  */
840
  NKC = 32,
841
  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
842
  CTX = 64
843
};
844
 
845
static const struct {
846
  /* Bitmap of flags above.  */
847
  unsigned char flags;
848
  /* Combining class of the character.  */
849
  unsigned char combine;
850
  /* Last character in the range described by this entry.  */
851
  unsigned short end;
852
} ucnranges[] = {
853
#include "ucnid.h"
854
};
855
 
856
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
857
   the start of an identifier, and 0 if C is not valid in an
858
   identifier.  We assume C has already gone through the checks of
859
   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
860
   algorithm is a simple binary search on the table defined in
861
   ucnid.h.  */
862
 
863
static int
864
ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
865
                         struct normalize_state *nst)
866
{
867
  int mn, mx, md;
868
 
869
  if (c > 0xFFFF)
870
    return 0;
871
 
872
  mn = 0;
873
  mx = ARRAY_SIZE (ucnranges) - 1;
874
  while (mx != mn)
875
    {
876
      md = (mn + mx) / 2;
877
      if (c <= ucnranges[md].end)
878
        mx = md;
879
      else
880
        mn = md + 1;
881
    }
882
 
883
  /* When -pedantic, we require the character to have been listed by
884
     the standard for the current language.  Otherwise, we accept the
885
     union of the acceptable sets for C++98 and C99.  */
886
  if (! (ucnranges[mn].flags & (C99 | CXX)))
887
      return 0;
888
 
889
  if (CPP_PEDANTIC (pfile)
890
      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
891
          || (CPP_OPTION (pfile, cplusplus)
892
              && !(ucnranges[mn].flags & CXX))))
893
    return 0;
894
 
895
  /* Update NST.  */
896
  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
897
    nst->level = normalized_none;
898
  else if (ucnranges[mn].flags & CTX)
899
    {
900
      bool safe;
901
      cppchar_t p = nst->previous;
902
 
903
      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
904
      if (c == 0x09BE)
905
        safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
906
      else if (c == 0x0B3E)
907
        safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
908
      else if (c == 0x0BBE)
909
        safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
910
      else if (c == 0x0CC2)
911
        safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
912
      else if (c == 0x0D3E)
913
        safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
914
      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
915
         and are combined algorithmically from a sequence of the form
916
         1100-1112 1161-1175 11A8-11C2
917
         (if the third is not present, it is treated as 11A7, which is not
918
         really a valid character).
919
         Unfortunately, C99 allows (only) the NFC form, but C++ allows
920
         only the combining characters.  */
921
      else if (c >= 0x1161 && c <= 0x1175)
922
        safe = p < 0x1100 || p > 0x1112;
923
      else if (c >= 0x11A8 && c <= 0x11C2)
924
        safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
925
      else
926
        {
927
          /* Uh-oh, someone updated ucnid.h without updating this code.  */
928
          cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
929
          safe = true;
930
        }
931
      if (!safe && c < 0x1161)
932
        nst->level = normalized_none;
933
      else if (!safe)
934
        nst->level = MAX (nst->level, normalized_identifier_C);
935
    }
936
  else if (ucnranges[mn].flags & NKC)
937
    ;
938
  else if (ucnranges[mn].flags & NFC)
939
    nst->level = MAX (nst->level, normalized_C);
940
  else if (ucnranges[mn].flags & CID)
941
    nst->level = MAX (nst->level, normalized_identifier_C);
942
  else
943
    nst->level = normalized_none;
944
  nst->previous = c;
945
  nst->prev_class = ucnranges[mn].combine;
946
 
947
  /* In C99, UCN digits may not begin identifiers.  */
948
  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
949
    return 2;
950
 
951
  return 1;
952
}
953
 
954
/* [lex.charset]: The character designated by the universal character
955
   name \UNNNNNNNN is that character whose character short name in
956
   ISO/IEC 10646 is NNNNNNNN; the character designated by the
957
   universal character name \uNNNN is that character whose character
958
   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
959
   for a universal character name corresponds to a surrogate code point
960
   (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
961
   Additionally, if the hexadecimal value for a universal-character-name
962
   outside a character or string literal corresponds to a control character
963
   (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
964
   character in the basic source character set, the program is ill-formed.
965
 
966
   C99 6.4.3: A universal character name shall not specify a character
967
   whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
968
   or 0060 (`), nor one in the range D800 through DFFF inclusive.
969
 
970
   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
971
   buffer end is delimited by a non-hex digit.  Returns zero if the
972
   UCN has not been consumed.
973
 
974
   Otherwise the nonzero value of the UCN, whether valid or invalid,
975
   is returned.  Diagnostics are emitted for invalid values.  PSTR
976
   is updated to point one beyond the UCN, or to the syntactically
977
   invalid character.
978
 
979
   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
980
   an identifier, or 2 otherwise.  */
981
 
982
cppchar_t
983
_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
984
                const uchar *limit, int identifier_pos,
985
                struct normalize_state *nst)
986
{
987
  cppchar_t result, c;
988
  unsigned int length;
989
  const uchar *str = *pstr;
990
  const uchar *base = str - 2;
991
 
992
  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
993
    cpp_error (pfile, CPP_DL_WARNING,
994
               "universal character names are only valid in C++ and C99");
995
  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
996
    cpp_warning (pfile, CPP_W_TRADITIONAL,
997
                 "the meaning of '\\%c' is different in traditional C",
998
                 (int) str[-1]);
999
 
1000
  if (str[-1] == 'u')
1001
    length = 4;
1002
  else if (str[-1] == 'U')
1003
    length = 8;
1004
  else
1005
    {
1006
      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
1007
      length = 4;
1008
    }
1009
 
1010
  result = 0;
1011
  do
1012
    {
1013
      c = *str;
1014
      if (!ISXDIGIT (c))
1015
        break;
1016
      str++;
1017
      result = (result << 4) + hex_value (c);
1018
    }
1019
  while (--length && str < limit);
1020
 
1021
  /* Partial UCNs are not valid in strings, but decompose into
1022
     multiple tokens in identifiers, so we can't give a helpful
1023
     error message in that case.  */
1024
  if (length && identifier_pos)
1025
    return 0;
1026
 
1027
  *pstr = str;
1028
  if (length)
1029
    {
1030
      cpp_error (pfile, CPP_DL_ERROR,
1031
                 "incomplete universal character name %.*s",
1032
                 (int) (str - base), base);
1033
      result = 1;
1034
    }
1035
  /* The C99 standard permits $, @ and ` to be specified as UCNs.  We use
1036
     hex escapes so that this also works with EBCDIC hosts.
1037
     C++0x permits everything below 0xa0 within literals;
1038
     ucn_valid_in_identifier will complain about identifiers.  */
1039
  else if ((result < 0xa0
1040
            && !CPP_OPTION (pfile, cplusplus)
1041
            && (result != 0x24 && result != 0x40 && result != 0x60))
1042
           || (result & 0x80000000)
1043
           || (result >= 0xD800 && result <= 0xDFFF))
1044
    {
1045
      cpp_error (pfile, CPP_DL_ERROR,
1046
                 "%.*s is not a valid universal character",
1047
                 (int) (str - base), base);
1048
      result = 1;
1049
    }
1050
  else if (identifier_pos && result == 0x24
1051
           && CPP_OPTION (pfile, dollars_in_ident))
1052
    {
1053
      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1054
        {
1055
          CPP_OPTION (pfile, warn_dollars) = 0;
1056
          cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1057
        }
1058
      NORMALIZE_STATE_UPDATE_IDNUM (nst);
1059
    }
1060
  else if (identifier_pos)
1061
    {
1062
      int validity = ucn_valid_in_identifier (pfile, result, nst);
1063
 
1064
      if (validity == 0)
1065
        cpp_error (pfile, CPP_DL_ERROR,
1066
                   "universal character %.*s is not valid in an identifier",
1067
                   (int) (str - base), base);
1068
      else if (validity == 2 && identifier_pos == 1)
1069
        cpp_error (pfile, CPP_DL_ERROR,
1070
   "universal character %.*s is not valid at the start of an identifier",
1071
                   (int) (str - base), base);
1072
    }
1073
 
1074
  if (result == 0)
1075
    result = 1;
1076
 
1077
  return result;
1078
}
1079
 
1080
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1081
   it to the execution character set and write the result into TBUF.
1082
   An advanced pointer is returned.  Issues all relevant diagnostics.  */
1083
static const uchar *
1084
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1085
             struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1086
{
1087
  cppchar_t ucn;
1088
  uchar buf[6];
1089
  uchar *bufp = buf;
1090
  size_t bytesleft = 6;
1091
  int rval;
1092
  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1093
 
1094
  from++;  /* Skip u/U.  */
1095
  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1096
 
1097
  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1098
  if (rval)
1099
    {
1100
      errno = rval;
1101
      cpp_errno (pfile, CPP_DL_ERROR,
1102
                 "converting UCN to source character set");
1103
    }
1104
  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1105
    cpp_errno (pfile, CPP_DL_ERROR,
1106
               "converting UCN to execution character set");
1107
 
1108
  return from;
1109
}
1110
 
1111
/* Subroutine of convert_hex and convert_oct.  N is the representation
1112
   in the execution character set of a numeric escape; write it into the
1113
   string buffer TBUF and update the end-of-string pointer therein.  WIDE
1114
   is true if it's a wide string that's being assembled in TBUF.  This
1115
   function issues no diagnostics and never fails.  */
1116
static void
1117
emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1118
                     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1119
{
1120
  size_t width = cvt.width;
1121
 
1122
  if (width != CPP_OPTION (pfile, char_precision))
1123
    {
1124
      /* We have to render this into the target byte order, which may not
1125
         be our byte order.  */
1126
      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1127
      size_t cwidth = CPP_OPTION (pfile, char_precision);
1128
      size_t cmask = width_to_mask (cwidth);
1129
      size_t nbwc = width / cwidth;
1130
      size_t i;
1131
      size_t off = tbuf->len;
1132
      cppchar_t c;
1133
 
1134
      if (tbuf->len + nbwc > tbuf->asize)
1135
        {
1136
          tbuf->asize += OUTBUF_BLOCK_SIZE;
1137
          tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1138
        }
1139
 
1140
      for (i = 0; i < nbwc; i++)
1141
        {
1142
          c = n & cmask;
1143
          n >>= cwidth;
1144
          tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1145
        }
1146
      tbuf->len += nbwc;
1147
    }
1148
  else
1149
    {
1150
      /* Note: this code does not handle the case where the target
1151
         and host have a different number of bits in a byte.  */
1152
      if (tbuf->len + 1 > tbuf->asize)
1153
        {
1154
          tbuf->asize += OUTBUF_BLOCK_SIZE;
1155
          tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1156
        }
1157
      tbuf->text[tbuf->len++] = n;
1158
    }
1159
}
1160
 
1161
/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1162
   character set and write it into the string buffer TBUF.  Returns an
1163
   advanced pointer, and issues diagnostics as necessary.
1164
   No character set translation occurs; this routine always produces the
1165
   execution-set character with numeric value equal to the given hex
1166
   number.  You can, e.g. generate surrogate pairs this way.  */
1167
static const uchar *
1168
convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1169
             struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1170
{
1171
  cppchar_t c, n = 0, overflow = 0;
1172
  int digits_found = 0;
1173
  size_t width = cvt.width;
1174
  size_t mask = width_to_mask (width);
1175
 
1176
  if (CPP_WTRADITIONAL (pfile))
1177
    cpp_warning (pfile, CPP_W_TRADITIONAL,
1178
                 "the meaning of '\\x' is different in traditional C");
1179
 
1180
  from++;  /* Skip 'x'.  */
1181
  while (from < limit)
1182
    {
1183
      c = *from;
1184
      if (! hex_p (c))
1185
        break;
1186
      from++;
1187
      overflow |= n ^ (n << 4 >> 4);
1188
      n = (n << 4) + hex_value (c);
1189
      digits_found = 1;
1190
    }
1191
 
1192
  if (!digits_found)
1193
    {
1194
      cpp_error (pfile, CPP_DL_ERROR,
1195
                 "\\x used with no following hex digits");
1196
      return from;
1197
    }
1198
 
1199
  if (overflow | (n != (n & mask)))
1200
    {
1201
      cpp_error (pfile, CPP_DL_PEDWARN,
1202
                 "hex escape sequence out of range");
1203
      n &= mask;
1204
    }
1205
 
1206
  emit_numeric_escape (pfile, n, tbuf, cvt);
1207
 
1208
  return from;
1209
}
1210
 
1211
/* Convert an octal escape, pointed to by FROM, to the execution
1212
   character set and write it into the string buffer TBUF.  Returns an
1213
   advanced pointer, and issues diagnostics as necessary.
1214
   No character set translation occurs; this routine always produces the
1215
   execution-set character with numeric value equal to the given octal
1216
   number.  */
1217
static const uchar *
1218
convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1219
             struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1220
{
1221
  size_t count = 0;
1222
  cppchar_t c, n = 0;
1223
  size_t width = cvt.width;
1224
  size_t mask = width_to_mask (width);
1225
  bool overflow = false;
1226
 
1227
  while (from < limit && count++ < 3)
1228
    {
1229
      c = *from;
1230
      if (c < '0' || c > '7')
1231
        break;
1232
      from++;
1233
      overflow |= n ^ (n << 3 >> 3);
1234
      n = (n << 3) + c - '0';
1235
    }
1236
 
1237
  if (n != (n & mask))
1238
    {
1239
      cpp_error (pfile, CPP_DL_PEDWARN,
1240
                 "octal escape sequence out of range");
1241
      n &= mask;
1242
    }
1243
 
1244
  emit_numeric_escape (pfile, n, tbuf, cvt);
1245
 
1246
  return from;
1247
}
1248
 
1249
/* Convert an escape sequence (pointed to by FROM) to its value on
1250
   the target, and to the execution character set.  Do not scan past
1251
   LIMIT.  Write the converted value into TBUF.  Returns an advanced
1252
   pointer.  Handles all relevant diagnostics.  */
1253
static const uchar *
1254
convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1255
                struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1256
{
1257
  /* Values of \a \b \e \f \n \r \t \v respectively.  */
1258
#if HOST_CHARSET == HOST_CHARSET_ASCII
1259
  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1260
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1261
  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1262
#else
1263
#error "unknown host character set"
1264
#endif
1265
 
1266
  uchar c;
1267
 
1268
  c = *from;
1269
  switch (c)
1270
    {
1271
      /* UCNs, hex escapes, and octal escapes are processed separately.  */
1272
    case 'u': case 'U':
1273
      return convert_ucn (pfile, from, limit, tbuf, cvt);
1274
 
1275
    case 'x':
1276
      return convert_hex (pfile, from, limit, tbuf, cvt);
1277
      break;
1278
 
1279
    case '0':  case '1':  case '2':  case '3':
1280
    case '4':  case '5':  case '6':  case '7':
1281
      return convert_oct (pfile, from, limit, tbuf, cvt);
1282
 
1283
      /* Various letter escapes.  Get the appropriate host-charset
1284
         value into C.  */
1285
    case '\\': case '\'': case '"': case '?': break;
1286
 
1287
    case '(': case '{': case '[': case '%':
1288
      /* '\(', etc, can be used at the beginning of a line in a long
1289
         string split onto multiple lines with \-newline, to prevent
1290
         Emacs or other text editors from getting confused.  '\%' can
1291
         be used to prevent SCCS from mangling printf format strings.  */
1292
      if (CPP_PEDANTIC (pfile))
1293
        goto unknown;
1294
      break;
1295
 
1296
    case 'b': c = charconsts[1];  break;
1297
    case 'f': c = charconsts[3];  break;
1298
    case 'n': c = charconsts[4];  break;
1299
    case 'r': c = charconsts[5];  break;
1300
    case 't': c = charconsts[6];  break;
1301
    case 'v': c = charconsts[7];  break;
1302
 
1303
    case 'a':
1304
      if (CPP_WTRADITIONAL (pfile))
1305
        cpp_warning (pfile, CPP_W_TRADITIONAL,
1306
                     "the meaning of '\\a' is different in traditional C");
1307
      c = charconsts[0];
1308
      break;
1309
 
1310
    case 'e': case 'E':
1311
      if (CPP_PEDANTIC (pfile))
1312
        cpp_error (pfile, CPP_DL_PEDWARN,
1313
                   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1314
      c = charconsts[2];
1315
      break;
1316
 
1317
    default:
1318
    unknown:
1319
      if (ISGRAPH (c))
1320
        cpp_error (pfile, CPP_DL_PEDWARN,
1321
                   "unknown escape sequence: '\\%c'", (int) c);
1322
      else
1323
        {
1324
          /* diagnostic.c does not support "%03o".  When it does, this
1325
             code can use %03o directly in the diagnostic again.  */
1326
          char buf[32];
1327
          sprintf(buf, "%03o", (int) c);
1328
          cpp_error (pfile, CPP_DL_PEDWARN,
1329
                     "unknown escape sequence: '\\%s'", buf);
1330
        }
1331
    }
1332
 
1333
  /* Now convert what we have to the execution character set.  */
1334
  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1335
    cpp_errno (pfile, CPP_DL_ERROR,
1336
               "converting escape sequence to execution character set");
1337
 
1338
  return from + 1;
1339
}
1340
 
1341
/* TYPE is a token type.  The return value is the conversion needed to
1342
   convert from source to execution character set for the given type. */
1343
static struct cset_converter
1344
converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
1345
{
1346
  switch (type)
1347
    {
1348
    default:
1349
        return pfile->narrow_cset_desc;
1350
    case CPP_UTF8STRING:
1351
        return pfile->utf8_cset_desc;
1352
    case CPP_CHAR16:
1353
    case CPP_STRING16:
1354
        return pfile->char16_cset_desc;
1355
    case CPP_CHAR32:
1356
    case CPP_STRING32:
1357
        return pfile->char32_cset_desc;
1358
    case CPP_WCHAR:
1359
    case CPP_WSTRING:
1360
        return pfile->wide_cset_desc;
1361
    }
1362
}
1363
 
1364
/* FROM is an array of cpp_string structures of length COUNT.  These
1365
   are to be converted from the source to the execution character set,
1366
   escape sequences translated, and finally all are to be
1367
   concatenated.  WIDE indicates whether or not to produce a wide
1368
   string.  The result is written into TO.  Returns true for success,
1369
   false for failure.  */
1370
bool
1371
cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1372
                      cpp_string *to,  enum cpp_ttype type)
1373
{
1374
  struct _cpp_strbuf tbuf;
1375
  const uchar *p, *base, *limit;
1376
  size_t i;
1377
  struct cset_converter cvt = converter_for_type (pfile, type);
1378
 
1379
  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1380
  tbuf.text = XNEWVEC (uchar, tbuf.asize);
1381
  tbuf.len = 0;
1382
 
1383
  for (i = 0; i < count; i++)
1384
    {
1385
      p = from[i].text;
1386
      if (*p == 'u')
1387
        {
1388
          if (*++p == '8')
1389
            p++;
1390
        }
1391
      else if (*p == 'L' || *p == 'U') p++;
1392
      if (*p == 'R')
1393
        {
1394
          const uchar *prefix;
1395
 
1396
          /* Skip over 'R"'.  */
1397
          p += 2;
1398
          prefix = p;
1399
          while (*p != '(')
1400
            p++;
1401
          p++;
1402
          limit = from[i].text + from[i].len;
1403
          if (limit >= p + (p - prefix) + 1)
1404
            limit -= (p - prefix) + 1;
1405
 
1406
          /* Raw strings are all normal characters; these can be fed
1407
             directly to convert_cset.  */
1408
          if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
1409
            goto fail;
1410
 
1411
          continue;
1412
        }
1413
 
1414
      p++; /* Skip leading quote.  */
1415
      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1416
 
1417
      for (;;)
1418
        {
1419
          base = p;
1420
          while (p < limit && *p != '\\')
1421
            p++;
1422
          if (p > base)
1423
            {
1424
              /* We have a run of normal characters; these can be fed
1425
                 directly to convert_cset.  */
1426
              if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1427
                goto fail;
1428
            }
1429
          if (p == limit)
1430
            break;
1431
 
1432
          p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
1433
        }
1434
    }
1435
  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1436
     structure.  */
1437
  emit_numeric_escape (pfile, 0, &tbuf, cvt);
1438
  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1439
  to->text = tbuf.text;
1440
  to->len = tbuf.len;
1441
  return true;
1442
 
1443
 fail:
1444
  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1445
  free (tbuf.text);
1446
  return false;
1447
}
1448
 
1449
/* Subroutine of do_line and do_linemarker.  Convert escape sequences
1450
   in a string, but do not perform character set conversion.  */
1451
bool
1452
cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1453
                                  size_t count, cpp_string *to,
1454
                                  enum cpp_ttype type ATTRIBUTE_UNUSED)
1455
{
1456
  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1457
  bool retval;
1458
 
1459
  pfile->narrow_cset_desc.func = convert_no_conversion;
1460
  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1461
  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
1462
 
1463
  retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
1464
 
1465
  pfile->narrow_cset_desc = save_narrow_cset_desc;
1466
  return retval;
1467
}
1468
 
1469
 
1470
/* Subroutine of cpp_interpret_charconst which performs the conversion
1471
   to a number, for narrow strings.  STR is the string structure returned
1472
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1473
   cpp_interpret_charconst.  */
1474
static cppchar_t
1475
narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1476
                         unsigned int *pchars_seen, int *unsignedp)
1477
{
1478
  size_t width = CPP_OPTION (pfile, char_precision);
1479
  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1480
  size_t mask = width_to_mask (width);
1481
  size_t i;
1482
  cppchar_t result, c;
1483
  bool unsigned_p;
1484
 
1485
  /* The value of a multi-character character constant, or a
1486
     single-character character constant whose representation in the
1487
     execution character set is more than one byte long, is
1488
     implementation defined.  This implementation defines it to be the
1489
     number formed by interpreting the byte sequence in memory as a
1490
     big-endian binary number.  If overflow occurs, the high bytes are
1491
     lost, and a warning is issued.
1492
 
1493
     We don't want to process the NUL terminator handed back by
1494
     cpp_interpret_string.  */
1495
  result = 0;
1496
  for (i = 0; i < str.len - 1; i++)
1497
    {
1498
      c = str.text[i] & mask;
1499
      if (width < BITS_PER_CPPCHAR_T)
1500
        result = (result << width) | c;
1501
      else
1502
        result = c;
1503
    }
1504
 
1505
  if (i > max_chars)
1506
    {
1507
      i = max_chars;
1508
      cpp_error (pfile, CPP_DL_WARNING,
1509
                 "character constant too long for its type");
1510
    }
1511
  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1512
    cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
1513
 
1514
  /* Multichar constants are of type int and therefore signed.  */
1515
  if (i > 1)
1516
    unsigned_p = 0;
1517
  else
1518
    unsigned_p = CPP_OPTION (pfile, unsigned_char);
1519
 
1520
  /* Truncate the constant to its natural width, and simultaneously
1521
     sign- or zero-extend to the full width of cppchar_t.
1522
     For single-character constants, the value is WIDTH bits wide.
1523
     For multi-character constants, the value is INT_PRECISION bits wide.  */
1524
  if (i > 1)
1525
    width = CPP_OPTION (pfile, int_precision);
1526
  if (width < BITS_PER_CPPCHAR_T)
1527
    {
1528
      mask = ((cppchar_t) 1 << width) - 1;
1529
      if (unsigned_p || !(result & (1 << (width - 1))))
1530
        result &= mask;
1531
      else
1532
        result |= ~mask;
1533
    }
1534
  *pchars_seen = i;
1535
  *unsignedp = unsigned_p;
1536
  return result;
1537
}
1538
 
1539
/* Subroutine of cpp_interpret_charconst which performs the conversion
1540
   to a number, for wide strings.  STR is the string structure returned
1541
   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1542
   cpp_interpret_charconst.  TYPE is the token type.  */
1543
static cppchar_t
1544
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1545
                       unsigned int *pchars_seen, int *unsignedp,
1546
                       enum cpp_ttype type)
1547
{
1548
  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1549
  size_t width = converter_for_type (pfile, type).width;
1550
  size_t cwidth = CPP_OPTION (pfile, char_precision);
1551
  size_t mask = width_to_mask (width);
1552
  size_t cmask = width_to_mask (cwidth);
1553
  size_t nbwc = width / cwidth;
1554
  size_t off, i;
1555
  cppchar_t result = 0, c;
1556
 
1557
  /* This is finicky because the string is in the target's byte order,
1558
     which may not be our byte order.  Only the last character, ignoring
1559
     the NUL terminator, is relevant.  */
1560
  off = str.len - (nbwc * 2);
1561
  result = 0;
1562
  for (i = 0; i < nbwc; i++)
1563
    {
1564
      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1565
      result = (result << cwidth) | (c & cmask);
1566
    }
1567
 
1568
  /* Wide character constants have type wchar_t, and a single
1569
     character exactly fills a wchar_t, so a multi-character wide
1570
     character constant is guaranteed to overflow.  */
1571
  if (str.len > nbwc * 2)
1572
    cpp_error (pfile, CPP_DL_WARNING,
1573
               "character constant too long for its type");
1574
 
1575
  /* Truncate the constant to its natural width, and simultaneously
1576
     sign- or zero-extend to the full width of cppchar_t.  */
1577
  if (width < BITS_PER_CPPCHAR_T)
1578
    {
1579
      if (type == CPP_CHAR16 || type == CPP_CHAR32
1580
          || CPP_OPTION (pfile, unsigned_wchar)
1581
          || !(result & (1 << (width - 1))))
1582
        result &= mask;
1583
      else
1584
        result |= ~mask;
1585
    }
1586
 
1587
  if (type == CPP_CHAR16 || type == CPP_CHAR32
1588
      || CPP_OPTION (pfile, unsigned_wchar))
1589
    *unsignedp = 1;
1590
  else
1591
    *unsignedp = 0;
1592
 
1593
  *pchars_seen = 1;
1594
  return result;
1595
}
1596
 
1597
/* Interpret a (possibly wide) character constant in TOKEN.
1598
   PCHARS_SEEN points to a variable that is filled in with the number
1599
   of characters seen, and UNSIGNEDP to a variable that indicates
1600
   whether the result has signed type.  */
1601
cppchar_t
1602
cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1603
                         unsigned int *pchars_seen, int *unsignedp)
1604
{
1605
  cpp_string str = { 0, 0 };
1606
  bool wide = (token->type != CPP_CHAR);
1607
  cppchar_t result;
1608
 
1609
  /* an empty constant will appear as L'', u'', U'' or '' */
1610
  if (token->val.str.len == (size_t) (2 + wide))
1611
    {
1612
      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1613
      return 0;
1614
    }
1615
  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
1616
    return 0;
1617
 
1618
  if (wide)
1619
    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
1620
                                    token->type);
1621
  else
1622
    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1623
 
1624
  if (str.text != token->val.str.text)
1625
    free ((void *)str.text);
1626
 
1627
  return result;
1628
}
1629
 
1630
/* Convert an identifier denoted by ID and LEN, which might contain
1631
   UCN escapes, to the source character set, either UTF-8 or
1632
   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1633
cpp_hashnode *
1634
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1635
{
1636
  /* It turns out that a UCN escape always turns into fewer characters
1637
     than the escape itself, so we can allocate a temporary in advance.  */
1638
  uchar * buf = (uchar *) alloca (len + 1);
1639
  uchar * bufp = buf;
1640
  size_t idp;
1641
 
1642
  for (idp = 0; idp < len; idp++)
1643
    if (id[idp] != '\\')
1644
      *bufp++ = id[idp];
1645
    else
1646
      {
1647
        unsigned length = id[idp+1] == 'u' ? 4 : 8;
1648
        cppchar_t value = 0;
1649
        size_t bufleft = len - (bufp - buf);
1650
        int rval;
1651
 
1652
        idp += 2;
1653
        while (length && idp < len && ISXDIGIT (id[idp]))
1654
          {
1655
            value = (value << 4) + hex_value (id[idp]);
1656
            idp++;
1657
            length--;
1658
          }
1659
        idp--;
1660
 
1661
        /* Special case for EBCDIC: if the identifier contains
1662
           a '$' specified using a UCN, translate it to EBCDIC.  */
1663
        if (value == 0x24)
1664
          {
1665
            *bufp++ = '$';
1666
            continue;
1667
          }
1668
 
1669
        rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1670
        if (rval)
1671
          {
1672
            errno = rval;
1673
            cpp_errno (pfile, CPP_DL_ERROR,
1674
                       "converting UCN to source character set");
1675
            break;
1676
          }
1677
      }
1678
 
1679
  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1680
                                  buf, bufp - buf, HT_ALLOC));
1681
}
1682
 
1683
/* Convert an input buffer (containing the complete contents of one
1684
   source file) from INPUT_CHARSET to the source character set.  INPUT
1685
   points to the input buffer, SIZE is its allocated size, and LEN is
1686
   the length of the meaningful data within the buffer.  The
1687
   translated buffer is returned, *ST_SIZE is set to the length of
1688
   the meaningful data within the translated buffer, and *BUFFER_START
1689
   is set to the start of the returned buffer.  *BUFFER_START may
1690
   differ from the return value in the case of a BOM or other ignored
1691
   marker information.
1692
 
1693
   INPUT is expected to have been allocated with xmalloc.  This
1694
   function will either set *BUFFER_START to INPUT, or free it and set
1695
   *BUFFER_START to a pointer to another xmalloc-allocated block of
1696
   memory.  */
1697
uchar *
1698
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1699
                    uchar *input, size_t size, size_t len,
1700
                    const unsigned char **buffer_start, off_t *st_size)
1701
{
1702
  struct cset_converter input_cset;
1703
  struct _cpp_strbuf to;
1704
  unsigned char *buffer;
1705
 
1706
  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1707
  if (input_cset.func == convert_no_conversion)
1708
    {
1709
      to.text = input;
1710
      to.asize = size;
1711
      to.len = len;
1712
    }
1713
  else
1714
    {
1715
      to.asize = MAX (65536, len);
1716
      to.text = XNEWVEC (uchar, to.asize);
1717
      to.len = 0;
1718
 
1719
      if (!APPLY_CONVERSION (input_cset, input, len, &to))
1720
        cpp_error (pfile, CPP_DL_ERROR,
1721
                   "failure to convert %s to %s",
1722
                   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1723
 
1724
      free (input);
1725
    }
1726
 
1727
  /* Clean up the mess.  */
1728
  if (input_cset.func == convert_using_iconv)
1729
    iconv_close (input_cset.cd);
1730
 
1731
  /* Resize buffer if we allocated substantially too much, or if we
1732
     haven't enough space for the \n-terminator.  */
1733
  if (to.len + 4096 < to.asize || to.len >= to.asize)
1734
    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1735
 
1736
  /* If the file is using old-school Mac line endings (\r only),
1737
     terminate with another \r, not an \n, so that we do not mistake
1738
     the \r\n sequence for a single DOS line ending and erroneously
1739
     issue the "No newline at end of file" diagnostic.  */
1740
  if (to.len && to.text[to.len - 1] == '\r')
1741
    to.text[to.len] = '\r';
1742
  else
1743
    to.text[to.len] = '\n';
1744
 
1745
  buffer = to.text;
1746
  *st_size = to.len;
1747
#if HOST_CHARSET == HOST_CHARSET_ASCII
1748
  /* The HOST_CHARSET test just above ensures that the source charset
1749
     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
1750
     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
1751
     BOM -- however, even if it did, we would still need this code due
1752
     to the 'convert_no_conversion' case.  */
1753
  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
1754
      && to.text[2] == 0xbf)
1755
    {
1756
      *st_size -= 3;
1757
      buffer += 3;
1758
    }
1759
#endif
1760
 
1761
  *buffer_start = to.text;
1762
  return buffer;
1763
}
1764
 
1765
/* Decide on the default encoding to assume for input files.  */
1766
const char *
1767
_cpp_default_encoding (void)
1768
{
1769
  const char *current_encoding = NULL;
1770
 
1771
  /* We disable this because the default codeset is 7-bit ASCII on
1772
     most platforms, and this causes conversion failures on every
1773
     file in GCC that happens to have one of the upper 128 characters
1774
     in it -- most likely, as part of the name of a contributor.
1775
     We should definitely recognize in-band markers of file encoding,
1776
     like:
1777
     - the appropriate Unicode byte-order mark (FE FF) to recognize
1778
       UTF16 and UCS4 (in both big-endian and little-endian flavors)
1779
       and UTF8
1780
     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1781
       distinguish ASCII and EBCDIC.
1782
     - now we can parse something like "#pragma GCC encoding <xyz>
1783
       on the first line, or even Emacs/VIM's mode line tags (there's
1784
       a problem here in that VIM uses the last line, and Emacs has
1785
       its more elaborate "local variables" convention).
1786
     - investigate whether Java has another common convention, which
1787
       would be friendly to support.
1788
     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1789
#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1790
  setlocale (LC_CTYPE, "");
1791
  current_encoding = nl_langinfo (CODESET);
1792
#endif
1793
  if (current_encoding == NULL || *current_encoding == '\0')
1794
    current_encoding = SOURCE_CHARSET;
1795
 
1796
  return current_encoding;
1797
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.