OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [ada/] [a-suenco.adb] - Blame information for rev 849

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 706 jeremybenn
------------------------------------------------------------------------------
2
--                                                                          --
3
--                         GNAT RUN-TIME COMPONENTS                         --
4
--                                                                          --
5
--                   ADA.STRINGS.UTF_ENCODING.CONVERSIONS                   --
6
--                                                                          --
7
--                                 B o d y                                  --
8
--                                                                          --
9
--          Copyright (C) 2010-2011, Free Software Foundation, Inc.         --
10
--                                                                          --
11
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12
-- terms of the  GNU General Public License as published  by the Free Soft- --
13
-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16
-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17
--                                                                          --
18
-- As a special exception under Section 7 of GPL version 3, you are granted --
19
-- additional permissions described in the GCC Runtime Library Exception,   --
20
-- version 3.1, as published by the Free Software Foundation.               --
21
--                                                                          --
22
-- You should have received a copy of the GNU General Public License and    --
23
-- a copy of the GCC Runtime Library Exception along with this program;     --
24
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25
-- <http://www.gnu.org/licenses/>.                                          --
26
--                                                                          --
27
-- GNAT was originally developed  by the GNAT team at  New York University. --
28
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29
--                                                                          --
30
------------------------------------------------------------------------------
31
 
32
package body Ada.Strings.UTF_Encoding.Conversions is
33
   use Interfaces;
34
 
35
   --  Convert from UTF-8/UTF-16BE/LE to UTF-8/UTF-16BE/LE
36
 
37
   function Convert
38
     (Item          : UTF_String;
39
      Input_Scheme  : Encoding_Scheme;
40
      Output_Scheme : Encoding_Scheme;
41
      Output_BOM    : Boolean := False) return UTF_String
42
   is
43
   begin
44
      --  Nothing to do if identical schemes
45
 
46
      if Input_Scheme = Output_Scheme then
47
         return Item;
48
 
49
      --  For remaining cases, one or other of the operands is UTF-16BE/LE
50
      --  encoded, so go through UTF-16 intermediate.
51
 
52
      else
53
         return Convert (UTF_16_Wide_String'(Convert (Item, Input_Scheme)),
54
                         Output_Scheme, Output_BOM);
55
      end if;
56
   end Convert;
57
 
58
   --  Convert from UTF-8/UTF-16BE/LE to UTF-16
59
 
60
   function Convert
61
     (Item          : UTF_String;
62
      Input_Scheme  : Encoding_Scheme;
63
      Output_BOM    : Boolean := False) return UTF_16_Wide_String
64
   is
65
   begin
66
      if Input_Scheme = UTF_8 then
67
         return Convert (Item, Output_BOM);
68
      else
69
         return To_UTF_16 (Item, Input_Scheme, Output_BOM);
70
      end if;
71
   end Convert;
72
 
73
   --  Convert from UTF-8 to UTF-16
74
 
75
   function Convert
76
     (Item       : UTF_8_String;
77
      Output_BOM : Boolean := False) return UTF_16_Wide_String
78
   is
79
      Result : UTF_16_Wide_String (1 .. Item'Length + 1);
80
      --  Maximum length of result, including possible BOM
81
 
82
      Len : Natural := 0;
83
      --  Number of characters stored so far in Result
84
 
85
      Iptr : Natural;
86
      --  Next character to process in Item
87
 
88
      C : Unsigned_8;
89
      --  Input UTF-8 code
90
 
91
      R : Unsigned_16;
92
      --  Output UTF-16 code
93
 
94
      procedure Get_Continuation;
95
      --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
96
      --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
97
      --  is incremented. Raises exception if continuation byte does not exist
98
      --  or is invalid.
99
 
100
      ----------------------
101
      -- Get_Continuation --
102
      ----------------------
103
 
104
      procedure Get_Continuation is
105
      begin
106
         if Iptr > Item'Last then
107
            Raise_Encoding_Error (Iptr - 1);
108
 
109
         else
110
            C := To_Unsigned_8 (Item (Iptr));
111
            Iptr := Iptr + 1;
112
 
113
            if C < 2#10_000000# or else C > 2#10_111111# then
114
               Raise_Encoding_Error (Iptr - 1);
115
 
116
            else
117
               R :=
118
                 Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
119
            end if;
120
         end if;
121
      end Get_Continuation;
122
 
123
   --  Start of processing for Convert
124
 
125
   begin
126
      --  Output BOM if required
127
 
128
      if Output_BOM then
129
         Len := Len + 1;
130
         Result (Len) := BOM_16 (1);
131
      end if;
132
 
133
      --  Skip OK BOM
134
 
135
      Iptr := Item'First;
136
 
137
      if Item'Length >= 3 and then Item (Iptr .. Iptr + 2) = BOM_8 then
138
         Iptr := Iptr + 3;
139
 
140
      --  Error if bad BOM
141
 
142
      elsif Item'Length >= 2
143
        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
144
                    or else
145
                  Item (Iptr .. Iptr + 1) = BOM_16LE)
146
      then
147
         Raise_Encoding_Error (Iptr);
148
 
149
      --  No BOM present
150
 
151
      else
152
         Iptr := Item'First;
153
      end if;
154
 
155
      while Iptr <= Item'Last loop
156
         C := To_Unsigned_8 (Item (Iptr));
157
         Iptr := Iptr + 1;
158
 
159
         --  Codes in the range 16#00# - 16#7F#
160
         --    UTF-8:  0xxxxxxx
161
         --    UTF-16: 00000000_0xxxxxxx
162
 
163
         if C <= 16#7F# then
164
            Len := Len + 1;
165
            Result (Len) := Wide_Character'Val (C);
166
 
167
         --  No initial code can be of the form 10xxxxxx. Such codes are used
168
         --  only for continuations.
169
 
170
         elsif C <= 2#10_111111# then
171
            Raise_Encoding_Error (Iptr - 1);
172
 
173
         --  Codes in the range 16#80# - 16#7FF#
174
         --    UTF-8:  110yyyxx 10xxxxxx
175
         --    UTF-16: 00000yyy_xxxxxxxx
176
 
177
         elsif C <= 2#110_11111# then
178
            R := Unsigned_16 (C and 2#000_11111#);
179
            Get_Continuation;
180
            Len := Len + 1;
181
            Result (Len) := Wide_Character'Val (R);
182
 
183
         --  Codes in the range 16#800# - 16#FFFF#
184
         --    UTF-8:  1110yyyy 10yyyyxx 10xxxxxx
185
         --    UTF-16: yyyyyyyy_xxxxxxxx
186
 
187
         elsif C <= 2#1110_1111# then
188
            R := Unsigned_16 (C and 2#0000_1111#);
189
            Get_Continuation;
190
            Get_Continuation;
191
            Len := Len + 1;
192
            Result (Len) := Wide_Character'Val (R);
193
 
194
            --  Make sure that we don't have a result in the forbidden range
195
            --  reserved for UTF-16 surrogate characters.
196
 
197
            if R in 16#D800# .. 16#DF00# then
198
               Raise_Encoding_Error (Iptr - 3);
199
            end if;
200
 
201
         --  Codes in the range 16#10000# - 16#10FFFF#
202
         --    UTF-8:  11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
203
         --    UTF-16: 110110zz_zzyyyyyy 110111yy_xxxxxxxx
204
         --    Note: zzzz in the output is input zzzzz - 1
205
 
206
         elsif C <= 2#11110_111# then
207
            R := Unsigned_16 (C and 2#00000_111#);
208
            Get_Continuation;
209
 
210
            --  R now has zzzzzyyyy
211
 
212
            R := R - 2#0000_1_0000#;
213
 
214
            --  R now has zzzzyyyy (zzzz minus one for the output)
215
 
216
            Get_Continuation;
217
 
218
            --  R now has zzzzyyyyyyyyxx
219
 
220
            Len := Len + 1;
221
            Result (Len) :=
222
              Wide_Character'Val
223
                (2#110110_00_0000_0000# or Shift_Right (R, 4));
224
 
225
            R := R and 2#1111#;
226
            Get_Continuation;
227
            Len := Len + 1;
228
            Result (Len) :=
229
              Wide_Character'Val (2#110111_00_0000_0000# or R);
230
 
231
         --  Any other code is an error
232
 
233
         else
234
            Raise_Encoding_Error (Iptr - 1);
235
         end if;
236
      end loop;
237
 
238
      return Result (1 .. Len);
239
   end Convert;
240
 
241
   --  Convert from UTF-16 to UTF-8/UTF-16-BE/LE
242
 
243
   function Convert
244
     (Item          : UTF_16_Wide_String;
245
      Output_Scheme : Encoding_Scheme;
246
      Output_BOM    : Boolean := False) return UTF_String
247
   is
248
   begin
249
      if Output_Scheme = UTF_8 then
250
         return Convert (Item, Output_BOM);
251
      else
252
         return From_UTF_16 (Item, Output_Scheme, Output_BOM);
253
      end if;
254
   end Convert;
255
 
256
   --  Convert from UTF-16 to UTF-8
257
 
258
   function Convert
259
     (Item          : UTF_16_Wide_String;
260
      Output_BOM    : Boolean := False) return UTF_8_String
261
   is
262
      Result : UTF_8_String (1 .. 3 * Item'Length + 3);
263
      --  Worst case is 3 output codes for each input code + BOM space
264
 
265
      Len : Natural;
266
      --  Number of result codes stored
267
 
268
      Iptr : Natural;
269
      --  Pointer to next input character
270
 
271
      C1, C2 : Unsigned_16;
272
 
273
      zzzzz    : Unsigned_16;
274
      yyyyyyyy : Unsigned_16;
275
      xxxxxxxx : Unsigned_16;
276
      --  Components of double length case
277
 
278
   begin
279
      Iptr := Item'First;
280
 
281
      --  Skip BOM at start of input
282
 
283
      if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
284
         Iptr := Iptr + 1;
285
      end if;
286
 
287
      --  Generate output BOM if required
288
 
289
      if Output_BOM then
290
         Result (1 .. 3) := BOM_8;
291
         Len := 3;
292
      else
293
         Len := 0;
294
      end if;
295
 
296
      --  Loop through input
297
 
298
      while Iptr <= Item'Last loop
299
         C1 := To_Unsigned_16 (Item (Iptr));
300
         Iptr := Iptr + 1;
301
 
302
         --  Codes in the range 16#0000# - 16#007F#
303
         --    UTF-16: 000000000xxxxxxx
304
         --    UTF-8:  0xxxxxxx
305
 
306
         if C1 <= 16#007F# then
307
            Result (Len + 1) := Character'Val (C1);
308
            Len := Len + 1;
309
 
310
         --  Codes in the range 16#80# - 16#7FF#
311
         --    UTF-16: 00000yyyxxxxxxxx
312
         --    UTF-8:  110yyyxx 10xxxxxx
313
 
314
         elsif C1 <= 16#07FF# then
315
            Result (Len + 1) :=
316
              Character'Val
317
                (2#110_00000# or Shift_Right (C1, 6));
318
            Result (Len + 2) :=
319
              Character'Val
320
                (2#10_000000# or (C1 and 2#00_111111#));
321
            Len := Len + 2;
322
 
323
         --  Codes in the range 16#800# - 16#D7FF# or 16#E000# - 16#FFFF#
324
         --    UTF-16: yyyyyyyyxxxxxxxx
325
         --    UTF-8:  1110yyyy 10yyyyxx 10xxxxxx
326
 
327
         elsif C1 <= 16#D7FF# or else C1 >= 16#E000# then
328
            Result (Len + 1) :=
329
              Character'Val
330
                (2#1110_0000# or Shift_Right (C1, 12));
331
            Result (Len + 2) :=
332
              Character'Val
333
                (2#10_000000# or (Shift_Right (C1, 6) and 2#00_111111#));
334
            Result (Len + 3) :=
335
              Character'Val
336
                (2#10_000000# or (C1 and 2#00_111111#));
337
            Len := Len + 3;
338
 
339
         --  Codes in the range 16#10000# - 16#10FFFF#
340
         --    UTF-16: 110110zzzzyyyyyy 110111yyxxxxxxxx
341
         --    UTF-8:  11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
342
         --    Note: zzzzz in the output is input zzzz + 1
343
 
344
         elsif C1 <= 2#110110_11_11111111# then
345
            if Iptr > Item'Last then
346
               Raise_Encoding_Error (Iptr - 1);
347
            else
348
               C2 := To_Unsigned_16 (Item (Iptr));
349
               Iptr := Iptr + 1;
350
            end if;
351
 
352
            if (C2 and 2#111111_00_00000000#) /= 2#110111_00_00000000# then
353
               Raise_Encoding_Error (Iptr - 1);
354
            end if;
355
 
356
            zzzzz    := (Shift_Right (C1, 6) and 2#1111#) + 1;
357
            yyyyyyyy := ((Shift_Left (C1, 2) and 2#111111_00#)
358
                            or
359
                         (Shift_Right (C2, 8) and 2#000000_11#));
360
            xxxxxxxx := C2 and 2#11111111#;
361
 
362
            Result (Len + 1) :=
363
              Character'Val
364
                (2#11110_000# or (Shift_Right (zzzzz, 2)));
365
            Result (Len + 2) :=
366
              Character'Val
367
                (2#10_000000# or Shift_Left (zzzzz and 2#11#, 4)
368
                              or Shift_Right (yyyyyyyy, 4));
369
            Result (Len + 3) :=
370
              Character'Val
371
                (2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 4)
372
                              or Shift_Right (xxxxxxxx, 6));
373
            Result (Len + 4) :=
374
              Character'Val
375
                (2#10_000000# or (xxxxxxxx and 2#00_111111#));
376
            Len := Len + 4;
377
 
378
         --  Error if input in 16#DC00# - 16#DFFF# (2nd surrogate with no 1st)
379
 
380
         else
381
            Raise_Encoding_Error (Iptr - 2);
382
         end if;
383
      end loop;
384
 
385
      return Result (1 .. Len);
386
   end Convert;
387
 
388
end Ada.Strings.UTF_Encoding.Conversions;

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.