OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [ada/] [a-suezst.adb] - Blame information for rev 708

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 706 jeremybenn
------------------------------------------------------------------------------
2
--                                                                          --
3
--                         GNAT RUN-TIME COMPONENTS                         --
4
--                                                                          --
5
--                ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_STRINGS                --
6
--                                                                          --
7
--                                 B o d y                                  --
8
--                                                                          --
9
--          Copyright (C) 2010-2011, Free Software Foundation, Inc.         --
10
--                                                                          --
11
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12
-- terms of the  GNU General Public License as published  by the Free Soft- --
13
-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16
-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17
--                                                                          --
18
-- As a special exception under Section 7 of GPL version 3, you are granted --
19
-- additional permissions described in the GCC Runtime Library Exception,   --
20
-- version 3.1, as published by the Free Software Foundation.               --
21
--                                                                          --
22
-- You should have received a copy of the GNU General Public License and    --
23
-- a copy of the GCC Runtime Library Exception along with this program;     --
24
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25
-- <http://www.gnu.org/licenses/>.                                          --
26
--                                                                          --
27
-- GNAT was originally developed  by the GNAT team at  New York University. --
28
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29
--                                                                          --
30
------------------------------------------------------------------------------
31
 
32
package body Ada.Strings.UTF_Encoding.Wide_Wide_Strings is
33
   use Interfaces;
34
 
35
   ------------
36
   -- Decode --
37
   ------------
38
 
39
   --  Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
40
 
41
   function Decode
42
     (Item         : UTF_String;
43
      Input_Scheme : Encoding_Scheme) return Wide_Wide_String
44
   is
45
   begin
46
      if Input_Scheme = UTF_8 then
47
         return Decode (Item);
48
      else
49
         return Decode (To_UTF_16 (Item, Input_Scheme));
50
      end if;
51
   end Decode;
52
 
53
   --  Decode UTF-8 input to Wide_Wide_String
54
 
55
   function Decode (Item : UTF_8_String) return Wide_Wide_String is
56
      Result : Wide_Wide_String (1 .. Item'Length);
57
      --  Result string (worst case is same length as input)
58
 
59
      Len : Natural := 0;
60
      --  Length of result stored so far
61
 
62
      Iptr : Natural;
63
      --  Input string pointer
64
 
65
      C : Unsigned_8;
66
      R : Unsigned_32;
67
 
68
      procedure Get_Continuation;
69
      --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
70
      --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
71
      --  is incremented. Raises exception if continuation byte does not exist
72
      --  or is invalid.
73
 
74
      ----------------------
75
      -- Get_Continuation --
76
      ----------------------
77
 
78
      procedure Get_Continuation is
79
      begin
80
         if Iptr > Item'Last then
81
            Raise_Encoding_Error (Iptr - 1);
82
 
83
         else
84
            C := To_Unsigned_8 (Item (Iptr));
85
            Iptr := Iptr + 1;
86
 
87
            if C not in 2#10_000000# .. 2#10_111111# then
88
               Raise_Encoding_Error (Iptr - 1);
89
            else
90
               R := Shift_Left (R, 6) or Unsigned_32 (C and 2#00_111111#);
91
            end if;
92
         end if;
93
      end Get_Continuation;
94
 
95
   --  Start of processing for Decode
96
 
97
   begin
98
      Iptr := Item'First;
99
 
100
      --  Skip BOM at start
101
 
102
      if Item'Length >= 3
103
        and then Item (Iptr .. Iptr + 2) = BOM_8
104
      then
105
         Iptr := Iptr + 3;
106
 
107
      --  Error if bad BOM
108
 
109
      elsif Item'Length >= 2
110
        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111
                    or else
112
                  Item (Iptr .. Iptr + 1) = BOM_16LE)
113
      then
114
         Raise_Encoding_Error (Iptr);
115
      end if;
116
 
117
      --  Loop through input characters
118
 
119
      while Iptr <= Item'Last loop
120
         C := To_Unsigned_8 (Item (Iptr));
121
         Iptr := Iptr + 1;
122
 
123
         --  Codes in the range 16#00# - 16#7F# are represented as
124
         --    0xxxxxxx
125
 
126
         if C <= 16#7F# then
127
            R := Unsigned_32 (C);
128
 
129
         --  No initial code can be of the form 10xxxxxx. Such codes are used
130
         --  only for continuations.
131
 
132
         elsif C <= 2#10_111111# then
133
            Raise_Encoding_Error (Iptr - 1);
134
 
135
         --  Codes in the range 16#80# - 16#7FF# are represented as
136
         --    110yyyxx 10xxxxxx
137
 
138
         elsif C <= 2#110_11111# then
139
            R := Unsigned_32 (C and 2#000_11111#);
140
            Get_Continuation;
141
 
142
         --  Codes in the range 16#800# - 16#FFFF# are represented as
143
         --    1110yyyy 10yyyyxx 10xxxxxx
144
 
145
         elsif C <= 2#1110_1111# then
146
            R := Unsigned_32 (C and 2#0000_1111#);
147
            Get_Continuation;
148
            Get_Continuation;
149
 
150
         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
151
         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
152
 
153
         elsif C <= 2#11110_111# then
154
            R := Unsigned_32 (C and 2#00000_111#);
155
            Get_Continuation;
156
            Get_Continuation;
157
            Get_Continuation;
158
 
159
         --  Any other code is an error
160
 
161
         else
162
            Raise_Encoding_Error (Iptr - 1);
163
         end if;
164
 
165
         Len := Len + 1;
166
         Result (Len) := Wide_Wide_Character'Val (R);
167
      end loop;
168
 
169
      return Result (1 .. Len);
170
   end Decode;
171
 
172
   --  Decode UTF-16 input to Wide_Wide_String
173
 
174
   function Decode (Item : UTF_16_Wide_String) return Wide_Wide_String is
175
      Result : Wide_Wide_String (1 .. Item'Length);
176
      --  Result cannot be longer than the input string
177
 
178
      Len : Natural := 0;
179
      --  Length of result
180
 
181
      Iptr : Natural;
182
      --  Pointer to next element in Item
183
 
184
      C : Unsigned_16;
185
      R : Unsigned_32;
186
 
187
   begin
188
      --  Skip UTF-16 BOM at start
189
 
190
      Iptr := Item'First;
191
 
192
      if Iptr <= Item'Last and then Item (Iptr) = BOM_16 (1) then
193
         Iptr := Iptr + 1;
194
      end if;
195
 
196
      --  Loop through input characters
197
 
198
      while Iptr <= Item'Last loop
199
         C := To_Unsigned_16 (Item (Iptr));
200
         Iptr := Iptr + 1;
201
 
202
         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
203
         --  represent their own value.
204
 
205
         if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
206
            Len := Len + 1;
207
            Result (Len) := Wide_Wide_Character'Val (C);
208
 
209
         --  Codes in the range 16#D800#..16#DBFF# represent the first of the
210
         --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
211
         --  The first surrogate provides 10 high order bits of the result.
212
 
213
         elsif C <= 16#DBFF# then
214
            R := Shift_Left ((Unsigned_32 (C) - 16#D800#), 10);
215
 
216
            --  Error if at end of string
217
 
218
            if Iptr > Item'Last then
219
               Raise_Encoding_Error (Iptr - 1);
220
 
221
            --  Otherwise next character must be valid low order surrogate
222
            --  which provides the low 10 order bits of the result.
223
 
224
            else
225
               C := To_Unsigned_16 (Item (Iptr));
226
               Iptr := Iptr + 1;
227
 
228
               if C not in 16#DC00# .. 16#DFFF# then
229
                  Raise_Encoding_Error (Iptr - 1);
230
 
231
               else
232
                  R := R or (Unsigned_32 (C) mod 2 ** 10);
233
 
234
               --  The final adjustment is to add 16#01_0000 to get the
235
               --  result back in the required 21 bit range.
236
 
237
                  R := R + 16#01_0000#;
238
                  Len := Len + 1;
239
                  Result (Len) := Wide_Wide_Character'Val (R);
240
               end if;
241
            end if;
242
 
243
         --  Remaining codes are invalid
244
 
245
         else
246
            Raise_Encoding_Error (Iptr - 1);
247
         end if;
248
      end loop;
249
 
250
      return Result (1 .. Len);
251
   end Decode;
252
 
253
   ------------
254
   -- Encode --
255
   ------------
256
 
257
   --  Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
258
 
259
   function Encode
260
     (Item          : Wide_Wide_String;
261
      Output_Scheme : Encoding_Scheme;
262
      Output_BOM    : Boolean  := False) return UTF_String
263
   is
264
   begin
265
      if Output_Scheme = UTF_8 then
266
         return Encode (Item, Output_BOM);
267
      else
268
         return From_UTF_16 (Encode (Item), Output_Scheme, Output_BOM);
269
      end if;
270
   end Encode;
271
 
272
   --  Encode Wide_Wide_String in UTF-8
273
 
274
   function Encode
275
     (Item       : Wide_Wide_String;
276
      Output_BOM : Boolean  := False) return UTF_8_String
277
   is
278
      Result : String (1 .. 4 * Item'Length + 3);
279
      --  Worst case is four bytes per input byte + space for BOM
280
 
281
      Len  : Natural;
282
      --  Number of output codes stored in Result
283
 
284
      C : Unsigned_32;
285
      --  Single input character
286
 
287
      procedure Store (C : Unsigned_32);
288
      pragma Inline (Store);
289
      --  Store one output code (input is in range 0 .. 255)
290
 
291
      -----------
292
      -- Store --
293
      -----------
294
 
295
      procedure Store (C : Unsigned_32) is
296
      begin
297
         Len := Len + 1;
298
         Result (Len) := Character'Val (C);
299
      end Store;
300
 
301
   --  Start of processing for Encode
302
 
303
   begin
304
      --  Output BOM if required
305
 
306
      if Output_BOM then
307
         Result (1 .. 3) := BOM_8;
308
         Len := 3;
309
      else
310
         Len := 0;
311
      end if;
312
 
313
      --  Loop through characters of input
314
 
315
      for Iptr in Item'Range loop
316
         C := To_Unsigned_32 (Item (Iptr));
317
 
318
         --  Codes in the range 16#00#..16#7F# are represented as
319
         --    0xxxxxxx
320
 
321
         if C <= 16#7F# then
322
            Store (C);
323
 
324
         --  Codes in the range 16#80#..16#7FF# are represented as
325
         --    110yyyxx 10xxxxxx
326
 
327
         elsif C <= 16#7FF# then
328
            Store (2#110_00000# or Shift_Right (C, 6));
329
            Store (2#10_000000# or (C and 2#00_111111#));
330
 
331
         --  Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
332
         --  represented as
333
         --    1110yyyy 10yyyyxx 10xxxxxx
334
 
335
         elsif C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
336
            Store (2#1110_0000# or Shift_Right (C, 12));
337
            Store (2#10_000000# or
338
                     Shift_Right (C and 2#111111_000000#, 6));
339
            Store (2#10_000000# or (C and 2#00_111111#));
340
 
341
         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
342
         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
343
 
344
         elsif C in 16#1_0000# .. 16#10_FFFF# then
345
            Store (2#11110_000# or
346
                     Shift_Right (C, 18));
347
            Store (2#10_000000# or
348
                     Shift_Right (C and 2#111111_000000_000000#, 12));
349
            Store (2#10_000000# or
350
                     Shift_Right (C and 2#111111_000000#, 6));
351
            Store (2#10_000000# or
352
                     (C and 2#00_111111#));
353
 
354
         --  All other codes are invalid
355
 
356
         else
357
            Raise_Encoding_Error (Iptr);
358
         end if;
359
      end loop;
360
 
361
      return Result (1 .. Len);
362
   end Encode;
363
 
364
   --  Encode Wide_Wide_String in UTF-16
365
 
366
   function Encode
367
     (Item       : Wide_Wide_String;
368
      Output_BOM : Boolean  := False) return UTF_16_Wide_String
369
   is
370
      Result : UTF_16_Wide_String (1 .. 2 * Item'Length + 1);
371
      --  Worst case is each input character generates two output characters
372
      --  plus one for possible BOM.
373
 
374
      Len : Integer;
375
      --  Length of output string
376
 
377
      C : Unsigned_32;
378
 
379
   begin
380
      --  Output BOM if needed
381
 
382
      if Output_BOM then
383
         Result (1) := BOM_16 (1);
384
         Len := 1;
385
      else
386
         Len := 0;
387
      end if;
388
 
389
      --  Loop through input characters encoding them
390
 
391
      for Iptr in Item'Range loop
392
         C := To_Unsigned_32 (Item (Iptr));
393
 
394
         --  Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
395
         --  are output unchanged
396
 
397
         if C <= 16#00_D7FF# or else C in 16#E000# .. 16#FFFD# then
398
            Len := Len + 1;
399
            Result (Len) := Wide_Character'Val (C);
400
 
401
         --  Codes in the range 16#01_0000#..16#10_FFFF# are output using two
402
         --  surrogate characters. First 16#1_0000# is subtracted from the code
403
         --  point to give a 20-bit value. This is then split into two separate
404
         --  10-bit values each of which is represented as a surrogate with the
405
         --  most significant half placed in the first surrogate. The ranges of
406
         --  values used for the two surrogates are 16#D800#-16#DBFF# for the
407
         --  first, most significant surrogate and 16#DC00#-16#DFFF# for the
408
         --  second, least significant surrogate.
409
 
410
         elsif C in 16#1_0000# ..  16#10_FFFF# then
411
            C := C - 16#1_0000#;
412
 
413
            Len := Len + 1;
414
            Result (Len) := Wide_Character'Val (16#D800# + C / 2 ** 10);
415
 
416
            Len := Len + 1;
417
            Result (Len) := Wide_Character'Val (16#DC00# + C mod 2 ** 10);
418
 
419
         --  All other codes are invalid
420
 
421
         else
422
            Raise_Encoding_Error (Iptr);
423
         end if;
424
      end loop;
425
 
426
      return Result (1 .. Len);
427
   end Encode;
428
 
429
end Ada.Strings.UTF_Encoding.Wide_Wide_Strings;

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.