OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [ada/] [g-decstr.adb] - Blame information for rev 281

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 281 jeremybenn
------------------------------------------------------------------------------
2
--                                                                          --
3
--                         GNAT RUN-TIME COMPONENTS                         --
4
--                                                                          --
5
--                    G N A T . D E C O D E _ S T R I N G                   --
6
--                                                                          --
7
--                                 S p e c                                  --
8
--                                                                          --
9
--                     Copyright (C) 2007-2008, AdaCore                     --
10
--                                                                          --
11
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12
-- terms of the  GNU General Public License as published  by the Free Soft- --
13
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
14
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
17
-- for  more details.  You should have  received  a copy of the GNU General --
18
-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
19
-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
20
-- Boston, MA 02110-1301, USA.                                              --
21
--                                                                          --
22
-- As a special exception,  if other files  instantiate  generics from this --
23
-- unit, or you link  this unit with other files  to produce an executable, --
24
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
25
-- covered  by the  GNU  General  Public  License.  This exception does not --
26
-- however invalidate  any other reasons why  the executable file  might be --
27
-- covered by the  GNU Public License.                                      --
28
--                                                                          --
29
-- GNAT was originally developed  by the GNAT team at  New York University. --
30
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
31
--                                                                          --
32
------------------------------------------------------------------------------
33
 
34
--  This package provides a utility routine for converting from an encoded
35
--  string to a corresponding Wide_String or Wide_Wide_String value.
36
 
37
with Interfaces; use Interfaces;
38
 
39
with System.WCh_Cnv; use System.WCh_Cnv;
40
with System.WCh_Con; use System.WCh_Con;
41
 
42
package body GNAT.Decode_String is
43
 
44
   -----------------------
45
   -- Local Subprograms --
46
   -----------------------
47
 
48
   procedure Bad;
49
   pragma No_Return (Bad);
50
   --  Raise error for bad encoding
51
 
52
   procedure Past_End;
53
   pragma No_Return (Past_End);
54
   --  Raise error for off end of string
55
 
56
   ---------
57
   -- Bad --
58
   ---------
59
 
60
   procedure Bad is
61
   begin
62
      raise Constraint_Error with
63
        "bad encoding or character out of range";
64
   end Bad;
65
 
66
   ---------------------------
67
   -- Decode_Wide_Character --
68
   ---------------------------
69
 
70
   procedure Decode_Wide_Character
71
     (Input  : String;
72
      Ptr    : in out Natural;
73
      Result : out Wide_Character)
74
   is
75
      Char : Wide_Wide_Character;
76
   begin
77
      Decode_Wide_Wide_Character (Input, Ptr, Char);
78
 
79
      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
80
         Bad;
81
      else
82
         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
83
      end if;
84
   end Decode_Wide_Character;
85
 
86
   ------------------------
87
   -- Decode_Wide_String --
88
   ------------------------
89
 
90
   function Decode_Wide_String (S : String) return Wide_String is
91
      Result : Wide_String (1 .. S'Length);
92
      Length : Natural;
93
   begin
94
      Decode_Wide_String (S, Result, Length);
95
      return Result (1 .. Length);
96
   end Decode_Wide_String;
97
 
98
   procedure Decode_Wide_String
99
     (S      : String;
100
      Result : out Wide_String;
101
      Length : out Natural)
102
   is
103
      Ptr : Natural;
104
 
105
   begin
106
      Ptr := S'First;
107
      Length := 0;
108
      while Ptr <= S'Last loop
109
         if Length >= Result'Last then
110
            Past_End;
111
         end if;
112
 
113
         Length := Length + 1;
114
         Decode_Wide_Character (S, Ptr, Result (Length));
115
      end loop;
116
   end Decode_Wide_String;
117
 
118
   --------------------------------
119
   -- Decode_Wide_Wide_Character --
120
   --------------------------------
121
 
122
   procedure Decode_Wide_Wide_Character
123
     (Input  : String;
124
      Ptr    : in out Natural;
125
      Result : out Wide_Wide_Character)
126
   is
127
      C : Character;
128
 
129
      function In_Char return Character;
130
      pragma Inline (In_Char);
131
      --  Function to get one input character
132
 
133
      -------------
134
      -- In_Char --
135
      -------------
136
 
137
      function In_Char return Character is
138
      begin
139
         if Ptr <= Input'Last then
140
            Ptr := Ptr + 1;
141
            return Input (Ptr - 1);
142
         else
143
            Past_End;
144
         end if;
145
      end In_Char;
146
 
147
   --  Start of processing for Decode_Wide_Wide_Character
148
 
149
   begin
150
      C := In_Char;
151
 
152
      --  Special fast processing for UTF-8 case
153
 
154
      if Encoding_Method = WCEM_UTF8 then
155
         UTF8 : declare
156
            U : Unsigned_32;
157
            W : Unsigned_32;
158
 
159
            procedure Get_UTF_Byte;
160
            pragma Inline (Get_UTF_Byte);
161
            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
162
            --  Reads a byte, and raises CE if the first two bits are not 10.
163
            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
164
 
165
            ------------------
166
            -- Get_UTF_Byte --
167
            ------------------
168
 
169
            procedure Get_UTF_Byte is
170
            begin
171
               U := Unsigned_32 (Character'Pos (In_Char));
172
 
173
               if (U and 2#11000000#) /= 2#10_000000# then
174
                  Bad;
175
               end if;
176
 
177
               W := Shift_Left (W, 6) or (U and 2#00111111#);
178
            end Get_UTF_Byte;
179
 
180
         --  Start of processing for UTF8 case
181
 
182
         begin
183
            --  Note: for details of UTF8 encoding see RFC 3629
184
 
185
            U := Unsigned_32 (Character'Pos (C));
186
 
187
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
188
 
189
            if (U and 2#10000000#) = 2#00000000# then
190
               Result := Wide_Wide_Character'Val (Character'Pos (C));
191
 
192
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
193
 
194
            elsif (U and 2#11100000#) = 2#110_00000# then
195
               W := U and 2#00011111#;
196
               Get_UTF_Byte;
197
               Result := Wide_Wide_Character'Val (W);
198
 
199
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
200
 
201
            elsif (U and 2#11110000#) = 2#1110_0000# then
202
               W := U and 2#00001111#;
203
               Get_UTF_Byte;
204
               Get_UTF_Byte;
205
               Result := Wide_Wide_Character'Val (W);
206
 
207
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
208
 
209
            elsif (U and 2#11111000#) = 2#11110_000# then
210
               W := U and 2#00000111#;
211
 
212
               for K in 1 .. 3 loop
213
                  Get_UTF_Byte;
214
               end loop;
215
 
216
               Result := Wide_Wide_Character'Val (W);
217
 
218
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
219
            --                               10xxxxxx 10xxxxxx
220
 
221
            elsif (U and 2#11111100#) = 2#111110_00# then
222
               W := U and 2#00000011#;
223
 
224
               for K in 1 .. 4 loop
225
                  Get_UTF_Byte;
226
               end loop;
227
 
228
               Result := Wide_Wide_Character'Val (W);
229
 
230
            --  All other cases are invalid, note that this includes:
231
 
232
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
233
            --                               10xxxxxx 10xxxxxx 10xxxxxx
234
 
235
            --  since Wide_Wide_Character does not include code values
236
            --  greater than 16#03FF_FFFF#.
237
 
238
            else
239
               Bad;
240
            end if;
241
         end UTF8;
242
 
243
      --  All encoding functions other than UTF-8
244
 
245
      else
246
         Non_UTF8 : declare
247
            function Char_Sequence_To_UTF is
248
              new Char_Sequence_To_UTF_32 (In_Char);
249
 
250
         begin
251
            --  For brackets, must test for specific case of [ not followed by
252
            --  quotation, where we must not call Char_Sequence_To_UTF, but
253
            --  instead just return the bracket unchanged.
254
 
255
            if Encoding_Method = WCEM_Brackets
256
              and then C = '['
257
              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
258
            then
259
               Result := '[';
260
 
261
            --  All other cases including [" with Brackets
262
 
263
            else
264
               Result :=
265
                 Wide_Wide_Character'Val
266
                   (Char_Sequence_To_UTF (C, Encoding_Method));
267
            end if;
268
         end Non_UTF8;
269
      end if;
270
   end Decode_Wide_Wide_Character;
271
 
272
   -----------------------------
273
   -- Decode_Wide_Wide_String --
274
   -----------------------------
275
 
276
   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
277
      Result : Wide_Wide_String (1 .. S'Length);
278
      Length : Natural;
279
   begin
280
      Decode_Wide_Wide_String (S, Result, Length);
281
      return Result (1 .. Length);
282
   end Decode_Wide_Wide_String;
283
 
284
   procedure Decode_Wide_Wide_String
285
     (S      : String;
286
      Result : out Wide_Wide_String;
287
      Length : out Natural)
288
   is
289
      Ptr : Natural;
290
 
291
   begin
292
      Ptr := S'First;
293
      Length := 0;
294
      while Ptr <= S'Last loop
295
         if Length >= Result'Last then
296
            Past_End;
297
         end if;
298
 
299
         Length := Length + 1;
300
         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
301
      end loop;
302
   end Decode_Wide_Wide_String;
303
 
304
   -------------------------
305
   -- Next_Wide_Character --
306
   -------------------------
307
 
308
   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
309
   begin
310
      if Ptr < Input'First then
311
         Past_End;
312
      end if;
313
 
314
      --  Special efficient encoding for UTF-8 case
315
 
316
      if Encoding_Method = WCEM_UTF8 then
317
         UTF8 : declare
318
            U : Unsigned_32;
319
 
320
            procedure Getc;
321
            pragma Inline (Getc);
322
            --  Gets the character at Input (Ptr) and returns code in U as
323
            --  Unsigned_32 value. On return Ptr is bumped past the character.
324
 
325
            procedure Skip_UTF_Byte;
326
            pragma Inline (Skip_UTF_Byte);
327
            --  Skips past one encoded byte which must be 2#10xxxxxx#
328
 
329
            ----------
330
            -- Getc --
331
            ----------
332
 
333
            procedure Getc is
334
            begin
335
               if Ptr > Input'Last then
336
                  Past_End;
337
               else
338
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
339
                  Ptr := Ptr + 1;
340
               end if;
341
            end Getc;
342
 
343
            -------------------
344
            -- Skip_UTF_Byte --
345
            -------------------
346
 
347
            procedure Skip_UTF_Byte is
348
            begin
349
               Getc;
350
 
351
               if (U and 2#11000000#) /= 2#10_000000# then
352
                  Bad;
353
               end if;
354
            end Skip_UTF_Byte;
355
 
356
         --  Start of processing for UTF-8 case
357
 
358
         begin
359
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
360
 
361
            Getc;
362
 
363
            if (U and 2#10000000#) = 2#00000000# then
364
               return;
365
 
366
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
367
 
368
            elsif (U and 2#11100000#) = 2#110_00000# then
369
               Skip_UTF_Byte;
370
 
371
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
372
 
373
            elsif (U and 2#11110000#) = 2#1110_0000# then
374
               Skip_UTF_Byte;
375
               Skip_UTF_Byte;
376
 
377
            --  Any other code is invalid, note that this includes:
378
 
379
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
380
 
381
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
382
            --                               10xxxxxx 10xxxxxx
383
 
384
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
385
            --                               10xxxxxx 10xxxxxx 10xxxxxx
386
 
387
            --  since Wide_Character does not allow codes > 16#FFFF#
388
 
389
            else
390
               Bad;
391
            end if;
392
         end UTF8;
393
 
394
      --  Non-UTF-8 case
395
 
396
      else
397
         declare
398
            Discard : Wide_Character;
399
         begin
400
            Decode_Wide_Character (Input, Ptr, Discard);
401
         end;
402
      end if;
403
   end Next_Wide_Character;
404
 
405
   ------------------------------
406
   -- Next_Wide_Wide_Character --
407
   ------------------------------
408
 
409
   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
410
   begin
411
      --  Special efficient encoding for UTF-8 case
412
 
413
      if Encoding_Method = WCEM_UTF8 then
414
         UTF8 : declare
415
            U : Unsigned_32;
416
 
417
            procedure Getc;
418
            pragma Inline (Getc);
419
            --  Gets the character at Input (Ptr) and returns code in U as
420
            --  Unsigned_32 value. On return Ptr is bumped past the character.
421
 
422
            procedure Skip_UTF_Byte;
423
            pragma Inline (Skip_UTF_Byte);
424
            --  Skips past one encoded byte which must be 2#10xxxxxx#
425
 
426
            ----------
427
            -- Getc --
428
            ----------
429
 
430
            procedure Getc is
431
            begin
432
               if Ptr > Input'Last then
433
                  Past_End;
434
               else
435
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
436
                  Ptr := Ptr + 1;
437
               end if;
438
            end Getc;
439
 
440
            -------------------
441
            -- Skip_UTF_Byte --
442
            -------------------
443
 
444
            procedure Skip_UTF_Byte is
445
            begin
446
               Getc;
447
 
448
               if (U and 2#11000000#) /= 2#10_000000# then
449
                  Bad;
450
               end if;
451
            end Skip_UTF_Byte;
452
 
453
         --  Start of processing for UTF-8 case
454
 
455
         begin
456
            if Ptr < Input'First then
457
               Past_End;
458
            end if;
459
 
460
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
461
 
462
            Getc;
463
 
464
            if (U and 2#10000000#) = 2#00000000# then
465
               null;
466
 
467
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
468
 
469
            elsif (U and 2#11100000#) = 2#110_00000# then
470
               Skip_UTF_Byte;
471
 
472
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
473
 
474
            elsif (U and 2#11110000#) = 2#1110_0000# then
475
               Skip_UTF_Byte;
476
               Skip_UTF_Byte;
477
 
478
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
479
 
480
            elsif (U and 2#11111000#) = 2#11110_000# then
481
               for K in 1 .. 3 loop
482
                  Skip_UTF_Byte;
483
               end loop;
484
 
485
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
486
            --                               10xxxxxx 10xxxxxx
487
 
488
            elsif (U and 2#11111100#) = 2#111110_00# then
489
               for K in 1 .. 4 loop
490
                  Skip_UTF_Byte;
491
               end loop;
492
 
493
            --  Any other code is invalid, note that this includes:
494
 
495
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
496
            --                               10xxxxxx 10xxxxxx 10xxxxxx
497
 
498
            --  since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
499
 
500
            else
501
               Bad;
502
            end if;
503
         end UTF8;
504
 
505
      --  Non-UTF-8 case
506
 
507
      else
508
         declare
509
            Discard : Wide_Wide_Character;
510
         begin
511
            Decode_Wide_Wide_Character (Input, Ptr, Discard);
512
         end;
513
      end if;
514
   end Next_Wide_Wide_Character;
515
 
516
   --------------
517
   -- Past_End --
518
   --------------
519
 
520
   procedure Past_End is
521
   begin
522
      raise Constraint_Error with "past end of string";
523
   end Past_End;
524
 
525
   -------------------------
526
   -- Prev_Wide_Character --
527
   -------------------------
528
 
529
   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
530
   begin
531
      if Ptr > Input'Last + 1 then
532
         Past_End;
533
      end if;
534
 
535
      --  Special efficient encoding for UTF-8 case
536
 
537
      if Encoding_Method = WCEM_UTF8 then
538
         UTF8 : declare
539
            U : Unsigned_32;
540
 
541
            procedure Getc;
542
            pragma Inline (Getc);
543
            --  Gets the character at Input (Ptr - 1) and returns code in U as
544
            --  Unsigned_32 value. On return Ptr is decremented by one.
545
 
546
            procedure Skip_UTF_Byte;
547
            pragma Inline (Skip_UTF_Byte);
548
            --  Checks that U is 2#10xxxxxx# and then calls Get
549
 
550
            ----------
551
            -- Getc --
552
            ----------
553
 
554
            procedure Getc is
555
            begin
556
               if Ptr <= Input'First then
557
                  Past_End;
558
               else
559
                  Ptr := Ptr - 1;
560
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
561
               end if;
562
            end Getc;
563
 
564
            -------------------
565
            -- Skip_UTF_Byte --
566
            -------------------
567
 
568
            procedure Skip_UTF_Byte is
569
            begin
570
               if (U and 2#11000000#) = 2#10_000000# then
571
                  Getc;
572
               else
573
                  Bad;
574
               end if;
575
            end Skip_UTF_Byte;
576
 
577
         --  Start of processing for UTF-8 case
578
 
579
         begin
580
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
581
 
582
            Getc;
583
 
584
            if (U and 2#10000000#) = 2#00000000# then
585
               return;
586
 
587
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
588
 
589
            else
590
               Skip_UTF_Byte;
591
 
592
               if (U and 2#11100000#) = 2#110_00000# then
593
                  return;
594
 
595
               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
596
 
597
               else
598
                  Skip_UTF_Byte;
599
 
600
                  if (U and 2#11110000#) = 2#1110_0000# then
601
                     return;
602
 
603
                     --  Any other code is invalid, note that this includes:
604
 
605
                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
606
                     --                           10xxxxxx
607
 
608
                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
609
                     --                               10xxxxxx 10xxxxxx
610
                     --                               10xxxxxx
611
 
612
                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
613
                     --                               10xxxxxx 10xxxxxx
614
                     --                               10xxxxxx 10xxxxxx
615
 
616
                     --  since Wide_Character does not allow codes > 16#FFFF#
617
 
618
                  else
619
                     Bad;
620
                  end if;
621
               end if;
622
            end if;
623
         end UTF8;
624
 
625
      --  Special efficient encoding for brackets case
626
 
627
      elsif Encoding_Method = WCEM_Brackets then
628
         Brackets : declare
629
            P : Natural;
630
            S : Natural;
631
 
632
         begin
633
            --  See if we have "] at end positions
634
 
635
            if Ptr > Input'First + 1
636
              and then Input (Ptr - 1) = ']'
637
              and then Input (Ptr - 2) = '"'
638
            then
639
               P := Ptr - 2;
640
 
641
               --  Loop back looking for [" at start
642
 
643
               while P >= Ptr - 10 loop
644
                  if P <= Input'First + 1 then
645
                     Bad;
646
 
647
                  elsif Input (P - 1) = '"'
648
                    and then Input (P - 2) = '['
649
                  then
650
                     --  Found ["..."], scan forward to check it
651
 
652
                     S := P - 2;
653
                     P := S;
654
                     Next_Wide_Character (Input, P);
655
 
656
                     --  OK if at original pointer, else error
657
 
658
                     if P = Ptr then
659
                        Ptr := S;
660
                        return;
661
                     else
662
                        Bad;
663
                     end if;
664
                  end if;
665
 
666
                  P := P - 1;
667
               end loop;
668
 
669
               --  Falling through loop means more than 8 chars between the
670
               --  enclosing brackets (or simply a missing left bracket)
671
 
672
               Bad;
673
 
674
            --  Here if no bracket sequence present
675
 
676
            else
677
               if Ptr = Input'First then
678
                  Past_End;
679
               else
680
                  Ptr := Ptr - 1;
681
               end if;
682
            end if;
683
         end Brackets;
684
 
685
      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
686
      --  go to the start of the string and skip forwards till Ptr matches.
687
 
688
      else
689
         Non_UTF_Brackets : declare
690
            Discard : Wide_Character;
691
            PtrS    : Natural;
692
            PtrP    : Natural;
693
 
694
         begin
695
            PtrS := Input'First;
696
 
697
            if Ptr <= PtrS then
698
               Past_End;
699
            end if;
700
 
701
            loop
702
               PtrP := PtrS;
703
               Decode_Wide_Character (Input, PtrS, Discard);
704
 
705
               if PtrS = Ptr then
706
                  Ptr := PtrP;
707
                  return;
708
 
709
               elsif PtrS > Ptr then
710
                  Bad;
711
               end if;
712
            end loop;
713
 
714
         exception
715
            when Constraint_Error =>
716
               Bad;
717
         end Non_UTF_Brackets;
718
      end if;
719
   end Prev_Wide_Character;
720
 
721
   ------------------------------
722
   -- Prev_Wide_Wide_Character --
723
   ------------------------------
724
 
725
   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
726
   begin
727
      if Ptr > Input'Last + 1 then
728
         Past_End;
729
      end if;
730
 
731
      --  Special efficient encoding for UTF-8 case
732
 
733
      if Encoding_Method = WCEM_UTF8 then
734
         UTF8 : declare
735
            U : Unsigned_32;
736
 
737
            procedure Getc;
738
            pragma Inline (Getc);
739
            --  Gets the character at Input (Ptr - 1) and returns code in U as
740
            --  Unsigned_32 value. On return Ptr is decremented by one.
741
 
742
            procedure Skip_UTF_Byte;
743
            pragma Inline (Skip_UTF_Byte);
744
            --  Checks that U is 2#10xxxxxx# and then calls Get
745
 
746
            ----------
747
            -- Getc --
748
            ----------
749
 
750
            procedure Getc is
751
            begin
752
               if Ptr <= Input'First then
753
                  Past_End;
754
               else
755
                  Ptr := Ptr - 1;
756
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
757
               end if;
758
            end Getc;
759
 
760
            -------------------
761
            -- Skip_UTF_Byte --
762
            -------------------
763
 
764
            procedure Skip_UTF_Byte is
765
            begin
766
               if (U and 2#11000000#) = 2#10_000000# then
767
                  Getc;
768
               else
769
                  Bad;
770
               end if;
771
            end Skip_UTF_Byte;
772
 
773
         --  Start of processing for UTF-8 case
774
 
775
         begin
776
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
777
 
778
            Getc;
779
 
780
            if (U and 2#10000000#) = 2#00000000# then
781
               return;
782
 
783
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
784
 
785
            else
786
               Skip_UTF_Byte;
787
 
788
               if (U and 2#11100000#) = 2#110_00000# then
789
                  return;
790
 
791
               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
792
 
793
               else
794
                  Skip_UTF_Byte;
795
 
796
                  if (U and 2#11110000#) = 2#1110_0000# then
797
                     return;
798
 
799
                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
800
                  --                           10xxxxxx
801
 
802
                  else
803
                     Skip_UTF_Byte;
804
 
805
                     if (U and 2#11111000#) = 2#11110_000# then
806
                        return;
807
 
808
                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
809
                     --                               10xxxxxx 10xxxxxx
810
                     --                               10xxxxxx
811
 
812
                     else
813
                        Skip_UTF_Byte;
814
 
815
                        if (U and 2#11111100#) = 2#111110_00# then
816
                           return;
817
 
818
                        --  Any other code is invalid, note that this includes:
819
 
820
                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
821
                        --                               10xxxxxx 10xxxxxx
822
                        --                               10xxxxxx 10xxxxxx
823
 
824
                        --  since Wide_Wide_Character does not allow codes
825
                        --  greater than 16#03FF_FFFF#
826
 
827
                        else
828
                           Bad;
829
                        end if;
830
                     end if;
831
                  end if;
832
               end if;
833
            end if;
834
         end UTF8;
835
 
836
      --  Special efficient encoding for brackets case
837
 
838
      elsif Encoding_Method = WCEM_Brackets then
839
         Brackets : declare
840
            P : Natural;
841
            S : Natural;
842
 
843
         begin
844
            --  See if we have "] at end positions
845
 
846
            if Ptr > Input'First + 1
847
              and then Input (Ptr - 1) = ']'
848
              and then Input (Ptr - 2) = '"'
849
            then
850
               P := Ptr - 2;
851
 
852
               --  Loop back looking for [" at start
853
 
854
               while P >= Ptr - 10 loop
855
                  if P <= Input'First + 1 then
856
                     Bad;
857
 
858
                  elsif Input (P - 1) = '"'
859
                    and then Input (P - 2) = '['
860
                  then
861
                     --  Found ["..."], scan forward to check it
862
 
863
                     S := P - 2;
864
                     P := S;
865
                     Next_Wide_Wide_Character (Input, P);
866
 
867
                     --  OK if at original pointer, else error
868
 
869
                     if P = Ptr then
870
                        Ptr := S;
871
                        return;
872
                     else
873
                        Bad;
874
                     end if;
875
                  end if;
876
 
877
                  P := P - 1;
878
               end loop;
879
 
880
               --  Falling through loop means more than 8 chars between the
881
               --  enclosing brackets (or simply a missing left bracket)
882
 
883
               Bad;
884
 
885
            --  Here if no bracket sequence present
886
 
887
            else
888
               if Ptr = Input'First then
889
                  Past_End;
890
               else
891
                  Ptr := Ptr - 1;
892
               end if;
893
            end if;
894
         end Brackets;
895
 
896
      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
897
      --  go to the start of the string and skip forwards till Ptr matches.
898
 
899
      else
900
         Non_UTF8_Brackets : declare
901
            Discard : Wide_Wide_Character;
902
            PtrS    : Natural;
903
            PtrP    : Natural;
904
 
905
         begin
906
            PtrS := Input'First;
907
 
908
            if Ptr <= PtrS then
909
               Past_End;
910
            end if;
911
 
912
            loop
913
               PtrP := PtrS;
914
               Decode_Wide_Wide_Character (Input, PtrS, Discard);
915
 
916
               if PtrS = Ptr then
917
                  Ptr := PtrP;
918
                  return;
919
 
920
               elsif PtrS > Ptr then
921
                  Bad;
922
               end if;
923
            end loop;
924
 
925
         exception
926
            when Constraint_Error =>
927
               Bad;
928
         end Non_UTF8_Brackets;
929
      end if;
930
   end Prev_Wide_Wide_Character;
931
 
932
   --------------------------
933
   -- Validate_Wide_String --
934
   --------------------------
935
 
936
   function Validate_Wide_String (S : String) return Boolean is
937
      Ptr : Natural;
938
 
939
   begin
940
      Ptr := S'First;
941
      while Ptr <= S'Last loop
942
         Next_Wide_Character (S, Ptr);
943
      end loop;
944
 
945
      return True;
946
 
947
   exception
948
      when Constraint_Error =>
949
         return False;
950
   end Validate_Wide_String;
951
 
952
   -------------------------------
953
   -- Validate_Wide_Wide_String --
954
   -------------------------------
955
 
956
   function Validate_Wide_Wide_String (S : String) return Boolean is
957
      Ptr : Natural;
958
 
959
   begin
960
      Ptr := S'First;
961
      while Ptr <= S'Last loop
962
         Next_Wide_Wide_Character (S, Ptr);
963
      end loop;
964
 
965
      return True;
966
 
967
   exception
968
      when Constraint_Error =>
969
         return False;
970
   end Validate_Wide_Wide_String;
971
 
972
end GNAT.Decode_String;

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.