OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [ada/] [g-decstr.adb] - Blame information for rev 749

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 706 jeremybenn
------------------------------------------------------------------------------
2
--                                                                          --
3
--                         GNAT RUN-TIME COMPONENTS                         --
4
--                                                                          --
5
--                    G N A T . D E C O D E _ S T R I N G                   --
6
--                                                                          --
7
--                                 S p e c                                  --
8
--                                                                          --
9
--                     Copyright (C) 2007-2010, AdaCore                     --
10
--                                                                          --
11
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12
-- terms of the  GNU General Public License as published  by the Free Soft- --
13
-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16
-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17
--                                                                          --
18
-- As a special exception under Section 7 of GPL version 3, you are granted --
19
-- additional permissions described in the GCC Runtime Library Exception,   --
20
-- version 3.1, as published by the Free Software Foundation.               --
21
--                                                                          --
22
-- You should have received a copy of the GNU General Public License and    --
23
-- a copy of the GCC Runtime Library Exception along with this program;     --
24
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25
-- <http://www.gnu.org/licenses/>.                                          --
26
--                                                                          --
27
-- GNAT was originally developed  by the GNAT team at  New York University. --
28
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29
--                                                                          --
30
------------------------------------------------------------------------------
31
 
32
--  This package provides a utility routine for converting from an encoded
33
--  string to a corresponding Wide_String or Wide_Wide_String value.
34
 
35
with Interfaces; use Interfaces;
36
 
37
with System.WCh_Cnv; use System.WCh_Cnv;
38
with System.WCh_Con; use System.WCh_Con;
39
 
40
package body GNAT.Decode_String is
41
 
42
   -----------------------
43
   -- Local Subprograms --
44
   -----------------------
45
 
46
   procedure Bad;
47
   pragma No_Return (Bad);
48
   --  Raise error for bad encoding
49
 
50
   procedure Past_End;
51
   pragma No_Return (Past_End);
52
   --  Raise error for off end of string
53
 
54
   ---------
55
   -- Bad --
56
   ---------
57
 
58
   procedure Bad is
59
   begin
60
      raise Constraint_Error with
61
        "bad encoding or character out of range";
62
   end Bad;
63
 
64
   ---------------------------
65
   -- Decode_Wide_Character --
66
   ---------------------------
67
 
68
   procedure Decode_Wide_Character
69
     (Input  : String;
70
      Ptr    : in out Natural;
71
      Result : out Wide_Character)
72
   is
73
      Char : Wide_Wide_Character;
74
   begin
75
      Decode_Wide_Wide_Character (Input, Ptr, Char);
76
 
77
      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
78
         Bad;
79
      else
80
         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
81
      end if;
82
   end Decode_Wide_Character;
83
 
84
   ------------------------
85
   -- Decode_Wide_String --
86
   ------------------------
87
 
88
   function Decode_Wide_String (S : String) return Wide_String is
89
      Result : Wide_String (1 .. S'Length);
90
      Length : Natural;
91
   begin
92
      Decode_Wide_String (S, Result, Length);
93
      return Result (1 .. Length);
94
   end Decode_Wide_String;
95
 
96
   procedure Decode_Wide_String
97
     (S      : String;
98
      Result : out Wide_String;
99
      Length : out Natural)
100
   is
101
      Ptr : Natural;
102
 
103
   begin
104
      Ptr := S'First;
105
      Length := 0;
106
      while Ptr <= S'Last loop
107
         if Length >= Result'Last then
108
            Past_End;
109
         end if;
110
 
111
         Length := Length + 1;
112
         Decode_Wide_Character (S, Ptr, Result (Length));
113
      end loop;
114
   end Decode_Wide_String;
115
 
116
   --------------------------------
117
   -- Decode_Wide_Wide_Character --
118
   --------------------------------
119
 
120
   procedure Decode_Wide_Wide_Character
121
     (Input  : String;
122
      Ptr    : in out Natural;
123
      Result : out Wide_Wide_Character)
124
   is
125
      C : Character;
126
 
127
      function In_Char return Character;
128
      pragma Inline (In_Char);
129
      --  Function to get one input character
130
 
131
      -------------
132
      -- In_Char --
133
      -------------
134
 
135
      function In_Char return Character is
136
      begin
137
         if Ptr <= Input'Last then
138
            Ptr := Ptr + 1;
139
            return Input (Ptr - 1);
140
         else
141
            Past_End;
142
         end if;
143
      end In_Char;
144
 
145
   --  Start of processing for Decode_Wide_Wide_Character
146
 
147
   begin
148
      C := In_Char;
149
 
150
      --  Special fast processing for UTF-8 case
151
 
152
      if Encoding_Method = WCEM_UTF8 then
153
         UTF8 : declare
154
            U : Unsigned_32;
155
            W : Unsigned_32;
156
 
157
            procedure Get_UTF_Byte;
158
            pragma Inline (Get_UTF_Byte);
159
            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160
            --  Reads a byte, and raises CE if the first two bits are not 10.
161
            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
162
 
163
            ------------------
164
            -- Get_UTF_Byte --
165
            ------------------
166
 
167
            procedure Get_UTF_Byte is
168
            begin
169
               U := Unsigned_32 (Character'Pos (In_Char));
170
 
171
               if (U and 2#11000000#) /= 2#10_000000# then
172
                  Bad;
173
               end if;
174
 
175
               W := Shift_Left (W, 6) or (U and 2#00111111#);
176
            end Get_UTF_Byte;
177
 
178
         --  Start of processing for UTF8 case
179
 
180
         begin
181
            --  Note: for details of UTF8 encoding see RFC 3629
182
 
183
            U := Unsigned_32 (Character'Pos (C));
184
 
185
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
186
 
187
            if (U and 2#10000000#) = 2#00000000# then
188
               Result := Wide_Wide_Character'Val (Character'Pos (C));
189
 
190
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
191
 
192
            elsif (U and 2#11100000#) = 2#110_00000# then
193
               W := U and 2#00011111#;
194
               Get_UTF_Byte;
195
               Result := Wide_Wide_Character'Val (W);
196
 
197
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
198
 
199
            elsif (U and 2#11110000#) = 2#1110_0000# then
200
               W := U and 2#00001111#;
201
               Get_UTF_Byte;
202
               Get_UTF_Byte;
203
               Result := Wide_Wide_Character'Val (W);
204
 
205
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
206
 
207
            elsif (U and 2#11111000#) = 2#11110_000# then
208
               W := U and 2#00000111#;
209
 
210
               for K in 1 .. 3 loop
211
                  Get_UTF_Byte;
212
               end loop;
213
 
214
               Result := Wide_Wide_Character'Val (W);
215
 
216
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
217
            --                               10xxxxxx 10xxxxxx
218
 
219
            elsif (U and 2#11111100#) = 2#111110_00# then
220
               W := U and 2#00000011#;
221
 
222
               for K in 1 .. 4 loop
223
                  Get_UTF_Byte;
224
               end loop;
225
 
226
               Result := Wide_Wide_Character'Val (W);
227
 
228
            --  All other cases are invalid, note that this includes:
229
 
230
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
231
            --                               10xxxxxx 10xxxxxx 10xxxxxx
232
 
233
            --  since Wide_Wide_Character does not include code values
234
            --  greater than 16#03FF_FFFF#.
235
 
236
            else
237
               Bad;
238
            end if;
239
         end UTF8;
240
 
241
      --  All encoding functions other than UTF-8
242
 
243
      else
244
         Non_UTF8 : declare
245
            function Char_Sequence_To_UTF is
246
              new Char_Sequence_To_UTF_32 (In_Char);
247
 
248
         begin
249
            --  For brackets, must test for specific case of [ not followed by
250
            --  quotation, where we must not call Char_Sequence_To_UTF, but
251
            --  instead just return the bracket unchanged.
252
 
253
            if Encoding_Method = WCEM_Brackets
254
              and then C = '['
255
              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
256
            then
257
               Result := '[';
258
 
259
            --  All other cases including [" with Brackets
260
 
261
            else
262
               Result :=
263
                 Wide_Wide_Character'Val
264
                   (Char_Sequence_To_UTF (C, Encoding_Method));
265
            end if;
266
         end Non_UTF8;
267
      end if;
268
   end Decode_Wide_Wide_Character;
269
 
270
   -----------------------------
271
   -- Decode_Wide_Wide_String --
272
   -----------------------------
273
 
274
   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
275
      Result : Wide_Wide_String (1 .. S'Length);
276
      Length : Natural;
277
   begin
278
      Decode_Wide_Wide_String (S, Result, Length);
279
      return Result (1 .. Length);
280
   end Decode_Wide_Wide_String;
281
 
282
   procedure Decode_Wide_Wide_String
283
     (S      : String;
284
      Result : out Wide_Wide_String;
285
      Length : out Natural)
286
   is
287
      Ptr : Natural;
288
 
289
   begin
290
      Ptr := S'First;
291
      Length := 0;
292
      while Ptr <= S'Last loop
293
         if Length >= Result'Last then
294
            Past_End;
295
         end if;
296
 
297
         Length := Length + 1;
298
         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
299
      end loop;
300
   end Decode_Wide_Wide_String;
301
 
302
   -------------------------
303
   -- Next_Wide_Character --
304
   -------------------------
305
 
306
   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
307
   begin
308
      if Ptr < Input'First then
309
         Past_End;
310
      end if;
311
 
312
      --  Special efficient encoding for UTF-8 case
313
 
314
      if Encoding_Method = WCEM_UTF8 then
315
         UTF8 : declare
316
            U : Unsigned_32;
317
 
318
            procedure Getc;
319
            pragma Inline (Getc);
320
            --  Gets the character at Input (Ptr) and returns code in U as
321
            --  Unsigned_32 value. On return Ptr is bumped past the character.
322
 
323
            procedure Skip_UTF_Byte;
324
            pragma Inline (Skip_UTF_Byte);
325
            --  Skips past one encoded byte which must be 2#10xxxxxx#
326
 
327
            ----------
328
            -- Getc --
329
            ----------
330
 
331
            procedure Getc is
332
            begin
333
               if Ptr > Input'Last then
334
                  Past_End;
335
               else
336
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
337
                  Ptr := Ptr + 1;
338
               end if;
339
            end Getc;
340
 
341
            -------------------
342
            -- Skip_UTF_Byte --
343
            -------------------
344
 
345
            procedure Skip_UTF_Byte is
346
            begin
347
               Getc;
348
 
349
               if (U and 2#11000000#) /= 2#10_000000# then
350
                  Bad;
351
               end if;
352
            end Skip_UTF_Byte;
353
 
354
         --  Start of processing for UTF-8 case
355
 
356
         begin
357
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
358
 
359
            Getc;
360
 
361
            if (U and 2#10000000#) = 2#00000000# then
362
               return;
363
 
364
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
365
 
366
            elsif (U and 2#11100000#) = 2#110_00000# then
367
               Skip_UTF_Byte;
368
 
369
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
370
 
371
            elsif (U and 2#11110000#) = 2#1110_0000# then
372
               Skip_UTF_Byte;
373
               Skip_UTF_Byte;
374
 
375
            --  Any other code is invalid, note that this includes:
376
 
377
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
378
 
379
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
380
            --                               10xxxxxx 10xxxxxx
381
 
382
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
383
            --                               10xxxxxx 10xxxxxx 10xxxxxx
384
 
385
            --  since Wide_Character does not allow codes > 16#FFFF#
386
 
387
            else
388
               Bad;
389
            end if;
390
         end UTF8;
391
 
392
      --  Non-UTF-8 case
393
 
394
      else
395
         declare
396
            Discard : Wide_Character;
397
         begin
398
            Decode_Wide_Character (Input, Ptr, Discard);
399
         end;
400
      end if;
401
   end Next_Wide_Character;
402
 
403
   ------------------------------
404
   -- Next_Wide_Wide_Character --
405
   ------------------------------
406
 
407
   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
408
   begin
409
      --  Special efficient encoding for UTF-8 case
410
 
411
      if Encoding_Method = WCEM_UTF8 then
412
         UTF8 : declare
413
            U : Unsigned_32;
414
 
415
            procedure Getc;
416
            pragma Inline (Getc);
417
            --  Gets the character at Input (Ptr) and returns code in U as
418
            --  Unsigned_32 value. On return Ptr is bumped past the character.
419
 
420
            procedure Skip_UTF_Byte;
421
            pragma Inline (Skip_UTF_Byte);
422
            --  Skips past one encoded byte which must be 2#10xxxxxx#
423
 
424
            ----------
425
            -- Getc --
426
            ----------
427
 
428
            procedure Getc is
429
            begin
430
               if Ptr > Input'Last then
431
                  Past_End;
432
               else
433
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
434
                  Ptr := Ptr + 1;
435
               end if;
436
            end Getc;
437
 
438
            -------------------
439
            -- Skip_UTF_Byte --
440
            -------------------
441
 
442
            procedure Skip_UTF_Byte is
443
            begin
444
               Getc;
445
 
446
               if (U and 2#11000000#) /= 2#10_000000# then
447
                  Bad;
448
               end if;
449
            end Skip_UTF_Byte;
450
 
451
         --  Start of processing for UTF-8 case
452
 
453
         begin
454
            if Ptr < Input'First then
455
               Past_End;
456
            end if;
457
 
458
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
459
 
460
            Getc;
461
 
462
            if (U and 2#10000000#) = 2#00000000# then
463
               null;
464
 
465
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
466
 
467
            elsif (U and 2#11100000#) = 2#110_00000# then
468
               Skip_UTF_Byte;
469
 
470
            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
471
 
472
            elsif (U and 2#11110000#) = 2#1110_0000# then
473
               Skip_UTF_Byte;
474
               Skip_UTF_Byte;
475
 
476
            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
477
 
478
            elsif (U and 2#11111000#) = 2#11110_000# then
479
               for K in 1 .. 3 loop
480
                  Skip_UTF_Byte;
481
               end loop;
482
 
483
            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
484
            --                               10xxxxxx 10xxxxxx
485
 
486
            elsif (U and 2#11111100#) = 2#111110_00# then
487
               for K in 1 .. 4 loop
488
                  Skip_UTF_Byte;
489
               end loop;
490
 
491
            --  Any other code is invalid, note that this includes:
492
 
493
            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
494
            --                               10xxxxxx 10xxxxxx 10xxxxxx
495
 
496
            --  since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
497
 
498
            else
499
               Bad;
500
            end if;
501
         end UTF8;
502
 
503
      --  Non-UTF-8 case
504
 
505
      else
506
         declare
507
            Discard : Wide_Wide_Character;
508
         begin
509
            Decode_Wide_Wide_Character (Input, Ptr, Discard);
510
         end;
511
      end if;
512
   end Next_Wide_Wide_Character;
513
 
514
   --------------
515
   -- Past_End --
516
   --------------
517
 
518
   procedure Past_End is
519
   begin
520
      raise Constraint_Error with "past end of string";
521
   end Past_End;
522
 
523
   -------------------------
524
   -- Prev_Wide_Character --
525
   -------------------------
526
 
527
   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
528
   begin
529
      if Ptr > Input'Last + 1 then
530
         Past_End;
531
      end if;
532
 
533
      --  Special efficient encoding for UTF-8 case
534
 
535
      if Encoding_Method = WCEM_UTF8 then
536
         UTF8 : declare
537
            U : Unsigned_32;
538
 
539
            procedure Getc;
540
            pragma Inline (Getc);
541
            --  Gets the character at Input (Ptr - 1) and returns code in U as
542
            --  Unsigned_32 value. On return Ptr is decremented by one.
543
 
544
            procedure Skip_UTF_Byte;
545
            pragma Inline (Skip_UTF_Byte);
546
            --  Checks that U is 2#10xxxxxx# and then calls Get
547
 
548
            ----------
549
            -- Getc --
550
            ----------
551
 
552
            procedure Getc is
553
            begin
554
               if Ptr <= Input'First then
555
                  Past_End;
556
               else
557
                  Ptr := Ptr - 1;
558
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
559
               end if;
560
            end Getc;
561
 
562
            -------------------
563
            -- Skip_UTF_Byte --
564
            -------------------
565
 
566
            procedure Skip_UTF_Byte is
567
            begin
568
               if (U and 2#11000000#) = 2#10_000000# then
569
                  Getc;
570
               else
571
                  Bad;
572
               end if;
573
            end Skip_UTF_Byte;
574
 
575
         --  Start of processing for UTF-8 case
576
 
577
         begin
578
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
579
 
580
            Getc;
581
 
582
            if (U and 2#10000000#) = 2#00000000# then
583
               return;
584
 
585
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
586
 
587
            else
588
               Skip_UTF_Byte;
589
 
590
               if (U and 2#11100000#) = 2#110_00000# then
591
                  return;
592
 
593
               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
594
 
595
               else
596
                  Skip_UTF_Byte;
597
 
598
                  if (U and 2#11110000#) = 2#1110_0000# then
599
                     return;
600
 
601
                     --  Any other code is invalid, note that this includes:
602
 
603
                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
604
                     --                           10xxxxxx
605
 
606
                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
607
                     --                               10xxxxxx 10xxxxxx
608
                     --                               10xxxxxx
609
 
610
                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
611
                     --                               10xxxxxx 10xxxxxx
612
                     --                               10xxxxxx 10xxxxxx
613
 
614
                     --  since Wide_Character does not allow codes > 16#FFFF#
615
 
616
                  else
617
                     Bad;
618
                  end if;
619
               end if;
620
            end if;
621
         end UTF8;
622
 
623
      --  Special efficient encoding for brackets case
624
 
625
      elsif Encoding_Method = WCEM_Brackets then
626
         Brackets : declare
627
            P : Natural;
628
            S : Natural;
629
 
630
         begin
631
            --  See if we have "] at end positions
632
 
633
            if Ptr > Input'First + 1
634
              and then Input (Ptr - 1) = ']'
635
              and then Input (Ptr - 2) = '"'
636
            then
637
               P := Ptr - 2;
638
 
639
               --  Loop back looking for [" at start
640
 
641
               while P >= Ptr - 10 loop
642
                  if P <= Input'First + 1 then
643
                     Bad;
644
 
645
                  elsif Input (P - 1) = '"'
646
                    and then Input (P - 2) = '['
647
                  then
648
                     --  Found ["..."], scan forward to check it
649
 
650
                     S := P - 2;
651
                     P := S;
652
                     Next_Wide_Character (Input, P);
653
 
654
                     --  OK if at original pointer, else error
655
 
656
                     if P = Ptr then
657
                        Ptr := S;
658
                        return;
659
                     else
660
                        Bad;
661
                     end if;
662
                  end if;
663
 
664
                  P := P - 1;
665
               end loop;
666
 
667
               --  Falling through loop means more than 8 chars between the
668
               --  enclosing brackets (or simply a missing left bracket)
669
 
670
               Bad;
671
 
672
            --  Here if no bracket sequence present
673
 
674
            else
675
               if Ptr = Input'First then
676
                  Past_End;
677
               else
678
                  Ptr := Ptr - 1;
679
               end if;
680
            end if;
681
         end Brackets;
682
 
683
      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
684
      --  go to the start of the string and skip forwards till Ptr matches.
685
 
686
      else
687
         Non_UTF_Brackets : declare
688
            Discard : Wide_Character;
689
            PtrS    : Natural;
690
            PtrP    : Natural;
691
 
692
         begin
693
            PtrS := Input'First;
694
 
695
            if Ptr <= PtrS then
696
               Past_End;
697
            end if;
698
 
699
            loop
700
               PtrP := PtrS;
701
               Decode_Wide_Character (Input, PtrS, Discard);
702
 
703
               if PtrS = Ptr then
704
                  Ptr := PtrP;
705
                  return;
706
 
707
               elsif PtrS > Ptr then
708
                  Bad;
709
               end if;
710
            end loop;
711
 
712
         exception
713
            when Constraint_Error =>
714
               Bad;
715
         end Non_UTF_Brackets;
716
      end if;
717
   end Prev_Wide_Character;
718
 
719
   ------------------------------
720
   -- Prev_Wide_Wide_Character --
721
   ------------------------------
722
 
723
   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
724
   begin
725
      if Ptr > Input'Last + 1 then
726
         Past_End;
727
      end if;
728
 
729
      --  Special efficient encoding for UTF-8 case
730
 
731
      if Encoding_Method = WCEM_UTF8 then
732
         UTF8 : declare
733
            U : Unsigned_32;
734
 
735
            procedure Getc;
736
            pragma Inline (Getc);
737
            --  Gets the character at Input (Ptr - 1) and returns code in U as
738
            --  Unsigned_32 value. On return Ptr is decremented by one.
739
 
740
            procedure Skip_UTF_Byte;
741
            pragma Inline (Skip_UTF_Byte);
742
            --  Checks that U is 2#10xxxxxx# and then calls Get
743
 
744
            ----------
745
            -- Getc --
746
            ----------
747
 
748
            procedure Getc is
749
            begin
750
               if Ptr <= Input'First then
751
                  Past_End;
752
               else
753
                  Ptr := Ptr - 1;
754
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
755
               end if;
756
            end Getc;
757
 
758
            -------------------
759
            -- Skip_UTF_Byte --
760
            -------------------
761
 
762
            procedure Skip_UTF_Byte is
763
            begin
764
               if (U and 2#11000000#) = 2#10_000000# then
765
                  Getc;
766
               else
767
                  Bad;
768
               end if;
769
            end Skip_UTF_Byte;
770
 
771
         --  Start of processing for UTF-8 case
772
 
773
         begin
774
            --  16#00_0000#-16#00_007F#: 0xxxxxxx
775
 
776
            Getc;
777
 
778
            if (U and 2#10000000#) = 2#00000000# then
779
               return;
780
 
781
            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
782
 
783
            else
784
               Skip_UTF_Byte;
785
 
786
               if (U and 2#11100000#) = 2#110_00000# then
787
                  return;
788
 
789
               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
790
 
791
               else
792
                  Skip_UTF_Byte;
793
 
794
                  if (U and 2#11110000#) = 2#1110_0000# then
795
                     return;
796
 
797
                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
798
                  --                           10xxxxxx
799
 
800
                  else
801
                     Skip_UTF_Byte;
802
 
803
                     if (U and 2#11111000#) = 2#11110_000# then
804
                        return;
805
 
806
                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
807
                     --                               10xxxxxx 10xxxxxx
808
                     --                               10xxxxxx
809
 
810
                     else
811
                        Skip_UTF_Byte;
812
 
813
                        if (U and 2#11111100#) = 2#111110_00# then
814
                           return;
815
 
816
                        --  Any other code is invalid, note that this includes:
817
 
818
                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
819
                        --                               10xxxxxx 10xxxxxx
820
                        --                               10xxxxxx 10xxxxxx
821
 
822
                        --  since Wide_Wide_Character does not allow codes
823
                        --  greater than 16#03FF_FFFF#
824
 
825
                        else
826
                           Bad;
827
                        end if;
828
                     end if;
829
                  end if;
830
               end if;
831
            end if;
832
         end UTF8;
833
 
834
      --  Special efficient encoding for brackets case
835
 
836
      elsif Encoding_Method = WCEM_Brackets then
837
         Brackets : declare
838
            P : Natural;
839
            S : Natural;
840
 
841
         begin
842
            --  See if we have "] at end positions
843
 
844
            if Ptr > Input'First + 1
845
              and then Input (Ptr - 1) = ']'
846
              and then Input (Ptr - 2) = '"'
847
            then
848
               P := Ptr - 2;
849
 
850
               --  Loop back looking for [" at start
851
 
852
               while P >= Ptr - 10 loop
853
                  if P <= Input'First + 1 then
854
                     Bad;
855
 
856
                  elsif Input (P - 1) = '"'
857
                    and then Input (P - 2) = '['
858
                  then
859
                     --  Found ["..."], scan forward to check it
860
 
861
                     S := P - 2;
862
                     P := S;
863
                     Next_Wide_Wide_Character (Input, P);
864
 
865
                     --  OK if at original pointer, else error
866
 
867
                     if P = Ptr then
868
                        Ptr := S;
869
                        return;
870
                     else
871
                        Bad;
872
                     end if;
873
                  end if;
874
 
875
                  P := P - 1;
876
               end loop;
877
 
878
               --  Falling through loop means more than 8 chars between the
879
               --  enclosing brackets (or simply a missing left bracket)
880
 
881
               Bad;
882
 
883
            --  Here if no bracket sequence present
884
 
885
            else
886
               if Ptr = Input'First then
887
                  Past_End;
888
               else
889
                  Ptr := Ptr - 1;
890
               end if;
891
            end if;
892
         end Brackets;
893
 
894
      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
895
      --  go to the start of the string and skip forwards till Ptr matches.
896
 
897
      else
898
         Non_UTF8_Brackets : declare
899
            Discard : Wide_Wide_Character;
900
            PtrS    : Natural;
901
            PtrP    : Natural;
902
 
903
         begin
904
            PtrS := Input'First;
905
 
906
            if Ptr <= PtrS then
907
               Past_End;
908
            end if;
909
 
910
            loop
911
               PtrP := PtrS;
912
               Decode_Wide_Wide_Character (Input, PtrS, Discard);
913
 
914
               if PtrS = Ptr then
915
                  Ptr := PtrP;
916
                  return;
917
 
918
               elsif PtrS > Ptr then
919
                  Bad;
920
               end if;
921
            end loop;
922
 
923
         exception
924
            when Constraint_Error =>
925
               Bad;
926
         end Non_UTF8_Brackets;
927
      end if;
928
   end Prev_Wide_Wide_Character;
929
 
930
   --------------------------
931
   -- Validate_Wide_String --
932
   --------------------------
933
 
934
   function Validate_Wide_String (S : String) return Boolean is
935
      Ptr : Natural;
936
 
937
   begin
938
      Ptr := S'First;
939
      while Ptr <= S'Last loop
940
         Next_Wide_Character (S, Ptr);
941
      end loop;
942
 
943
      return True;
944
 
945
   exception
946
      when Constraint_Error =>
947
         return False;
948
   end Validate_Wide_String;
949
 
950
   -------------------------------
951
   -- Validate_Wide_Wide_String --
952
   -------------------------------
953
 
954
   function Validate_Wide_Wide_String (S : String) return Boolean is
955
      Ptr : Natural;
956
 
957
   begin
958
      Ptr := S'First;
959
      while Ptr <= S'Last loop
960
         Next_Wide_Wide_Character (S, Ptr);
961
      end loop;
962
 
963
      return True;
964
 
965
   exception
966
      when Constraint_Error =>
967
         return False;
968
   end Validate_Wide_Wide_String;
969
 
970
end GNAT.Decode_String;

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.