1 |
706 |
jeremybenn |
------------------------------------------------------------------------------
|
2 |
|
|
-- --
|
3 |
|
|
-- GNAT RUN-TIME COMPONENTS --
|
4 |
|
|
-- --
|
5 |
|
|
-- A D A . S T R I N G S . U T F _ E N C O D I N G --
|
6 |
|
|
-- --
|
7 |
|
|
-- S p e c --
|
8 |
|
|
-- --
|
9 |
|
|
-- This specification is derived from the Ada Reference Manual for use with --
|
10 |
|
|
-- GNAT. The copyright notice above, and the license provisions that follow --
|
11 |
|
|
-- apply solely to the contents of the part following the private keyword. --
|
12 |
|
|
-- --
|
13 |
|
|
-- GNAT is free software; you can redistribute it and/or modify it under --
|
14 |
|
|
-- terms of the GNU General Public License as published by the Free Soft- --
|
15 |
|
|
-- ware Foundation; either version 3, or (at your option) any later ver- --
|
16 |
|
|
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
|
17 |
|
|
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
|
18 |
|
|
-- or FITNESS FOR A PARTICULAR PURPOSE. --
|
19 |
|
|
-- --
|
20 |
|
|
-- As a special exception under Section 7 of GPL version 3, you are granted --
|
21 |
|
|
-- additional permissions described in the GCC Runtime Library Exception, --
|
22 |
|
|
-- version 3.1, as published by the Free Software Foundation. --
|
23 |
|
|
-- --
|
24 |
|
|
-- You should have received a copy of the GNU General Public License and --
|
25 |
|
|
-- a copy of the GCC Runtime Library Exception along with this program; --
|
26 |
|
|
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
|
27 |
|
|
-- <http://www.gnu.org/licenses/>. --
|
28 |
|
|
-- --
|
29 |
|
|
-- GNAT was originally developed by the GNAT team at New York University. --
|
30 |
|
|
-- Extensive contributions were provided by Ada Core Technologies Inc. --
|
31 |
|
|
-- --
|
32 |
|
|
------------------------------------------------------------------------------
|
33 |
|
|
|
34 |
|
|
-- This is one of the Ada 2012 package defined in AI05-0137-1. It is a parent
|
35 |
|
|
-- package that contains declarations used in the child packages for handling
|
36 |
|
|
-- UTF encoded strings. Note: this package is consistent with Ada 95, and may
|
37 |
|
|
-- be used in Ada 95 or Ada 2005 mode.
|
38 |
|
|
|
39 |
|
|
with Interfaces;
|
40 |
|
|
with Unchecked_Conversion;
|
41 |
|
|
|
42 |
|
|
package Ada.Strings.UTF_Encoding is
|
43 |
|
|
pragma Pure (UTF_Encoding);
|
44 |
|
|
|
45 |
|
|
subtype UTF_String is String;
|
46 |
|
|
-- Used to represent a string of 8-bit values containing a sequence of
|
47 |
|
|
-- values encoded in one of three ways (UTF-8, UTF-16BE, or UTF-16LE).
|
48 |
|
|
-- Typically used in connection with a Scheme parameter indicating which
|
49 |
|
|
-- of the encodings applies. This is not strictly a String value in the
|
50 |
|
|
-- sense defined in the Ada RM, but in practice type String accommodates
|
51 |
|
|
-- all possible 256 codes, and can be used to hold any sequence of 8-bit
|
52 |
|
|
-- codes. We use String directly rather than create a new type so that
|
53 |
|
|
-- all existing facilities for manipulating type String (e.g. the child
|
54 |
|
|
-- packages of Ada.Strings) are available for manipulation of UTF_Strings.
|
55 |
|
|
|
56 |
|
|
type Encoding_Scheme is (UTF_8, UTF_16BE, UTF_16LE);
|
57 |
|
|
-- Used to specify which of three possible encodings apply to a UTF_String
|
58 |
|
|
|
59 |
|
|
subtype UTF_8_String is String;
|
60 |
|
|
-- Similar to UTF_String but specifically represents a UTF-8 encoded string
|
61 |
|
|
|
62 |
|
|
subtype UTF_16_Wide_String is Wide_String;
|
63 |
|
|
-- This is similar to UTF_8_String but is used to represent a Wide_String
|
64 |
|
|
-- value which is a sequence of 16-bit values encoded using UTF-16. Again
|
65 |
|
|
-- this is not strictly a Wide_String in the sense of the Ada RM, but the
|
66 |
|
|
-- type Wide_String can be used to represent a sequence of arbitrary 16-bit
|
67 |
|
|
-- values, and it is more convenient to use Wide_String than a new type.
|
68 |
|
|
|
69 |
|
|
Encoding_Error : exception;
|
70 |
|
|
-- This exception is raised in the following situations:
|
71 |
|
|
-- a) A UTF encoded string contains an invalid encoding sequence
|
72 |
|
|
-- b) A UTF-16BE or UTF-16LE input string has an odd length
|
73 |
|
|
-- c) An incorrect character value is present in the Input string
|
74 |
|
|
-- d) The result for a Wide_Character output exceeds 16#FFFF#
|
75 |
|
|
-- The exception message has the index value where the error occurred.
|
76 |
|
|
|
77 |
|
|
-- The BOM (BYTE_ORDER_MARK) values defined here are used at the start of
|
78 |
|
|
-- a string to indicate the encoding. The convention in this package is
|
79 |
|
|
-- that on input a correct BOM is ignored and an incorrect BOM causes an
|
80 |
|
|
-- Encoding_Error exception. On output, the output string may or may not
|
81 |
|
|
-- include a BOM depending on the setting of Output_BOM.
|
82 |
|
|
|
83 |
|
|
BOM_8 : constant UTF_8_String :=
|
84 |
|
|
Character'Val (16#EF#) &
|
85 |
|
|
Character'Val (16#BB#) &
|
86 |
|
|
Character'Val (16#BF#);
|
87 |
|
|
|
88 |
|
|
BOM_16BE : constant UTF_String :=
|
89 |
|
|
Character'Val (16#FE#) &
|
90 |
|
|
Character'Val (16#FF#);
|
91 |
|
|
|
92 |
|
|
BOM_16LE : constant UTF_String :=
|
93 |
|
|
Character'Val (16#FF#) &
|
94 |
|
|
Character'Val (16#FE#);
|
95 |
|
|
|
96 |
|
|
BOM_16 : constant UTF_16_Wide_String :=
|
97 |
|
|
(1 => Wide_Character'Val (16#FEFF#));
|
98 |
|
|
|
99 |
|
|
function Encoding
|
100 |
|
|
(Item : UTF_String;
|
101 |
|
|
Default : Encoding_Scheme := UTF_8) return Encoding_Scheme;
|
102 |
|
|
-- This function inspects a UTF_String value to determine whether it
|
103 |
|
|
-- starts with a BOM for UTF-8, UTF-16BE, or UTF_16LE. If so, the result
|
104 |
|
|
-- is the scheme corresponding to the BOM. If no valid BOM is present
|
105 |
|
|
-- then the result is the specified Default value.
|
106 |
|
|
|
107 |
|
|
private
|
108 |
|
|
function To_Unsigned_8 is new
|
109 |
|
|
Unchecked_Conversion (Character, Interfaces.Unsigned_8);
|
110 |
|
|
|
111 |
|
|
function To_Unsigned_16 is new
|
112 |
|
|
Unchecked_Conversion (Wide_Character, Interfaces.Unsigned_16);
|
113 |
|
|
|
114 |
|
|
function To_Unsigned_32 is new
|
115 |
|
|
Unchecked_Conversion (Wide_Wide_Character, Interfaces.Unsigned_32);
|
116 |
|
|
|
117 |
|
|
subtype UTF_XE_Encoding is Encoding_Scheme range UTF_16BE .. UTF_16LE;
|
118 |
|
|
-- Subtype containing only UTF_16BE and UTF_16LE entries
|
119 |
|
|
|
120 |
|
|
-- Utility routines for converting between UTF-16 and UTF-16LE/BE
|
121 |
|
|
|
122 |
|
|
function From_UTF_16
|
123 |
|
|
(Item : UTF_16_Wide_String;
|
124 |
|
|
Output_Scheme : UTF_XE_Encoding;
|
125 |
|
|
Output_BOM : Boolean := False) return UTF_String;
|
126 |
|
|
-- The input string Item is encoded in UTF-16. The output is encoded using
|
127 |
|
|
-- Output_Scheme (which is either UTF-16LE or UTF-16BE). There are no error
|
128 |
|
|
-- cases. The output starts with BOM_16BE/LE if Output_BOM is True.
|
129 |
|
|
|
130 |
|
|
function To_UTF_16
|
131 |
|
|
(Item : UTF_String;
|
132 |
|
|
Input_Scheme : UTF_XE_Encoding;
|
133 |
|
|
Output_BOM : Boolean := False) return UTF_16_Wide_String;
|
134 |
|
|
-- The input string Item is encoded using Input_Scheme which is either
|
135 |
|
|
-- UTF-16LE or UTF-16BE. The output is the corresponding UTF_16 wide
|
136 |
|
|
-- string. Encoding error is raised if the length of the input is odd.
|
137 |
|
|
-- The output starts with BOM_16 if Output_BOM is True.
|
138 |
|
|
|
139 |
|
|
procedure Raise_Encoding_Error (Index : Natural);
|
140 |
|
|
pragma No_Return (Raise_Encoding_Error);
|
141 |
|
|
-- Raise Encoding_Error exception for bad encoding in input item. The
|
142 |
|
|
-- parameter Index is the index of the location in Item for the error.
|
143 |
|
|
|
144 |
|
|
end Ada.Strings.UTF_Encoding;
|