OpenCores
URL https://opencores.org/ocsvn/scarts/scarts/trunk

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [libjava/] [classpath/] [gnu/] [java/] [nio/] [charset/] [UTF_8.java] - Blame information for rev 14

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 14 jlechner
/* UTF_8.java --
2
   Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
package gnu.java.nio.charset;
39
 
40
import java.nio.ByteBuffer;
41
import java.nio.CharBuffer;
42
import java.nio.charset.Charset;
43
import java.nio.charset.CharsetDecoder;
44
import java.nio.charset.CharsetEncoder;
45
import java.nio.charset.CoderResult;
46
 
47
/**
48
 * UTF-8 charset.
49
 *
50
 * <p> UTF-8 references:
51
 * <ul>
52
 *   <li> <a href="http://ietf.org/rfc/rfc2279.txt">RFC 2279</a>
53
 *   <li> The <a href="http://www.unicode.org/unicode/standard/standard.html">
54
 *     Unicode standard</a> and
55
 *     <a href="http://www.unicode.org/versions/corrigendum1.html">
56
 *      Corrigendum</a>
57
 * </ul>
58
 *
59
 * @author Jesse Rosenstock
60
 */
61
final class UTF_8 extends Charset
62
{
63
  UTF_8 ()
64
  {
65
    super ("UTF-8", new String[] {
66
        /* These names are provided by
67
         * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
68
         */
69
        "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305",
70
        "windows-65001", "cp1208",
71
        // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
72
        "UTF8"
73
    });
74
  }
75
 
76
  public boolean contains (Charset cs)
77
  {
78
    return cs instanceof US_ASCII || cs instanceof ISO_8859_1
79
      || cs instanceof UTF_8 || cs instanceof UTF_16BE
80
      || cs instanceof UTF_16LE || cs instanceof UTF_16;
81
  }
82
 
83
  public CharsetDecoder newDecoder ()
84
  {
85
    return new Decoder (this);
86
  }
87
 
88
  public CharsetEncoder newEncoder ()
89
  {
90
    return new Encoder (this);
91
  }
92
 
93
  private static final class Decoder extends CharsetDecoder
94
  {
95
    // Package-private to avoid a trampoline constructor.
96
    Decoder (Charset cs)
97
    {
98
      super (cs, 1f, 1f);
99
    }
100
 
101
    protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out)
102
    {
103
      // TODO: Optimize this in the case in.hasArray() / out.hasArray()
104
      int inPos = in.position();
105
      try
106
        {
107
          while (in.hasRemaining ())
108
            {
109
              char c;
110
              byte b1 = in.get ();
111
              int highNibble = ((b1 & 0xFF) >> 4) & 0xF;
112
              switch (highNibble)
113
                {
114
                  case 0: case 1: case 2: case 3:
115
                  case 4: case 5: case 6: case 7:
116
                    if (out.remaining () < 1)
117
                      return CoderResult.OVERFLOW;
118
                    out.put ((char) b1);
119
                    inPos++;
120
                    break;
121
 
122
                  case 0xC: case 0xD:
123
                    byte b2;
124
                    if (in.remaining () < 1)
125
                      return CoderResult.UNDERFLOW;
126
                    if (out.remaining () < 1)
127
                      return CoderResult.OVERFLOW;
128
                    if (!isContinuation (b2 = in.get ()))
129
                      return CoderResult.malformedForLength (1);
130
                    c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F));
131
                    // check that we had the shortest encoding
132
                    if (c <= 0x7F)
133
                      return CoderResult.malformedForLength (2);
134
                    out.put (c);
135
                    inPos += 2;
136
                    break;
137
 
138
                  case 0xE:
139
                    byte b3;
140
                    if (in.remaining () < 2)
141
                      return CoderResult.UNDERFLOW;
142
                    if (out.remaining () < 1)
143
                      return CoderResult.OVERFLOW;
144
                    if (!isContinuation (b2 = in.get ()))
145
                      return CoderResult.malformedForLength (1);
146
                    if (!isContinuation (b3 = in.get ()))
147
                      return CoderResult.malformedForLength (1);
148
                    c = (char) (((b1 & 0x0F) << 12)
149
                                | ((b2 & 0x3F) << 6)
150
                                | (b3 & 0x3F));
151
                    // check that we had the shortest encoding
152
                    if (c <= 0x7FF)
153
                      return CoderResult.malformedForLength (3);
154
                    out.put (c);
155
                    inPos += 3;
156
                    break;
157
 
158
                  case 0xF:
159
                    byte b4;
160
                    if (in.remaining () < 3)
161
                      return CoderResult.UNDERFLOW;
162
                    if((b1&0x0F) > 4)
163
                      return CoderResult.malformedForLength (4);
164
                    if (out.remaining () < 2)
165
                      return CoderResult.OVERFLOW;
166
                    if (!isContinuation (b2 = in.get ()))
167
                      return CoderResult.malformedForLength (3);
168
                    if (!isContinuation (b3 = in.get ()))
169
                      return CoderResult.malformedForLength (2);
170
                    if (!isContinuation (b4 = in.get ()))
171
                      return CoderResult.malformedForLength (1);
172
                    int n = (((b1 & 0x3) << 18)
173
                             | ((b2 & 0x3F) << 12)
174
                             | ((b3 & 0x3F) << 6)
175
                             | (b4 & 0x3F)) - 0x10000;
176
                    char c1 = (char)(0xD800 | (n & 0xFFC00)>>10);
177
                    char c2 = (char)(0xDC00 | (n & 0x003FF));
178
                    out.put (c1);
179
                    out.put (c2);
180
                    inPos += 4;
181
                    break;
182
 
183
                  default:
184
                    return CoderResult.malformedForLength (1);
185
                }
186
            }
187
 
188
          return CoderResult.UNDERFLOW;
189
        }
190
      finally
191
        {
192
          // In case we did a get(), then encountered an error, reset the
193
          // position to before the error.  If there was no error, this
194
          // will benignly reset the position to the value it already has.
195
          in.position (inPos);
196
        }
197
    }
198
 
199
    private static boolean isContinuation (byte b)
200
    {
201
      return (b & 0xC0) == 0x80;
202
    }
203
  }
204
 
205
  private static final class Encoder extends CharsetEncoder
206
  {
207
    // Package-private to avoid a trampoline constructor.
208
    Encoder (Charset cs)
209
    {
210
      // According to
211
      // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html
212
      //   On average, English takes slightly over one unit per code point.
213
      //   Most Latin-script languages take about 1.1 bytes. Greek, Russian,
214
      //   Arabic and Hebrew take about 1.7 bytes, and most others (including
215
      //   Japanese, Chinese, Korean and Hindi) take about 3 bytes.
216
      // We assume we will be dealing with latin scripts, and use 1.1 
217
      // for averageBytesPerChar.
218
      super (cs, 1.1f, 4.0f);
219
    }
220
 
221
    protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out)
222
    {
223
      int inPos = in.position();
224
      try
225
        {
226
          // TODO: Optimize this in the case in.hasArray() / out.hasArray()
227
          while (in.hasRemaining ())
228
          {
229
            int remaining = out.remaining ();
230
            char c = in.get ();
231
 
232
            // UCS-4 range (hex.)           UTF-8 octet sequence (binary)
233
            // 0000 0000-0000 007F   0xxxxxxx
234
            // 0000 0080-0000 07FF   110xxxxx 10xxxxxx
235
            // 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
236
 
237
            //        Scalar Value          UTF-16                byte 1     byte 2     byte 3     byte 4
238
            //        0000 0000 0xxx xxxx   0000 0000 0xxx xxxx   0xxx xxxx
239
            //        0000 0yyy yyxx xxxx   0000 0yyy yyxx xxxx   110y yyyy  10xx xxxx
240
            //        zzzz yyyy yyxx xxxx   zzzz yyyy yyxx xxxx   1110 zzzz  10yy yyyy  10xx xxxx
241
            // u uuuu zzzz yyyy yyxx xxxx   1101 10ww wwzz zzyy   1111 0uuu  10uu zzzz  10yy yyyy  10xx xxxx
242
            //                            + 1101 11yy yyxx xxxx
243
            // Note: uuuuu = wwww + 1
244
            if (c <= 0x7F)
245
              {
246
                if (remaining < 1)
247
                  return CoderResult.OVERFLOW;
248
                out.put ((byte) c);
249
                inPos++;
250
              }
251
            else if (c <= 0x7FF)
252
              {
253
                if (remaining < 2)
254
                  return CoderResult.OVERFLOW;
255
                out.put ((byte) (0xC0 | (c >> 6)));
256
                out.put ((byte) (0x80 | (c & 0x3F)));
257
                inPos++;
258
              }
259
            else if (0xD800 <= c && c <= 0xDFFF)
260
              {
261
                if (remaining < 4)
262
                  return CoderResult.OVERFLOW;
263
 
264
                // we got a low surrogate without a preciding high one
265
                if (c > 0xDBFF)
266
                  return CoderResult.malformedForLength (1);
267
 
268
                // high surrogates
269
                if (!in.hasRemaining ())
270
                  return CoderResult.UNDERFLOW;
271
 
272
                char d = in.get ();
273
 
274
                // make sure d is a low surrogate
275
                if (d < 0xDC00 || d > 0xDFFF)
276
                  return CoderResult.malformedForLength (1);
277
 
278
                // make the 32 bit value
279
                // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
280
                int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000;
281
                // assert value == value2;
282
                out.put ((byte) (0xF0 | ((value >> 18) & 0x07)));
283
                out.put ((byte) (0x80 | ((value >> 12) & 0x3F)));
284
                out.put ((byte) (0x80 | ((value >>  6) & 0x3F)));
285
                out.put ((byte) (0x80 | ((value      ) & 0x3F)));
286
                inPos += 2;
287
              }
288
            else
289
              {
290
                if (remaining < 3)
291
                  return CoderResult.OVERFLOW;
292
 
293
                out.put ((byte) (0xE0 | (c >> 12)));
294
                out.put ((byte) (0x80 | ((c >> 6) & 0x3F)));
295
                out.put ((byte) (0x80 | (c & 0x3F)));
296
                inPos++;
297
              }
298
          }
299
 
300
          return CoderResult.UNDERFLOW;
301
        }
302
      finally
303
        {
304
          // In case we did a get(), then encountered an error, reset the
305
          // position to before the error.  If there was no error, this
306
          // will benignly reset the position to the value it already has.
307
          in.position (inPos);
308
        }
309
    }
310
  }
311
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.