OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [scripts/] [unicode-muncher.pl] - Blame information for rev 830

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 762 jeremybenn
#!/usr/bin/perl -w
2
# unicode-muncher.pl -- generate Unicode database for java.lang.Character
3
# Copyright (C) 1998, 2002, 2004  Free Software Foundation, Inc.
4
#
5
# This file is part of GNU Classpath.
6
#
7
# GNU Classpath is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2, or (at your option)
10
# any later version.
11
#
12
# GNU Classpath is distributed in the hope that it will be useful, but
13
# WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
# General Public License for more details.
16
#
17
# You should have received a copy of the GNU General Public License
18
# along with GNU Classpath; see the file COPYING.  If not, write to the
19
# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20
# 02110-1301 USA.
21
#
22
# Linking this library statically or dynamically with other modules is
23
# making a combined work based on this library.  Thus, the terms and
24
# conditions of the GNU General Public License cover the whole
25
# combination.
26
#
27
# As a special exception, the copyright holders of this library give you
28
# permission to link this library with independent modules to produce an
29
# executable, regardless of the license terms of these independent
30
# modules, and to copy and distribute the resulting executable under
31
# terms of your choice, provided that you also meet, for each linked
32
# independent module, the terms and conditions of the license of that
33
# module.  An independent module is a module which is not derived from
34
# or based on this library.  If you modify this library, you may extend
35
# this exception to your version of the library, but you are not
36
# obligated to do so.  If you do not wish to do so, delete this
37
# exception statement from your version.
38
 
39
# Code for reading UnicodeData-3.0.0.txt and SpecialCasing-2.txt to generate
40
# the code for gnu.java.lang.CharData. The relevant files can be found here:
41
#
42
#   http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
43
#   http://www.unicode.org/Public/3.0-Update/SpecialCasing-2.txt
44
#
45
# Inspired by code from Jochen Hoenicke.
46
# author Eric Blake <ebb9@email.byu.edu>
47
#
48
# Usage: ./unicode-muncher <UnicodeData.txt> <SpecialCasing> <CharData.java>
49
#   where <UnicodeData.txt> is obtained from www.unicode.org (named
50
#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), <SpecialCasing>
51
#   is obtained from www.unicode too (named SpecialCasing-2.txt for Unicode
52
#   version 3.0.0), and <CharData.java> is the final location for the Java
53
#   interface gnu.java.lang.CharData. As of JDK 1.4, use Unicode version 3.0.0
54
#   for best results.
55
 
56
##
57
## Convert a 16-bit integer to a Java source code String literal character
58
##
59
sub javaChar($) {
60
    my ($char) = @_;
61
    die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
62
    $char += 0x10000 if $char < 0;
63
    # Special case characters that must be escaped, or are shorter as ASCII
64
    return sprintf("\\%03o", $char) if $char < 0x20;
65
    return "\\\"" if $char == 0x22;
66
    return "\\\\" if $char == 0x5c;
67
    return pack("C", $char) if $char < 0x7f;
68
    return sprintf("\\u%04x", $char);
69
}
70
 
71
##
72
## Convert the text UnicodeData file from www.unicode.org into a Java
73
## interface with string constants holding the compressed information.
74
##
75
my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
76
                   SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
77
my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
78
 
79
my $NOBREAK_FLAG  = 32;
80
my $MIRRORED_FLAG = 64;
81
 
82
my %special = ();
83
my @info = ();
84
my $titlecase = "";
85
my $count = 0;
86
my $range = 0;
87
 
88
die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
89
    unless @ARGV == 3;
90
$| = 1;
91
print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
92
print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
93
 
94
# Stage 0: Parse the special casing file
95
print "Parsing special casing file\n";
96
open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
97
while (<SPECIAL>) {
98
    next if /^\#/;
99
    my ($ch, undef, undef, $upper) = split / *; */;
100
 
101
    # This grabs only the special casing for multi-char uppercase. Note that
102
    # there are no multi-char lowercase, and that Sun ignores multi-char
103
    # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
104
    # which must be hardcoded in java.lang.String:
105
    #  \u03a3 (Sun ignores this special case)
106
    #  \u0049 - lowercases to \u0131, but only in Turkish locale
107
    #  \u0069 - uppercases to \u0130, but only in Turkish locale
108
    next unless defined $upper and $upper =~ / /;
109
    $special{hex $ch} = [map {hex} split ' ', $upper];
110
}
111
 
112
close SPECIAL;
113
 
114
# Stage 1: Parse the attribute file
115
print "Parsing attributes file";
116
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
117
while (<UNICODE>) {
118
    print "." unless $count++ % 1000;
119
    chomp;
120
    s/\r//g;
121
    my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
122
        $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
123
    $ch = hex($ch);
124
    next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
125
 
126
    my ($type, $numValue, $upperchar, $lowerchar, $direction);
127
 
128
    $type = 0;
129
    while ($category !~ /^$TYPECODES[$type]$/) {
130
        if (++$type == @TYPECODES) {
131
            die "$ch: Unknown type: $category";
132
        }
133
    }
134
    $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
135
    $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
136
 
137
    if ($numeric =~ /^[0-9]+$/) {
138
        $numValue = $numeric;
139
        die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
140
    } elsif ($numeric eq "") {
141
        # Special case sequences of 'a'-'z'
142
        if ($ch >= 0x0041 && $ch <= 0x005a) {
143
            $numValue = $ch - 0x0037;
144
        } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
145
            $numValue = $ch - 0x0057;
146
        } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
147
            $numValue = $ch - 0xff17;
148
        } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
149
            $numValue = $ch - 0xff37;
150
        } else {
151
            $numValue = -1;
152
        }
153
    } else {
154
        $numValue = -2;
155
    }
156
 
157
    $upperchar = $upcase ? hex($upcase) - $ch : 0;
158
    $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
159
    if ($title ne $upcase) {
160
        my $titlechar = $title ? hex($title) : $ch;
161
        $titlecase .= pack("n2", $ch, $titlechar);
162
    }
163
 
164
    $direction = 0;
165
    while ($bidir !~ /^$DIRCODES[$direction]$/) {
166
        if (++$direction == @DIRCODES) {
167
            $direction = -1;
168
            last;
169
        }
170
    }
171
    $direction <<= 2;
172
    $direction += $#{$special{$ch}} if defined $special{$ch};
173
 
174
    if ($range) {
175
        die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
176
        for ($range + 1 .. $ch - 1) {
177
            $info[$_] = pack("n5", $type, $numValue, $upperchar,
178
                             $lowerchar, $direction);
179
        }
180
        $range = 0;
181
    } elsif ($name =~ /First>$/) {
182
        $range = $ch;
183
    }
184
    $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
185
                      $direction);
186
}
187
close UNICODE;
188
 
189
# Stage 2: Compress the data structures
190
printf "\nCompressing data structures";
191
$count = 0;
192
my $info = ();
193
my %charhash = ();
194
my @charinfo = ();
195
 
196
for my $ch (0 .. 0xffff) {
197
    print "." unless $count++ % 0x1000;
198
    $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
199
 
200
    my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
201
    if (! exists $charhash{$info[$ch]}) {
202
        push @charinfo, [ $numVal, $upper, $lower, $direction ];
203
        $charhash{$info[$ch]} = $#charinfo;
204
    }
205
    $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
206
}
207
 
208
my $charlen = @charinfo;
209
my $bestshift;
210
my $bestest = 1000000;
211
my $bestblkstr;
212
die "Too many unique character entries: $charlen\n" if $charlen > 512;
213
print "\nUnique character entries: $charlen\n";
214
 
215
for my $i (3 .. 8) {
216
    my $blksize = 1 << $i;
217
    my %blocks = ();
218
    my @blkarray = ();
219
    my ($j, $k);
220
    print "shift: $i";
221
 
222
    for ($j = 0; $j < 0x10000; $j += $blksize) {
223
        my $blkkey = substr $info, 2 * $j, 2 * $blksize;
224
        if (! exists $blocks{$blkkey}) {
225
            push @blkarray, $blkkey;
226
            $blocks{$blkkey} = $#blkarray;
227
        }
228
    }
229
    my $blknum = @blkarray;
230
    my $blocklen = $blknum * $blksize;
231
    printf " before %5d", $blocklen;
232
 
233
    # Now we try to pack the blkarray as tight as possible by finding matching
234
    # heads and tails.
235
    for ($j = $blksize - 1; $j > 0; $j--) {
236
        my %tails = ();
237
        for $k (0 .. $#blkarray) {
238
            next unless defined $blkarray[$k];
239
            my $len = length $blkarray[$k];
240
            my $tail = substr $blkarray[$k], $len - $j * 2;
241
            if (exists $tails{$tail}) {
242
                push @{$tails{$tail}}, $k;
243
            } else {
244
                $tails{$tail} = [ $k ];
245
            }
246
        }
247
 
248
        # tails are calculated, now calculate the heads and merge.
249
      BLOCK:
250
        for $k (0 .. $#blkarray) {
251
            next unless defined $blkarray[$k];
252
            my $tomerge = $k;
253
            while (1) {
254
                my $head = substr($blkarray[$tomerge], 0, $j * 2);
255
                my $entry = $tails{$head};
256
                next BLOCK unless defined $entry;
257
 
258
                my $other = shift @{$entry};
259
                if ($other == $tomerge) {
260
                    if (@{$entry}) {
261
                        push @{$entry}, $other;
262
                        $other = shift @{$entry};
263
                    } else {
264
                        push @{$entry}, $other;
265
                        next BLOCK;
266
                    }
267
                }
268
                if (@{$entry} == 0) {
269
                    delete $tails{$head};
270
                }
271
 
272
                # a match was found
273
                my $merge = $blkarray[$other]
274
                    . substr($blkarray[$tomerge], $j * 2);
275
                $blocklen -= $j;
276
                $blknum--;
277
 
278
                if ($other < $tomerge) {
279
                    $blkarray[$tomerge] = undef;
280
                    $blkarray[$other] = $merge;
281
                    my $len = length $merge;
282
                    my $tail = substr $merge, $len - $j * 2;
283
                    $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
284
                                      @{$tails{$tail}} ];
285
                    next BLOCK;
286
                }
287
                $blkarray[$tomerge] = $merge;
288
                $blkarray[$other] = undef;
289
            }
290
        }
291
    }
292
    my $blockstr;
293
    for $k (0 .. $#blkarray) {
294
        $blockstr .= $blkarray[$k] if defined $blkarray[$k];
295
    }
296
 
297
    die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
298
    my $estimate = 2 * $blocklen + (0x20000 >> $i);
299
 
300
    printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
301
    if ($estimate < $bestest) {
302
        $bestest = $estimate;
303
        $bestshift = $i;
304
        $bestblkstr = $blockstr;
305
    }
306
}
307
 
308
my @blocks;
309
my $blksize = 1 << $bestshift;
310
for (my $j = 0; $j < 0x10000; $j += $blksize) {
311
    my $blkkey = substr $info, 2 * $j, 2 * $blksize;
312
    my $index = index $bestblkstr, $blkkey;
313
    while ($index & 1) {
314
        die "not found: $j" if $index == -1;
315
        $index = index $bestblkstr, $blkkey, $index + 1;
316
    }
317
    push @blocks, ($index / 2 - $j) & 0xffff;
318
}
319
 
320
# Phase 3: Generate the file
321
die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
322
    if @blocks > 0xffff / 3;
323
die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
324
    if length($bestblkstr) > 0xffff / 3;
325
{
326
    print "Generating $ARGV[2] with shift of $bestshift";
327
    my ($i, $j);
328
 
329
    open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
330
    print OUTPUT <<EOF;
331
/* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
332
   Copyright (C) 2002 Free Software Foundation, Inc.
333
   *** This file is generated by scripts/unicode-muncher.pl ***
334
 
335
This file is part of GNU Classpath.
336
 
337
GNU Classpath is free software; you can redistribute it and/or modify
338
it under the terms of the GNU General Public License as published by
339
the Free Software Foundation; either version 2, or (at your option)
340
any later version.
341
 
342
GNU Classpath is distributed in the hope that it will be useful, but
343
WITHOUT ANY WARRANTY; without even the implied warranty of
344
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
345
General Public License for more details.
346
 
347
You should have received a copy of the GNU General Public License
348
along with GNU Classpath; see the file COPYING.  If not, write to the
349
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
350
02110-1301 USA.
351
 
352
Linking this library statically or dynamically with other modules is
353
making a combined work based on this library.  Thus, the terms and
354
conditions of the GNU General Public License cover the whole
355
combination.
356
 
357
As a special exception, the copyright holders of this library give you
358
permission to link this library with independent modules to produce an
359
executable, regardless of the license terms of these independent
360
modules, and to copy and distribute the resulting executable under
361
terms of your choice, provided that you also meet, for each linked
362
independent module, the terms and conditions of the license of that
363
module.  An independent module is a module which is not derived from
364
or based on this library.  If you modify this library, you may extend
365
this exception to your version of the library, but you are not
366
obligated to do so.  If you do not wish to do so, delete this
367
exception statement from your version. */
368
 
369
package gnu.java.lang;
370
 
371
/**
372
 * This contains the info about the unicode characters, that
373
 * java.lang.Character needs.  It is generated automatically from
374
 * <code>$ARGV[0]</code> and
375
 * <code>$ARGV[1]</code>, by some
376
 * perl scripts. These Unicode definition files can be found on the
377
 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
378
 * JDK 1.4 uses Unicode version 3.0.0.
379
 *
380
 * The data is stored as string constants, but Character will convert these
381
 * Strings to their respective <code>char[]</code> components.  The field
382
 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
383
 * characters within <code>DATA</code>.  The DATA field, in turn, stores
384
 * information about each character in the low order bits, and an offset
385
 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
386
 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
387
 * attribute tables are much smaller than 0xffff entries; as many characters
388
 * in Unicode share common attributes.  The DIRECTION table also contains
389
 * a field for detecting characters with multi-character uppercase expansions.
390
 * Next, there is a listing for <code>TITLE</code> exceptions (most characters
391
 * just have the same title case as upper case).  Finally, there are two
392
 * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
393
 * which lists the characters which are special cased, and
394
 * <code>UPPER_EXPAND</code>, which lists their expansion.
395
 *
396
 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
397
 *         Eric Blake)
398
 * \@see Character
399
 * \@see String
400
 */
401
public interface CharData
402
{
403
  /**
404
   * The Unicode definition file that was parsed to build this database.
405
   */
406
  String SOURCE = \"$ARGV[0]\";
407
 
408
  /**
409
   * The character shift amount to look up the block offset. In other words,
410
   * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
411
   * <code>ch</code> is described in <code>DATA</code>.
412
   */
413
  int SHIFT = $bestshift;
414
 
415
  /**
416
   * The mapping of character blocks to their location in <code>DATA</code>.
417
   * Each entry has been adjusted so that the 16-bit sum with the desired
418
   * character gives the actual index into <code>DATA</code>.
419
   */
420
  String BLOCKS
421
EOF
422
 
423
    for ($i = 0; $i < @blocks / 11; $i++) {
424
        print OUTPUT $i ? "\n    + \"" : "    = \"";
425
        for $j (0 .. 10) {
426
            last if @blocks <= $i * 11 + $j;
427
            my $val = $blocks[$i * 11 + $j];
428
            print OUTPUT javaChar($val);
429
        }
430
        print OUTPUT "\"";
431
    }
432
 
433
    print OUTPUT <<EOF;
434
;
435
 
436
  /**
437
   * Information about each character.  The low order 5 bits form the
438
   * character type, the next bit is a flag for non-breaking spaces, and the
439
   * next bit is a flag for mirrored directionality.  The high order 9 bits
440
   * form the offset into the attribute tables.  Note that this limits the
441
   * number of unique character attributes to 512, which is not a problem
442
   * as of Unicode version 3.2.0, but may soon become one.
443
   */
444
  String DATA
445
EOF
446
 
447
    my $len = length($bestblkstr) / 2;
448
    for ($i = 0; $i < $len / 11; $i++) {
449
        print OUTPUT $i ? "\n    + \"" : "    = \"";
450
        for $j (0 .. 10) {
451
            last if $len <= $i * 11 + $j;
452
            my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
453
            print OUTPUT javaChar($val);
454
        }
455
        print OUTPUT "\"";
456
    }
457
 
458
    print OUTPUT <<EOF;
459
;
460
 
461
  /**
462
   * This is the attribute table for computing the numeric value of a
463
   * character.  The value is -1 if Unicode does not define a value, -2
464
   * if the value is not a positive integer, otherwise it is the value.
465
   * Note that this is a signed value, but stored as an unsigned char
466
   * since this is a String literal.
467
   */
468
  String NUM_VALUE
469
EOF
470
 
471
    $len = @charinfo;
472
    for ($i = 0; $i < $len / 11; $i++) {
473
        print OUTPUT $i ? "\n    + \"" : "    = \"";
474
        for $j (0 .. 10) {
475
            last if $len <= $i * 11 + $j;
476
            my $val = $charinfo[$i * 11 + $j][0];
477
            print OUTPUT javaChar($val);
478
        }
479
        print OUTPUT "\"";
480
    }
481
 
482
    print OUTPUT <<EOF;
483
;
484
 
485
  /**
486
   * This is the attribute table for computing the single-character uppercase
487
   * representation of a character.  The value is the signed difference
488
   * between the character and its uppercase version.  Note that this is
489
   * stored as an unsigned char since this is a String literal.  When
490
   * capitalizing a String, you must first check if a multi-character uppercase
491
   * sequence exists before using this character.
492
   */
493
  String UPPER
494
EOF
495
 
496
    $len = @charinfo;
497
    for ($i = 0; $i < $len / 11; $i++) {
498
        print OUTPUT $i ? "\n    + \"" : "    = \"";
499
        for $j (0 .. 10) {
500
            last if $len <= $i * 11 + $j;
501
            my $val = $charinfo[$i * 11 + $j][1];
502
            print OUTPUT javaChar($val);
503
        }
504
        print OUTPUT "\"";
505
    }
506
 
507
    print OUTPUT <<EOF;
508
;
509
 
510
  /**
511
   * This is the attribute table for computing the lowercase representation
512
   * of a character.  The value is the signed difference between the
513
   * character and its lowercase version.  Note that this is stored as an
514
   * unsigned char since this is a String literal.
515
   */
516
  String LOWER
517
EOF
518
 
519
    $len = @charinfo;
520
    for ($i = 0; $i < $len / 13; $i++) {
521
        print OUTPUT $i ? "\n    + \"" : "    = \"";
522
        for $j (0 .. 12) {
523
            last if $len <= $i * 13 + $j;
524
            my $val = $charinfo[$i * 13 + $j][2];
525
            print OUTPUT javaChar($val);
526
        }
527
        print OUTPUT "\"";
528
    }
529
 
530
    print OUTPUT <<EOF;
531
;
532
 
533
  /**
534
   * This is the attribute table for computing the directionality class
535
   * of a character, as well as a marker of characters with a multi-character
536
   * capitalization.  The direction is taken by performing a signed shift
537
   * right by 2 (where a result of -1 means an unknown direction, such as
538
   * for undefined characters). The lower 2 bits form a count of the
539
   * additional characters that will be added to a String when performing
540
   * multi-character uppercase expansion. This count is also used, along with
541
   * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
542
   * when performing the case conversion. Note that this information is stored
543
   * as an unsigned char since this is a String literal.
544
   */
545
  String DIRECTION
546
EOF
547
 
548
    $len = @charinfo;
549
    for ($i = 0; $i < $len / 17; $i++) {
550
        print OUTPUT $i ? "\n    + \"" : "    = \"";
551
        for $j (0 .. 16) {
552
            last if $len <= $i * 17 + $j;
553
            my $val = $charinfo[$i * 17 + $j][3];
554
            print OUTPUT javaChar($val);
555
        }
556
        print OUTPUT "\"";
557
    }
558
 
559
    print OUTPUT <<EOF;
560
;
561
 
562
  /**
563
   * This is the listing of titlecase special cases (all other characters
564
   * can use <code>UPPER</code> to determine their titlecase).  The listing
565
   * is a sorted sequence of character pairs; converting the first character
566
   * of the pair to titlecase produces the second character.
567
   */
568
  String TITLE
569
EOF
570
 
571
    $len = length($titlecase) / 2;
572
    for ($i = 0; $i < $len / 11; $i++) {
573
        print OUTPUT $i ? "\n    + \"" : "    = \"";
574
        for $j (0 .. 10) {
575
            last if $len <= $i * 11 + $j;
576
            my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
577
            print OUTPUT javaChar($val);
578
        }
579
        print OUTPUT "\"";
580
    }
581
 
582
    print OUTPUT <<EOF;
583
;
584
 
585
  /**
586
   * This is a listing of characters with multi-character uppercase sequences.
587
   * A character appears in this list exactly when it has a non-zero entry
588
   * in the low-order 2-bit field of DIRECTION.  The listing is a sorted
589
   * sequence of pairs (hence a binary search on the even elements is an
590
   * efficient way to lookup a character). The first element of a pair is the
591
   * character with the expansion, and the second is the index into
592
   * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
593
   * DIRECTION to determine where the expansion ends.
594
   */
595
  String UPPER_SPECIAL
596
EOF
597
 
598
    my @list = sort {$a <=> $b} keys %special;
599
    my $expansion = "";
600
    my $offset = 0;
601
    $len = @list;
602
    for ($i = 0; $i < $len / 5; $i++) {
603
        print OUTPUT $i ? "\n    + \"" : "    = \"";
604
        for $j (0 .. 4) {
605
            last if $len <= $i * 5 + $j;
606
            my $ch = $list[$i * 5 + $j];
607
            print OUTPUT javaChar($ch);
608
            print OUTPUT javaChar($offset);
609
            $offset += @{$special{$ch}};
610
            $expansion .= pack "n*", @{$special{$ch}};
611
        }
612
        print OUTPUT "\"";
613
    }
614
 
615
    print OUTPUT <<EOF;
616
;
617
 
618
  /**
619
   * This is the listing of special case multi-character uppercase sequences.
620
   * Characters listed in UPPER_SPECIAL index into this table to find their
621
   * uppercase expansion. Remember that you must also perform special-casing
622
   * on two single-character sequences in the Turkish locale, which are not
623
   * covered here in CharData.
624
   */
625
  String UPPER_EXPAND
626
EOF
627
 
628
    $len = length($expansion) / 2;
629
    for ($i = 0; $i < $len / 11; $i++) {
630
        print OUTPUT $i ? "\n    + \"" : "    = \"";
631
        for $j (0 .. 10) {
632
            last if $len <= $i * 11 + $j;
633
            my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
634
            print OUTPUT javaChar($val);
635
        }
636
        print OUTPUT "\"";
637
    }
638
 
639
    print OUTPUT ";\n}\n";
640
    close OUTPUT;
641
}
642
print "\nDone.\n";

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.