| 1 | 14 | jlechner | #!/usr/bin/perl -w
 | 
      
         | 2 |  |  | # unicode-muncher.pl -- generate Unicode database for java.lang.Character
 | 
      
         | 3 |  |  | # Copyright (C) 1998, 2002, 2004  Free Software Foundation, Inc.
 | 
      
         | 4 |  |  | #
 | 
      
         | 5 |  |  | # This file is part of GNU Classpath.
 | 
      
         | 6 |  |  | #
 | 
      
         | 7 |  |  | # GNU Classpath is free software; you can redistribute it and/or modify
 | 
      
         | 8 |  |  | # it under the terms of the GNU General Public License as published by
 | 
      
         | 9 |  |  | # the Free Software Foundation; either version 2, or (at your option)
 | 
      
         | 10 |  |  | # any later version.
 | 
      
         | 11 |  |  | #
 | 
      
         | 12 |  |  | # GNU Classpath is distributed in the hope that it will be useful, but
 | 
      
         | 13 |  |  | # WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
      
         | 14 |  |  | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
      
         | 15 |  |  | # General Public License for more details.
 | 
      
         | 16 |  |  | #
 | 
      
         | 17 |  |  | # You should have received a copy of the GNU General Public License
 | 
      
         | 18 |  |  | # along with GNU Classpath; see the file COPYING.  If not, write to the
 | 
      
         | 19 |  |  | # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 | 
      
         | 20 |  |  | # 02110-1301 USA.
 | 
      
         | 21 |  |  | #
 | 
      
         | 22 |  |  | # Linking this library statically or dynamically with other modules is
 | 
      
         | 23 |  |  | # making a combined work based on this library.  Thus, the terms and
 | 
      
         | 24 |  |  | # conditions of the GNU General Public License cover the whole
 | 
      
         | 25 |  |  | # combination.
 | 
      
         | 26 |  |  | #
 | 
      
         | 27 |  |  | # As a special exception, the copyright holders of this library give you
 | 
      
         | 28 |  |  | # permission to link this library with independent modules to produce an
 | 
      
         | 29 |  |  | # executable, regardless of the license terms of these independent
 | 
      
         | 30 |  |  | # modules, and to copy and distribute the resulting executable under
 | 
      
         | 31 |  |  | # terms of your choice, provided that you also meet, for each linked
 | 
      
         | 32 |  |  | # independent module, the terms and conditions of the license of that
 | 
      
         | 33 |  |  | # module.  An independent module is a module which is not derived from
 | 
      
         | 34 |  |  | # or based on this library.  If you modify this library, you may extend
 | 
      
         | 35 |  |  | # this exception to your version of the library, but you are not
 | 
      
         | 36 |  |  | # obligated to do so.  If you do not wish to do so, delete this
 | 
      
         | 37 |  |  | # exception statement from your version.
 | 
      
         | 38 |  |  |  
 | 
      
         | 39 |  |  | # Code for reading UnicodeData-3.0.0.txt and SpecialCasing-2.txt to generate
 | 
      
         | 40 |  |  | # the code for gnu.java.lang.CharData. The relevant files can be found here:
 | 
      
         | 41 |  |  | #
 | 
      
         | 42 |  |  | #   http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
 | 
      
         | 43 |  |  | #   http://www.unicode.org/Public/3.0-Update/SpecialCasing-2.txt
 | 
      
         | 44 |  |  | #
 | 
      
         | 45 |  |  | # Inspired by code from Jochen Hoenicke.
 | 
      
         | 46 |  |  | # author Eric Blake <ebb9@email.byu.edu>
 | 
      
         | 47 |  |  | #
 | 
      
         | 48 |  |  | # Usage: ./unicode-muncher <UnicodeData.txt> <SpecialCasing> <CharData.java>
 | 
      
         | 49 |  |  | #   where <UnicodeData.txt> is obtained from www.unicode.org (named
 | 
      
         | 50 |  |  | #   UnicodeData-3.0.0.txt for Unicode version 3.0.0), <SpecialCasing>
 | 
      
         | 51 |  |  | #   is obtained from www.unicode too (named SpecialCasing-2.txt for Unicode
 | 
      
         | 52 |  |  | #   version 3.0.0), and <CharData.java> is the final location for the Java
 | 
      
         | 53 |  |  | #   interface gnu.java.lang.CharData. As of JDK 1.4, use Unicode version 3.0.0
 | 
      
         | 54 |  |  | #   for best results.
 | 
      
         | 55 |  |  |  
 | 
      
         | 56 |  |  | ##
 | 
      
         | 57 |  |  | ## Convert a 16-bit integer to a Java source code String literal character
 | 
      
         | 58 |  |  | ##
 | 
      
         | 59 |  |  | sub javaChar($) {
 | 
      
         | 60 |  |  |     my ($char) = @_;
 | 
      
         | 61 |  |  |     die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
 | 
      
         | 62 |  |  |     $char += 0x10000 if $char < 0;
 | 
      
         | 63 |  |  |     # Special case characters that must be escaped, or are shorter as ASCII
 | 
      
         | 64 |  |  |     return sprintf("\\%03o", $char) if $char < 0x20;
 | 
      
         | 65 |  |  |     return "\\\"" if $char == 0x22;
 | 
      
         | 66 |  |  |     return "\\\\" if $char == 0x5c;
 | 
      
         | 67 |  |  |     return pack("C", $char) if $char < 0x7f;
 | 
      
         | 68 |  |  |     return sprintf("\\u%04x", $char);
 | 
      
         | 69 |  |  | }
 | 
      
         | 70 |  |  |  
 | 
      
         | 71 |  |  | ##
 | 
      
         | 72 |  |  | ## Convert the text UnicodeData file from www.unicode.org into a Java
 | 
      
         | 73 |  |  | ## interface with string constants holding the compressed information.
 | 
      
         | 74 |  |  | ##
 | 
      
         | 75 |  |  | my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
 | 
      
         | 76 |  |  |                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
 | 
      
         | 77 |  |  | my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
 | 
      
         | 78 |  |  |  
 | 
      
         | 79 |  |  | my $NOBREAK_FLAG  = 32;
 | 
      
         | 80 |  |  | my $MIRRORED_FLAG = 64;
 | 
      
         | 81 |  |  |  
 | 
      
         | 82 |  |  | my %special = ();
 | 
      
         | 83 |  |  | my @info = ();
 | 
      
         | 84 |  |  | my $titlecase = "";
 | 
      
         | 85 |  |  | my $count = 0;
 | 
      
         | 86 |  |  | my $range = 0;
 | 
      
         | 87 |  |  |  
 | 
      
         | 88 |  |  | die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
 | 
      
         | 89 |  |  |     unless @ARGV == 3;
 | 
      
         | 90 |  |  | $| = 1;
 | 
      
         | 91 |  |  | print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
 | 
      
         | 92 |  |  | print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
 | 
      
         | 93 |  |  |  
 | 
      
         | 94 |  |  | # Stage 0: Parse the special casing file
 | 
      
         | 95 |  |  | print "Parsing special casing file\n";
 | 
      
         | 96 |  |  | open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
 | 
      
         | 97 |  |  | while (<SPECIAL>) {
 | 
      
         | 98 |  |  |     next if /^\#/;
 | 
      
         | 99 |  |  |     my ($ch, undef, undef, $upper) = split / *; */;
 | 
      
         | 100 |  |  |  
 | 
      
         | 101 |  |  |     # This grabs only the special casing for multi-char uppercase. Note that
 | 
      
         | 102 |  |  |     # there are no multi-char lowercase, and that Sun ignores multi-char
 | 
      
         | 103 |  |  |     # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
 | 
      
         | 104 |  |  |     # which must be hardcoded in java.lang.String:
 | 
      
         | 105 |  |  |     #  \u03a3 (Sun ignores this special case)
 | 
      
         | 106 |  |  |     #  \u0049 - lowercases to \u0131, but only in Turkish locale
 | 
      
         | 107 |  |  |     #  \u0069 - uppercases to \u0130, but only in Turkish locale
 | 
      
         | 108 |  |  |     next unless defined $upper and $upper =~ / /;
 | 
      
         | 109 |  |  |     $special{hex $ch} = [map {hex} split ' ', $upper];
 | 
      
         | 110 |  |  | }
 | 
      
         | 111 |  |  |  
 | 
      
         | 112 |  |  | close SPECIAL;
 | 
      
         | 113 |  |  |  
 | 
      
         | 114 |  |  | # Stage 1: Parse the attribute file
 | 
      
         | 115 |  |  | print "Parsing attributes file";
 | 
      
         | 116 |  |  | open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 | 
      
         | 117 |  |  | while (<UNICODE>) {
 | 
      
         | 118 |  |  |     print "." unless $count++ % 1000;
 | 
      
         | 119 |  |  |     chomp;
 | 
      
         | 120 |  |  |     s/\r//g;
 | 
      
         | 121 |  |  |     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
 | 
      
         | 122 |  |  |         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
 | 
      
         | 123 |  |  |     $ch = hex($ch);
 | 
      
         | 124 |  |  |     next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
 | 
      
         | 125 |  |  |  
 | 
      
         | 126 |  |  |     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 | 
      
         | 127 |  |  |  
 | 
      
         | 128 |  |  |     $type = 0;
 | 
      
         | 129 |  |  |     while ($category !~ /^$TYPECODES[$type]$/) {
 | 
      
         | 130 |  |  |         if (++$type == @TYPECODES) {
 | 
      
         | 131 |  |  |             die "$ch: Unknown type: $category";
 | 
      
         | 132 |  |  |         }
 | 
      
         | 133 |  |  |     }
 | 
      
         | 134 |  |  |     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 | 
      
         | 135 |  |  |     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 | 
      
         | 136 |  |  |  
 | 
      
         | 137 |  |  |     if ($numeric =~ /^[0-9]+$/) {
 | 
      
         | 138 |  |  |         $numValue = $numeric;
 | 
      
         | 139 |  |  |         die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
 | 
      
         | 140 |  |  |     } elsif ($numeric eq "") {
 | 
      
         | 141 |  |  |         # Special case sequences of 'a'-'z'
 | 
      
         | 142 |  |  |         if ($ch >= 0x0041 && $ch <= 0x005a) {
 | 
      
         | 143 |  |  |             $numValue = $ch - 0x0037;
 | 
      
         | 144 |  |  |         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 | 
      
         | 145 |  |  |             $numValue = $ch - 0x0057;
 | 
      
         | 146 |  |  |         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 | 
      
         | 147 |  |  |             $numValue = $ch - 0xff17;
 | 
      
         | 148 |  |  |         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 | 
      
         | 149 |  |  |             $numValue = $ch - 0xff37;
 | 
      
         | 150 |  |  |         } else {
 | 
      
         | 151 |  |  |             $numValue = -1;
 | 
      
         | 152 |  |  |         }
 | 
      
         | 153 |  |  |     } else {
 | 
      
         | 154 |  |  |         $numValue = -2;
 | 
      
         | 155 |  |  |     }
 | 
      
         | 156 |  |  |  
 | 
      
         | 157 |  |  |     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 | 
      
         | 158 |  |  |     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 | 
      
         | 159 |  |  |     if ($title ne $upcase) {
 | 
      
         | 160 |  |  |         my $titlechar = $title ? hex($title) : $ch;
 | 
      
         | 161 |  |  |         $titlecase .= pack("n2", $ch, $titlechar);
 | 
      
         | 162 |  |  |     }
 | 
      
         | 163 |  |  |  
 | 
      
         | 164 |  |  |     $direction = 0;
 | 
      
         | 165 |  |  |     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 | 
      
         | 166 |  |  |         if (++$direction == @DIRCODES) {
 | 
      
         | 167 |  |  |             $direction = -1;
 | 
      
         | 168 |  |  |             last;
 | 
      
         | 169 |  |  |         }
 | 
      
         | 170 |  |  |     }
 | 
      
         | 171 |  |  |     $direction <<= 2;
 | 
      
         | 172 |  |  |     $direction += $#{$special{$ch}} if defined $special{$ch};
 | 
      
         | 173 |  |  |  
 | 
      
         | 174 |  |  |     if ($range) {
 | 
      
         | 175 |  |  |         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 | 
      
         | 176 |  |  |         for ($range + 1 .. $ch - 1) {
 | 
      
         | 177 |  |  |             $info[$_] = pack("n5", $type, $numValue, $upperchar,
 | 
      
         | 178 |  |  |                              $lowerchar, $direction);
 | 
      
         | 179 |  |  |         }
 | 
      
         | 180 |  |  |         $range = 0;
 | 
      
         | 181 |  |  |     } elsif ($name =~ /First>$/) {
 | 
      
         | 182 |  |  |         $range = $ch;
 | 
      
         | 183 |  |  |     }
 | 
      
         | 184 |  |  |     $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 | 
      
         | 185 |  |  |                       $direction);
 | 
      
         | 186 |  |  | }
 | 
      
         | 187 |  |  | close UNICODE;
 | 
      
         | 188 |  |  |  
 | 
      
         | 189 |  |  | # Stage 2: Compress the data structures
 | 
      
         | 190 |  |  | printf "\nCompressing data structures";
 | 
      
         | 191 |  |  | $count = 0;
 | 
      
         | 192 |  |  | my $info = ();
 | 
      
         | 193 |  |  | my %charhash = ();
 | 
      
         | 194 |  |  | my @charinfo = ();
 | 
      
         | 195 |  |  |  
 | 
      
         | 196 |  |  | for my $ch (0 .. 0xffff) {
 | 
      
         | 197 |  |  |     print "." unless $count++ % 0x1000;
 | 
      
         | 198 |  |  |     $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
 | 
      
         | 199 |  |  |  
 | 
      
         | 200 |  |  |     my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
 | 
      
         | 201 |  |  |     if (! exists $charhash{$info[$ch]}) {
 | 
      
         | 202 |  |  |         push @charinfo, [ $numVal, $upper, $lower, $direction ];
 | 
      
         | 203 |  |  |         $charhash{$info[$ch]} = $#charinfo;
 | 
      
         | 204 |  |  |     }
 | 
      
         | 205 |  |  |     $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
 | 
      
         | 206 |  |  | }
 | 
      
         | 207 |  |  |  
 | 
      
         | 208 |  |  | my $charlen = @charinfo;
 | 
      
         | 209 |  |  | my $bestshift;
 | 
      
         | 210 |  |  | my $bestest = 1000000;
 | 
      
         | 211 |  |  | my $bestblkstr;
 | 
      
         | 212 |  |  | die "Too many unique character entries: $charlen\n" if $charlen > 512;
 | 
      
         | 213 |  |  | print "\nUnique character entries: $charlen\n";
 | 
      
         | 214 |  |  |  
 | 
      
         | 215 |  |  | for my $i (3 .. 8) {
 | 
      
         | 216 |  |  |     my $blksize = 1 << $i;
 | 
      
         | 217 |  |  |     my %blocks = ();
 | 
      
         | 218 |  |  |     my @blkarray = ();
 | 
      
         | 219 |  |  |     my ($j, $k);
 | 
      
         | 220 |  |  |     print "shift: $i";
 | 
      
         | 221 |  |  |  
 | 
      
         | 222 |  |  |     for ($j = 0; $j < 0x10000; $j += $blksize) {
 | 
      
         | 223 |  |  |         my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 | 
      
         | 224 |  |  |         if (! exists $blocks{$blkkey}) {
 | 
      
         | 225 |  |  |             push @blkarray, $blkkey;
 | 
      
         | 226 |  |  |             $blocks{$blkkey} = $#blkarray;
 | 
      
         | 227 |  |  |         }
 | 
      
         | 228 |  |  |     }
 | 
      
         | 229 |  |  |     my $blknum = @blkarray;
 | 
      
         | 230 |  |  |     my $blocklen = $blknum * $blksize;
 | 
      
         | 231 |  |  |     printf " before %5d", $blocklen;
 | 
      
         | 232 |  |  |  
 | 
      
         | 233 |  |  |     # Now we try to pack the blkarray as tight as possible by finding matching
 | 
      
         | 234 |  |  |     # heads and tails.
 | 
      
         | 235 |  |  |     for ($j = $blksize - 1; $j > 0; $j--) {
 | 
      
         | 236 |  |  |         my %tails = ();
 | 
      
         | 237 |  |  |         for $k (0 .. $#blkarray) {
 | 
      
         | 238 |  |  |             next unless defined $blkarray[$k];
 | 
      
         | 239 |  |  |             my $len = length $blkarray[$k];
 | 
      
         | 240 |  |  |             my $tail = substr $blkarray[$k], $len - $j * 2;
 | 
      
         | 241 |  |  |             if (exists $tails{$tail}) {
 | 
      
         | 242 |  |  |                 push @{$tails{$tail}}, $k;
 | 
      
         | 243 |  |  |             } else {
 | 
      
         | 244 |  |  |                 $tails{$tail} = [ $k ];
 | 
      
         | 245 |  |  |             }
 | 
      
         | 246 |  |  |         }
 | 
      
         | 247 |  |  |  
 | 
      
         | 248 |  |  |         # tails are calculated, now calculate the heads and merge.
 | 
      
         | 249 |  |  |       BLOCK:
 | 
      
         | 250 |  |  |         for $k (0 .. $#blkarray) {
 | 
      
         | 251 |  |  |             next unless defined $blkarray[$k];
 | 
      
         | 252 |  |  |             my $tomerge = $k;
 | 
      
         | 253 |  |  |             while (1) {
 | 
      
         | 254 |  |  |                 my $head = substr($blkarray[$tomerge], 0, $j * 2);
 | 
      
         | 255 |  |  |                 my $entry = $tails{$head};
 | 
      
         | 256 |  |  |                 next BLOCK unless defined $entry;
 | 
      
         | 257 |  |  |  
 | 
      
         | 258 |  |  |                 my $other = shift @{$entry};
 | 
      
         | 259 |  |  |                 if ($other == $tomerge) {
 | 
      
         | 260 |  |  |                     if (@{$entry}) {
 | 
      
         | 261 |  |  |                         push @{$entry}, $other;
 | 
      
         | 262 |  |  |                         $other = shift @{$entry};
 | 
      
         | 263 |  |  |                     } else {
 | 
      
         | 264 |  |  |                         push @{$entry}, $other;
 | 
      
         | 265 |  |  |                         next BLOCK;
 | 
      
         | 266 |  |  |                     }
 | 
      
         | 267 |  |  |                 }
 | 
      
         | 268 |  |  |                 if (@{$entry} == 0) {
 | 
      
         | 269 |  |  |                     delete $tails{$head};
 | 
      
         | 270 |  |  |                 }
 | 
      
         | 271 |  |  |  
 | 
      
         | 272 |  |  |                 # a match was found
 | 
      
         | 273 |  |  |                 my $merge = $blkarray[$other]
 | 
      
         | 274 |  |  |                     . substr($blkarray[$tomerge], $j * 2);
 | 
      
         | 275 |  |  |                 $blocklen -= $j;
 | 
      
         | 276 |  |  |                 $blknum--;
 | 
      
         | 277 |  |  |  
 | 
      
         | 278 |  |  |                 if ($other < $tomerge) {
 | 
      
         | 279 |  |  |                     $blkarray[$tomerge] = undef;
 | 
      
         | 280 |  |  |                     $blkarray[$other] = $merge;
 | 
      
         | 281 |  |  |                     my $len = length $merge;
 | 
      
         | 282 |  |  |                     my $tail = substr $merge, $len - $j * 2;
 | 
      
         | 283 |  |  |                     $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 | 
      
         | 284 |  |  |                                       @{$tails{$tail}} ];
 | 
      
         | 285 |  |  |                     next BLOCK;
 | 
      
         | 286 |  |  |                 }
 | 
      
         | 287 |  |  |                 $blkarray[$tomerge] = $merge;
 | 
      
         | 288 |  |  |                 $blkarray[$other] = undef;
 | 
      
         | 289 |  |  |             }
 | 
      
         | 290 |  |  |         }
 | 
      
         | 291 |  |  |     }
 | 
      
         | 292 |  |  |     my $blockstr;
 | 
      
         | 293 |  |  |     for $k (0 .. $#blkarray) {
 | 
      
         | 294 |  |  |         $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 | 
      
         | 295 |  |  |     }
 | 
      
         | 296 |  |  |  
 | 
      
         | 297 |  |  |     die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 | 
      
         | 298 |  |  |     my $estimate = 2 * $blocklen + (0x20000 >> $i);
 | 
      
         | 299 |  |  |  
 | 
      
         | 300 |  |  |     printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 | 
      
         | 301 |  |  |     if ($estimate < $bestest) {
 | 
      
         | 302 |  |  |         $bestest = $estimate;
 | 
      
         | 303 |  |  |         $bestshift = $i;
 | 
      
         | 304 |  |  |         $bestblkstr = $blockstr;
 | 
      
         | 305 |  |  |     }
 | 
      
         | 306 |  |  | }
 | 
      
         | 307 |  |  |  
 | 
      
         | 308 |  |  | my @blocks;
 | 
      
         | 309 |  |  | my $blksize = 1 << $bestshift;
 | 
      
         | 310 |  |  | for (my $j = 0; $j < 0x10000; $j += $blksize) {
 | 
      
         | 311 |  |  |     my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 | 
      
         | 312 |  |  |     my $index = index $bestblkstr, $blkkey;
 | 
      
         | 313 |  |  |     while ($index & 1) {
 | 
      
         | 314 |  |  |         die "not found: $j" if $index == -1;
 | 
      
         | 315 |  |  |         $index = index $bestblkstr, $blkkey, $index + 1;
 | 
      
         | 316 |  |  |     }
 | 
      
         | 317 |  |  |     push @blocks, ($index / 2 - $j) & 0xffff;
 | 
      
         | 318 |  |  | }
 | 
      
         | 319 |  |  |  
 | 
      
         | 320 |  |  | # Phase 3: Generate the file
 | 
      
         | 321 |  |  | die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
 | 
      
         | 322 |  |  |     if @blocks > 0xffff / 3;
 | 
      
         | 323 |  |  | die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
 | 
      
         | 324 |  |  |     if length($bestblkstr) > 0xffff / 3;
 | 
      
         | 325 |  |  | {
 | 
      
         | 326 |  |  |     print "Generating $ARGV[2] with shift of $bestshift";
 | 
      
         | 327 |  |  |     my ($i, $j);
 | 
      
         | 328 |  |  |  
 | 
      
         | 329 |  |  |     open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
 | 
      
         | 330 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 331 |  |  | /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
 | 
      
         | 332 |  |  |    Copyright (C) 2002 Free Software Foundation, Inc.
 | 
      
         | 333 |  |  |    *** This file is generated by scripts/unicode-muncher.pl ***
 | 
      
         | 334 |  |  |  
 | 
      
         | 335 |  |  | This file is part of GNU Classpath.
 | 
      
         | 336 |  |  |  
 | 
      
         | 337 |  |  | GNU Classpath is free software; you can redistribute it and/or modify
 | 
      
         | 338 |  |  | it under the terms of the GNU General Public License as published by
 | 
      
         | 339 |  |  | the Free Software Foundation; either version 2, or (at your option)
 | 
      
         | 340 |  |  | any later version.
 | 
      
         | 341 |  |  |  
 | 
      
         | 342 |  |  | GNU Classpath is distributed in the hope that it will be useful, but
 | 
      
         | 343 |  |  | WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
      
         | 344 |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
      
         | 345 |  |  | General Public License for more details.
 | 
      
         | 346 |  |  |  
 | 
      
         | 347 |  |  | You should have received a copy of the GNU General Public License
 | 
      
         | 348 |  |  | along with GNU Classpath; see the file COPYING.  If not, write to the
 | 
      
         | 349 |  |  | Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 | 
      
         | 350 |  |  | 02110-1301 USA.
 | 
      
         | 351 |  |  |  
 | 
      
         | 352 |  |  | Linking this library statically or dynamically with other modules is
 | 
      
         | 353 |  |  | making a combined work based on this library.  Thus, the terms and
 | 
      
         | 354 |  |  | conditions of the GNU General Public License cover the whole
 | 
      
         | 355 |  |  | combination.
 | 
      
         | 356 |  |  |  
 | 
      
         | 357 |  |  | As a special exception, the copyright holders of this library give you
 | 
      
         | 358 |  |  | permission to link this library with independent modules to produce an
 | 
      
         | 359 |  |  | executable, regardless of the license terms of these independent
 | 
      
         | 360 |  |  | modules, and to copy and distribute the resulting executable under
 | 
      
         | 361 |  |  | terms of your choice, provided that you also meet, for each linked
 | 
      
         | 362 |  |  | independent module, the terms and conditions of the license of that
 | 
      
         | 363 |  |  | module.  An independent module is a module which is not derived from
 | 
      
         | 364 |  |  | or based on this library.  If you modify this library, you may extend
 | 
      
         | 365 |  |  | this exception to your version of the library, but you are not
 | 
      
         | 366 |  |  | obligated to do so.  If you do not wish to do so, delete this
 | 
      
         | 367 |  |  | exception statement from your version. */
 | 
      
         | 368 |  |  |  
 | 
      
         | 369 |  |  | package gnu.java.lang;
 | 
      
         | 370 |  |  |  
 | 
      
         | 371 |  |  | /**
 | 
      
         | 372 |  |  |  * This contains the info about the unicode characters, that
 | 
      
         | 373 |  |  |  * java.lang.Character needs.  It is generated automatically from
 | 
      
         | 374 |  |  |  * <code>$ARGV[0]</code> and
 | 
      
         | 375 |  |  |  * <code>$ARGV[1]</code>, by some
 | 
      
         | 376 |  |  |  * perl scripts. These Unicode definition files can be found on the
 | 
      
         | 377 |  |  |  * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 | 
      
         | 378 |  |  |  * JDK 1.4 uses Unicode version 3.0.0.
 | 
      
         | 379 |  |  |  *
 | 
      
         | 380 |  |  |  * The data is stored as string constants, but Character will convert these
 | 
      
         | 381 |  |  |  * Strings to their respective <code>char[]</code> components.  The field
 | 
      
         | 382 |  |  |  * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
 | 
      
         | 383 |  |  |  * characters within <code>DATA</code>.  The DATA field, in turn, stores
 | 
      
         | 384 |  |  |  * information about each character in the low order bits, and an offset
 | 
      
         | 385 |  |  |  * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
 | 
      
         | 386 |  |  |  * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
 | 
      
         | 387 |  |  |  * attribute tables are much smaller than 0xffff entries; as many characters
 | 
      
         | 388 |  |  |  * in Unicode share common attributes.  The DIRECTION table also contains
 | 
      
         | 389 |  |  |  * a field for detecting characters with multi-character uppercase expansions.
 | 
      
         | 390 |  |  |  * Next, there is a listing for <code>TITLE</code> exceptions (most characters
 | 
      
         | 391 |  |  |  * just have the same title case as upper case).  Finally, there are two
 | 
      
         | 392 |  |  |  * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
 | 
      
         | 393 |  |  |  * which lists the characters which are special cased, and
 | 
      
         | 394 |  |  |  * <code>UPPER_EXPAND</code>, which lists their expansion.
 | 
      
         | 395 |  |  |  *
 | 
      
         | 396 |  |  |  * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
 | 
      
         | 397 |  |  |  *         Eric Blake)
 | 
      
         | 398 |  |  |  * \@see Character
 | 
      
         | 399 |  |  |  * \@see String
 | 
      
         | 400 |  |  |  */
 | 
      
         | 401 |  |  | public interface CharData
 | 
      
         | 402 |  |  | {
 | 
      
         | 403 |  |  |   /**
 | 
      
         | 404 |  |  |    * The Unicode definition file that was parsed to build this database.
 | 
      
         | 405 |  |  |    */
 | 
      
         | 406 |  |  |   String SOURCE = \"$ARGV[0]\";
 | 
      
         | 407 |  |  |  
 | 
      
         | 408 |  |  |   /**
 | 
      
         | 409 |  |  |    * The character shift amount to look up the block offset. In other words,
 | 
      
         | 410 |  |  |    * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
 | 
      
         | 411 |  |  |    * <code>ch</code> is described in <code>DATA</code>.
 | 
      
         | 412 |  |  |    */
 | 
      
         | 413 |  |  |   int SHIFT = $bestshift;
 | 
      
         | 414 |  |  |  
 | 
      
         | 415 |  |  |   /**
 | 
      
         | 416 |  |  |    * The mapping of character blocks to their location in <code>DATA</code>.
 | 
      
         | 417 |  |  |    * Each entry has been adjusted so that the 16-bit sum with the desired
 | 
      
         | 418 |  |  |    * character gives the actual index into <code>DATA</code>.
 | 
      
         | 419 |  |  |    */
 | 
      
         | 420 |  |  |   String BLOCKS
 | 
      
         | 421 |  |  | EOF
 | 
      
         | 422 |  |  |  
 | 
      
         | 423 |  |  |     for ($i = 0; $i < @blocks / 11; $i++) {
 | 
      
         | 424 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 425 |  |  |         for $j (0 .. 10) {
 | 
      
         | 426 |  |  |             last if @blocks <= $i * 11 + $j;
 | 
      
         | 427 |  |  |             my $val = $blocks[$i * 11 + $j];
 | 
      
         | 428 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 429 |  |  |         }
 | 
      
         | 430 |  |  |         print OUTPUT "\"";
 | 
      
         | 431 |  |  |     }
 | 
      
         | 432 |  |  |  
 | 
      
         | 433 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 434 |  |  | ;
 | 
      
         | 435 |  |  |  
 | 
      
         | 436 |  |  |   /**
 | 
      
         | 437 |  |  |    * Information about each character.  The low order 5 bits form the
 | 
      
         | 438 |  |  |    * character type, the next bit is a flag for non-breaking spaces, and the
 | 
      
         | 439 |  |  |    * next bit is a flag for mirrored directionality.  The high order 9 bits
 | 
      
         | 440 |  |  |    * form the offset into the attribute tables.  Note that this limits the
 | 
      
         | 441 |  |  |    * number of unique character attributes to 512, which is not a problem
 | 
      
         | 442 |  |  |    * as of Unicode version 3.2.0, but may soon become one.
 | 
      
         | 443 |  |  |    */
 | 
      
         | 444 |  |  |   String DATA
 | 
      
         | 445 |  |  | EOF
 | 
      
         | 446 |  |  |  
 | 
      
         | 447 |  |  |     my $len = length($bestblkstr) / 2;
 | 
      
         | 448 |  |  |     for ($i = 0; $i < $len / 11; $i++) {
 | 
      
         | 449 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 450 |  |  |         for $j (0 .. 10) {
 | 
      
         | 451 |  |  |             last if $len <= $i * 11 + $j;
 | 
      
         | 452 |  |  |             my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
 | 
      
         | 453 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 454 |  |  |         }
 | 
      
         | 455 |  |  |         print OUTPUT "\"";
 | 
      
         | 456 |  |  |     }
 | 
      
         | 457 |  |  |  
 | 
      
         | 458 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 459 |  |  | ;
 | 
      
         | 460 |  |  |  
 | 
      
         | 461 |  |  |   /**
 | 
      
         | 462 |  |  |    * This is the attribute table for computing the numeric value of a
 | 
      
         | 463 |  |  |    * character.  The value is -1 if Unicode does not define a value, -2
 | 
      
         | 464 |  |  |    * if the value is not a positive integer, otherwise it is the value.
 | 
      
         | 465 |  |  |    * Note that this is a signed value, but stored as an unsigned char
 | 
      
         | 466 |  |  |    * since this is a String literal.
 | 
      
         | 467 |  |  |    */
 | 
      
         | 468 |  |  |   String NUM_VALUE
 | 
      
         | 469 |  |  | EOF
 | 
      
         | 470 |  |  |  
 | 
      
         | 471 |  |  |     $len = @charinfo;
 | 
      
         | 472 |  |  |     for ($i = 0; $i < $len / 11; $i++) {
 | 
      
         | 473 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 474 |  |  |         for $j (0 .. 10) {
 | 
      
         | 475 |  |  |             last if $len <= $i * 11 + $j;
 | 
      
         | 476 |  |  |             my $val = $charinfo[$i * 11 + $j][0];
 | 
      
         | 477 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 478 |  |  |         }
 | 
      
         | 479 |  |  |         print OUTPUT "\"";
 | 
      
         | 480 |  |  |     }
 | 
      
         | 481 |  |  |  
 | 
      
         | 482 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 483 |  |  | ;
 | 
      
         | 484 |  |  |  
 | 
      
         | 485 |  |  |   /**
 | 
      
         | 486 |  |  |    * This is the attribute table for computing the single-character uppercase
 | 
      
         | 487 |  |  |    * representation of a character.  The value is the signed difference
 | 
      
         | 488 |  |  |    * between the character and its uppercase version.  Note that this is
 | 
      
         | 489 |  |  |    * stored as an unsigned char since this is a String literal.  When
 | 
      
         | 490 |  |  |    * capitalizing a String, you must first check if a multi-character uppercase
 | 
      
         | 491 |  |  |    * sequence exists before using this character.
 | 
      
         | 492 |  |  |    */
 | 
      
         | 493 |  |  |   String UPPER
 | 
      
         | 494 |  |  | EOF
 | 
      
         | 495 |  |  |  
 | 
      
         | 496 |  |  |     $len = @charinfo;
 | 
      
         | 497 |  |  |     for ($i = 0; $i < $len / 11; $i++) {
 | 
      
         | 498 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 499 |  |  |         for $j (0 .. 10) {
 | 
      
         | 500 |  |  |             last if $len <= $i * 11 + $j;
 | 
      
         | 501 |  |  |             my $val = $charinfo[$i * 11 + $j][1];
 | 
      
         | 502 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 503 |  |  |         }
 | 
      
         | 504 |  |  |         print OUTPUT "\"";
 | 
      
         | 505 |  |  |     }
 | 
      
         | 506 |  |  |  
 | 
      
         | 507 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 508 |  |  | ;
 | 
      
         | 509 |  |  |  
 | 
      
         | 510 |  |  |   /**
 | 
      
         | 511 |  |  |    * This is the attribute table for computing the lowercase representation
 | 
      
         | 512 |  |  |    * of a character.  The value is the signed difference between the
 | 
      
         | 513 |  |  |    * character and its lowercase version.  Note that this is stored as an
 | 
      
         | 514 |  |  |    * unsigned char since this is a String literal.
 | 
      
         | 515 |  |  |    */
 | 
      
         | 516 |  |  |   String LOWER
 | 
      
         | 517 |  |  | EOF
 | 
      
         | 518 |  |  |  
 | 
      
         | 519 |  |  |     $len = @charinfo;
 | 
      
         | 520 |  |  |     for ($i = 0; $i < $len / 13; $i++) {
 | 
      
         | 521 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 522 |  |  |         for $j (0 .. 12) {
 | 
      
         | 523 |  |  |             last if $len <= $i * 13 + $j;
 | 
      
         | 524 |  |  |             my $val = $charinfo[$i * 13 + $j][2];
 | 
      
         | 525 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 526 |  |  |         }
 | 
      
         | 527 |  |  |         print OUTPUT "\"";
 | 
      
         | 528 |  |  |     }
 | 
      
         | 529 |  |  |  
 | 
      
         | 530 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 531 |  |  | ;
 | 
      
         | 532 |  |  |  
 | 
      
         | 533 |  |  |   /**
 | 
      
         | 534 |  |  |    * This is the attribute table for computing the directionality class
 | 
      
         | 535 |  |  |    * of a character, as well as a marker of characters with a multi-character
 | 
      
         | 536 |  |  |    * capitalization.  The direction is taken by performing a signed shift
 | 
      
         | 537 |  |  |    * right by 2 (where a result of -1 means an unknown direction, such as
 | 
      
         | 538 |  |  |    * for undefined characters). The lower 2 bits form a count of the
 | 
      
         | 539 |  |  |    * additional characters that will be added to a String when performing
 | 
      
         | 540 |  |  |    * multi-character uppercase expansion. This count is also used, along with
 | 
      
         | 541 |  |  |    * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
 | 
      
         | 542 |  |  |    * when performing the case conversion. Note that this information is stored
 | 
      
         | 543 |  |  |    * as an unsigned char since this is a String literal.
 | 
      
         | 544 |  |  |    */
 | 
      
         | 545 |  |  |   String DIRECTION
 | 
      
         | 546 |  |  | EOF
 | 
      
         | 547 |  |  |  
 | 
      
         | 548 |  |  |     $len = @charinfo;
 | 
      
         | 549 |  |  |     for ($i = 0; $i < $len / 17; $i++) {
 | 
      
         | 550 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 551 |  |  |         for $j (0 .. 16) {
 | 
      
         | 552 |  |  |             last if $len <= $i * 17 + $j;
 | 
      
         | 553 |  |  |             my $val = $charinfo[$i * 17 + $j][3];
 | 
      
         | 554 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 555 |  |  |         }
 | 
      
         | 556 |  |  |         print OUTPUT "\"";
 | 
      
         | 557 |  |  |     }
 | 
      
         | 558 |  |  |  
 | 
      
         | 559 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 560 |  |  | ;
 | 
      
         | 561 |  |  |  
 | 
      
         | 562 |  |  |   /**
 | 
      
         | 563 |  |  |    * This is the listing of titlecase special cases (all other characters
 | 
      
         | 564 |  |  |    * can use <code>UPPER</code> to determine their titlecase).  The listing
 | 
      
         | 565 |  |  |    * is a sorted sequence of character pairs; converting the first character
 | 
      
         | 566 |  |  |    * of the pair to titlecase produces the second character.
 | 
      
         | 567 |  |  |    */
 | 
      
         | 568 |  |  |   String TITLE
 | 
      
         | 569 |  |  | EOF
 | 
      
         | 570 |  |  |  
 | 
      
         | 571 |  |  |     $len = length($titlecase) / 2;
 | 
      
         | 572 |  |  |     for ($i = 0; $i < $len / 11; $i++) {
 | 
      
         | 573 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 574 |  |  |         for $j (0 .. 10) {
 | 
      
         | 575 |  |  |             last if $len <= $i * 11 + $j;
 | 
      
         | 576 |  |  |             my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
 | 
      
         | 577 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 578 |  |  |         }
 | 
      
         | 579 |  |  |         print OUTPUT "\"";
 | 
      
         | 580 |  |  |     }
 | 
      
         | 581 |  |  |  
 | 
      
         | 582 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 583 |  |  | ;
 | 
      
         | 584 |  |  |  
 | 
      
         | 585 |  |  |   /**
 | 
      
         | 586 |  |  |    * This is a listing of characters with multi-character uppercase sequences.
 | 
      
         | 587 |  |  |    * A character appears in this list exactly when it has a non-zero entry
 | 
      
         | 588 |  |  |    * in the low-order 2-bit field of DIRECTION.  The listing is a sorted
 | 
      
         | 589 |  |  |    * sequence of pairs (hence a binary search on the even elements is an
 | 
      
         | 590 |  |  |    * efficient way to lookup a character). The first element of a pair is the
 | 
      
         | 591 |  |  |    * character with the expansion, and the second is the index into
 | 
      
         | 592 |  |  |    * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
 | 
      
         | 593 |  |  |    * DIRECTION to determine where the expansion ends.
 | 
      
         | 594 |  |  |    */
 | 
      
         | 595 |  |  |   String UPPER_SPECIAL
 | 
      
         | 596 |  |  | EOF
 | 
      
         | 597 |  |  |  
 | 
      
         | 598 |  |  |     my @list = sort {$a <=> $b} keys %special;
 | 
      
         | 599 |  |  |     my $expansion = "";
 | 
      
         | 600 |  |  |     my $offset = 0;
 | 
      
         | 601 |  |  |     $len = @list;
 | 
      
         | 602 |  |  |     for ($i = 0; $i < $len / 5; $i++) {
 | 
      
         | 603 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 604 |  |  |         for $j (0 .. 4) {
 | 
      
         | 605 |  |  |             last if $len <= $i * 5 + $j;
 | 
      
         | 606 |  |  |             my $ch = $list[$i * 5 + $j];
 | 
      
         | 607 |  |  |             print OUTPUT javaChar($ch);
 | 
      
         | 608 |  |  |             print OUTPUT javaChar($offset);
 | 
      
         | 609 |  |  |             $offset += @{$special{$ch}};
 | 
      
         | 610 |  |  |             $expansion .= pack "n*", @{$special{$ch}};
 | 
      
         | 611 |  |  |         }
 | 
      
         | 612 |  |  |         print OUTPUT "\"";
 | 
      
         | 613 |  |  |     }
 | 
      
         | 614 |  |  |  
 | 
      
         | 615 |  |  |     print OUTPUT <<EOF;
 | 
      
         | 616 |  |  | ;
 | 
      
         | 617 |  |  |  
 | 
      
         | 618 |  |  |   /**
 | 
      
         | 619 |  |  |    * This is the listing of special case multi-character uppercase sequences.
 | 
      
         | 620 |  |  |    * Characters listed in UPPER_SPECIAL index into this table to find their
 | 
      
         | 621 |  |  |    * uppercase expansion. Remember that you must also perform special-casing
 | 
      
         | 622 |  |  |    * on two single-character sequences in the Turkish locale, which are not
 | 
      
         | 623 |  |  |    * covered here in CharData.
 | 
      
         | 624 |  |  |    */
 | 
      
         | 625 |  |  |   String UPPER_EXPAND
 | 
      
         | 626 |  |  | EOF
 | 
      
         | 627 |  |  |  
 | 
      
         | 628 |  |  |     $len = length($expansion) / 2;
 | 
      
         | 629 |  |  |     for ($i = 0; $i < $len / 11; $i++) {
 | 
      
         | 630 |  |  |         print OUTPUT $i ? "\n    + \"" : "    = \"";
 | 
      
         | 631 |  |  |         for $j (0 .. 10) {
 | 
      
         | 632 |  |  |             last if $len <= $i * 11 + $j;
 | 
      
         | 633 |  |  |             my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
 | 
      
         | 634 |  |  |             print OUTPUT javaChar($val);
 | 
      
         | 635 |  |  |         }
 | 
      
         | 636 |  |  |         print OUTPUT "\"";
 | 
      
         | 637 |  |  |     }
 | 
      
         | 638 |  |  |  
 | 
      
         | 639 |  |  |     print OUTPUT ";\n}\n";
 | 
      
         | 640 |  |  |     close OUTPUT;
 | 
      
         | 641 |  |  | }
 | 
      
         | 642 |  |  | print "\nDone.\n";
 |