URL
https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc
[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [scripts/] [unicode-decomp.pl] - Rev 841
Go to most recent revision | Compare with Previous | Blame | View Log
#!/usr/bin/perl -w # unicode-decomp.pl - script to generate database for java.text.Collator # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc. # # This file is part of libjava. # # This software is copyrighted work licensed under the terms of the # Libjava License. Please consult the file "LIBJAVA_LICENSE" for # details. # Code for reading UnicodeData.txt and generating the code for # gnu.java.lang.CharData. For now, the relevant Unicode definition files # are found in libjava/gnu/gcj/convert/. # # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h> # where <UnicodeData.txt> is obtained from www.unicode.org (named # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java> # is the final location of include/java-chardecomp.h. # As of JDK 1.4, use Unicode version 3.0.0 for best results. # # If this exits with nonzero status, then you must investigate the # cause of the problem. # Diagnostics and other information to stderr. # With -n, the files are not created, but all processing still occurs. # These maps characters to their decompositions. my %canonical_decomposition = (); my %full_decomposition = (); # Handle `-n' and open output files. if ($ARGV[0] && $ARGV[0] eq '-n') { shift @ARGV; $ARGV[1] = '/dev/null'; } die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2; open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n"; # Process the Unicode file. $| = 1; my $count = 0; print STDERR "Parsing attributes file"; while (<UNICODE>) { print STDERR "." unless $count++ % 1000; chomp; s/\r//g; my ($ch, undef, undef, undef, undef, $decomp) = split ';'; $ch = hex($ch); if ($decomp ne '') { my $is_full = 0; my @decomp = (); foreach (split (' ', $decomp)) { if (/^\<.*\>$/) { $is_full = 1; next; } push (@decomp, hex ($_)); } my $s = pack "n*", @decomp; if ($is_full) { $full_decomposition{$ch} = $s; } else { $canonical_decomposition{$ch} = $s; } } } # Now generate decomposition tables. open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n"; print STDERR "\nGenerating tables\n"; print DECOMP <<EOF; // java-chardecomp.h - Decomposition character tables -*- c++ -*- #ifndef __JAVA_CHARDECOMP_H__ #define __JAVA_CHARDECOMP_H__ // These tables are automatically generated by the $0 // script. DO NOT EDIT the tables. Instead, fix the script // and run it again. // This file should only be included by natCollator.cc struct decomp_entry { jchar key; const char *value; }; EOF &write_decompositions; print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; close(DECOMP); print STDERR "Done\n"; exit; # Write a single decomposition table. sub write_single_decomposition($$%) { my ($name, $is_canon, %table) = @_; my $first_line = 1; print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; for my $key (0 .. 0xffff) { next if ! defined $table{$key}; print DECOMP ",\n" unless $first_line; $first_line = 0; printf DECOMP " { 0x%04x, \"", $key; # We represent the expansion as a series of bytes, terminated # with a double nul. This is ugly, but relatively # space-efficient. Most expansions are short, but there are a # few that are very long (e.g. \uFDFA). This means that if we # chose a fixed-space representation we would waste a lot of # space. my @expansion = unpack "n*", $table{$key}; foreach my $char (@expansion) { printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); } print DECOMP "\" }"; } print DECOMP "\n};\n\n"; } sub write_decompositions() { &write_single_decomposition ('canonical', 1, %canonical_decomposition); &write_single_decomposition ('full', 0, %full_decomposition); }
Go to most recent revision | Compare with Previous | Blame | View Log