#!/opt/bin/perl # the generated tables mostly have NOT been checked so far! use v5.8.2; use utf8; use Encode; use Encode::JP; use Encode::CN; use Encode::KR; use Encode::HanExtra; use Encode::JIS2K; my $gen; $TO_UNICODE = 0; # also generate to_unicode tables sub linear { my ($a, $l, $h, $b) = @_; for ($l .. $h) { return unless defined $a->[$_] && $a->[$_] == $_ + ($b - $l); } 1; } sub wrap { my $res = " "; $res .= "$1\n " while $_[0] =~ /\G(.{90}\S*?)\s+/gc; $res .= "$1" if $_[0] =~ /\G(.*)$/; $res; } my $last_tab_full; # hack sub gentab { my ($enc, $l, $h, $f, $rep) = @_; $last_tab_full = 0; " = {\n" . (wrap join ", ", map +(sprintf "0x$f", defined $enc->[$_] ? $enc->[$_] : $last_tab_full++ * 0 + $rep ), $l..$h) . "\n};\n"; } sub gen { my ($enc, $base, $ch) = @_; my ($fun, $tab); my (@t1, @t2); for (255, 159, 127, 126, 125) { if (linear $enc, 0, $_, 0) { undef $enc->[$_] for 0..$_; $fun .= sprintf "if (· <= 0x%04x) return ·;\n", $_; } } for (126, 127, 128) { if (linear $enc, $_, 159, $_) { undef $enc->[$_] for $_..159; $fun .= sprintf "if (0x%04x <= · && · <= 0x%04x) return ·;\n", $_, 159; } } for (126, 127) { next unless defined $enc->[$_]; $fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $_, delete $enc->[$_]; } my @map; my @map2; for (0 .. $#$enc) { if (defined $enc->[$_]) { $map[$_] = $enc->[$_]; $map2[$_ >> 8]++; } } for my $p (0..255) { if ($map2[$p]) { my $b = $p << 8; my ($l, $h); for my $i (0..255) { $l = $i, last if defined $map[$b + $i]; } for my $i (0..255) { $h = 255 - $i, last if defined $map[$b + 255 - $i]; } if ($map2[$p] <= 5) { for ($l .. $h) { next unless defined $enc->[$b + $_]; $fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $b + $_, $enc->[$b + $_]; } } elsif (linear $enc, $b + $l, $b + $h, $enc->[$b + $l]) { my $dif = $enc->[$b + $l] - ($b + $l); $dif = $dif < 0 ? sprintf "- 0x%04x", -$dif : sprintf "+ 0x%04x", $dif; $fun .= sprintf "if (· <= 0x%04x && 0x%04x <= ·) return · %s;\n", $b + $l, $b + $h, $dif; } elsif ($map2[$p] <= 5) { # defunct $fun .= "switch (·)\n {\n"; for ($l .. $h) { next unless defined $enc->[$b + $_]; $fun .= sprintf " case 0x%04x: return 0x%04x;\n", $b + $_, $enc->[$b + $_]; } $fun .= " }\n"; } else { my ($i, $t, $f) = (0, "uint8_t", "%02x"); for ($l .. $h) { if ($enc->[$b + $_] > 255) { ($i, $t, $f) = (1, "uint16_t", "%04x"); last; } } $i ? push @t2, [$p, $l, $h, $t, $f] : push @t1, [$b, $l, $h, $t, $f]; } } } if (@t2 <= 3) { push @t1, @t2; @t2 = (); } for (@t1) { my ($b, $l, $h, $t, $f) = @$_; my $rep = $b + $l == 0 ? 128 : 0; $tab .= "static const $t $base\_$b\[]" . gentab $enc, $b + $l, $b + $h, $f; $fun .= sprintf "if (0x%04x <= · && · <= 0x%04x)\n" . " return %s$base\_$b\[· - 0x%04x];\n", $b + $l, $b + $h, ($last_tab_full ? sprintf "$base\_$b\[· - 0x%04x] == $rep ? NOCHAR : ", $b + $l : ""), $b + $l; } if (@t2) { my ($min, $max) = (255, 0); my ($l, $h) = (255, 0); for (@t2) { my ($p, $L, $H, $t, $f) = @$_; $max = $p if $p > $max; $min = $p if $p < $min; $l = $L if $L < $l; $h = $H if $H > $h; } $fun .= "uint8_t l = ·;\n"; $fun .= "uint16_t h = · >> 8;\n"; if ($h - $l < 200) { my $d = $h - $l + 1; my @enc; for $p ($min .. $max) { for $i ($l .. $h) { $enc[($p - $min) * $d + $i - $l] = $enc->[$p * 256 + $i]; } } $tab .= "static const uint16_t $base\_m[]" . gentab \@enc, 0, $#enc, "%04x"; if ($last_tab_full) { $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" . " && 0x%02x <= l && l <= 0x%02x)\n" . " return $base\_m\[h * 0x%02x + l - 0x%04x]\n" . " ? $base\_m\[h * 0x%02x + l - 0x%04x]\n" . " : NOCHAR;\n", $min, $max, $l, $h, ($d, $min * $d + $l) x 2; } else { $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" . " && 0x%02x <= l && l <= 0x%02x)\n" . " return $base\_m\[h * 0x%02x + l - 0x%04x];\n", $min, $max, $l, $h, $d, $min * $d + $l; } } else { my @tab = (0) x ($max - $min); for (@t2) { my ($p, undef, undef, $t, $f) = @$_; $tab .= "static const $t $base\_$p\[]" . gentab $enc, $p * 256 + $l, $p * 256 + $h, $f, 0; $tab[$p - $min] = "$base\_$p"; } $tab .= "const uint16_t *$base\_i[] = {\n" . (wrap join ", ", @tab) . "\n};\n\n"; $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" . " && 0x%02x <= l && l <= 0x%02x\n" . " && $base\_i[h - 0x%02x])\n" . " return $base\_i\[h - 0x%02x][l - 0x%02x]\n" . " ? $base\_i\[h - 0x%02x][l - 0x%02x]\n" . " : NOCHAR;\n", $min, $max, $l, $h, $min, ($min, $l) x 2; } } $fun .= "return NOCHAR;\n"; $fun =~ s/·/$ch/g; ($tab, $fun); } while () { my ($group, $base, $cs, $type) = split /\s+/; my @enc1; my @enc2; for (0 .. 65535) { my $enc = encode $cs, (chr $_), Encode::FB_QUIET; if (length $enc) { my $code = hex unpack "H*", $enc; $enc1[$_] = $code; $enc2[$code] = $_ unless defined $enc2[$code]; } } my ($tab1, $fun1); my ($tab2, $fun2); ($tab1, $fun1) = gen \@enc1, "$base\_f", "unicode"; ($tab2, $fun2) = gen \@enc2, "$base\_t", "enc" if $TO_UNICODE; $fun1 =~ s/^/ /gm; $fun2 =~ s/^/ /gm; print "$base\n"; open OUT, ">table/$base.h" or die; print OUT <