mirror of
https://bitbucket.org/smil3y/kde-l10n.git
synced 2025-02-23 18:42:54 +00:00
328 lines
9.2 KiB
Perl
Executable file
328 lines
9.2 KiB
Perl
Executable file
#!/usr/bin/env perl
|
||
# Resolve Serbian hybridized Cyrillic Ijekavian/Ekavian text.
|
||
#
|
||
# Hybridized Serbian Cyrillic text may contain alternatives directives
|
||
# by script (~@) and by dialect (~#):
|
||
#
|
||
# Поређано ~@/азбучним/abecednim/ редоследом.
|
||
# Можда и ~#/смеју/смију/ да се појаве.
|
||
#
|
||
# which are resolved into one of the alternatives depending on target
|
||
# dialect and script combination.
|
||
#
|
||
# Alternatives directives by script are needed only when
|
||
# direct Cyrillic to Latin transliteration is not sufficient;
|
||
# for Latin combinations, text outside of alternatives by script
|
||
# is automatically transliterated.
|
||
#
|
||
# Alternatives by dialect should be rare, as dialect hybridization is normally
|
||
# performed by inserting jat-reflex ticks (›, ‹, ◃, ▹) into Ijekavian text:
|
||
#
|
||
# Пром›јене ће одмах бити заб‹иљежене.
|
||
#
|
||
# Text with jat-reflex ticks is resolved to clean Ijekavian by simply
|
||
# removing the marks, and to Ekavian by applying a mapping table.
|
||
#
|
||
# Text is input through standard output and output to standard output.
|
||
# Input text must be UTF-8 encoded, and output is UTF-8 as well.
|
||
#
|
||
# Chusslove Illich <caslav.ilic@gmx.net>
|
||
|
||
use strict;
|
||
use warnings;
|
||
use utf8;
|
||
|
||
binmode(STDIN, ":utf8");
|
||
binmode(STDOUT, ":utf8");
|
||
|
||
$0 =~ s/.*\///;
|
||
sub error { die "$0: @_\n"; }
|
||
sub warning { warn "$0: @_\n"; }
|
||
|
||
sub show_usage
|
||
{
|
||
die "Usage: $0 [ec|el|ic|il]\n";
|
||
}
|
||
|
||
# Resolve alternatives directives in text,
|
||
# given the alternative head, selected alternative (1-based)
|
||
# and total number of alternatives per directive.
|
||
sub resalts
|
||
{
|
||
my ($text, $althead, $select, $total) = @_;
|
||
|
||
my $althlen = length($althead);
|
||
|
||
my $rtext;
|
||
my $malformed = 0;
|
||
my $p = -1;
|
||
my $pp;
|
||
my $errtext;
|
||
while (1) {
|
||
$pp = $p + 1;
|
||
$p = index($text, $althead, $pp);
|
||
if ($p < 0) {
|
||
$rtext .= substr($text, $pp);
|
||
last;
|
||
}
|
||
my $ps = $p;
|
||
|
||
# Append segment prior to alternatives directive to the result.
|
||
$rtext .= substr($text, $pp, $p - $pp);
|
||
$errtext = substr($text, $p, $p + 30); # text segment for error report
|
||
|
||
# Must have at least 2 characters after the head.
|
||
if (length($text) < $p + $althlen + 2) {
|
||
$malformed = 1;
|
||
last;
|
||
}
|
||
|
||
# Read the separating character.
|
||
$p += $althlen;
|
||
my $sep = substr($text, $p, 1);
|
||
|
||
# Parse requested number of inserts,
|
||
# choose the one with matching index for the result.
|
||
my @alts;
|
||
for (my $i = 0; $i < $total; ++$i) {
|
||
$pp = $p + 1;
|
||
$p = index($text, $sep, $pp);
|
||
# Must have exactly the given total number of alternatives.
|
||
if ($p < 0) {
|
||
$malformed = 1;
|
||
last;
|
||
}
|
||
push(@alts, substr($text, $pp, $p - $pp));
|
||
}
|
||
last if $malformed;
|
||
|
||
# Replace the alternative if admissible, or leave directive untouched.
|
||
my $isel = $select - 1;
|
||
if ($isel < @alts) {
|
||
$rtext .= $alts[$isel];
|
||
} else {
|
||
$rtext .= substr($text, $ps, $p + 1 - $ps);
|
||
}
|
||
}
|
||
|
||
if ($malformed) {
|
||
$rtext = $text;
|
||
warning("Malformed alternatives directive at '$errtext', skipped.");
|
||
}
|
||
|
||
return $rtext;
|
||
}
|
||
|
||
# Transliteration table Serbian Cyrillic->Latin.
|
||
my %map_ctol = (
|
||
'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'ђ' => 'đ',
|
||
'е' => 'e', 'ж' => 'ž', 'з' => 'z', 'и' => 'i', 'ј' => 'j', 'к' => 'k',
|
||
'л' => 'l', 'љ' => 'lj','м' => 'm', 'н' => 'n', 'њ' => 'nj','о' => 'o',
|
||
'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'ћ' => 'ć', 'у' => 'u',
|
||
'ф' => 'f', 'х' => 'h', 'ц' => 'c', 'ч' => 'č', 'џ' => 'dž','ш' => 'š',
|
||
'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Ђ' => 'Đ',
|
||
'Е' => 'E', 'Ж' => 'Ž', 'З' => 'Z', 'И' => 'I', 'Ј' => 'J', 'К' => 'K',
|
||
'Л' => 'L', 'Љ' => 'Lj','М' => 'M', 'Н' => 'N', 'Њ' => 'Nj','О' => 'O',
|
||
'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'Ћ' => 'Ć', 'У' => 'U',
|
||
'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C', 'Ч' => 'Č', 'Џ' => 'Dž','Ш' => 'Š',
|
||
# accented NFC:
|
||
'ѐ' => 'è', 'ѝ' => 'ì', 'ӣ' => 'ī', 'ӯ' => 'ū',
|
||
'Ѐ' => 'È', 'Ѝ' => 'Ì', 'Ӣ' => 'Ī', 'Ӯ' => 'Ū',
|
||
# frequent accented from NFD to NFC (keys now 2-char):
|
||
'а̂' => 'â', 'о̂' => 'ô', 'а̑' => 'ȃ', 'о̑' => 'ȏ',
|
||
);
|
||
|
||
# Transliterate Cyrillic text to Latin.
|
||
sub ctol
|
||
{
|
||
my ($text) = @_;
|
||
my $tlen = length($text);
|
||
my $ntext = "";
|
||
for (my $i = 0; $i < $tlen; ++$i) {
|
||
my $c = substr($text, $i, 1);
|
||
my $c2 = substr($text, $i, 2);
|
||
my $r = ($map_ctol{$c2} or $map_ctol{$c});
|
||
if ($r) {
|
||
my $cp = $i + 1 < $tlen ? substr($text, $i + 1, 1) : "";
|
||
my $cn = $i > 0 ? substr($text, $i - 1, 1) : "";
|
||
if ( length($r) > 1 and $c =~ /[[:upper:]]/
|
||
and ($cn =~ /[[:upper:]]/ or $cp =~ /[[:upper:]]/))
|
||
{
|
||
$ntext .= uc($r);
|
||
} else {
|
||
$ntext .= $r;
|
||
}
|
||
} else {
|
||
$ntext .= $c;
|
||
}
|
||
}
|
||
return $ntext;
|
||
}
|
||
|
||
# Resolve hybrid Cyrillic/Latin text into clean Cyrillic.
|
||
sub hctoc
|
||
{
|
||
my ($text) = @_;
|
||
my $ntext;
|
||
$ntext = resalts($text, '~@', 1, 2);
|
||
return $ntext;
|
||
}
|
||
|
||
# Resolve hybrid Cyrillic/Latin text into clean Latin.
|
||
sub hctol
|
||
{
|
||
my ($text) = @_;
|
||
my $ntext;
|
||
$ntext = ctol($text); # FIXME: Do not convert inside alt directives.
|
||
$ntext = resalts($ntext, '~@', 2, 2);
|
||
return $ntext;
|
||
}
|
||
|
||
# Ijekavian to Ekavian map (Latin script and letter cases derived afterwards).
|
||
my @reflex_spec = (
|
||
['›', {
|
||
'ије' => 'е',
|
||
'је' => 'е',
|
||
}],
|
||
['‹', {
|
||
'иј' => 'еј',
|
||
'иљ' => 'ел',
|
||
'ио' => 'ео',
|
||
'ље' => 'ле',
|
||
'ње' => 'не',
|
||
}],
|
||
['▹', {
|
||
'ије' => 'и',
|
||
'је' => 'и',
|
||
}],
|
||
['◃', {
|
||
'ијел' => 'ео',
|
||
'ијен' => 'ењ',
|
||
'ил' => 'ел',
|
||
'ит' => 'ет',
|
||
'јел' => 'ео',
|
||
'тн' => 'тњ',
|
||
'шње' => 'сне',
|
||
}],
|
||
);
|
||
|
||
# Derive data for dehybridization.
|
||
my @reflex_spec_dehyb;
|
||
for my $refgrp (@reflex_spec) {
|
||
my $tick = $refgrp->[0];
|
||
my $refmap = $refgrp->[1];
|
||
# Derive Latin mappings (must be fully done before different cases).
|
||
for my $ijkfrm (keys %{$refmap}) {
|
||
my $ekvfrm = $refmap->{$ijkfrm};
|
||
$refmap->{ctol($ijkfrm)} = ctol($ekvfrm);
|
||
}
|
||
# Derive mappings with different cases.
|
||
for my $ijkfrm (keys %{$refmap}) {
|
||
my $ekvfrm = $refmap->{$ijkfrm};
|
||
$refmap->{ucfirst($ijkfrm)} = ucfirst($ekvfrm);
|
||
$refmap->{uc($ijkfrm)} = uc($ekvfrm);
|
||
}
|
||
# Compute minimum and maximum reflex lengths.
|
||
my $reflen_min = 0;
|
||
my $reflen_max = 0;
|
||
for my $ijkfrm (keys %{$refmap}) {
|
||
my $reflen = length($ijkfrm);
|
||
$reflen_max = $reflen if $reflen_max < $reflen;
|
||
$reflen_min = $reflen if $reflen_min > $reflen;
|
||
}
|
||
# Derivation for current group done.
|
||
push @reflex_spec_dehyb, [$tick, $refmap, $reflen_min, $reflen_max];
|
||
}
|
||
|
||
|
||
# Resolve hybrid Ijekavian text into clean Ekavian.
|
||
sub hitoe
|
||
{
|
||
my ($text) = @_;
|
||
return hito_w($text, 0);
|
||
}
|
||
|
||
# Resolve hybrid Ijekavian text into clean Ijekavian.
|
||
sub hitoi
|
||
{
|
||
my ($text) = @_;
|
||
return hito_w($text, 1);
|
||
}
|
||
|
||
sub hito_w
|
||
{
|
||
my ($text, $toijek) = @_;
|
||
|
||
for my $refgrp (@reflex_spec_dehyb) {
|
||
$text = hito_w_simple($text, @{$refgrp}, $toijek);
|
||
}
|
||
$text = resalts($text, '~#', (!$toijek? 1 : 2), 2);
|
||
|
||
return $text;
|
||
}
|
||
|
||
sub hito_w_simple
|
||
{
|
||
my ($text, $tick, $refmap, $reflen_min, $reflen_max, $toijek) = @_;
|
||
|
||
my $ntext;
|
||
my $p = 0;
|
||
while (1) {
|
||
my $pp = $p;
|
||
$p = index($text, $tick, $p);
|
||
if ($p < 0) {
|
||
$ntext .= substr($text, $pp);
|
||
last;
|
||
}
|
||
$ntext .= substr($text, $pp, $p - $pp);
|
||
$pp = $p;
|
||
$p += length($tick);
|
||
if ($p >= length($text) or substr($text, $p, 1) !~ /\w/) {
|
||
$ntext .= $tick;
|
||
next;
|
||
}
|
||
|
||
my $reflen = $reflen_min;
|
||
my ($ijkfrm, $ekvfrm);
|
||
while ($reflen <= $reflen_max and !$ekvfrm) {
|
||
$ijkfrm = substr($text, $p, $reflen);
|
||
$ekvfrm = $refmap->{$ijkfrm};
|
||
$reflen += 1;
|
||
}
|
||
|
||
if ($ekvfrm) {
|
||
$ntext .= (!$toijek ? $ekvfrm : $ijkfrm);
|
||
$p += length($ijkfrm);
|
||
} else {
|
||
$ntext .= $tick;
|
||
my $dtext = substr($text, $pp, 20);
|
||
warning("Unknown jat-reflex starting from '$dtext'.");
|
||
}
|
||
}
|
||
return $ntext;
|
||
}
|
||
|
||
|
||
sub main
|
||
{
|
||
@ARGV == 1 or show_usage();
|
||
my $dstarget = shift @ARGV;
|
||
$dstarget =~ /^(ec|el|ic|il)$/ or show_usage();
|
||
|
||
my $resf;
|
||
if ($dstarget eq "ec") {
|
||
$resf = sub { return hitoe(hctoc($_[0])); }
|
||
} elsif ($dstarget eq "el") {
|
||
$resf = sub { return hitoe(hctol($_[0])); }
|
||
} elsif ($dstarget eq "ic") {
|
||
$resf = sub { return hitoi(hctoc($_[0])); }
|
||
} else {
|
||
$resf = sub { return hitoi(hctol($_[0])); }
|
||
}
|
||
|
||
while (<STDIN>) {
|
||
print $resf->($_);
|
||
}
|
||
}
|
||
|
||
main();
|