#!/usr/bin/perl
# pctran2stardict.pl v. 1.0.0 - 2007-13-12 (jose1711 -at- gmail -dot- com)
#
# perl script to convert PC Translator dictionary (LangSoft) to a format
# processable by StarDict (stardict.sf.net)
#
# ! ! ! I M P O R T A N T ! ! !
# To use this software you must be legal owner of the license for the dictionary!
# Pre pouzitie tohto skriptu musite vlastnit licenciu ku slovniku!
# ! ! ! I M P O R T A N T ! ! !
#
# DO *NOT* PUBLISH FILES GENERATED BY THIS SCRIPT ! ! !
#
# Usage:
# do a backup of database using dictionary manager and then run
# zcat *5 >ancs.txt
# after that you are ready to use this script
#
# credits:
#   - Belisarivs (testing, ideas)
#
# History (in Slovak):
# 0.1 - uvodna verzia: fungujuca konverzia anglicko-ceskeho slovniku
# 0.2 - precisteny (rozumnejsie napisany) skript
#       ifo subory su vygenerovane automaticky (uzivatel ich nemusi editovat rucne)
# 0.3 - spolu s prekladom je konvertovany aj typ slova
#       cistejsi kod
# 0.4 - pridany subor CHANGES ;-)
#       pridana podpora pre slovensku a polsku abecedu
#       opravena chyba s nezobrazovanim niektorych prekladov
# 0.5 - podpora pre latincinu
# 0.6 - command-line parameter: kod jazyka (odpada nutnost upravy skriptu)
#       opravena chyba s nezobrazovanim slov obsahujucich ampersand (&)
#       mensie upravy
# 0.6.1 - oprava pismena x v polskej abecede 
# 0.6.2 - oprava mazania anglickej slovnej zasoby pred konverziou
# 0.7 "120 minutes delayed train"
#       - pridany nemecky slovnik 
# 0.7.1 - pridane nemecke cleny (jun 2006)
# 1.0.0 - prepisane do perlu (december 2007)
#       - citatelnejsi a spravovatelnejsi kod
#       - nezavislost na nastaveni locales
#       - lahke pridavanie slovnikov v dalsich verziach
#       - multiplatformnost (netestovane)
#       - obsah suboru CHANGES je sucastou skriptu
#       - opravenych viacero malych i zavaznejsich chyb (N->H v nemeckom slovniku)
#       - nutnost potvrdenia legalneho vlastnictva PC Translatoru pred konverziou
#       - pridana podpora pre taliansky, francuzsky, portugalsky, spanielsky, holandsky,
#         madarsky a svedsky slovnik
#       - jednoduchy ukazatel priebehu konverzie
# #########################################################################################

use Encode;
use FileHandle;

# open(ABC,"<abc");
# binmode ABC,":utf8";
# while (<ABC>) {
#  	print unpack("U",$_);
# }
# exit;

open(INPUT, "<ancs.txt") or die ("File ancs.txt not found");
$input_file_size=-s INPUT;

unless ($input_file_size){
	die ("File ancs.txt has zero (0) size!");
}

%type_dict=();

# 139 -> a
%pr_dict=(
139=>97,
140=>225,
141=>228,
142=>226,
143=>229,
144=>227,
145=>261,
146=>98,
147=>99,
149=>263,
150=>231,
151=>100,
154=>101,
156=>233,
158=>234,
159=>102,
160=>103,
161=>104,
163=>105,
164=>237,
166=>106,
167=>107,
168=>108,
171=>322,
172=>109,
173=>110,
175=>324,
176=>111,
177=>243,
178=>244,
179=>246,
180=>245,
182=>112,
183=>113,
184=>114,
185=>224,
187=>115,
189=>347,
191=>116,
193=>117,
194=>250,
196=>252,
198=>118,
199=>119,
200=>120,
201=>121,
203=>122,
205=>378,
206=>380,
72=>66,
73=>67,
77=>68,
80=>69,
85=>70,
86=>71,
87=>72,
89=>73,
92=>74,
93=>75,
94=>76,
97=>321,
98=>77,
99=>78,
102=>79,
108=>80,
110=>82,
113=>83,
117=>84,
119=>85,
124=>86,
125=>87,
126=>88,
129=>90
);

# 127 -> a
%it_dict=(
39=>39,
127=>97,
128=>224,
129=>226,
130=>224,
131=>228,
133=>98,
134=>99,
136=>231,
137=>100,
139=>101,
140=>233,
141=>234,
142=>232,
143=>235,
145=>102,
146=>103,
147=>104,
148=>105,
149=>237,
150=>238,
151=>236,
152=>239,
153=>106,
154=>107,
155=>108,
158=>109,
159=>110,
160=>242,
161=>241,
162=>111,
163=>243,
164=>244,
165=>242,
166=>246,
167=>112,
168=>113,
169=>114,
172=>115,
175=>116,
177=>117,
178=>250,
179=>251,
180=>249,
181=>252,
183=>118,
184=>119,
185=>120,
186=>121,
187=>253,
189=>122,
195=>161,
196=>191,
65=>65,
67=>194,
71=>66,
72=>67,
75=>68,
77=>69,
78=>201,
83=>70,
84=>71,
85=>72,
86=>73,
87=>74,
91=>74,
92=>75,
93=>76,
96=>77,
97=>78,
100=>79,
105=>80,
106=>81,
107=>82,
110=>83,
112=>84,
114=>85,
118=>220,
120=>86,
121=>87,
122=>88,
123=>89,
125=>90
);

# 119 -> a
%de_dict=(119=>97,
120=>228,
121=>225,
122=>98,
123=>99,
124=>231,
125=>100,
127=>101,
128=>233,
130=>102,
131=>103,
132=>104,
133=>105,
134=>237,
135=>106,
136=>107,
137=>108,
140=>109,
141=>110,
142=>328,
143=>111,
144=>246,
145=>243,
147=>112,
148=>113,
149=>114,
151=>345,
152=>115,
153=>223,
154=>353,
155=>116,
157=>117,
158=>252,
161=>118,
162=>119,
163=>120,
164=>121,
165=>253,
166=>122,
167=>382,
232=>269,
66=>196,
68=>66,
69=>67,
71=>68,
73=>69,
76=>70,
77=>71,
78=>72,
79=>73,
81=>74,
82=>75,
83=>76,
86=>77,
87=>78,
89=>79,
90=>214,
93=>80,
94=>81,
95=>82,
98=>83,
100=>84,
102=>85,
103=>220,
104=>218,
106=>86,
107=>87,
108=>88,
109=>89,
111=>90,
194=>123,
168=>10,
171=>126,
);

# 139 -> a
%pl_dict=(
139=>97,
145=>261,
146=>98,
147=>99,
149=>263,
151=>100,
154=>101,
158=>281,
159=>102,
160=>103,
161=>104,
163=>105,
166=>106,
167=>107,
168=>108,
171=>322,
172=>109,
173=>110,
175=>324,
176=>111,
177=>243,
182=>112,
184=>114,
187=>115,
189=>347,
191=>116,
193=>117,
198=>118,
199=>119,
200=>120,
201=>121,
203=>122,
205=>378,
206=>380,
72=>66,
73=>67,
77=>68,
80=>69,
85=>70,
86=>71,
87=>72,
89=>73,
92=>74,
93=>75,
94=>76,
97=>321,
98=>77,
99=>78,
102=>79,
108=>80,
110=>82,
113=>83,
117=>84,
119=>85,
124=>86,
125=>87,
129=>90
);

# 139 -> a
%lt_dict=(
139=>97,
140=>225,
146=>98,
147=>99,
151=>100,
154=>101,
156=>233,
159=>102,
160=>103,
161=>104,
163=>105,
164=>237,
166=>106,
167=>107,
168=>108,
172=>109,
173=>110,
176=>111,
177=>243,
179=>246,
182=>112,
183=>113,
184=>114,
185=>341,
187=>115,
191=>116,
193=>117,
194=>250,
196=>252,
198=>118,
199=>119,
200=>120,
201=>121,
202=>253,
203=>122,
72=>66,
73=>67,
80=>69,
86=>71,
87=>72,
89=>73,
93=>75,
94=>76,
98=>77,
102=>79,
113=>83,
119=>85,
124=>86,
125=>87,
126=>88,
129=>90
);

# rcaron, ecaron, ccaron, scaron, ocircumflex, naccute, udiaresis
%en_dict=(
248=>345,
236=>283,
232=>269,
154=>353,
249=>367,
241=>324,
129=>252
);

# 120 -> a
%cs_dict=(
120=>97,
121=>225,
122=>228,
123=>98,
124=>99,
125=>269,
126=>100,
127=>271,
128=>101,
129=>233,
130=>283,
131=>102,
132=>103,
133=>104,
135=>105,
136=>237,
137=>106,
138=>107,
139=>108,
140=>314,
141=>318,
142=>109,
143=>110,
144=>328,
145=>111,
146=>243,
147=>244,
149=>112,
150=>113,
151=>114,
152=>341,
153=>345,
154=>115,
155=>353,
156=>116,
157=>357,
158=>117,
159=>250,
160=>367,
161=>252,
162=>118,
163=>119,
164=>120,
165=>121,
166=>253,
167=>122,
168=>382,
172=>126,
230=>956,
65=>65,
66=>193,
68=>66,
69=>67,
70=>268,
71=>68,
72=>270,
73=>69,
74=>201,
75=>282,
76=>70,
77=>71,
78=>72,
80=>73,
81=>205,
82=>74,
83=>75,
84=>76,
87=>77,
88=>78,
90=>79,
94=>80,
95=>81,
96=>82,
98=>344,
99=>83,
100=>352,
101=>84,
103=>85,
104=>218,
107=>86,
108=>87,
109=>88,
110=>89,
111=>221,
112=>90,
113=>381
);

%hu_dict=(
139=>97,
140=>225,
141=>228,
142=>226,
143=>229,
144=>227,
145=>261,
146=>98,
147=>99,
149=>263,
150=>231,
151=>100,
154=>101,
156=>233,
158=>234,
159=>102,
160=>103,
161=>104,
163=>105,
164=>237,
166=>106,
167=>107,
168=>108,
171=>322,
172=>109,
173=>110,
175=>324,
176=>111,
177=>243,
178=>244,
179=>246,
180=>337,
182=>112,
183=>113,
184=>114,
185=>224,
187=>115,
189=>347,
191=>116,
193=>117,
194=>250,
196=>252,
197=>369,
198=>118,
199=>119,
200=>120,
201=>121,
203=>122,
205=>378,
206=>380,
72=>66,
73=>67,
77=>68,
80=>69,
85=>70,
86=>71,
87=>72,
89=>73,
92=>74,
93=>75,
94=>76,
97=>321,
98=>77,
99=>78,
102=>79,
108=>80,
110=>82,
113=>83,
117=>84,
119=>85,
124=>86,
125=>87,
126=>88,
129=>90
);

%sk_dict=%cs_dict;
%fr_dict=%es_dict=%it_dict;
%sw_dict=%du_dict=%pr_dict;

unless ($#ARGV==0 && $ARGV[0]=~ /^(de|du|en|es|fr|hu|it|lt|pl|pr|sk|sw)$/){
print "The correct syntax is: $0 de|du|en|es|fr|hu|it|lt|pl|pr|sk|sw\n"; exit
}

$language=$ARGV[0];

if (-f "pc_translator-${language}-cs"){
print "Output file (pc_translator-${language}-cs) already exist - please fix this\n";exit;}

if (-f "pc_translator-cs-${language}"){
print "Output file (pc_translator-cs-${language}) already exist - please fix this\n";exit;}

print "Are you the LEGAL owner of PC Translator? Answer yes or no\n";
chomp ($answer = <STDIN>);

unless ($answer eq "yes") {die ("Exiting")};

print "Conversion ($language) in progress";

open (OUTPUT1,"> pc_translator-${language}-cs");
open (OUTPUT2,"> pc_translator-cs-${language}");

sub decode_word{
$dictionary=@_[0];
@word=split("",$_[1]);
$word_out="";
foreach $letter (@word){
	# transform character to number
	$letter_byte_value=unpack("U",$letter);
	# print $letter_byte_value . ",";
	# use dictionary value, if not present - use the original value
	if (! ${"$dictionary"._dict}{$letter_byte_value}) {
		if ($letter_byte_value =~ /134|79/ and $dictionary eq "cs"){
			# 134 is ch, 79 is Ch
			if ($letter_byte_value == 134) {$letter_out="ch"} else {$letter_out="Ch"};
		}
		else {
			$letter_out=pack("U",$letter_byte_value);
			}
		}
	else {
		$letter_out=pack("U",${"$dictionary"._dict}{$letter_byte_value});
		}
		
	
	$word_out=$word_out . $letter_out;
}
return $word_out;
}

sub return_code{
@word=split("",$_);
$word_out="";
foreach $letter (@word){
	# transform character to number
	if ( unpack("U",$letter) == 1 ){
		last
		}
	$word_out=$word_out . unpack("U",$letter) . ",";
	}
return $word_out;
}

binmode STDOUT,":utf8";
binmode OUTPUT1,":utf8";
binmode OUTPUT2,":utf8";

# reset counter
$i=0;
while (<INPUT>){
	$position1=tell INPUT;
	# do not process the first two lines
	$i++;
	next if $i<3;
	@line=split /\x01|\x02/;
	$left_word=$line[0];
	$right_word=$line[1];
	$type=$line[2];
	$type=~ s/&/X/g;
	# for german/french dict
	$article=$line[3];
	if ($left_word) {
	# left_word defined, we'll decode it
	$decoded_left_word=decode_word($language,$left_word);
# 	$decoded_left_word=$decoded_left_word . " " . return_code($left_word);
	$decoded_right_word=decode_word(cs,$right_word);
	$decoded_right_word=~ s/&/&amp;/g;
	}
	else {
		# left_word undefined, we'll keep the previous one
		$decoded_right_word=decode_word(cs,$right_word);
		}
print OUTPUT1 $decoded_left_word . "\t" . $decoded_right_word;
print OUTPUT2 $decoded_right_word . "\t" . $decoded_left_word;

if ($language eq "de"){
	if ($article =~ "/r|e|s/" ){
		$article =~ s/ *$//;
		print OUTPUT1 " <small>(" .$article . ")</small>";
		print OUTPUT2 " <small>(" .$article . ")</small>";
	}
}

if ($language eq "fr"){
	if ($article =~ "f|m|n" ){
		print OUTPUT1 " <small>(" .$article . ")</small>";
		print OUTPUT2 " <small>(" .$article . ")</small>";
	}
}

if ($type ne "" && $type ne " " && $type ne "/" ){
	print OUTPUT1 " <small>(" .decode_word(type,$type) . ")</small>";
	print OUTPUT2 " <small>(" .decode_word(type,$type) . ")</small>";
	}
print OUTPUT1 "\n";
print OUTPUT2 "\n";

$j=$j+$position1-$position2;

STDOUT->autoflush(1);
if ($j > $input_file_size/10){
	# here is our progress indicator ;-)
	print STDOUT "...";
	printf ("%d %%",(tell INPUT)/$input_file_size*100);
	$j=0;
	}

$position2=tell INPUT;
}

close INPUT;
close OUTPUT1;
close OUTPUT2;

print "...Done\n";
print "Running tabfile utility to convert to StarDict format files...\n";

system ("/usr/lib/stardict-tools/tabfile pc_translator-${language}-cs") and system ("tabfile pc_translator-${language}-cs") and system ("./tabfile pc_translator-${language}-cs") and die ("Tabfile utility could not be found in \$PATH or current directory. Install it (stardict-tools) or edit \$PATH.");

system ("/usr/lib/stardict-tools/tabfile pc_translator-cs-${language}") and system ("tabfile pc_translator-cs-${language}") and system ("./tabfile pc_translator-cs-${language}") and die ("Tabfile utility could not be found in \$PATH or current directory. Install it (stardict-tools) or edit \$PATH.");

print "Change line sametypesequence=m in .ifo files to sametypesequence=g and move files with extension .ifo, .dict and .idx to /usr/share/stardict/dic/ (system-wide dictionary) or  ~/.stardict/dic/ (user-specific) and restart stardict.\n";
