aboutsummaryrefslogtreecommitdiff
#!/bin/sh
# mkdict-cedict: build dictd format dictionairies from CC-CEDICT
# Copyright (C) 2024 Runciter <runciter@whispers-vpn.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

CONSONS=bcdfghjklmnpqrstwxyz

URL=https://www.mdbg.net/chinese/dictionary?page=cc-cedict
SHORT="CC-CEDICT: community maintained Chinese-English dictionary"

I=0
SPLITARRAY[$I]=zi
I=$((I+1))
SPLITARRAY[$I]=piny

I=0
FLAVOR[$I]=smpl
FLAVORINFO[$I]="simplified"
FLAVORAPPEND[$I]=-smpl
FLAVORSPLITTER[$I]=0
FLAVORSPLITINDEX[$I]=2
I=$((I+1))
FLAVOR[$I]=trad
FLAVORINFO[$I]="traditional"
FLAVORAPPEND[$I]=-trad
FLAVORSPLITTER[$I]=0
FLAVORSPLITINDEX[$I]=1
I=$((I+1))
FLAVOR[$I]=acc
FLAVORINFO[$I]=pinyin
FLAVORAPPEND[$I]=-pinyin
FLAVORSPLITTER[$I]=1
FLAVORSPLITINDEX[$I]=1
I=$((I+1))
FLAVOR[$I]=numb
FLAVORINFO[$I]="numbered"
FLAVORAPPEND[$I]=-numb
FLAVORSPLITTER[$I]=1
FLAVORSPLITINDEX[$I]=2
I=$((I+1))
FLAVOR[$I]=bare
FLAVORINFO[$I]="bare"
FLAVORAPPEND[$I]=-bare
FLAVORSPLITTER[$I]=1
FLAVORSPLITINDEX[$I]=3

PINY=": v
s/^\(.\+\)\[\([[:alnum:][:space:]·,:]*\)u:\([[:alnum:][:space:]·,:]*\)\]/\1[\2v\3]/
t v"

PINY="${PINY}
s/^\([^[]\+\)\[\([[:alnum:][:space:]·,]*\)\]/\1[\2] {\2} (\2) /"

I=0
SUBFROM[$I]=a	SUBTO[$I]=ā	TONE[$I]=1	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=a	SUBTO[$I]=á	TONE[$I]=2	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=a	SUBTO[$I]=ǎ	TONE[$I]=3	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=a	SUBTO[$I]=à	TONE[$I]=4	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=a	SUBTO[$I]=a	TONE[$I]=5	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=e	SUBTO[$I]=ē	TONE[$I]=1	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=e	SUBTO[$I]=é	TONE[$I]=2	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=e	SUBTO[$I]=ě	TONE[$I]=3	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=e	SUBTO[$I]=è	TONE[$I]=4	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=e	SUBTO[$I]=e	TONE[$I]=5	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=ou	SUBTO[$I]=ōu	TONE[$I]=1	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=ou	SUBTO[$I]=óu	TONE[$I]=2	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=ou	SUBTO[$I]=ǒu	TONE[$I]=3	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=ou	SUBTO[$I]=òu	TONE[$I]=4	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=ou	SUBTO[$I]=ou	TONE[$I]=5	LAST[$I]=no	I=$((I+1))
SUBFROM[$I]=i	SUBTO[$I]=ī	TONE[$I]=1	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=i	SUBTO[$I]=í	TONE[$I]=2	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=i	SUBTO[$I]=ǐ	TONE[$I]=3	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=i	SUBTO[$I]=ì	TONE[$I]=4	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=i	SUBTO[$I]=i	TONE[$I]=5	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=o	SUBTO[$I]=ō	TONE[$I]=1	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=o	SUBTO[$I]=ó	TONE[$I]=2	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=o	SUBTO[$I]=ǒ	TONE[$I]=3	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=o	SUBTO[$I]=ò	TONE[$I]=4	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=o	SUBTO[$I]=o	TONE[$I]=5	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=u	SUBTO[$I]=ū	TONE[$I]=1	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=u	SUBTO[$I]=ú	TONE[$I]=2	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=u	SUBTO[$I]=ǔ	TONE[$I]=3	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=u	SUBTO[$I]=ù	TONE[$I]=4	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=u	SUBTO[$I]=u	TONE[$I]=5	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=v	SUBTO[$I]=ǖ	TONE[$I]=1	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=v	SUBTO[$I]=ǘ	TONE[$I]=2	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=v	SUBTO[$I]=ǚ	TONE[$I]=3	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=v	SUBTO[$I]=ǜ	TONE[$I]=4	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=v	SUBTO[$I]=ü	TONE[$I]=5	LAST[$I]=yes	I=$((I+1))
SUBFROM[$I]=r	SUBTO[$I]="'r"	TONE[$I]=5	LAST[$I]=yes	I=$((I+1))

for IND in ${!TONE[@]}
do
	PINY="${PINY}
: ${SUBTO[$IND]}
s/^\([^[]\+\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk  -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\][[:space:]]{[[:alnum:][:space:]·,]*}[[:space:]]([[:alnum:][:space:]·,]*[[:alpha:]]*${SUBFROM[$IND]}[$(awk  -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*)\)/\1${SUBTO[$IND]}\3\4\5/
s/^\([^[]\+\[.*\].*{.*}.*(.*).*\/.*\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk  -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\].*$\)/\1${SUBTO[$IND]}\3\4/
t ${SUBTO[$IND]}"
done

AWK='function adddefs(entries, entries_ind, new, equivs, defs)
	{
	entries[new] = entries[new] "\n\n" equivs "\n"
	for (i in defs)
	{
		entries_ind[new]++
		entries[new] = entries[new] "\n  " entries_ind[new] ". " defs[i]
	}
}
function arrayprint(array, out)
{

	for (word in array)
	{
		print "_____\n\n" word array[word] |& out
	}
}

BEGIN {
	FS="/"'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	out'${FLAVOR[$IND]}' = "dictfmt -c5 -u '"$URL"' -s \"'"${FLAVORINFO[$IND]} $SHORT"'\" --allchars --utf8 cedict'${FLAVORAPPEND[$IND]}'"'
done
AWK=$AWK'
}

/^# / {'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	print substr($0, 3, length() -3) |& out'${FLAVOR[$IND]}
done
AWK=$AWK'
}

/^#! / {'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	print substr($0, 4, length() -4) |& out'${FLAVOR[$IND]}
done
AWK=$AWK'
}

/^[^#]/ {
	split($1, defined, /\[/)
	split(defined[1], '${SPLITARRAY[0]}', / /)
	split (defined[2], '${SPLITARRAY[1]}', /\] \{|\} \(|\)/)'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	new'${FLAVOR[$IND]}' = '${SPLITARRAY[${FLAVORSPLITTER[$IND]}]}'['${FLAVORSPLITINDEX[$IND]}']
	'
done
AWK=$AWK'equivs = '
for IND in ${!FLAVOR[@]}
do
	SEQ=$SEQ'new'${FLAVOR[$IND]}' " | " '
done
SEQ="${SEQ:0:-6}"
AWK=$AWK$SEQ
AWK=$AWK'
	delete defs
	for (i = 2 ; i < NF ; i++)
	{
		defs[i-1] = $i
	}'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	gsub(/[[:space:]]|,|,/, "", new'${FLAVOR[$IND]}')
	adddefs('${FLAVOR[$IND]}', '${FLAVOR[$IND]}'_ind, new'${FLAVOR[$IND]}', equivs, defs)'
done
AWK=$AWK'
}

END {'
for IND in ${!FLAVOR[@]}
do
	AWK=$AWK'
	arrayprint('${FLAVOR[$IND]}', out'${FLAVOR[$IND]}')
	close(out'${FLAVOR[$IND]}')'
done
AWK=$AWK'
}'

cat $1 | gunzip - | sed "$PINY" | gawk -- "$AWK"