diff options
author | Runciter | 2024-10-24 23:43:52 +0800 |
---|---|---|
committer | Runciter | 2024-10-24 23:43:52 +0800 |
commit | 1bb276d2340eff428ff397dbd493646d880388c4 (patch) | |
tree | 2969506db461ce4cff264b1f13437004a08cc2e0 /mkdict-cedict | |
download | cc-cedict-02ee4b2ba70c591810aecd66ff48634ed324719a.tar.gz |
A AUTHORS
A COPYING
A ChangeLog
A INSTALL
A Makefile.am
A Makefile.in
A NEWS
A README
A aclocal.m4
A configure
A configure.ac
A install-sh
A missing
A mkdict-cedict
A raw/cedict_1_0_ts_utf-8_mdbg.txt.gz
Diffstat (limited to 'mkdict-cedict')
-rwxr-xr-x | mkdict-cedict | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/mkdict-cedict b/mkdict-cedict new file mode 100755 index 0000000..f4201ec --- /dev/null +++ b/mkdict-cedict @@ -0,0 +1,201 @@ +#!/bin/sh +# mkdict-cedict: build dictd format dictionairies from CC-CEDICT +# Copyright (C) 2024 Runciter <runciter@whispers-vpn.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +CONSONS=bcdfghjklmnpqrstwxyz + +URL=https://www.mdbg.net/chinese/dictionary?page=cc-cedict +SHORT="CC-CEDICT: community maintained Chinese-English dictionary" + +I=0 +SPLITARRAY[$I]=zi +I=$((I+1)) +SPLITARRAY[$I]=piny + +I=0 +FLAVOR[$I]=smpl +FLAVORINFO[$I]="simplified" +FLAVORAPPEND[$I]=-smpl +FLAVORSPLITTER[$I]=0 +FLAVORSPLITINDEX[$I]=2 +I=$((I+1)) +FLAVOR[$I]=trad +FLAVORINFO[$I]="traditional" +FLAVORAPPEND[$I]=-trad +FLAVORSPLITTER[$I]=0 +FLAVORSPLITINDEX[$I]=1 +I=$((I+1)) +FLAVOR[$I]=acc +FLAVORINFO[$I]=pinyin +FLAVORAPPEND[$I]=-pinyin +FLAVORSPLITTER[$I]=1 +FLAVORSPLITINDEX[$I]=1 +I=$((I+1)) +FLAVOR[$I]=numb +FLAVORINFO[$I]="numbered" +FLAVORAPPEND[$I]=-numb +FLAVORSPLITTER[$I]=1 +FLAVORSPLITINDEX[$I]=2 +I=$((I+1)) +FLAVOR[$I]=bare +FLAVORINFO[$I]="bare" +FLAVORAPPEND[$I]=-bare +FLAVORSPLITTER[$I]=1 +FLAVORSPLITINDEX[$I]=3 + +PINY=": v +s/^\(.\+\)\[\([[:alnum:][:space:]·,:]*\)u:\([[:alnum:][:space:]·,:]*\)\]/\1[\2v\3]/ +t v" + +PINY="${PINY} +s/^\([^[]\+\)\[\([[:alnum:][:space:]·,]*\)\]/\1[\2] {\2} (\2) /" + +I=0 +SUBFROM[$I]=a SUBTO[$I]=ā TONE[$I]=1 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=a SUBTO[$I]=á TONE[$I]=2 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=a SUBTO[$I]=ǎ TONE[$I]=3 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=a SUBTO[$I]=à TONE[$I]=4 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=a SUBTO[$I]=a TONE[$I]=5 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=e SUBTO[$I]=ē TONE[$I]=1 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=e SUBTO[$I]=é TONE[$I]=2 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=e SUBTO[$I]=ě TONE[$I]=3 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=e SUBTO[$I]=è TONE[$I]=4 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=e SUBTO[$I]=e TONE[$I]=5 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=ou SUBTO[$I]=ōu TONE[$I]=1 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=ou SUBTO[$I]=óu TONE[$I]=2 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=ou SUBTO[$I]=ǒu TONE[$I]=3 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=ou SUBTO[$I]=òu TONE[$I]=4 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=ou SUBTO[$I]=ou TONE[$I]=5 LAST[$I]=no I=$((I+1)) +SUBFROM[$I]=i SUBTO[$I]=ī TONE[$I]=1 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=i SUBTO[$I]=í TONE[$I]=2 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=i SUBTO[$I]=ǐ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=i SUBTO[$I]=ì TONE[$I]=4 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=i SUBTO[$I]=i TONE[$I]=5 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=o SUBTO[$I]=ō TONE[$I]=1 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=o SUBTO[$I]=ó TONE[$I]=2 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=o SUBTO[$I]=ǒ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=o SUBTO[$I]=ò TONE[$I]=4 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=o SUBTO[$I]=o TONE[$I]=5 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=u SUBTO[$I]=ū TONE[$I]=1 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=u SUBTO[$I]=ú TONE[$I]=2 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=u SUBTO[$I]=ǔ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=u SUBTO[$I]=ù TONE[$I]=4 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=u SUBTO[$I]=u TONE[$I]=5 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=v SUBTO[$I]=ǖ TONE[$I]=1 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=v SUBTO[$I]=ǘ TONE[$I]=2 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=v SUBTO[$I]=ǚ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=v SUBTO[$I]=ǜ TONE[$I]=4 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=v SUBTO[$I]=ü TONE[$I]=5 LAST[$I]=yes I=$((I+1)) +SUBFROM[$I]=r SUBTO[$I]="'r" TONE[$I]=5 LAST[$I]=yes I=$((I+1)) + +for IND in ${!TONE[@]} +do + PINY="${PINY} +: ${SUBTO[$IND]} +s/^\([^[]\+\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\][[:space:]]{[[:alnum:][:space:]·,]*}[[:space:]]([[:alnum:][:space:]·,]*[[:alpha:]]*${SUBFROM[$IND]}[$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*)\)/\1${SUBTO[$IND]}\3\4\5/ +s/^\([^[]\+\[.*\].*{.*}.*(.*).*\/.*\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\].*$\)/\1${SUBTO[$IND]}\3\4/ +t ${SUBTO[$IND]}" +done + +AWK='function adddefs(entries, entries_ind, new, equivs, defs) + { + entries[new] = entries[new] "\n\n" equivs "\n" + for (i in defs) + { + entries_ind[new]++ + entries[new] = entries[new] "\n " entries_ind[new] ". " defs[i] + } +} +function arrayprint(array, out) +{ + + for (word in array) + { + print "_____\n\n" word array[word] |& out + } +} + +BEGIN { + FS="/"' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + out'${FLAVOR[$IND]}' = "dictfmt -c5 -u '"$URL"' -s \"'"${FLAVORINFO[$IND]} $SHORT"'\" --allchars --utf8 cedict'${FLAVORAPPEND[$IND]}'"' +done +AWK=$AWK' +} + +/^# / {' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + print substr($0, 3, length() -3) |& out'${FLAVOR[$IND]} +done +AWK=$AWK' +} + +/^#! / {' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + print substr($0, 4, length() -4) |& out'${FLAVOR[$IND]} +done +AWK=$AWK' +} + +/^[^#]/ { + split($1, defined, /\[/) + split(defined[1], '${SPLITARRAY[0]}', / /) + split (defined[2], '${SPLITARRAY[1]}', /\] \{|\} \(|\)/)' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + new'${FLAVOR[$IND]}' = '${SPLITARRAY[${FLAVORSPLITTER[$IND]}]}'['${FLAVORSPLITINDEX[$IND]}'] + ' +done +AWK=$AWK'equivs = ' +for IND in ${!FLAVOR[@]} +do + SEQ=$SEQ'new'${FLAVOR[$IND]}' " | " ' +done +SEQ="${SEQ:0:-6}" +AWK=$AWK$SEQ +AWK=$AWK' + delete defs + for (i = 2 ; i < NF ; i++) + { + defs[i-1] = $i + }' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + gsub(/[[:space:]]|,|,/, "", new'${FLAVOR[$IND]}') + adddefs('${FLAVOR[$IND]}', '${FLAVOR[$IND]}'_ind, new'${FLAVOR[$IND]}', equivs, defs)' +done +AWK=$AWK' +} + +END {' +for IND in ${!FLAVOR[@]} +do + AWK=$AWK' + arrayprint('${FLAVOR[$IND]}', out'${FLAVOR[$IND]}') + close(out'${FLAVOR[$IND]}')' +done +AWK=$AWK' +}' + +cat $1 | gunzip - | sed "$PINY" | gawk -- "$AWK" |