summaryrefslogtreecommitdiff
path: root/mkdict-cedict
diff options
context:
space:
mode:
authorRunciter2024-10-24 23:43:52 +0800
committerRunciter2024-10-24 23:43:52 +0800
commit1bb276d2340eff428ff397dbd493646d880388c4 (patch)
tree2969506db461ce4cff264b1f13437004a08cc2e0 /mkdict-cedict
downloadcc-cedict-0.1.tar.gz
A AUTHORS A COPYING A ChangeLog A INSTALL A Makefile.am A Makefile.in A NEWS A README A aclocal.m4 A configure A configure.ac A install-sh A missing A mkdict-cedict A raw/cedict_1_0_ts_utf-8_mdbg.txt.gz
Diffstat (limited to 'mkdict-cedict')
-rwxr-xr-xmkdict-cedict201
1 files changed, 201 insertions, 0 deletions
diff --git a/mkdict-cedict b/mkdict-cedict
new file mode 100755
index 0000000..f4201ec
--- /dev/null
+++ b/mkdict-cedict
@@ -0,0 +1,201 @@
+#!/bin/sh
+# mkdict-cedict: build dictd format dictionairies from CC-CEDICT
+# Copyright (C) 2024 Runciter <runciter@whispers-vpn.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+CONSONS=bcdfghjklmnpqrstwxyz
+
+URL=https://www.mdbg.net/chinese/dictionary?page=cc-cedict
+SHORT="CC-CEDICT: community maintained Chinese-English dictionary"
+
+I=0
+SPLITARRAY[$I]=zi
+I=$((I+1))
+SPLITARRAY[$I]=piny
+
+I=0
+FLAVOR[$I]=smpl
+FLAVORINFO[$I]="simplified"
+FLAVORAPPEND[$I]=-smpl
+FLAVORSPLITTER[$I]=0
+FLAVORSPLITINDEX[$I]=2
+I=$((I+1))
+FLAVOR[$I]=trad
+FLAVORINFO[$I]="traditional"
+FLAVORAPPEND[$I]=-trad
+FLAVORSPLITTER[$I]=0
+FLAVORSPLITINDEX[$I]=1
+I=$((I+1))
+FLAVOR[$I]=acc
+FLAVORINFO[$I]=pinyin
+FLAVORAPPEND[$I]=-pinyin
+FLAVORSPLITTER[$I]=1
+FLAVORSPLITINDEX[$I]=1
+I=$((I+1))
+FLAVOR[$I]=numb
+FLAVORINFO[$I]="numbered"
+FLAVORAPPEND[$I]=-numb
+FLAVORSPLITTER[$I]=1
+FLAVORSPLITINDEX[$I]=2
+I=$((I+1))
+FLAVOR[$I]=bare
+FLAVORINFO[$I]="bare"
+FLAVORAPPEND[$I]=-bare
+FLAVORSPLITTER[$I]=1
+FLAVORSPLITINDEX[$I]=3
+
+PINY=": v
+s/^\(.\+\)\[\([[:alnum:][:space:]·,:]*\)u:\([[:alnum:][:space:]·,:]*\)\]/\1[\2v\3]/
+t v"
+
+PINY="${PINY}
+s/^\([^[]\+\)\[\([[:alnum:][:space:]·,]*\)\]/\1[\2] {\2} (\2) /"
+
+I=0
+SUBFROM[$I]=a SUBTO[$I]=ā TONE[$I]=1 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=a SUBTO[$I]=á TONE[$I]=2 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=a SUBTO[$I]=ǎ TONE[$I]=3 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=a SUBTO[$I]=à TONE[$I]=4 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=a SUBTO[$I]=a TONE[$I]=5 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=e SUBTO[$I]=ē TONE[$I]=1 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=e SUBTO[$I]=é TONE[$I]=2 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=e SUBTO[$I]=ě TONE[$I]=3 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=e SUBTO[$I]=è TONE[$I]=4 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=e SUBTO[$I]=e TONE[$I]=5 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=ou SUBTO[$I]=ōu TONE[$I]=1 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=ou SUBTO[$I]=óu TONE[$I]=2 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=ou SUBTO[$I]=ǒu TONE[$I]=3 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=ou SUBTO[$I]=òu TONE[$I]=4 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=ou SUBTO[$I]=ou TONE[$I]=5 LAST[$I]=no I=$((I+1))
+SUBFROM[$I]=i SUBTO[$I]=ī TONE[$I]=1 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=i SUBTO[$I]=í TONE[$I]=2 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=i SUBTO[$I]=ǐ TONE[$I]=3 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=i SUBTO[$I]=ì TONE[$I]=4 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=i SUBTO[$I]=i TONE[$I]=5 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=o SUBTO[$I]=ō TONE[$I]=1 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=o SUBTO[$I]=ó TONE[$I]=2 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=o SUBTO[$I]=ǒ TONE[$I]=3 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=o SUBTO[$I]=ò TONE[$I]=4 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=o SUBTO[$I]=o TONE[$I]=5 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=u SUBTO[$I]=ū TONE[$I]=1 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=u SUBTO[$I]=ú TONE[$I]=2 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=u SUBTO[$I]=ǔ TONE[$I]=3 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=u SUBTO[$I]=ù TONE[$I]=4 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=u SUBTO[$I]=u TONE[$I]=5 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=v SUBTO[$I]=ǖ TONE[$I]=1 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=v SUBTO[$I]=ǘ TONE[$I]=2 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=v SUBTO[$I]=ǚ TONE[$I]=3 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=v SUBTO[$I]=ǜ TONE[$I]=4 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=v SUBTO[$I]=ü TONE[$I]=5 LAST[$I]=yes I=$((I+1))
+SUBFROM[$I]=r SUBTO[$I]="'r" TONE[$I]=5 LAST[$I]=yes I=$((I+1))
+
+for IND in ${!TONE[@]}
+do
+ PINY="${PINY}
+: ${SUBTO[$IND]}
+s/^\([^[]\+\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\][[:space:]]{[[:alnum:][:space:]·,]*}[[:space:]]([[:alnum:][:space:]·,]*[[:alpha:]]*${SUBFROM[$IND]}[$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*)\)/\1${SUBTO[$IND]}\3\4\5/
+s/^\([^[]\+\[.*\].*{.*}.*(.*).*\/.*\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\].*$\)/\1${SUBTO[$IND]}\3\4/
+t ${SUBTO[$IND]}"
+done
+
+AWK='function adddefs(entries, entries_ind, new, equivs, defs)
+ {
+ entries[new] = entries[new] "\n\n" equivs "\n"
+ for (i in defs)
+ {
+ entries_ind[new]++
+ entries[new] = entries[new] "\n " entries_ind[new] ". " defs[i]
+ }
+}
+function arrayprint(array, out)
+{
+
+ for (word in array)
+ {
+ print "_____\n\n" word array[word] |& out
+ }
+}
+
+BEGIN {
+ FS="/"'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ out'${FLAVOR[$IND]}' = "dictfmt -c5 -u '"$URL"' -s \"'"${FLAVORINFO[$IND]} $SHORT"'\" --allchars --utf8 cedict'${FLAVORAPPEND[$IND]}'"'
+done
+AWK=$AWK'
+}
+
+/^# / {'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ print substr($0, 3, length() -3) |& out'${FLAVOR[$IND]}
+done
+AWK=$AWK'
+}
+
+/^#! / {'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ print substr($0, 4, length() -4) |& out'${FLAVOR[$IND]}
+done
+AWK=$AWK'
+}
+
+/^[^#]/ {
+ split($1, defined, /\[/)
+ split(defined[1], '${SPLITARRAY[0]}', / /)
+ split (defined[2], '${SPLITARRAY[1]}', /\] \{|\} \(|\)/)'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ new'${FLAVOR[$IND]}' = '${SPLITARRAY[${FLAVORSPLITTER[$IND]}]}'['${FLAVORSPLITINDEX[$IND]}']
+ '
+done
+AWK=$AWK'equivs = '
+for IND in ${!FLAVOR[@]}
+do
+ SEQ=$SEQ'new'${FLAVOR[$IND]}' " | " '
+done
+SEQ="${SEQ:0:-6}"
+AWK=$AWK$SEQ
+AWK=$AWK'
+ delete defs
+ for (i = 2 ; i < NF ; i++)
+ {
+ defs[i-1] = $i
+ }'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ gsub(/[[:space:]]|,|,/, "", new'${FLAVOR[$IND]}')
+ adddefs('${FLAVOR[$IND]}', '${FLAVOR[$IND]}'_ind, new'${FLAVOR[$IND]}', equivs, defs)'
+done
+AWK=$AWK'
+}
+
+END {'
+for IND in ${!FLAVOR[@]}
+do
+ AWK=$AWK'
+ arrayprint('${FLAVOR[$IND]}', out'${FLAVOR[$IND]}')
+ close(out'${FLAVOR[$IND]}')'
+done
+AWK=$AWK'
+}'
+
+cat $1 | gunzip - | sed "$PINY" | gawk -- "$AWK"