#!/bin/sh # mkdict-cedict: build dictd format dictionairies from CC-CEDICT # Copyright (C) 2024 Runciter # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . CONSONS=bcdfghjklmnpqrstwxyz URL=https://www.mdbg.net/chinese/dictionary?page=cc-cedict SHORT="CC-CEDICT: community maintained Chinese-English dictionary" I=0 SPLITARRAY[$I]=zi I=$((I+1)) SPLITARRAY[$I]=piny I=0 FLAVOR[$I]=smpl FLAVORINFO[$I]="simplified" FLAVORAPPEND[$I]=-smpl FLAVORSPLITTER[$I]=0 FLAVORSPLITINDEX[$I]=2 I=$((I+1)) FLAVOR[$I]=trad FLAVORINFO[$I]="traditional" FLAVORAPPEND[$I]=-trad FLAVORSPLITTER[$I]=0 FLAVORSPLITINDEX[$I]=1 I=$((I+1)) FLAVOR[$I]=acc FLAVORINFO[$I]=pinyin FLAVORAPPEND[$I]=-pinyin FLAVORSPLITTER[$I]=1 FLAVORSPLITINDEX[$I]=1 I=$((I+1)) FLAVOR[$I]=numb FLAVORINFO[$I]="numbered" FLAVORAPPEND[$I]=-numb FLAVORSPLITTER[$I]=1 FLAVORSPLITINDEX[$I]=2 I=$((I+1)) FLAVOR[$I]=bare FLAVORINFO[$I]="bare" FLAVORAPPEND[$I]=-bare FLAVORSPLITTER[$I]=1 FLAVORSPLITINDEX[$I]=3 PINY=": v s/^\(.\+\)\[\([[:alnum:][:space:]·,:]*\)u:\([[:alnum:][:space:]·,:]*\)\]/\1[\2v\3]/ t v" PINY="${PINY} s/^\([^[]\+\)\[\([[:alnum:][:space:]·,]*\)\]/\1[\2] {\2} (\2) /" I=0 SUBFROM[$I]=a SUBTO[$I]=ā TONE[$I]=1 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=a SUBTO[$I]=á TONE[$I]=2 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=a SUBTO[$I]=ǎ TONE[$I]=3 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=a SUBTO[$I]=à TONE[$I]=4 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=a SUBTO[$I]=a TONE[$I]=5 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=e SUBTO[$I]=ē TONE[$I]=1 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=e SUBTO[$I]=é TONE[$I]=2 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=e SUBTO[$I]=ě TONE[$I]=3 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=e SUBTO[$I]=è TONE[$I]=4 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=e SUBTO[$I]=e TONE[$I]=5 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=ou SUBTO[$I]=ōu TONE[$I]=1 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=ou SUBTO[$I]=óu TONE[$I]=2 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=ou SUBTO[$I]=ǒu TONE[$I]=3 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=ou SUBTO[$I]=òu TONE[$I]=4 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=ou SUBTO[$I]=ou TONE[$I]=5 LAST[$I]=no I=$((I+1)) SUBFROM[$I]=i SUBTO[$I]=ī TONE[$I]=1 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=i SUBTO[$I]=í TONE[$I]=2 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=i SUBTO[$I]=ǐ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=i SUBTO[$I]=ì TONE[$I]=4 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=i SUBTO[$I]=i TONE[$I]=5 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=o SUBTO[$I]=ō TONE[$I]=1 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=o SUBTO[$I]=ó TONE[$I]=2 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=o SUBTO[$I]=ǒ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=o SUBTO[$I]=ò TONE[$I]=4 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=o SUBTO[$I]=o TONE[$I]=5 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=u SUBTO[$I]=ū TONE[$I]=1 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=u SUBTO[$I]=ú TONE[$I]=2 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=u SUBTO[$I]=ǔ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=u SUBTO[$I]=ù TONE[$I]=4 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=u SUBTO[$I]=u TONE[$I]=5 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=v SUBTO[$I]=ǖ TONE[$I]=1 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=v SUBTO[$I]=ǘ TONE[$I]=2 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=v SUBTO[$I]=ǚ TONE[$I]=3 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=v SUBTO[$I]=ǜ TONE[$I]=4 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=v SUBTO[$I]=ü TONE[$I]=5 LAST[$I]=yes I=$((I+1)) SUBFROM[$I]=r SUBTO[$I]="'r" TONE[$I]=5 LAST[$I]=yes I=$((I+1)) for IND in ${!TONE[@]} do PINY="${PINY} : ${SUBTO[$IND]} s/^\([^[]\+\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\][[:space:]]{[[:alnum:][:space:]·,]*}[[:space:]]([[:alnum:][:space:]·,]*[[:alpha:]]*${SUBFROM[$IND]}[$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*)\)/\1${SUBTO[$IND]}\3\4\5/ s/^\([^[]\+\[.*\].*{.*}.*(.*).*\/.*\[[[:alnum:][:space:]·,]*[[:alpha:]]*\)\(${SUBFROM[$IND]}\)\([$(awk -- 'BEGIN{print "'${LAST[$IND]}'" == "yes" ? "'$CONSONS'" : "[:alpha:]";}' /dev/null)]*\)${TONE[$IND]}\([[:alnum:][:space:]·,]*\].*$\)/\1${SUBTO[$IND]}\3\4/ t ${SUBTO[$IND]}" done AWK='function adddefs(entries, entries_ind, new, equivs, defs) { entries[new] = entries[new] "\n\n" equivs "\n" for (i in defs) { entries_ind[new]++ entries[new] = entries[new] "\n " entries_ind[new] ". " defs[i] } } function arrayprint(array, out) { for (word in array) { print "_____\n\n" word array[word] |& out } } BEGIN { FS="/"' for IND in ${!FLAVOR[@]} do AWK=$AWK' out'${FLAVOR[$IND]}' = "dictfmt -c5 -u '"$URL"' -s \"'"${FLAVORINFO[$IND]} $SHORT"'\" --allchars --utf8 cedict'${FLAVORAPPEND[$IND]}'"' done AWK=$AWK' } /^# / {' for IND in ${!FLAVOR[@]} do AWK=$AWK' print substr($0, 3, length() -3) |& out'${FLAVOR[$IND]} done AWK=$AWK' } /^#! / {' for IND in ${!FLAVOR[@]} do AWK=$AWK' print substr($0, 4, length() -4) |& out'${FLAVOR[$IND]} done AWK=$AWK' } /^[^#]/ { split($1, defined, /\[/) split(defined[1], '${SPLITARRAY[0]}', / /) split (defined[2], '${SPLITARRAY[1]}', /\] \{|\} \(|\)/)' for IND in ${!FLAVOR[@]} do AWK=$AWK' new'${FLAVOR[$IND]}' = '${SPLITARRAY[${FLAVORSPLITTER[$IND]}]}'['${FLAVORSPLITINDEX[$IND]}'] ' done AWK=$AWK'equivs = ' for IND in ${!FLAVOR[@]} do SEQ=$SEQ'new'${FLAVOR[$IND]}' " | " ' done SEQ="${SEQ:0:-6}" AWK=$AWK$SEQ AWK=$AWK' delete defs for (i = 2 ; i < NF ; i++) { defs[i-1] = $i }' for IND in ${!FLAVOR[@]} do AWK=$AWK' gsub(/[[:space:]]|,|,/, "", new'${FLAVOR[$IND]}') adddefs('${FLAVOR[$IND]}', '${FLAVOR[$IND]}'_ind, new'${FLAVOR[$IND]}', equivs, defs)' done AWK=$AWK' } END {' for IND in ${!FLAVOR[@]} do AWK=$AWK' arrayprint('${FLAVOR[$IND]}', out'${FLAVOR[$IND]}') close(out'${FLAVOR[$IND]}')' done AWK=$AWK' }' cat $1 | gunzip - | sed "$PINY" | gawk -- "$AWK"