[TUHS] Canonical Historic Character Encoding Conversion?
Steffen Nurpmeso via TUHS
tuhs at tuhs.org
Thu Nov 20 01:28:01 AEST 2025
Ah. Oh. TUHS strips attachments.
Here inline:
#!/bin/sh -
#@ Update character-sets from IANA.
#@ Adapted from a smart awk script written by Gaetan Bisson, and adjusted.
url_cs='https://www.iana.org/assignments/character-sets/character-sets.xml'
: ${FMT:=txt} # txt,c,list
: ${FETCH:=} # non-empty: do it
: ${DBG:=} # non-empty: add more comments (FMT=c)
: ${MIME:=} # non-empty: only include preferred MIME name and normalizations (FMT=c)
: ${awk:=awk}
: ${curl:=curl}
datetime=$(date +'%FT%T%z')
download() (
${curl} -v -o character-sets.xml ${url_cs}
echo download ok: $?
echo remember to adjust ASCII alias
)
process() {
${awk} -F "[<>]" -v FMT="${FMT}" -v MIME="${MIME}" -v DBG="${DBG}" -v URL="${url_cs}" -v DT="${datetime}" '
function err(){
print "Bogus: " FILENAME " at line " FNR ": " $0 > "/dev/stderr"
exit 1
}
# Normalize algorithm
function c_norm(n){
n = tolower(n)
gsub("[[:punct:]]", " ", n)
gsub("[[:space:]]+", " ", n)
for(;;){
o = match(n, "[[:lower:]][[:digit:]]")
if(o == 0){
o = match(n, "[[:digit:]][[:lower:]]")
if(o == 0)
break
}
n2 = substr(n, o + 1)
n = substr(n, 1, o) " " n2
}
return n
}
BEGIN {all=header_dumped = parse = hot = datno = 0}
/<registry/ {parse=1}
/<\/registry/ {exit}
/<record/{
if(!parse) next
if(hot) err()
hot = 1
n = mib = p = ""
acnt = 0
}
/<name/ {if(!parse) next; if(!hot) err(); if(n) err(); n = $3}
/<value/ {if(!parse) next; if(!hot) err(); if(mib) err(); mib = $3}
/<alias/ {if(!parse) next; if(!hot) err(); aa[++acnt] = $3}
/<preferred_alias/ {if(!parse) next; if(!hot) err(); if(p) err(); p = $3}
/<\/record/{
if(!parse) next; if(!hot) err()
hot = 0
if(!n) err()
if(!p)
p = n
p = tolower(p)
if(FMT == "txt"){
if(!header_dumped++){
print "# character-sets.txt, created " DT
print "# Source: " URL
print ""
}
print "Name: " n
if(mib)
print "MIBenum: " mib
print "MIME: " p
if(acnt > 0){
for(i = 1; i <= acnt; ++i)
print "Alias: " aa[i]
}
print ""
}else if(FMT == "c"){
if(!header_dumped++){
print "/* IANA character-sets data, created " DT " */"
print "/* Source: " URL " */"
if(DBG)
print ""
print "static char const"
}
if(!mib)
mib = "U16_MAX"
++datno
cdat[datno "mib"] = mib
cdat[datno "name"] = p
cdat[datno "mime_off"] = 0 # for now
if(datno > 1)
printf ","
printf " * const a_iconv_cs_" datno "[] = {"
if(DBG)
printf "\n\t"
printf "\"" p "\""
if(MIME){
cnt = 0
cdat[datno "mime_off"] = "U8_MAX";
}else{
printf ",\"" n "\""
cnt = 1
}
if(!MIME && acnt > 0){
for(i = 1; i <= acnt; ++i){
if(tolower(aa[i]) == p)
cdat[datno "mime_off"] = cnt
printf ",\"" aa[i] "\""
++cnt
}
}
cdat[datno "cnt"] = cnt
if(DBG)
printf "\n\t"
cnt = 1
cnorm[1] = c_norm(n)
if(acnt > 0)
for(i = 1; i <= acnt; ++i){
n = c_norm(aa[i])
for(j = 1;; ++j){
if(cnorm[j] == n)
break
if(j == cnt){
cnorm[++cnt] = n
break;
}
}
}
cdat[datno "norm_cnt"] = cnt
for(i = 1; i <= cnt; ++i)
printf ",\"" cnorm[i] "\""
if(DBG){
printf "\n\t/* mib=" cdat[datno "mib"] " name=" cdat[datno "name"]
printf " cnt=" cdat[datno "cnt"] " mime_off=" cdat[datno "mime_off"]
printf " norm_cnt=" cdat[datno "norm_cnt"] " */\n"
}
printf "}\n"
}else if(FMT == "list"){
print n
if(acnt > 0){
for(i = 1; i <= acnt; ++i)
print aa[i]
}
}else{
print "unknown FMT: " FMT > "/dev/stderr"
exit 64
}
}
END{
if(FMT == "c"){
printf ";\n"
if(DBG){
print ""
}
print "static struct a_iconv_cs const a_iconv_db[] = {"
for(i = 1; i <= datno; ++i){
if(DBG)
printf "\t"
printf "{" cdat[i "mib"] "," cdat[i "cnt"] "," cdat[i "mime_off"]
printf "," cdat[i "norm_cnt"] ",{0,},a_iconv_cs_" i "},"
if(DBG)
printf " /* " cdat[i "name"] " */"
printf "\n"
}
print "};"
}
}
' < character-sets.xml
[ ${?} -eq 0 ] || exit 30
}
if [ -n "${FETCH}" ]; then
download || exit ${?}
fi
process
# s-itt-mode
--steffen
|
|Der Kragenbaer, The moon bear,
|der holt sich munter he cheerfully and one by one
|einen nach dem anderen runter wa.ks himself off
|(By Robert Gernhardt)
More information about the TUHS
mailing list