#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 将《dict_revised_2015_20250627.xlsx》转换为 Goldendict/MDX 的源 txt。支持命令行： --group {head,head+pron,none} head : 按字詞名合并（默认） head+pron : 仅按多音序号(>0)合并；多音序号为 0/空的行各自独立 none : 不合并，逐行一个词条 --stats : 打印统计信息 --input/-i : 输入 xlsx 文件（默认：dict_revised_2015_20250627.xlsx） --output/-o : 输出 txt 文件（默认：dict_revised_2015_20250627.mdx.txt） --encoding/-e : 输出编码（默认：utf-8；可选 utf-16le 等） HTML 结构使用 class：edugycd,index,radical,bopomo,hanyu,alias,code,article,pron,accent,mean,syn,ant,polyref,variant,imgchar """ import argparse import re from pathlib import Path import pandas as pd # 映射：多音序号 -> 中文数字 CN_NUM = {1: "一", 2: "二", 3: "三", 4: "四", 5: "五", 6: "六"} # 變體類型映射 VARIANT_TYPE = { "1": "變", "2": "又音", "3": "語音", "4": "讀音", # 允许直接用中文 "變": "變", "又音": "又音", "語音": "語音", "讀音": "讀音", } # 特定图片占位 IMG_ALT_MAP = { "9868._104_0.gif": "未收錄篆字", "9b46._104_0.gif": "e^x", "9a73._104_0.gif": "x^2", "975d._104_0.gif": "a^π", } # 泛化匹配 RE_IMG_GENERIC_GIF = re.compile(r"\b([0-9a-fA-F]{4}\._104_0\.gif)\b") RE_IMG_ENTITY_PNG = re.compile(r"&([0-9a-fA-F]{4})_\.png;") def replace_inline_images(text: str) -> str: if not text: return "" # 明确的 gif 名称 for fname, alt in IMG_ALT_MAP.items(): text = re.sub(re.escape(fname), f' {alt}

', text) # 泛化 gif 名称（4位hex） def repl_generic(m): fname = m.group(1) alt = IMG_ALT_MAP.get(fname, fname.split("._")[0]) return f' {alt}

' text = RE_IMG_GENERIC_GIF.sub(repl_generic, text) # &3ae4_.png; ->

def repl_entity_png(m): hex4 = m.group(1).lower() fname = f"{hex4}_.png" return f' {hex4}

' text = RE_IMG_ENTITY_PNG.sub(repl_entity_png, text) return text def clean_text(s): if s is None: return "" s = str(s) s = s.replace("\r\n", "\n").replace("\r", "\n").strip() # 去掉成对引号 if len(s) >= 2 and ((s.startswith('"') and s.endswith('"')) or (s.startswith("「") and s.endswith("」"))): s = s[1:-1].strip() s = replace_inline_images(s) s = s.replace("\n", "
") return s def join_nonempty(parts, sep="　"): return sep.join([p for p in parts if p]) def build_radical_line(row): radical = str(row.get("部首字", "") or "").strip() total = str(row.get("總筆畫數", "") or "").strip() out = str(row.get("部首外筆畫數", "") or "").strip() if not radical and not total and not out: return "" pieces = [] if radical: pieces.append(f"部首：{radical}") if out: pieces.append(f"部- {out} 畫") if total: pieces.append(f"共 {total} 畫") return " ".join(pieces) def safe_int(x, default=0): try: if x == "" or x is None: return default return int(str(x).strip()) except: return default def make_bopomo_and_pinyin_block(rows): """ 合并注音/拼音行： - 注音一式/漢語拼音 + 變體注音/變體漢語拼音一并纳入顶行展示（去重）。 - 只有当一个词条内存在 >=2 个不同的多音序号(>0)时，才对对应的注音/拼音加 (一)(二)... 前缀。 """ def sort_key(r): n = safe_int(r.get("多音排序"), 0) return (0 if n > 0 else 1, n) rows_sorted = sorted(rows, key=sort_key) # 判断是否需要多音枚举 pos_ns = [safe_int(r.get("多音排序"), 0) for r in rows_sorted if safe_int(r.get("多音排序"), 0) > 0] need_enum = len(set(pos_ns)) >= 2 zhuyin_list = [] hanyu_list = [] z_seen = set() p_seen = set() for r in rows_sorted: n = safe_int(r.get("多音排序"), 0) z = str(r.get("注音一式", "") or "").strip() p = str(r.get("漢語拼音", "") or "").strip() zv = str(r.get("變體注音", "") or "").strip() pv = str(r.get("變體漢語拼音", "") or "").strip() if z and z not in z_seen: prefix = f"({CN_NUM.get(n, str(n))}) " if (need_enum and n > 0) else "" zhuyin_list.append(prefix + z) z_seen.add(z) if p and p not in p_seen: prefix = f"({CN_NUM.get(n, str(n))}) " if (need_enum and n > 0) else "" hanyu_list.append(prefix + p) p_seen.add(p) # 變體注音/拼音不加枚举，只作补充 if zv and zv not in z_seen: zhuyin_list.append(zv) z_seen.add(zv) if pv and pv not in p_seen: hanyu_list.append(pv) p_seen.add(pv) bopomo_line = f'注音一式：{join_nonempty(zhuyin_list)}' if zhuyin_list else "" hanyu_line = f'漢語拼音：{join_nonempty(hanyu_list)}' if hanyu_list else "" return bopomo_line, hanyu_line def build_defs_block(rows): """ 釋義段落： - 若一条目内存在 >=2 个不同的多音序号(>0)，每块加 (一)(二)... + 注音，并显示變體類型。 - 否则：逐条列出；即使不是多音，也会在有變體類型时显示 accent 标签。 """ def sort_key(r): n = safe_int(r.get("多音排序"), 0) return (0 if n > 0 else 1, n) rows_sorted = sorted(rows, key=sort_key) pos_ns = [safe_int(r.get("多音排序"), 0) for r in rows_sorted if safe_int(r.get("多音排序"), 0) > 0] need_enum = len(set(pos_ns)) >= 2 multiple = len(rows_sorted) > 1 parts = [] for r in rows_sorted: defs = clean_text(r.get("釋義", "")) zhuyin = str(r.get("注音一式", "") or "").strip() n = safe_int(r.get("多音排序"), 0) var_type_raw = str(r.get("變體類型 1:變 2:又音 3:語音 4:讀音", "") or "").strip() var_type = VARIANT_TYPE.get(var_type_raw, var_type_raw) header_bits = [] if need_enum and n > 0: header_bits.append(f'({CN_NUM.get(n, str(n))}) {zhuyin}' if zhuyin else f'({CN_NUM.get(n, str(n))})') elif multiple and zhuyin: header_bits.append(f'{zhuyin}') if var_type: header_bits.append(f'{var_type}') if header_bits: parts.append(" ".join(header_bits) + "
") if defs: parts.append(f'{defs}') if not parts: return "" return '釋義：
' + "
".join(parts) + "" def build_entry_html(rows): """ 生成每个词条的 HTML 内容（包在 edugycd 容器内）。 """ r0 = rows[0] head = str(r0.get("字詞名", "") or "").strip() head_for_display = replace_inline_images(head) # 兼容：字數/單字，顯示部首筆畫 zi_shu = safe_int(r0.get("字數"), 0) # 顶部 lines = [f'{head_for_display}'] if zi_shu == 1: radical_line = build_radical_line(r0) if radical_line: lines.append(f'{radical_line}') # 注音/拼音（合并） bopomo_line, hanyu_line = make_bopomo_and_pinyin_block(rows) if bopomo_line: lines.append(bopomo_line) if hanyu_line: lines.append(hanyu_line) # 別名/編碼（合并去重） alias_set = list({str(r.get("辭條別名", "") or "").strip() for r in rows if str(r.get("辭條別名", "") or "").strip()}) code_set = list({str(r.get("字詞號", "") or "").strip() for r in rows if str(r.get("字詞號", "") or "").strip()}) if alias_set: lines.append(f'辭條別名：{"；".join(alias_set)}') if code_set: lines.append(f'編碼：{"；".join(code_set)}') # 釋義 defs_block = build_defs_block(rows) if defs_block: lines.append(defs_block) # 相似詞 / 相反詞 sim = list({str(r.get("相似詞", "") or "").strip() for r in rows if str(r.get("相似詞", "") or "").strip()}) ant = list({str(r.get("相反詞", "") or "").strip() for r in rows if str(r.get("相反詞", "") or "").strip()}) if sim: lines.append(f'相似詞：{"；".join(sim)}') if ant: lines.append(f'相反詞：{"；".join(ant)}') # 多音參見 / 異體字 polyref = list({str(r.get("多音參見訊息", "") or "").strip() for r in rows if str(r.get("多音參見訊息", "") or "").strip()}) if polyref: # polyref 里可能有换行和图片占位，同样清洗 lines.append(f'多音參見：{"；".join(clean_text(x) for x in polyref)}') variants = list({str(r.get("異體字", "") or "").strip() for r in rows if str(r.get("異體字", "") or "").strip()}) if variants: lines.append(f'異體字：{"；".join(variants)}') html = '

' + "
".join(lines) + "

" return html def group_rows(df: pd.DataFrame, mode: str): """ 根据模式分组并保持原始顺序。返回：[(head_plain, [row,row,...]), ...] - head：按「字詞名」全部合并 - head+pron：仅合并多音序号(>0)相同的行；多音序号为 0/空的行各自独立 - none：逐行一个组 """ groups = {} order = [] if mode == "none": for idx, row in df.iterrows(): head = str(row.get("字詞名", "") or "").strip() if not head: continue key = (head, f"R{idx}") groups[key] = [row] order.append(key) return [(k[0], groups[k]) for k in order] elif mode == "head+pron": for idx, row in df.iterrows(): head = str(row.get("字詞名", "") or "").strip() if not head: continue n = safe_int(row.get("多音排序"), 0) if n > 0: key = (head, f"N{n}") else: # 不合并：每行一个 key = (head, f"R{idx}") if key not in groups: groups[key] = [] order.append(key) groups[key].append(row) return [(k[0], groups[k]) for k in order] else: # head by_head = {} order_heads = [] for _, row in df.iterrows(): head = str(row.get("字詞名", "") or "").strip() if not head: continue if head not in by_head: by_head[head] = [] order_heads.append(head) by_head[head].append(row) return [(h, by_head[h]) for h in order_heads] def write_mdx_txt(grouped, out_path: Path, encoding="utf-8"): """ 输出 MDX 源 txt：詞條（纯文本） `1`

...

""" with out_path.open("w", encoding=encoding, newline="\n") as f: for head_plain, rows in grouped: html = build_entry_html(rows) f.write("\n") f.write(f"{head_plain}\n") f.write(f"`1`{html}\n") def compute_stats(df: pd.DataFrame): head = df["字詞名"].fillna("").astype(str).str.strip() total_rows = len(df) blank_rows = (head == "").sum() valid_rows = total_rows - blank_rows unique_heads = head[head != ""].nunique() # 计算 head+pron 模式下的词条数：对每个 head，合并 n>0 的同组 + n<=0 的逐行 entries_head_pron = 0 for h, sub in df[head != ""].groupby(head): ns = sub["多音排序"].fillna("").astype(str) ns_num = [] for x in ns: try: n = int(x.strip()) if x.strip() != "" else 0 except: n = 0 ns_num.append(n) ns_num = pd.Series(ns_num, index=sub.index) pos_unique = len(set(ns_num[ns_num > 0].tolist())) zeros = (ns_num <= 0).sum() entries_head_pron += pos_unique + zeros stats = { "總行數": total_rows, "空白頭詞行數": int(blank_rows), "有效行數(有字詞名)": int(valid_rows), "唯一頭詞數(=head模式詞條數)": int(unique_heads), "head+pron模式詞條數": int(entries_head_pron), "none模式詞條數(逐行)": int(valid_rows), } # 合併最多的頭詞（前 20） vc = head[head != ""].value_counts() top_merged = vc[vc > 1].head(20) return stats, top_merged def main(): ap = argparse.ArgumentParser(description="Build MDX txt from dict_revised_2015_20250627.xlsx") ap.add_argument("--group", choices=["head", "head+pron", "none"], default="head", help="分组方式") ap.add_argument("--stats", action="store_true", help="打印统计信息") ap.add_argument("-i", "--input", default="dict_revised_2015_20250627.xlsx", help="输入 xlsx 路径") ap.add_argument("-o", "--output", default="dict_revised_2015_20250627.mdx.txt", help="输出 txt 路径") ap.add_argument("-e", "--encoding", default="utf-8", help="输出编码（utf-8/utf-16le 等）") args = ap.parse_args() in_xlsx = Path(args.input) out_txt = Path(args.output) if not in_xlsx.exists(): raise FileNotFoundError(f"未找到 {in_xlsx}，请确认文件位于当前目录或用 --input 指定。") # dtype=str 避免编号丢前导零 df = pd.read_excel(in_xlsx, dtype=str, engine="openpyxl").fillna("") # 清洗列名 df.columns = [c.strip() for c in df.columns] # 分组 grouped = group_rows(df, args.group) # 写文件 write_mdx_txt(grouped, out_txt, encoding=args.encoding) print(f"已生成：{out_txt}（條目數：{len(grouped)}，模式：{args.group}，編碼：{args.encoding}）") if args.stats: stats, top_merged = compute_stats(df) print("\n=== 數據統計 ===") for k, v in stats.items(): print(f"{k}: {v}") if not top_merged.empty: print("\n合併最多的頭詞（前 20）：") for name, cnt in top_merged.items(): print(f"{name}: {cnt}") if __name__ == "__main__": main()