#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
将《dict_revised_2015_20250627.xlsx》转换为 Goldendict/MDX 的源 txt。

支持命令行：
  --group {head,head+pron,none}
      head       : 按字詞名合并（默认）
      head+pron  : 仅按多音序号(>0)合并；多音序号为 0/空 的行各自独立
      none       : 不合并，逐行一个词条
  --stats        : 打印统计信息
  --input/-i     : 输入 xlsx 文件（默认：dict_revised_2015_20250627.xlsx）
  --output/-o    : 输出 txt 文件（默认：dict_revised_2015_20250627.mdx.txt）
  --encoding/-e  : 输出编码（默认：utf-8；可选 utf-16le 等）

HTML 结构使用 class：edugycd,index,radical,bopomo,hanyu,alias,code,article,pron,accent,mean,syn,ant,polyref,variant,imgchar
"""

import argparse
import re
from pathlib import Path
import pandas as pd

# 映射：多音序号 -> 中文数字
CN_NUM = {1: "一", 2: "二", 3: "三", 4: "四", 5: "五", 6: "六"}

# 變體類型映射
VARIANT_TYPE = {
    "1": "變",
    "2": "又音",
    "3": "語音",
    "4": "讀音",
    # 允许直接用中文
    "變": "變",
    "又音": "又音",
    "語音": "語音",
    "讀音": "讀音",
}

# 特定图片占位
IMG_ALT_MAP = {
    "9868._104_0.gif": "未收錄篆字",
    "9b46._104_0.gif": "e^x",
    "9a73._104_0.gif": "x^2",
    "975d._104_0.gif": "a^π",
}

# 泛化匹配
RE_IMG_GENERIC_GIF = re.compile(r"\b([0-9a-fA-F]{4}\._104_0\.gif)\b")
RE_IMG_ENTITY_PNG = re.compile(r"&([0-9a-fA-F]{4})_\.png;")

def replace_inline_images(text: str) -> str:
    if not text:
        return ""
    # 明确的 gif 名称
    for fname, alt in IMG_ALT_MAP.items():
        text = re.sub(re.escape(fname), f'<img src="{fname}" class="imgchar" alt="{alt}" />', text)
    # 泛化 gif 名称（4位hex）
    def repl_generic(m):
        fname = m.group(1)
        alt = IMG_ALT_MAP.get(fname, fname.split("._")[0])
        return f'<img src="{fname}" class="imgchar" alt="{alt}" />'
    text = RE_IMG_GENERIC_GIF.sub(repl_generic, text)
    # &3ae4_.png; -> <img src="3ae4_.png">
    def repl_entity_png(m):
        hex4 = m.group(1).lower()
        fname = f"{hex4}_.png"
        return f'<img src="{fname}" class="imgchar" alt="{hex4}" />'
    text = RE_IMG_ENTITY_PNG.sub(repl_entity_png, text)
    return text

def clean_text(s):
    if s is None:
        return ""
    s = str(s)
    s = s.replace("\r\n", "\n").replace("\r", "\n").strip()
    # 去掉成对引号
    if len(s) >= 2 and ((s.startswith('"') and s.endswith('"')) or (s.startswith("「") and s.endswith("」"))):
        s = s[1:-1].strip()
    s = replace_inline_images(s)
    s = s.replace("\n", "<br />")
    return s

def join_nonempty(parts, sep="　"):
    return sep.join([p for p in parts if p])

def build_radical_line(row):
    radical = str(row.get("部首字", "") or "").strip()
    total = str(row.get("總筆畫數", "") or "").strip()
    out = str(row.get("部首外筆畫數", "") or "").strip()
    if not radical and not total and not out:
        return ""
    pieces = []
    if radical:
        pieces.append(f"部首：{radical}")
    if out:
        pieces.append(f"部- {out} 畫")
    if total:
        pieces.append(f"共 {total} 畫")
    return " ".join(pieces)

def safe_int(x, default=0):
    try:
        if x == "" or x is None:
            return default
        return int(str(x).strip())
    except:
        return default

def make_bopomo_and_pinyin_block(rows):
    """
    合并注音/拼音行：
    - 注音一式/漢語拼音 + 變體注音/變體漢語拼音 一并纳入顶行展示（去重）。
    - 只有当一个词条内存在 >=2 个不同的多音序号(>0)时，才对对应的注音/拼音加 (一)(二)... 前缀。
    """
    def sort_key(r):
        n = safe_int(r.get("多音排序"), 0)
        return (0 if n > 0 else 1, n)
    rows_sorted = sorted(rows, key=sort_key)

    # 判断是否需要多音枚举
    pos_ns = [safe_int(r.get("多音排序"), 0) for r in rows_sorted if safe_int(r.get("多音排序"), 0) > 0]
    need_enum = len(set(pos_ns)) >= 2

    zhuyin_list = []
    hanyu_list = []
    z_seen = set()
    p_seen = set()

    for r in rows_sorted:
        n = safe_int(r.get("多音排序"), 0)
        z = str(r.get("注音一式", "") or "").strip()
        p = str(r.get("漢語拼音", "") or "").strip()
        zv = str(r.get("變體注音", "") or "").strip()
        pv = str(r.get("變體漢語拼音", "") or "").strip()

        if z and z not in z_seen:
            prefix = f"({CN_NUM.get(n, str(n))}) " if (need_enum and n > 0) else ""
            zhuyin_list.append(prefix + z)
            z_seen.add(z)
        if p and p not in p_seen:
            prefix = f"({CN_NUM.get(n, str(n))}) " if (need_enum and n > 0) else ""
            hanyu_list.append(prefix + p)
            p_seen.add(p)

        # 變體注音/拼音不加枚举，只作补充
        if zv and zv not in z_seen:
            zhuyin_list.append(zv)
            z_seen.add(zv)
        if pv and pv not in p_seen:
            hanyu_list.append(pv)
            p_seen.add(pv)

    bopomo_line = f'<span class="bopomo">注音一式：{join_nonempty(zhuyin_list)}</span>' if zhuyin_list else ""
    hanyu_line = f'<span class="hanyu">漢語拼音：{join_nonempty(hanyu_list)}</span>' if hanyu_list else ""
    return bopomo_line, hanyu_line

def build_defs_block(rows):
    """
    釋義段落：
    - 若一条目内存在 >=2 个不同的多音序号(>0)，每块加 (一)(二)... + 注音，并显示變體類型。
    - 否则：逐条列出；即使不是多音，也会在有變體類型时显示 accent 标签。
    """
    def sort_key(r):
        n = safe_int(r.get("多音排序"), 0)
        return (0 if n > 0 else 1, n)
    rows_sorted = sorted(rows, key=sort_key)

    pos_ns = [safe_int(r.get("多音排序"), 0) for r in rows_sorted if safe_int(r.get("多音排序"), 0) > 0]
    need_enum = len(set(pos_ns)) >= 2
    multiple = len(rows_sorted) > 1

    parts = []
    for r in rows_sorted:
        defs = clean_text(r.get("釋義", ""))
        zhuyin = str(r.get("注音一式", "") or "").strip()
        n = safe_int(r.get("多音排序"), 0)
        var_type_raw = str(r.get("變體類型 1:變 2:又音 3:語音 4:讀音", "") or "").strip()
        var_type = VARIANT_TYPE.get(var_type_raw, var_type_raw)

        header_bits = []
        if need_enum and n > 0:
            header_bits.append(f'<span class="pron">({CN_NUM.get(n, str(n))}) {zhuyin}</span>' if zhuyin else f'<span class="pron">({CN_NUM.get(n, str(n))})</span>')
        elif multiple and zhuyin:
            header_bits.append(f'<span class="pron">{zhuyin}</span>')

        if var_type:
            header_bits.append(f'<span class="accent">{var_type}</span>')

        if header_bits:
            parts.append(" ".join(header_bits) + "<br />")
        if defs:
            parts.append(f'<span class="mean">{defs}</span>')

    if not parts:
        return ""
    return '<span class="article">釋義：<br />' + "<br />".join(parts) + "</span>"

def build_entry_html(rows):
    """
    生成每个词条的 HTML 内容（包在 edugycd 容器内）。
    """
    r0 = rows[0]
    head = str(r0.get("字詞名", "") or "").strip()
    head_for_display = replace_inline_images(head)

    # 兼容：字數/單字，顯示部首筆畫
    zi_shu = safe_int(r0.get("字數"), 0)

    # 顶部
    lines = [f'<span class="index">{head_for_display}</span>']
    if zi_shu == 1:
        radical_line = build_radical_line(r0)
        if radical_line:
            lines.append(f'<span class="radical">{radical_line}</span>')

    # 注音/拼音（合并）
    bopomo_line, hanyu_line = make_bopomo_and_pinyin_block(rows)
    if bopomo_line:
        lines.append(bopomo_line)
    if hanyu_line:
        lines.append(hanyu_line)

    # 別名/編碼（合并去重）
    alias_set = list({str(r.get("辭條別名", "") or "").strip() for r in rows if str(r.get("辭條別名", "") or "").strip()})
    code_set = list({str(r.get("字詞號", "") or "").strip() for r in rows if str(r.get("字詞號", "") or "").strip()})
    if alias_set:
        lines.append(f'<span class="alias">辭條別名：{"；".join(alias_set)}</span>')
    if code_set:
        lines.append(f'<span class="code">編碼：{"；".join(code_set)}</span>')

    # 釋義
    defs_block = build_defs_block(rows)
    if defs_block:
        lines.append(defs_block)

    # 相似詞 / 相反詞
    sim = list({str(r.get("相似詞", "") or "").strip() for r in rows if str(r.get("相似詞", "") or "").strip()})
    ant = list({str(r.get("相反詞", "") or "").strip() for r in rows if str(r.get("相反詞", "") or "").strip()})
    if sim:
        lines.append(f'<span class="syn">相似詞：{"；".join(sim)}</span>')
    if ant:
        lines.append(f'<span class="ant">相反詞：{"；".join(ant)}</span>')

    # 多音參見 / 異體字
    polyref = list({str(r.get("多音參見訊息", "") or "").strip() for r in rows if str(r.get("多音參見訊息", "") or "").strip()})
    if polyref:
        # polyref 里可能有换行和图片占位，同样清洗
        lines.append(f'<span class="polyref">多音參見：{"；".join(clean_text(x) for x in polyref)}</span>')

    variants = list({str(r.get("異體字", "") or "").strip() for r in rows if str(r.get("異體字", "") or "").strip()})
    if variants:
        lines.append(f'<span class="variant">異體字：{"；".join(variants)}</span>')

    html = '<div class="edugycd">' + "<br />".join(lines) + "</div>"
    return html

def group_rows(df: pd.DataFrame, mode: str):
    """
    根据模式分组并保持原始顺序。
    返回：[(head_plain, [row,row,...]), ...]
    - head：按「字詞名」全部合并
    - head+pron：仅合并多音序号(>0)相同的行；多音序号为 0/空的行各自独立
    - none：逐行一个组
    """
    groups = {}
    order = []

    if mode == "none":
        for idx, row in df.iterrows():
            head = str(row.get("字詞名", "") or "").strip()
            if not head:
                continue
            key = (head, f"R{idx}")
            groups[key] = [row]
            order.append(key)
        return [(k[0], groups[k]) for k in order]

    elif mode == "head+pron":
        for idx, row in df.iterrows():
            head = str(row.get("字詞名", "") or "").strip()
            if not head:
                continue
            n = safe_int(row.get("多音排序"), 0)
            if n > 0:
                key = (head, f"N{n}")
            else:
                # 不合并：每行一个
                key = (head, f"R{idx}")
            if key not in groups:
                groups[key] = []
                order.append(key)
            groups[key].append(row)
        return [(k[0], groups[k]) for k in order]

    else:  # head
        by_head = {}
        order_heads = []
        for _, row in df.iterrows():
            head = str(row.get("字詞名", "") or "").strip()
            if not head:
                continue
            if head not in by_head:
                by_head[head] = []
                order_heads.append(head)
            by_head[head].append(row)
        return [(h, by_head[h]) for h in order_heads]

def write_mdx_txt(grouped, out_path: Path, encoding="utf-8"):
    """
    输出 MDX 源 txt：
    </>
    詞條（纯文本）
    `1`<div class="edugycd">...</div>
    </>
    """
    with out_path.open("w", encoding=encoding, newline="\n") as f:
        for head_plain, rows in grouped:
            html = build_entry_html(rows)
            f.write("</>\n")
            f.write(f"{head_plain}\n")
            f.write(f"`1`{html}\n")

def compute_stats(df: pd.DataFrame):
    head = df["字詞名"].fillna("").astype(str).str.strip()
    total_rows = len(df)
    blank_rows = (head == "").sum()
    valid_rows = total_rows - blank_rows
    unique_heads = head[head != ""].nunique()

    # 计算 head+pron 模式下的词条数：对每个 head，合并 n>0 的同组 + n<=0 的逐行
    entries_head_pron = 0
    for h, sub in df[head != ""].groupby(head):
        ns = sub["多音排序"].fillna("").astype(str)
        ns_num = []
        for x in ns:
            try:
                n = int(x.strip()) if x.strip() != "" else 0
            except:
                n = 0
            ns_num.append(n)
        ns_num = pd.Series(ns_num, index=sub.index)
        pos_unique = len(set(ns_num[ns_num > 0].tolist()))
        zeros = (ns_num <= 0).sum()
        entries_head_pron += pos_unique + zeros

    stats = {
        "總行數": total_rows,
        "空白頭詞行數": int(blank_rows),
        "有效行數(有字詞名)": int(valid_rows),
        "唯一頭詞數(=head模式詞條數)": int(unique_heads),
        "head+pron模式詞條數": int(entries_head_pron),
        "none模式詞條數(逐行)": int(valid_rows),
    }

    # 合併最多的頭詞（前 20）
    vc = head[head != ""].value_counts()
    top_merged = vc[vc > 1].head(20)

    return stats, top_merged

def main():
    ap = argparse.ArgumentParser(description="Build MDX txt from dict_revised_2015_20250627.xlsx")
    ap.add_argument("--group", choices=["head", "head+pron", "none"], default="head", help="分组方式")
    ap.add_argument("--stats", action="store_true", help="打印统计信息")
    ap.add_argument("-i", "--input", default="dict_revised_2015_20250627.xlsx", help="输入 xlsx 路径")
    ap.add_argument("-o", "--output", default="dict_revised_2015_20250627.mdx.txt", help="输出 txt 路径")
    ap.add_argument("-e", "--encoding", default="utf-8", help="输出编码（utf-8/utf-16le 等）")
    args = ap.parse_args()

    in_xlsx = Path(args.input)
    out_txt = Path(args.output)

    if not in_xlsx.exists():
        raise FileNotFoundError(f"未找到 {in_xlsx}，请确认文件位于当前目录或用 --input 指定。")

    # dtype=str 避免编号丢前导零
    df = pd.read_excel(in_xlsx, dtype=str, engine="openpyxl").fillna("")
    # 清洗列名
    df.columns = [c.strip() for c in df.columns]

    # 分组
    grouped = group_rows(df, args.group)

    # 写文件
    write_mdx_txt(grouped, out_txt, encoding=args.encoding)

    print(f"已生成：{out_txt}（條目數：{len(grouped)}，模式：{args.group}，編碼：{args.encoding}）")

    if args.stats:
        stats, top_merged = compute_stats(df)
        print("\n=== 數據統計 ===")
        for k, v in stats.items():
            print(f"{k}: {v}")
        if not top_merged.empty:
            print("\n合併最多的頭詞（前 20）：")
            for name, cnt in top_merged.items():
                print(f"{name}: {cnt}")

if __name__ == "__main__":
    main()