import csv
import os
import time
import tkinter as tk
from tkinter import filedialog, ttk, messagebox
from urllib.parse import quote, urlparse
import threading
import concurrent.futures
import random
import re
from pathvalidate import sanitize_filename
from lxml import etree
from playwright.sync_api import sync_playwright


class WebCrawlerGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("智能网页爬取工具")
        self.root.geometry("900x650")

        # 创建主框架
        self.main_frame = ttk.Frame(root, padding="20")
        self.main_frame.pack(fill=tk.BOTH, expand=True)

        # 搜索引擎选择
        ttk.Label(self.main_frame, text="搜索引擎:").grid(row=0, column=0, sticky=tk.W, pady=5)
        self.search_engine_var = tk.StringVar(value="https://www.chinaso.com/newssearch/social/socialResults?q={}")
        self.search_engines = ttk.Combobox(self.main_frame, textvariable=self.search_engine_var, width=70)
        self.search_engines['values'] = (
            "https://m.guoxuedashi.net/zidian/{}.html",
            "https://www.baidu.com/s?wd={}",
            "https://www.google.com/search?q={}",
            "https://www.bing.com/search?q={}",
            "https://search.yahoo.com/search?p={}",
            "https://duckduckgo.com/?q={}",
            "https://hanyu.sogou.com/result?query={}",
            "https://www.chinaso.com/newssearch/social/socialResults?q={}",
            "http://yedict.com/zscontent.asp?uni={}"
        )
        self.search_engines.grid(row=0, column=1, columnspan=2, sticky=tk.EW, pady=5)

        # CSV文件选择
        ttk.Label(self.main_frame, text="CSV文件路径:").grid(row=1, column=0, sticky=tk.W, pady=5)
        self.csv_path_var = tk.StringVar(value="D:/robot/孔子.csv")
        self.csv_entry = ttk.Entry(self.main_frame, textvariable=self.csv_path_var, width=60)
        self.csv_entry.grid(row=1, column=1, sticky=tk.EW, pady=5)
        self.csv_browse = ttk.Button(self.main_frame, text="浏览...", command=self.browse_csv)
        self.csv_browse.grid(row=1, column=2, padx=(5, 0))

        # 输出目录
        ttk.Label(self.main_frame, text="输出目录:").grid(row=2, column=0, sticky=tk.W, pady=5)
        self.output_dir_var = tk.StringVar(value="D:/data网页/data中国搜索/html/")
        self.output_entry = ttk.Entry(self.main_frame, textvariable=self.output_dir_var, width=60)
        self.output_entry.grid(row=2, column=1, sticky=tk.EW, pady=5)
        self.output_browse = ttk.Button(self.main_frame, text="浏览...", command=self.browse_output)
        self.output_browse.grid(row=2, column=2, padx=(5, 0))

        # 内容提取选择器
        ttk.Label(self.main_frame, text="内容选择器 (CSS):").grid(row=3, column=0, sticky=tk.W, pady=5)
        self.selector_var = tk.StringVar(value=".search-list")
        self.selector_entry = ttk.Entry(self.main_frame, textvariable=self.selector_var, width=70)
        self.selector_entry.grid(row=3, column=1, columnspan=2, sticky=tk.EW, pady=5)
        ttk.Label(self.main_frame, text="示例: .main-hanyu, .main-content, #search-results, div[role='main']").grid(
            row=4, column=1,
            columnspan=2,
            sticky=tk.W)

        # 爬取延迟
        ttk.Label(self.main_frame, text="爬取延迟(秒):").grid(row=5, column=0, sticky=tk.W, pady=5)
        self.delay_var = tk.DoubleVar(value=0.1)
        self.delay_spin = ttk.Spinbox(self.main_frame, from_=0.05, to=2.0, increment=0.05,
                                      textvariable=self.delay_var, width=5)
        self.delay_spin.grid(row=5, column=1, sticky=tk.W)

        # 线程数设置
        ttk.Label(self.main_frame, text="并发线程数:").grid(row=5, column=0, sticky=tk.E, pady=5)
        self.threads_var = tk.IntVar(value=3)
        self.threads_spin = ttk.Spinbox(self.main_frame, from_=1, to=10, increment=1,
                                        textvariable=self.threads_var, width=5)
        self.threads_spin.grid(row=5, column=1, sticky=tk.E)

        # CSV行范围设置
        ttk.Label(self.main_frame, text="处理行范围:").grid(row=6, column=0, sticky=tk.W, pady=5)

        # 起始行设置
        self.start_row_frame = ttk.Frame(self.main_frame)
        self.start_row_frame.grid(row=6, column=1, sticky=tk.W)
        ttk.Label(self.start_row_frame, text="起始行:").pack(side=tk.LEFT)
        self.start_row_var = tk.StringVar(value="0")
        self.start_row_entry = ttk.Entry(self.start_row_frame, textvariable=self.start_row_var, width=8)
        self.start_row_entry.pack(side=tk.LEFT, padx=(5, 10))

        # 结束行设置
        ttk.Label(self.start_row_frame, text="结束行:").pack(side=tk.LEFT)
        self.end_row_var = tk.StringVar(value="")
        self.end_row_entry = ttk.Entry(self.start_row_frame, textvariable=self.end_row_var, width=8)
        self.end_row_entry.pack(side=tk.LEFT, padx=(5, 0))

        # 提示标签
        ttk.Label(self.main_frame, text="行号从0开始，结束行为空表示到最后一行").grid(row=6, column=2, sticky=tk.W)

        # 日志区域
        ttk.Label(self.main_frame, text="操作日志:").grid(row=7, column=0, sticky=tk.NW, pady=5)
        self.log_text = tk.Text(self.main_frame, height=15, width=80)
        self.log_text.grid(row=7, column=1, columnspan=2, sticky=tk.NSEW, pady=5)
        self.log_scroll = ttk.Scrollbar(self.main_frame, command=self.log_text.yview)
        self.log_scroll.grid(row=7, column=3, sticky=tk.NS)
        self.log_text.config(yscrollcommand=self.log_scroll.set)

        # 进度条
        self.progress_var = tk.DoubleVar()
        self.progress_bar = ttk.Progressbar(self.main_frame, variable=self.progress_var, maximum=100)
        self.progress_bar.grid(row=8, column=1, columnspan=2, sticky=tk.EW, pady=10)

        # 状态标签
        self.status_var = tk.StringVar(value="就绪")
        self.status_label = ttk.Label(self.main_frame, textvariable=self.status_var)
        self.status_label.grid(row=9, column=1, columnspan=2, sticky=tk.W)

        # 操作按钮
        self.btn_frame = ttk.Frame(self.main_frame)
        self.btn_frame.grid(row=10, column=1, columnspan=2, pady=10)

        self.start_btn = ttk.Button(self.btn_frame, text="开始爬取", command=self.start_crawling)
        self.start_btn.pack(side=tk.LEFT, padx=5)

        self.stop_btn = ttk.Button(self.btn_frame, text="停止", command=self.stop_crawling, state=tk.DISABLED)
        self.stop_btn.pack(side=tk.LEFT, padx=5)

        # 配置网格权重
        self.main_frame.columnconfigure(1, weight=1)
        self.main_frame.rowconfigure(7, weight=1)

        # 线程控制
        self.stop_event = threading.Event()
        self.crawling_thread = None
        self.processed_count = 0
        self.total_count = 0
        self.lock = threading.Lock()

    def browse_csv(self):
        file_path = filedialog.askopenfilename(
            filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
        )
        if file_path:
            self.csv_path_var.set(file_path)

    def browse_output(self):
        dir_path = filedialog.askdirectory()
        if dir_path:
            self.output_dir_var.set(dir_path)

    def log_message(self, message):
        self.log_text.insert(tk.END, message + "\n")
        self.log_text.see(tk.END)
        self.root.update_idletasks()

    def update_status(self, message):
        self.status_var.set(message)
        self.root.update_idletasks()

    def start_crawling(self):
        # 重置计数器和状态
        self.processed_count = 0
        self.total_count = 0
        self.progress_var.set(0)

        csv_path = self.csv_path_var.get()
        output_dir = self.output_dir_var.get()
        selector = self.selector_var.get()
        delay = self.delay_var.get()
        threads = self.threads_var.get()

        # 获取行范围设置
        try:
            start_row = int(self.start_row_var.get())
            if start_row < 0:
                raise ValueError("起始行不能为负数")
        except ValueError:
            messagebox.showerror("错误", "起始行必须是正整数或0")
            return

        try:
            end_row_str = self.end_row_var.get().strip()
            end_row = int(end_row_str) if end_row_str else None
            if end_row is not None and end_row < 0:
                raise ValueError("结束行不能为负数")
        except ValueError:
            messagebox.showerror("错误", "结束行必须是正整数或留空")
            return

        if not csv_path or not os.path.isfile(csv_path):
            messagebox.showerror("错误", "请选择有效的CSV文件")
            return

        if not output_dir:
            messagebox.showerror("错误", "请选择输出目录")
            return

        # 重置停止事件
        self.stop_event.clear()

        # 更新UI状态
        self.start_btn.config(state=tk.DISABLED)
        self.stop_btn.config(state=tk.NORMAL)

        # 在后台线程中运行爬虫
        self.crawling_thread = threading.Thread(
            target=self.run_crawler,
            args=(csv_path, output_dir, selector, delay, threads, start_row, end_row),
            daemon=True
        )
        self.crawling_thread.start()

    def stop_crawling(self):
        self.stop_event.set()
        self.log_message("正在停止爬取任务...")
        self.update_status("正在停止...")

    def run_crawler(self, csv_path, output_dir, selector, delay, max_workers, start_row, end_row):
        try:
            # 创建输出目录
            os.makedirs(output_dir, exist_ok=True)

            # 计算CSV文件总行数
            self.log_message("正在计算CSV文件行数...")
            with open(csv_path, 'r', encoding='utf-8') as f:
                total_lines = sum(1 for _ in f)

            # 计算实际需要处理的行数
            if end_row is None or end_row >= total_lines:
                end_row = total_lines - 1  # 行号从0开始，所以最后一行是total_lines-1
                self.log_message(f"结束行自动设置为最后一行: {end_row}")

            if start_row > end_row:
                self.log_message("错误: 起始行不能大于结束行")
                return

            if start_row >= total_lines:
                self.log_message("错误: 起始行超出文件范围")
                return

            self.total_count = end_row - start_row + 1
            self.log_message(f"共发现 {self.total_count} 个关键词需要处理 (行 {start_row}-{end_row})")

            # 使用线程池执行任务
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = []

                with open(csv_path, 'r', encoding='utf-8') as f:
                    reader = csv.reader(f)

                    # 跳过起始行之前的所有行
                    for _ in range(start_row):
                        try:
                            next(reader)
                        except StopIteration:
                            break

                    # 提交任务
                    for idx, row in enumerate(reader):
                        current_row = start_row + idx

                        # 检查是否超过结束行
                        if current_row > end_row:
                            break

                        if self.stop_event.is_set():
                            self.log_message("检测到停止信号，停止提交新任务")
                            break

                        if not row:
                            continue

                        # 提交任务到线程池
                        future = executor.submit(
                            self.process_keyword,
                            row[0].strip(),
                            output_dir,
                            selector,
                            delay,
                            current_row
                        )
                        futures.append(future)

                # 等待所有任务完成
                for future in concurrent.futures.as_completed(futures):
                    if self.stop_event.is_set():
                        # 取消所有未完成的任务
                        for f in futures:
                            f.cancel()
                        break

                    try:
                        result = future.result()
                    except Exception as e:
                        self.log_message(f"任务执行出错: {str(e)}")

            self.log_message(f"爬取任务完成! 共处理 {self.processed_count}/{self.total_count} 个关键词")
            self.update_status("任务完成")

        except Exception as e:
            self.log_message(f"发生错误: {str(e)}")
            self.update_status(f"错误: {str(e)}")

        finally:
            # 重置UI状态
            self.root.after(0, lambda: self.start_btn.config(state=tk.NORMAL))
            self.root.after(0, lambda: self.stop_btn.config(state=tk.DISABLED))

    def process_keyword(self, keyword, output_dir, selector, delay, current_row):
        """处理单个关键词的下载任务"""
        if self.stop_event.is_set():
            return False

        try:
            # 处理关键词 (去掉U+前缀)
            clean_keyword = keyword.replace("U+", "").strip()
            if not clean_keyword:
                self.log_message(f"行 {current_row}: 跳过空关键词")
                return False

            # 执行下载
            success = self.download_pages(clean_keyword, output_dir, selector, current_row)

            # 随机延迟 (0.8-1.2倍的设定延迟)
            time.sleep(delay * random.uniform(0.8, 1.2))

            return success
        except Exception as e:
            self.log_message(f"行 {current_row}: 处理关键词出错 ({keyword}): {str(e)}")
            return False
        finally:
            # 更新进度
            with self.lock:
                self.processed_count += 1
                progress = (self.processed_count / self.total_count) * 100
                self.progress_var.set(progress)
                self.update_status(f"处理中: {self.processed_count}/{self.total_count} ({progress:.1f}%)")

    def download_pages(self, keyword, output_dir, selector, current_row):
        """下载关键词的所有搜索结果页"""
        # 获取搜索引擎URL模板
        url_template = self.search_engine_var.get()
        if "{}" not in url_template:
            search_url = url_template
        else:
            search_url = url_template.format(quote(keyword))

        # 解析域名
        domain = urlparse(search_url).netloc

        # 只对中国搜索网站启用多页爬取
        if domain != "www.chinaso.com":
            return self.download_single_page(keyword, output_dir, selector, current_row, search_url, 1)

        # 下载第一页并获取总页数
        first_page_success, total_pages = self.download_single_page(
            keyword, output_dir, selector, current_row, search_url, 1
        )

        if not first_page_success:
            return False

        # 如果没有分页信息，直接返回
        if total_pages <= 1:
            return True

        # 下载后续页面
        success = True
        for page in range(2, total_pages + 1):
            if self.stop_event.is_set():
                break

            # 构建分页URL
            page_url = f"{search_url}&pn={page}"
            page_success, _ = self.download_single_page(
                keyword, output_dir, selector, current_row, page_url, page
            )

            # 记录是否所有页面都成功
            if not page_success:
                success = False

            # 随机延迟
            time.sleep(self.delay_var.get() * random.uniform(0.8, 1.2))

        return success

    def download_single_page(self, keyword, output_dir, selector, current_row, url, page_num):
        """下载单个页面并返回是否成功及总页数"""
        try:
            # 使用Playwright获取页面内容
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()

                # 设置请求头
                page.set_extra_http_headers({
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                    'Accept-Language': 'en-US,en;q=0.9',
                })

                # 导航到页面并等待网络空闲
                page.goto(url)
                page.wait_for_load_state("networkidle", timeout=30000)

                # 获取渲染后的HTML内容
                html_content = page.content()
                browser.close()

            # 解析HTML
            tree = etree.HTML(html_content)

            # 检查是否有搜索结果
            if selector:
                # 使用XPath选择元素
                try:
                    # 尝试CSS选择器
                    if selector.startswith('.') or selector.startswith('#'):
                        selected_elements = tree.xpath(f"//*[contains(@class, '{selector.lstrip('.')}')]")
                    else:
                        selected_elements = tree.xpath(f"//{selector}")
                except:
                    selected_elements = []

                if not selected_elements:
                    # 记录警告
                    self.log_message(
                        f"行 {current_row}: 警告: {keyword} 第{page_num}页 - 未找到匹配选择器 '{selector}' 的内容，跳过保存")
                    return False, 1
            else:
                selected_elements = []

            # 获取总页数（仅第一页）
            total_pages = 1
            if page_num == 1:
                pagination = tree.xpath('//ul[@class="el-pager"]')
                if pagination:
                    page_items = pagination[0].xpath('.//li[contains(@class, "number")]')
                    if page_items:
                        try:
                            # 尝试获取最后一个页码元素
                            last_page_element = page_items[-1]
                            total_pages = int(last_page_element.text)
                        except:
                            pass

            # 创建安全的文件名
            safe_keyword = sanitize_filename(keyword)

            # 添加页码后缀（除第一页外）
            if page_num > 1:
                filename = f"{safe_keyword}_p{page_num}.html"
            else:
                filename = f"{safe_keyword}.html"

            filepath = os.path.join(output_dir, filename)

            # 提取特定内容 (如果指定了选择器)
            if selector and selected_elements:
                # 提取元素HTML
                content_html = ""
                for element in selected_elements:
                    content_html += etree.tostring(element, encoding='unicode', method='html') + "\n"

                # 添加<body>标签包裹内容
                content_html = f"<body>{content_html}</body>"

                # 优化处理：去除换行和多余空格
                optimized_html = re.sub(r'\s+', ' ', content_html).strip()

                # 保存文件
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(optimized_html)

                self.log_message(f"行 {current_row}: 成功保存: {keyword} 第{page_num}页 -> {filepath}")
                return True, total_pages
            else:
                # 未指定选择器时保存原始内容
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(html_content)

                self.log_message(f"行 {current_row}: 成功保存原始内容: {keyword} 第{page_num}页 -> {filepath}")
                return True, total_pages

        except Exception as e:
            self.log_message(f"行 {current_row}: 处理错误 ({keyword} 第{page_num}页): {str(e)}")
            return False, 1


if __name__ == "__main__":
    root = tk.Tk()
    app = WebCrawlerGUI(root)
    root.mainloop()