动态网站爬取被识别概率低一些吧?我尝试了一下,为什么是Timeout?
address.txt内容如下:
como
que
de una
niña
bebe
¿O me equivoco?
python源代码
# -*- coding: utf-8 -*-
import os.path
from os import path
import time
from playwright.sync_api import sync_playwright
from playwright.sync_api import TimeoutError
with sync_playwright() as p:
browser = p.chromium.launch()
# 模拟高清屏,2为缩放倍率,爬取网页可以用1,制作高清的图片词典可以用2
context = browser.new_context(device_scale_factor=2)
page = context.new_page()
for i, line in enumerate(open("address.txt")):
filename = line + ".html" # 保存的文件名
line = line.strip() # 移除文本行前后空格
# 检查文件是否存在,存在跳过
if path.exists(filename):
continue
try:
# 设置50秒超时,默认是30秒,超时就跳过,下次再处理。
page.set_default_navigation_timeout(50000)
page.goto('https://www.spanishdict.com/translate/'+line)
except TimeoutError:
# 打印文本行,去除前后空格换行,错误提示
print('current: ', i, line, '[timeout]')
continue
# 等待2秒,确保动态网页也可以爬取
time.sleep(2)
# 读取网页内容
content = page.content()
# 打印文本行,去除前后空格换行,响应内容长度
print('current: ', i, line, len(content))
# 保存网页到文件
with open(filename, "w") as f:
f.write(content)
# 保存截图,方便查看效果
page.screenshot(path="screenshot.png", full_page=True)
# 保存指定选择器的截图,如果网页加密,可以方便制作图片词典
# elem = page.query_selector(".mtb")
# elem.screenshot(path="mtb.png")
browser.close()