playwright反爬虫检测
爬虫检测网站
Antibot: https://bot.sannysoft.com/
playwright:https://playwright.net.cn/python/
正常浏览器展示如下:

使用 playwright 打开时,展示如下:
playwright cr https://bot.sannysoft.com/

可见默认情况下使用 playwright 时 WebDriver 那一栏无法通过检查。
绕过方式一:Stealth 插件
pip install playwright-stealth
测试代码如下:
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
async def main():
async with Stealth().use_async(async_playwright()) as p: # 最常见用法
browser = await p.chromium.launch( # 启动浏览器
headless=False,
proxy={"server": "socks5://127.0.0.1:7890"},
ignore_default_args=["--enable-automation"], # 移除自动化标记
args=[
"--disable-blink-features=AutomationControlled", # 禁用自动化控制特性,隐藏自动化痕迹
"--ignore-certificate-errors", # 忽略SSL证书错误,允许访问自签名证书站点
"--disable-gpu", # 禁用GPU硬件加速,减少资源占用并避免部分兼容问题
"--disable-dev-shm-usage", # 禁用/dev/shm共享内存,防止容器或低内存环境崩溃
"--disable-webrtc-ip-handling", # 禁用WebRTC本地IP泄露,提升匿名性
"--force-webrtc-ip-handling-policy=disable_non_proxied_udp", # 强制WebRTC仅使用代理UDP,避免真实IP暴露
],
)
context = await browser.new_context(
locale="zh-CN",
timezone_id="Asia/Shanghai",
geolocation={"longitude": 116.4074, "latitude": 39.9042},
permissions=["geolocation"],
ignore_https_errors=True,
java_script_enabled=True,
accept_downloads=False,
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
)
page = await context.new_page()
await page.goto("https://bot.sannysoft.com/")
print(await page.title())
# 在这里继续操作
await page.wait_for_timeout(10000)
await browser.close()
asyncio.run(main())
绕过方式二:使用当前浏览器
退出浏览器,再以debug模式启动
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --remote-debugging-port=9222 --user-data-dir=/tmp/test/
编写代码连接:
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.connect_over_cdp("http://localhost:9222") # 连接刚才打开的浏览器
context = await browser.new_context()
page = await context.new_page()
await page.goto("https://bot.sannysoft.com/")
print(await page.title())
# 在这里继续操作
await page.wait_for_timeout(10000)
await browser.close()
asyncio.run(main())
扩展
既反爬又可以保持网站登录状态
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
import os
storagePath = "/Users/d4m1ts/.playwright/playwright_state.json" # 会话存储路径
if not os.path.exists(storagePath):
with open(storagePath, "w", encoding="utf-8") as f:
f.write("{}")
async def main():
async with Stealth().use_async(async_playwright()) as p: # 最常见用法
browser = await p.chromium.launch( # 启动浏览器
headless=False,
proxy={"server": "socks5://127.0.0.1:7890"},
ignore_default_args=["--enable-automation"], # 移除自动化标记
args=[
"--disable-blink-features=AutomationControlled", # 禁用自动化控制特性,隐藏自动化痕迹
"--ignore-certificate-errors", # 忽略SSL证书错误,允许访问自签名证书站点
"--disable-gpu", # 禁用GPU硬件加速,减少资源占用并避免部分兼容问题
"--disable-dev-shm-usage", # 禁用/dev/shm共享内存,防止容器或低内存环境崩溃
"--disable-webrtc-ip-handling", # 禁用WebRTC本地IP泄露,提升匿名性
"--force-webrtc-ip-handling-policy=disable_non_proxied_udp", # 强制WebRTC仅使用代理UDP,避免真实IP暴露
],
)
context = await browser.new_context(
storage_state=storagePath,
locale="zh-CN",
timezone_id="Asia/Shanghai",
geolocation={"longitude": 116.4074, "latitude": 39.9042},
permissions=["geolocation"],
ignore_https_errors=True,
java_script_enabled=True,
accept_downloads=False,
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
)
context.set_default_timeout(240000) # 设置全局默认超时为 240 秒
page = await context.new_page()
await page.goto("https://markdown.gm7.org/")
print(await page.title())
# 保存状态到文件
await context.storage_state(path=storagePath)
await browser.close()
asyncio.run(main())