python的pyppeteer的爬虫demo
作者:互联网
目标网站是药物临床试验登记平台
思路:尝试requests库直接获取该网站的response,失败,因为该网站返回202,需要破解js;然后尝试用chrome driver驱动获取网页数据,失败,因为被识别为恶意爬虫;然后找到了pyppeteer库进行尝试,成功
贴代码:
import asyncio
import random
from pyppeteer import launch
from pyppeteer.network_manager import Response
class PyppeteerScript(object):
"""
pyppeteer
"""
def __init__(self):
self.base_url = "http://www.chinadrugtrials.org.cn/clinicaltrials.searchlistdetail.dhtml"
# 从第几个开始爬取
self.current_page = 13480
# 可爬取的最大实验数 退出的条件
self.page_limit = 13483
self.config = {
# 无头浏览器设置 显示网页
"headless": False,
# 本地Chromium路径 镜像下载:https://npm.taobao.org/mirrors/chromium-browser-snapshots/
"executablePath": "/Users/xxx/Downloads/chromium/Chromium.app/Contents/MacOS/Chromium",
# 操作后的延迟时间 防止检测
"slowMo": 5,
# 配置参数
"args": [
# 关闭自动化提示框
"--disable-infobars",
# 关闭沙盒模式
"--no-sandbox",
# 代理 PS:每次需要更换代理就需要重新通过launch方法来启动浏览器
# "--proxy-server=http://ip:port",
# 取消显示正在受控制的bars
"--disable-infobars",
]
}
async def intercept_response(self, res: Response):
json_text = await res.text()
print(json_text)
def __save_html(self, content, name):
with open("./html/{}.html".format(name), "w+", encoding="utf-8") as f:
f.write(content)
async def run(self):
"""
入口函数
"""
browser = await launch(**self.config)
page = await browser.newPage()
await page.setViewport({'width': 1920, 'height': 1080})
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
);
# 混淆识别js代码 防止被识别
js_text = """
() =>{
Object.defineProperties(navigator,{ webdriver:{ get: () => false } });
window.navigator.chrome = { runtime: {}, };
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], });
}
"""
await page.evaluateOnNewDocument(js_text)
await page.goto(url=self.base_url)
# 等待网页加载(second)
await asyncio.sleep(5)
while True:
if self.current_page > self.page_limit:
break
# 执行翻页的JS函数
turn_page_js = "gotopage({})".format(self.current_page)
await page.evaluate(turn_page_js)
await asyncio.sleep(random.randint(2, 4))
self.current_page += 1
# 获取网页源码
page_text = await page.content()
page_title = await page.title()
self.__save_html(content=page_text, name=page_title)
# 没有XHR请求 不会响应response 不能直接获取json
# await page.setRequestInterception(True)
# page.on('response', self.intercept_response)
await browser.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(PyppeteerScript().run())
PS:该代码仅供学习交流,请勿用于其他商业用途
标签:__,await,python,demo,self,pyppeteer,js,text,page 来源: https://blog.csdn.net/weixin_50385593/article/details/117634794