Python—爬取三国演义小说(bs4)
作者:互联网
目录
一、源码
import requests
from bs4 import BeautifulSoup
#UA伪装
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
url='https://sanguo.5000yan.com/'
page_text=requests.get(url=url,headers=headers).content
# 1.实例化一个BeautifulSoup对象,并且将页面源码加载到该对象中
soup=BeautifulSoup(page_text,'lxml')
# 2.通过调用BeautifulSoup对象相关的属性或者方法进行标签定位和数据提取
#解析章节标题和详情页的url
li_list=soup.select('.sidamingzhu-list-mulu>ul>li')
fp=open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
#章节标题
title=li.a.string
#章节url
detail_url=li.a['href']
#对详情页发起请求,解析章节内容
detail_page_text=requests.get(url=detail_url,headers=headers).content
detail_soup=BeautifulSoup(detail_page_text,'lxml')
div_tag=detail_soup.find('div',class_='grap')
#解析到章节内容
content=div_tag.text
fp.write(title+':'+content)
print(title,'爬取成功!!!')
二、示例图片
三、bs4数据解析过程:
01、章节标题
代码解析数据:
#解析章节标题和详情页的url
li_list=soup.select('.sidamingzhu-list-mulu>ul>li')
#获取逐个章节标题与url
for li in li_list:
#章节标题
title=li.a.string
#章节url
detail_url=li.a['href']
02、章节内容
代码解析数据:
#对详情页发起请求
detail_page_text=requests.get(url=detail_url,headers=headers).content
#实例化详情页对象
detail_soup=BeautifulSoup(detail_page_text,'lxml')
#解析章节内容
div_tag=detail_soup.find('div',class_='grap')
#解析到章节内容,写入文件
content=div_tag.text
fp.write(title+':'+content)
标签:章节,Python,text,detail,li,爬取,bs4,url,解析 来源: https://blog.csdn.net/m0_65592409/article/details/123101573