古诗文网爬虫
作者:互联网
0x00 代码
#coding:utf-8
import requests
import re
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
response = requests.get(url,headers=headers)
text = response.text
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#re.DOTALL:(.)匹配所有字符。(.*?)加个问号改成非贪婪模式
#print(title)
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
#print(dynasty)
authors = re.findall(r'<p class="source">,*?<a.*?>.*?<a.*?>(.*?)</a>',text)
#print(authors)
content_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)#我们需要用(.*?)来获得中间的内容
#print((contents))
contents = []
for content in content_tags:
x = re.sub(r'<.*?>',"",content)
contents.append(x.strip())
#print(contents)
for value in zip(titles,dynasties,authors,contents):#将目标转换成一一对应的数组
title,dynastiy,author,content = value#进行解包
#将下面封装的字典装在列表里
poems = []
#封装在字典里
poem = {
'title':title,
'dynastiy':dynastiy,
'author':author,
'content':content
}
poems.append(poem)
for poem in poems:
print(poem)
print('='*40)
def main():
url = 'https://www.gushiwen.org/default_1.aspx'
for x in range(1,11):
url = "https://www.gushiwen.org/default_%s.aspx" %x
parse_page(url)
if __name__ == "__main__":
main()
0x02 效果
标签:url,text,爬虫,content,re,古诗文,print,contents 来源: https://www.cnblogs.com/wangtanzhi/p/12416397.html