20 古诗文网站诗文爬取(正则方法)
作者:互联网
1 """古诗文网爬虫""" 2 3 4 import re 5 import requests 6 7 def parse_page(url): 8 headers = { 9 'User-Agent': 'Mozilla/5.0', 10 } 11 12 response = requests.get(url, headers) 13 # print(response.text) 14 text = response.text 15 16 # re解析 17 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) # .本不会匹配\n,加上参数re.DOTALL即对任何字符都有效 18 # print(titles) 19 dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL) 20 # print(dynasties) 21 authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL) 22 # print(authors) 23 content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL) 24 # print(content_tags) 25 contents = [] 26 for content in content_tags: 27 x = re.sub(r'<.*>', "", content).strip() 28 contents.append(x) 29 poems = [] 30 for value in zip(titles, dynasties, authors, contents): 31 title, dynasty, author, content = value 32 poem = { 33 'title': title, 34 'dynasty': dynasty, 35 'author': author, 36 'content': content 37 } 38 poems.append(poem) 39 40 # 输出诗文记录 41 for poem in poems: 42 print(poem) 43 44 45 def main(): 46 url = "https://www.gushiwen.org/default_{}.aspx" 47 for x in range(1, 11): 48 newurl = url.format(x) 49 parse_page(newurl) 50 51 if __name__ == '__main__': 52 main()
标签:DOTALL,content,20,text,爬取,re,古诗文,__,print 来源: https://www.cnblogs.com/sruzzg/p/13128526.html