爬取四大名著
作者:互联网
''' 诗词名句网 1. 爬取固定书籍 2. 爬取书名 3. 爬取本部书的章回目录 4. 灵活处理,爬取任意书籍的章回目录 5. 加入异常处理 6. 爬取任意整本书 ''' import requests import re def bookSpider(oldurl,bookName): url=oldurl+".html" html=loadPage(url) try: with open("demo.txt",'w',encoding='utf-8') as f: f.write(html) except: print("FILE OPERATION ERROR") findTitle("demo.txt",bookName) cnt=findTileOfPages("demo.txt",bookName) getWholeBook(oldurl,bookName,cnt) def findTitle(filename,bookName): try: f=open(filename,encoding='utf-8') book=open("book.txt",'w',encoding='utf-8') except: print("FILE OPERATION ERROR") while True: line=f.readline() #print("READ:"+line) if not line: break pattern=re.compile(r'<title>《.{0,10}》') bookName=re.search(pattern,line) flag=False if bookName: print("书名:",end="") for ch in str(bookName): if ch == '《': flag=True if ch == '》': flag=False print("》") book.write('》'+'\n') if flag: print(ch,end="") book.write(ch) def findTileOfPages(filename,bookName): cnt=0 try: f = open(filename,encoding='utf-8') book = open("book.txt",'a', encoding='utf-8') except: print("FILE OPERATION ERROR") book.write("目录:\n") while True: line = f.readline() # print("READ:"+line) if not line: break pattern = re.compile(r'<li><a href="/book/'+bookName+'/\d+.html">.{10,40}</a></li>') titleOfpages = pattern.findall(line) flag = False if titleOfpages: for i in range(0,len(titleOfpages)): cnt+=1 for j in range(0,len(titleOfpages[i])): if titleOfpages[i][j] == '第': flag=True if titleOfpages[i][j] == '<': flag=False if flag: print(titleOfpages[i][j],end="") book.write(titleOfpages[i][j]) print() book.write('\n') return cnt def getWholeBook(url,bookName,cnt): print("正在下载全本书,请稍后...") for i in range(1,cnt+1): newUrl=url+'/'+str(i)+".html" print(newUrl) html=loadPage(newUrl) try: with open("bookHtml.txt", 'w', encoding='utf-8') as f: f.write(html) except: print("FILE OPERATION ERROR") f = open('bookHtml.txt', 'r', encoding='utf-8') bookContent = open('book.txt', 'a', encoding='utf-8') while True: line = f.readline() # print("READ:"+line) if not line: break pattern = re.compile(r'<p> .+</p>') content = re.findall(pattern, line) patternOfTitle=re.compile(r'<h1>.+</h1>') contentOfTitle = re.findall(patternOfTitle, line) flag=False for i in range(0, len(contentOfTitle)): for j in range(0, len(contentOfTitle[i])): if contentOfTitle[i][j] == '>': flag=True continue if contentOfTitle[i][j] == '<': flag=False continue if flag: bookContent.write(contentOfTitle[i][j]) bookContent.write('\n') flag = False for i in range(0, len(content)): for j in range(0, len(content[i])): if content[i][j] == '<': flag=False continue if content[i][j] == ';' and content[i][j - 1] == 'p' and content[i][j + 1] != '&': flag = True continue if flag: bookContent.write(content[i][j]) bookContent.write('\n') f.close() bookContent.close() def loadPage(url): try: header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} response = requests.get(url, headers=header) return response.content.decode('utf-8') except: print("PAGE LOAD ERROR") if __name__ == "__main__": bookName=input("请输入想看的书名:(全拼)") url = "http://www.shicimingju.com/book/"+bookName bookSpider(url,bookName)
标签:bookName,爬取,re,flag,book,print,line,四大名著 来源: https://www.cnblogs.com/TheSilverMoon/p/11143203.html