其他分享
首页 > 其他分享> > 80小说爬取

80小说爬取

作者:互联网

import os
import random
import re
import time

import lxml.etree
import requests
import faker

fake = faker.Faker()

uaList = []
for i in range(0, 10):
uaList.append(fake.user_agent())

headers = {
"User-Agent": random.choice(uaList)
}


def request_view(response):
import webbrowser
request_url = response.url
base_url = '<head><base href="%s">' % (request_url)
base_url = base_url.encode()
content = response.content.replace(b"<head>", base_url)
tem_html = open('tmp.html', 'wb')
tem_html.write(content)
tem_html.close()
webbrowser.open_new_tab('tmp.html')


class Crawl:
host = ""
headers = ""

def __init__(self, host, headers):
self.host = host
self.headers = headers

def get_content(self, url):
resp = requests.get(url, self.headers)
if resp.status_code != 200:
print("crawl url error " + url + str(resp.status_code))
content = None
else:
content = resp.content
return content

def get_novel_list(self, content):
html = lxml.etree.HTML(content)
list = html.xpath('//div[@class="searchlist_l_box"]/ul//li')
if len(list) > 0:
for li in list:
hrefs = li.xpath("./a/@href")
if len(hrefs) > 0:
for href in hrefs:
detail_url = self.join_url(href)
self.get_download_url(detail_url)

def join_url(self, url):
return "http://" + self.host + url

def get_download_url(self, detail_url):
content = self.get_content(detail_url)
html = lxml.etree.HTML(content)
title = html.xpath('//dd[@class="bt"]/h2/text()')
download_url = html.xpath('//div[@class="downlinks"]//a/@href')

if len(title) == 1 and len(download_url) >= 1:
title = title[0]
download_url = download_url[0]
download_url = self.join_url(download_url)
self.download_url(download_url, title)

def download_url(self,url,title):
title = re.sub(r'[?\\*|“<>:/]', '', title)
content = self.get_content(url)
html = lxml.etree.HTML(content)
txt_url = html.xpath('//div[@class="downlist"][1]/li/strong/a/@href')
if len(txt_url) == 1:
self.download_txt(txt_url[0], title)

def download_txt(self, url, title):
content = self.get_content(url)
path = "E:\\xiaoshuo"
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
file = path + "\\" + title
with open(file, "wb") as f:
f.write(content)
f.close()
print("download success " + title)
time.sleep(1)

def start(self):
list_code = [ 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z']

for code in list_code:
url = "http://" + self.host + "/" + code + ".html"
content = self.get_content(url)
if content is not None:
print("crawl url success:" + url)
self.get_novel_list(content)

if __name__ == "__main__":
host = "www.txt80.com"
crawl = Crawl(host, headers)
try:
crawl.start()
except Exception as e:
print(str(e))

  

标签:content,title,url,self,爬取,html,download,80,小说
来源: https://www.cnblogs.com/brady-wang/p/15611046.html