80小说爬取

2021-11-27 11:04:00 作者：互联网
import os
import random
import re
import time

import lxml.etree
import requests
import faker

fake = faker.Faker()

uaList = []
for i in range(0, 10):
    uaList.append(fake.user_agent())

headers = {
    "User-Agent": random.choice(uaList)
}


def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = '<head><base href="%s">' % (request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>", base_url)
    tem_html = open('tmp.html', 'wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab('tmp.html')


class Crawl:
    host = ""
    headers = ""

    def __init__(self, host, headers):
        self.host = host
        self.headers = headers

    def get_content(self, url):
        resp = requests.get(url, self.headers)
        if resp.status_code != 200:
            print("crawl url error " + url + str(resp.status_code))
            content = None
        else:
            content = resp.content
        return content

    def get_novel_list(self, content):
        html = lxml.etree.HTML(content)
        list = html.xpath('//div[@class="searchlist_l_box"]/ul//li')
        if len(list) > 0:
            for li in list:
                hrefs = li.xpath("./a/@href")
                if len(hrefs) > 0:
                    for href in hrefs:
                        detail_url = self.join_url(href)
                        self.get_download_url(detail_url)

    def join_url(self, url):
        return "http://" + self.host + url

    def get_download_url(self, detail_url):
        content = self.get_content(detail_url)
        html = lxml.etree.HTML(content)
        title = html.xpath('//dd[@class="bt"]/h2/text()')
        download_url = html.xpath('//div[@class="downlinks"]//a/@href')

        if len(title) == 1 and len(download_url) >= 1:
            title = title[0]
            download_url = download_url[0]
            download_url = self.join_url(download_url)
            self.download_url(download_url, title)

    def download_url(self,url,title):
        title = re.sub(r'[？\\*|“<>:/]', '', title)
        content = self.get_content(url)
        html = lxml.etree.HTML(content)
        txt_url = html.xpath('//div[@class="downlist"][1]/li/strong/a/@href')
        if len(txt_url) == 1:
            self.download_txt(txt_url[0], title)

    def download_txt(self, url, title):
        content = self.get_content(url)
        path = "E:\\xiaoshuo"
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        file = path + "\\" + title
        with open(file, "wb") as f:
            f.write(content)
            f.close()
        print("download success " + title)
        time.sleep(1)

    def start(self):
        list_code = [ 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                     'U', 'V', 'W', 'X', 'Y', 'Z']

        for code in list_code:
            url = "http://" + self.host + "/" + code + ".html"
            content = self.get_content(url)
            if content is not None:
                print("crawl url success:" + url)
                self.get_novel_list(content)

if __name__ == "__main__":
    host = "www.txt80.com"
    crawl = Crawl(host, headers)
    try:
        crawl.start()
    except Exception as e:
        print(str(e))
标签：content,title,url,self,爬取,html,download,80,小说
来源： https://www.cnblogs.com/brady-wang/p/15611046.html