首页 > 其他分享> > 根据名称搜索小说并下载到本地【全书小说网】
根据名称搜索小说并下载到本地【全书小说网】

2021-10-21 15:34:24 作者：互联网
'''
获取全书网的任意一本书的正文,每个章节为一个txt文件(如果章节太多可以获取前5章),这些文件全部放在以书名命名的文件夹中
'''
import os
import re
from urllib.parse import quote

import requests


class QuanShu:
    def __init__(self, name=''):
        name = quote(name.encode('gb2312'))
        # print(name)
        self.host = 'http://www.quanshuxs.com/'
        self.url = f'http://www.quanshuxs.com/search.asp?key={name}&x=0&y=0'
        pass

    def get_search_html(self):
        resp = requests.get(self.url)
        resp.encoding = resp.apparent_encoding
        html = resp.text
        tables = re.findall(
            r'<table cellspacing="0" cellpadding="0" width="962" border="0" align="center" class="m9">(.*?)</table>',
            html, re.S)
        url_list = []
        for i, table in enumerate(tables):
            works_url = re.findall(r'<a href="(.*)" target="_blank">', table, re.S)
            works_info = re.findall(r'<a href=".*?">(.*?)</a>', table, re.S)
            works_status = re.findall(r'状态: </font>\r\n(.*?)&nbsp;\|', table, re.S)
            if len(works_url) < 1:
                print("没有搜索到对应作品！")
                break
            else:
                works_url = works_url[0]
                works_name = str(works_info[0]).replace("<font color='red'>", "").replace("</font>", "")
                new_chapter = works_info[1]
                works_author = works_info[2]
                works_type = works_info[3]
                works_status = works_status[0]
                url_list.append(self.host + works_url)
                print(
                    f'序号:{i:3}作品名称:{works_name} 最新章节:{new_chapter} 作者: {works_author} 类型: {works_type} 状态: {works_status}',
                    end='\n\n')
        url_num = input("请选择需要下载的作品序号(enter):")
        self.get_works_html(url_list[int(url_num)])

    def get_works_html(self, url):
        works_resp = requests.get(url)
        works_resp.encoding = works_resp.apparent_encoding
        html = works_resp.text
        table = re.findall(r'class="mread">(.*?)</table>', html, re.S)[0]
        self.title = re.findall(r'<font color="#7B352B">(.*?)全文阅读</font>', table, re.S)[0]
        if not os.path.exists(self.title):
            os.mkdir(self.title)
        chapters = re.findall(r'<div class="bai"><a href="(.*?)">(.*?)</a>', table, re.S)[0]
        # print(chapters)
        self.get_chapter_html(chapters[0])

    # 访问单个章节
    def get_chapter_html(self, url):
        chapter_resp = requests.get(url)
        chapter_resp.encoding = chapter_resp.apparent_encoding
        html = chapter_resp.text
        chapter_name = re.findall(r"<strong>(.*?)</strong>", html, re.S)[0]
        print(f'正在下载---->{chapter_name}')
        next = re.findall(r"<a href='(.*?)'><font color='#7B352B'>下一章</font></a>　\( → \)", html, re.S)
        content = re.findall(r'<td colspan="2" class="content">(.*?)</td>', html, re.S)
        # print(content)
        content = content[0].replace('<br><br>', '\n').replace('<img src="image/', '').replace('.jpg">', ' ').replace(
            '&mdash;', '—')
        with open(f'{self.title}/{chapter_name}.txt', 'w+') as f:
            f.write("　　")
            f.write(content)
        print(f'已下载---->{chapter_name},{url}')
        if len(next) > 0:
            self.get_chapter_html(next[0])


if __name__ == '__main__':
    name = input("请输入小说名称:")
    quanshu = QuanShu(name)
    quanshu.get_search_html()
    quanshu.get_works_html()
标签：name,url,self,re,html,搜索,小说网,works,全书
来源： https://www.cnblogs.com/greensunit/p/15433582.html