爬取笔趣阁文章
作者:互联网
分享一个爬虫的部分代码
import requests
import time
from lxml import etree
def get_session(): # 获取会话
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
session = requests.session()
session.headers.update(header)
return session
def get_html(url): # 连接页面
for i in range(2):
res = get_session().get(url, timeout=5)
res.encoding = 'utf-8'
time.sleep(0.2)
return res
def get_conxpath(url, i_xpath): # 得到某页面某xpayh下内容
res = get_html(url)
time.sleep(0.3)
cont = etree.HTML(res.text).xpath(i_xpath)
return cont
def write_ap_file(thing, path):
with open(path, 'a+', encoding='utf-8') as f:
f.write(thing)
def running():
tiny_list_url = 'http://www.xbiquge.la/0/119/'
tiny_lists = get_conxpath(tiny_list_url, '//*[@id="list"]/dl/dd/a/@href')
m = 0
while m <= len(tiny_lists):
tiny_url = 'http://www.xbiquge.la'+tiny_lists[m]
docs = get_conxpath(tiny_url, '//*[@id="content"]/text()')
title = get_conxpath(tiny_url, '//*[@id="wrapper"]/div/div/div/h1/text()')
end_text = '\n\n'+''.join(title)+'\n\n'
for doc in docs:
if '\r' in doc:
doc.replace('\r', '\n')
end_text += doc
write_ap_file(end_text, 'G:\儒道至圣.TXT')
m += 1
print('{} in {}'.format(m, len(tiny_lists)))
if __name__ == '__main__':
running()
因为应对各种错误的代码过于繁杂,这里就不展示了^_^
[申明:禁止商业用途]
Herry _G 发布了3 篇原创文章 · 获赞 0 · 访问量 92 私信 关注标签:return,get,url,res,爬取,session,文章,笔趣,def 来源: https://blog.csdn.net/herry_g/article/details/104094407