爬取300首古诗
作者:互联网
import time
import requests
from lxml import etree
from multiprocessing import Pool
def zxc():
qwe_op=requests.get('https://so.gushiwen.cn/shiwenv_45c396367f59.aspx').text
html1 = etree.HTML(qwe_op)
'标头的xpth的'
roto=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
"作者的xpth"
roto1=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a[1]/text()')
'诗句'
textuio=html1.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/text()')
with open('lp/'+str(roto[0])+'.txt','a+',encoding='utf-8')as ll:
ll.write('{}\n'.format(str(roto[0])))
ll.write('{}\n'.format(str(roto1[0])))
for i in textuio:
s=str(i).replace(' ','')
ll.write('{}\n'.format(str(s)))
# 用多进程
if __name__ == '__main__':
qwe1=time.time()
pool= Pool()
hercx = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25'
}
cvb = requests.get('https://so.gushiwen.cn/gushi/tangshi.aspx/', headers=hercx).text
html = etree.HTML(cvb)
'爬取300首古诗 获取300首的地址xpath'
qwe_300 = '//*[@id="html"]/body/div/div/div/div/span/a/@href'
rahsfd_300 = html.xpath(qwe_300)
vbn_to = rahsfd_300 # 300个地址返回成列表里面
adp='https://so.gushiwen.cn'
for i in vbn_to:
qwe=adp+str(i)
pool.apply_async(zxc, args=(qwe,hercx))
cvb=time.time()
pool.close()
pool.join()
print('{}秒'.format(str(cvb-qwe1)))
标签:xpath,300,text,qwe,爬取,str,古诗,div 来源: https://www.cnblogs.com/xxh12/p/16673878.html