首页 > 其他分享> > 爬取300首古诗

爬取300首古诗

2022-09-09 20:32:39 作者：互联网

import time

import requests
from lxml import etree
from multiprocessing import Pool




def zxc():
  qwe_op=requests.get('https://so.gushiwen.cn/shiwenv_45c396367f59.aspx').text
  html1 = etree.HTML(qwe_op)
  '标头的xpth的'
  roto=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
  "作者的xpth"
  roto1=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a[1]/text()')
  '诗句'
  textuio=html1.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/text()')
  with open('lp/'+str(roto[0])+'.txt','a+',encoding='utf-8')as ll:
     ll.write('{}\n'.format(str(roto[0])))
     ll.write('{}\n'.format(str(roto1[0])))
     for i in textuio:
       s=str(i).replace(' ','')
       ll.write('{}\n'.format(str(s)))


# 用多进程
if __name__ == '__main__':
   qwe1=time.time()
   pool= Pool()
   hercx = {
     'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25'
   }
   cvb = requests.get('https://so.gushiwen.cn/gushi/tangshi.aspx/', headers=hercx).text
   html = etree.HTML(cvb)

   '爬取300首古诗 获取300首的地址xpath'
   qwe_300 = '//*[@id="html"]/body/div/div/div/div/span/a/@href'
   rahsfd_300 = html.xpath(qwe_300)
   vbn_to = rahsfd_300  # 300个地址返回成列表里面

   adp='https://so.gushiwen.cn'
   for i in vbn_to:
         qwe=adp+str(i)
         pool.apply_async(zxc, args=(qwe,hercx))
   cvb=time.time()
   pool.close()
   pool.join()
   print('{}秒'.format(str(cvb-qwe1)))

标签：xpath,300,text,qwe,爬取,str,古诗,div
来源： https://www.cnblogs.com/xxh12/p/16673878.html