其他分享
首页 > 其他分享> > 爬虫实战-政务新闻网频道信息

爬虫实战-政务新闻网频道信息

作者:互联网

爬虫实战-政务新闻网频道信息

文章目录


前言


提示:以下是本篇文章正文内容,下面案例可供参考

一、代码展示

import csv
import os
import time
from lxml import etree

import  requests
from  lxml  import  *

#网址分析:
#1.https://gov.rednet.cn/channel/8463.html
#2.https://gov.rednet.cn/channel/8463_2.html
list_html=[]
def getHtml(getNum,url,headers):
   for i in range(getNum):
       if i>0:
           url=url[:-5]+"_{}".format(i+1)+".html"
       html_rq=requests.get(url=url,headers=headers).content.decode("utf-8")
       print("--------正在爬取信息{}-----".format(url))
       time.sleep(1)
       list_html.append(html_rq)
   return  list_html

def  getInfos(list_html,headers):
    article=[]
    for i in list_html:
        i_1=etree.HTML(i)
        lis=i_1.xpath('//div[@id="div_newsList"]/ul/li')
        for li in lis:
            title=li.xpath('.//a/span/text()')[0]  #标题
            time=li.xpath('.//a/span/text()')[1]  #时间
            link=li.xpath('.//a/@href') [0] #链接
            print(title,time,link)

        details_html=requests.get(url=link,headers=headers).content.decode("utf-8")
        details_html=etree.HTML(details_html)

        all=details_html.xpath('//div[@class="m_b_25"]')[0]
        source=all.xpath('./span')[0].xpath('./text()')[0][3:]  #来源
        author=all.xpath('./span')[1].xpath('./text()')[0][3:]   #作者
        editor=all.xpath('./span')[2].xpath('./text()')[0][3:]   #编辑

        content=details_html.xpath('//ct/p/text()')   #文章内容

        article.append([title,source,author,editor,time,"".join(content)])

    print("-----------爬取完成!---------")
    return  article

def saveDate(article):
    path='./news/'
    if not os.path.exists(path):
        os.mkdir(path)

    with open("news/red.csv","w",newline="",encoding="utf-8") as f:
        writer=csv.writer(f,delimiter="#")
        headers=["新闻标题","来源","作者","编辑","发布时间","文章内容"]
        writer.writerow(headers)
        writer.writerow(article)

def main():
    url="https://gov.rednet.cn/channel/8463.html"
    headers={
        "Cookie":"wdcid=146010ccacd153d5; Hm_lvt_8d7a87037f266a8541a112ab5972e9a6=1639304907,1639304914,1639529992; Hm_lvt_aaecf8414f59c3fb0127932014cf53c7=1639305556,1639530097; wdses=7f6dc3a04fb6cf25; wdlast=1639530135; Hm_lpvt_aaecf8414f59c3fb0127932014cf53c7=1639530135; Hm_lpvt_8d7a87037f266a8541a112ab5972e9a6=1639530135",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }
    h=getHtml(getNum=2,url=url,headers=headers)
    article= getInfos(h,headers)
    saveDate(article)

if __name__ == '__main__':
    main()

标签:xpath,频道,url,text,html,爬虫,headers,新闻网,article
来源: https://blog.csdn.net/weixin_55008828/article/details/121949732