爬虫实战-政务新闻网频道信息
作者:互联网
爬虫实战-政务新闻网频道信息
文章目录
前言
提示:以下是本篇文章正文内容,下面案例可供参考
一、代码展示
import csv
import os
import time
from lxml import etree
import requests
from lxml import *
#网址分析:
#1.https://gov.rednet.cn/channel/8463.html
#2.https://gov.rednet.cn/channel/8463_2.html
list_html=[]
def getHtml(getNum,url,headers):
for i in range(getNum):
if i>0:
url=url[:-5]+"_{}".format(i+1)+".html"
html_rq=requests.get(url=url,headers=headers).content.decode("utf-8")
print("--------正在爬取信息{}-----".format(url))
time.sleep(1)
list_html.append(html_rq)
return list_html
def getInfos(list_html,headers):
article=[]
for i in list_html:
i_1=etree.HTML(i)
lis=i_1.xpath('//div[@id="div_newsList"]/ul/li')
for li in lis:
title=li.xpath('.//a/span/text()')[0] #标题
time=li.xpath('.//a/span/text()')[1] #时间
link=li.xpath('.//a/@href') [0] #链接
print(title,time,link)
details_html=requests.get(url=link,headers=headers).content.decode("utf-8")
details_html=etree.HTML(details_html)
all=details_html.xpath('//div[@class="m_b_25"]')[0]
source=all.xpath('./span')[0].xpath('./text()')[0][3:] #来源
author=all.xpath('./span')[1].xpath('./text()')[0][3:] #作者
editor=all.xpath('./span')[2].xpath('./text()')[0][3:] #编辑
content=details_html.xpath('//ct/p/text()') #文章内容
article.append([title,source,author,editor,time,"".join(content)])
print("-----------爬取完成!---------")
return article
def saveDate(article):
path='./news/'
if not os.path.exists(path):
os.mkdir(path)
with open("news/red.csv","w",newline="",encoding="utf-8") as f:
writer=csv.writer(f,delimiter="#")
headers=["新闻标题","来源","作者","编辑","发布时间","文章内容"]
writer.writerow(headers)
writer.writerow(article)
def main():
url="https://gov.rednet.cn/channel/8463.html"
headers={
"Cookie":"wdcid=146010ccacd153d5; Hm_lvt_8d7a87037f266a8541a112ab5972e9a6=1639304907,1639304914,1639529992; Hm_lvt_aaecf8414f59c3fb0127932014cf53c7=1639305556,1639530097; wdses=7f6dc3a04fb6cf25; wdlast=1639530135; Hm_lpvt_aaecf8414f59c3fb0127932014cf53c7=1639530135; Hm_lpvt_8d7a87037f266a8541a112ab5972e9a6=1639530135",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}
h=getHtml(getNum=2,url=url,headers=headers)
article= getInfos(h,headers)
saveDate(article)
if __name__ == '__main__':
main()
标签:xpath,频道,url,text,html,爬虫,headers,新闻网,article 来源: https://blog.csdn.net/weixin_55008828/article/details/121949732