爬取河北省招投标公共服务平台招标信息
作者:互联网
import requests
from lxml import etree
# 招标网址
url = 'http://121.28.195.124:9001/tender/xxgk/zbgg.do'
# session = requests.session()
# UA
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
}
data = {
'page':'2',
'TimeStr':'',
'allDq':'reset2',
'allHy':'reset1',
'AllPtName':'',
'KeyStr':'',
'KeyType':'',
'ggname':'',
}
# session.post(url=url,headers=headers)
#创建excel
from openpyxl import Workbook
wb = Workbook()
wb1 = wb.create_sheet('index',0)
wb1.title = '投标信息'
for i in range(1, 10):
data['page'] = i
# 发送请求,获取响应对象
page_text = requests.post(url=url, data=data, headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('.//div[@class="publicont"]')
for d in div_list:
title = d.xpath('./div[1]//a/text()') # 标题
s_url = 'http://121.28.195.124:9001'+str(d.xpath('./div[1]//a/@href')).replace("[",'').replace("]",'').replace("\'",'') # 链接
time = d.xpath('./div[1]//span[@class="span_o"]/text()')[0] # 公告时间
info = d.xpath('.//p[@class="p_tw"]//span[@class="span_on"]/text()')
area = info[0].strip().replace("[",'').replace("]",'').replace("\'",'') # 公告位置
hangye = info[-1] # 所属行业
if hangye == '[软件和信息技术服务业]':
print(title,area,hangye,time,s_url)
wb1.append([str(title),str(area),str(time),s_url])
# text = (str(title)+' '+str(area)+' '+str(time)+' '+str(s_url)+'\n').replace("[",'').replace("]",'').replace("\'",'')
wb.save('1.xlsx')
# print(d)
# print(div_list)
标签:title,招投标,text,replace,爬取,url,公共服务,str,div 来源: https://www.cnblogs.com/robertx/p/13049714.html