其他分享
首页 > 其他分享> > 爬取河北省招投标公共服务平台招标信息

爬取河北省招投标公共服务平台招标信息

作者:互联网

import requests
from lxml import etree

# 招标网址
url = 'http://121.28.195.124:9001/tender/xxgk/zbgg.do'

# session = requests.session()

# UA
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
}

data = {
    'page':'2',
    'TimeStr':'',
    'allDq':'reset2',
    'allHy':'reset1',
    'AllPtName':'',
    'KeyStr':'',
    'KeyType':'',
    'ggname':'',
}

# session.post(url=url,headers=headers)

#创建excel
from openpyxl import Workbook
wb = Workbook()
wb1 = wb.create_sheet('index',0)
wb1.title = '投标信息'
for i in range(1, 10):
    data['page'] = i
    # 发送请求,获取响应对象
    page_text = requests.post(url=url, data=data, headers=headers).text
    tree = etree.HTML(page_text)
    div_list = tree.xpath('.//div[@class="publicont"]')
    for d in div_list:
        title = d.xpath('./div[1]//a/text()')  # 标题
        s_url = 'http://121.28.195.124:9001'+str(d.xpath('./div[1]//a/@href')).replace("[",'').replace("]",'').replace("\'",'')  # 链接
        time = d.xpath('./div[1]//span[@class="span_o"]/text()')[0]  # 公告时间
        info = d.xpath('.//p[@class="p_tw"]//span[@class="span_on"]/text()')
        area = info[0].strip().replace("[",'').replace("]",'').replace("\'",'')  # 公告位置
        hangye = info[-1]  # 所属行业

        if hangye == '[软件和信息技术服务业]':
            print(title,area,hangye,time,s_url)
            wb1.append([str(title),str(area),str(time),s_url])
            # text = (str(title)+' '+str(area)+' '+str(time)+' '+str(s_url)+'\n').replace("[",'').replace("]",'').replace("\'",'')

wb.save('1.xlsx')
    # print(d)
# print(div_list)

标签:title,招投标,text,replace,爬取,url,公共服务,str,div
来源: https://www.cnblogs.com/robertx/p/13049714.html