其他分享
首页 > 其他分享> > 爬虫 -- 中国货币网债券财务报告

爬虫 -- 中国货币网债券财务报告

作者:互联网

Python批量下载中国货币网债券财务报告, 很多python爬虫的东西 值得学习

推文代码不能正常下载,可参考下面这个。

from bs4 import BeautifulSoup
import os
import bs4
import requests

def getHtml(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    return res.text

def get_file(title):
    filename = f'{title}.PDF'
    
    sub_dir = 'D:\Desktop\公告'
    if not os.path.exists(sub_dir):
        os.makedir(sub_dir)
        # saving_path = r'D:\Desktop\公告'  # 设置存储年报的文件夹
        
    filepath = sub_dir +'\\'+ filename
    return filepath

nums = 10
url = f'https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName=&pageSize={nums}&pageNo=1&inextp=3%2C5&limit=1&'
r = requests.get(url)
r.encoding = r.apparent_encoding  # 防止网页乱码
json = r.json()
# 存储数据
records = json['records']

items = []
for d in records:
    
    title = d['title']
    releaseDate = d['releaseDate']

    draftPath = d['draftPath']
    child_url = 'https://www.chinamoney.com.cn/'+draftPath

    item = [title, releaseDate, child_url]
    items.append(item)

    html = getHtml(child_url)
    soup = BeautifulSoup(html, 'html.parser')
    notice_info = soup.find('div', class_='article-a-attach-body')

    main_url = 'https://www.chinamoney.com.cn/dqs/cm-s-notice-query/' # constant

    for i in notice_info.select('li'):
        if isinstance(i, bs4.element.Tag):
            temp = i.a['onclick'].split('+')[1].split('\'')[1]
            url_end = os.path.join(main_url, temp)
            print(url_end)

            url_end_r = requests.get(url_end)
            # 下载PDF

            title = i.a.text.split('\n')[-2].split('.')[0]
            filepath = get_file(title)
            with open(filepath, 'wb') as f:
               f.write(url_end_r.content)

标签:end,get,--,res,title,爬虫,url,dir,财务报告
来源: https://www.cnblogs.com/RankFan/p/16403573.html