爬虫 -- 中国货币网债券财务报告
作者:互联网
Python批量下载中国货币网债券财务报告, 很多python爬虫的东西 值得学习
推文代码不能正常下载,可参考下面这个。
from bs4 import BeautifulSoup
import os
import bs4
import requests
def getHtml(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.text
def get_file(title):
filename = f'{title}.PDF'
sub_dir = 'D:\Desktop\公告'
if not os.path.exists(sub_dir):
os.makedir(sub_dir)
# saving_path = r'D:\Desktop\公告' # 设置存储年报的文件夹
filepath = sub_dir +'\\'+ filename
return filepath
nums = 10
url = f'https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName=&pageSize={nums}&pageNo=1&inextp=3%2C5&limit=1&'
r = requests.get(url)
r.encoding = r.apparent_encoding # 防止网页乱码
json = r.json()
# 存储数据
records = json['records']
items = []
for d in records:
title = d['title']
releaseDate = d['releaseDate']
draftPath = d['draftPath']
child_url = 'https://www.chinamoney.com.cn/'+draftPath
item = [title, releaseDate, child_url]
items.append(item)
html = getHtml(child_url)
soup = BeautifulSoup(html, 'html.parser')
notice_info = soup.find('div', class_='article-a-attach-body')
main_url = 'https://www.chinamoney.com.cn/dqs/cm-s-notice-query/' # constant
for i in notice_info.select('li'):
if isinstance(i, bs4.element.Tag):
temp = i.a['onclick'].split('+')[1].split('\'')[1]
url_end = os.path.join(main_url, temp)
print(url_end)
url_end_r = requests.get(url_end)
# 下载PDF
title = i.a.text.split('\n')[-2].split('.')[0]
filepath = get_file(title)
with open(filepath, 'wb') as f:
f.write(url_end_r.content)
标签:end,get,--,res,title,爬虫,url,dir,财务报告 来源: https://www.cnblogs.com/RankFan/p/16403573.html