使用Python抓取厦门房地产网签数据
作者:互联网
有了链家的房源数据,另一个重要的需求是希望获取每天的成交数据,这个数据在厦门网上房地产“http://fdc.zfj.xm.gov.cn/Home/Index”。这个网站有三个数据是很有用的,分别是“一手房情况”,“一手房网签情况”,“二手房情况”,研究了网页代码后,发现src里没有完整链接,所以一开始只能用最蠢的把整个网页截屏的办法来做,见上一篇“使用python进行截图”。
今天继续研究,找到了整个完整的链接路径,这下好办了,可是用EDGE打开是乱码,试了FireFox和Chrome,都是乱码,只有IE是可以正常显示的,于是思路如下:
import requests
from bs4 import BeautifulSoup
from datetime import date
from time import sleep
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66'
}
def get_html():
html = requests.get('http://fdc.zfj.xm.gov.cn/Home/Index',headers = headers).text
soup = BeautifulSoup(html,'html.parser')
return soup
# print(get_html())
def get_ysf_address():
soup = get_html()
ysf_image = soup.find('div',{'class':'container'}).find('div',{'style':'width:1000px;margin:0 auto;'}).\
find('div',{'class':'main'}).find('div',{'class':'main_turnover clearfix'}).\
find('div',{'class':'onehand onetwo'}).find('div',{'class':'imgcontainer'}).find('img').get('src')
ysf_address = r'http://fdc.zfj.xm.gov.cn' + ysf_image
return ysf_address
def get_ysfwq_address():
ysfwq_img = get_html().find('div',{'class':'container'}).find('div',{'style':'width:1000px;margin:0 auto;'}).\
find('div',{'class':'main'}).find('div',{'class':'main_turnover clearfix'}).\
find('div',{'class':'netsign onetwo'}).find('div',{'class':'imgcontainer'}).find('img').get('src')
ysfwq_address = r'http://fdc.zfj.xm.gov.cn' + ysfwq_img
return ysfwq_address
# print(get_ysfwq_address())
def get_esf_address():
esf_img = get_html().find('div',{'class':'container'}).find('div',{'style':'width:1000px;margin:0 auto;'}).\
find('div',{'class':'main'}).find('div',{'class':'main_turnover clearfix'}).\
find('div',{'class':'secondhand onetwo'}).find('div',{'class':'imgcontainer'}).find('img').get('src')
esf_address = r'http://fdc.zfj.xm.gov.cn' + esf_img
return esf_address
def get_picture():
for a in (get_esf_address(),get_ysf_address(),get_ysfwq_address()):
f = open("C:\\data\\网签备份\\" + str(date.today()) + a[30:35] + '.png','w+b')
f.write(requests.get(a,headers = headers).content)
sleep(15)
try:
get_picture()
except:
print(False)
标签:get,Python,class,抓取,网签,html,address,div,find 来源: https://blog.csdn.net/cwjcw81/article/details/113785039