python爬虫 爬取贝壳网中所有行政区内的二手房 将获取信息存于excle中CSV文件 含分析
作者:互联网
-- coding: utf-8 --
“”"
Created on Sat Feb 29 19:13:37 2020
@author: acliu
“”"
获取房屋的基本信息(若干页)
贝壳二手房北京房价
分行政区保存csv格式
网址:https://bj.ke.com/ershoufang/
import requests
import csv
import re
import xlwt
from bs4 import BeautifulSoup
#请求头,防止反爬。
#如果单用headers不够,可以加入host \ cookies
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',\
"Host": "cd.ke.com",
}
# 将获取的信息保存到csv中,
def save_info(content,title):
head = ('position', 'floor', 'builtYear', 'layout', 'size', 'orientation', 'totalPrice', 'perPrice')
style = xlwt.XFStyle() #初始化样式
font = xlwt.Font() #创建字体
font.name = 'Times New Roman'
font.bold = True #黑体
style.font = font #设定样式
with open('%s.csv' % title, 'w',newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(head)
for i in range(len(content)):
row = content[i]
writer.writerow(row)
#获取房屋相关信息
#主要包括'position','floor','builtYear','layout','size','orientation','totalPrice','unitPrice'
def get_info(place):
all_info = []
position_list = []
floor_list = []
builtYear_list = []
layout_list = []
size_list = []
orientation_list= []
totalPrice_list = []
unitPrice_list = []
for i in range(10):
link = 'https://cd.ke.com/ershoufang/%s/pg%dl2/' % (place, i)
r = requests.get(link, headers=headers, timeout=10)
print (str(i+1), 'status_code: ', r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
positionInfo = soup.findAll('div', {'class': 'positionInfo'})
houseInfo = soup.findAll('div', {'class': 'houseInfo'})
totalPrice = soup.findAll('div', {'class': 'totalPrice'})
unitPrice = soup.findAll('div', {'class': 'unitPrice'})
for item in positionInfo:
postion = item.a.text.strip()
position_list.append(postion)
for item in houseInfo:
house_info = item.text.strip().replace('\n', ' ').replace(' ', '')
floor = re.search('.楼层\(共[\d]+层\)', house_info).group()
s = re.search('[\d]*年建', house_info)
if s is not None: builtYear = s.group().replace("年建", '')
else: builtYear = None
layout = re.search('.室.厅', house_info).group()
size = re.search('([\d]*\.[\d]*|[\d]*)平米', house_info).group().replace('平米', '')
orientation = re.search('东南|东北|西南|西北|东|西|南|北', house_info).group()
floor_list.append(floor)
builtYear_list.append(builtYear)
layout_list.append(layout)
size_list.append(size)
orientation_list.append(orientation)
for item in totalPrice:
total_price = item.span.text.strip()
totalPrice_list.append(total_price)
for item in unitPrice:
unit_price = item.span.text.strip().replace('单价', '').replace('元/平米', '')
unitPrice_list.append(unit_price)
print (len(position_list))
print (len(floor_list))
print (len(builtYear_list))
print (len(layout_list))
print (len(size_list))
print (len(orientation_list))
print (len(totalPrice_list))
print (len(totalPrice_list))
for i in range(len(position_list)):
item = [position_list[i], floor_list[i], builtYear_list[i], layout_list[i], \
size_list[i], orientation_list[i], totalPrice_list[i], unitPrice_list[i]]
all_info.append(item)
return all_info
if __name__=='__main__':
area_list = ['dongcheng','xicheng','chaoyang','haidian','fengtai','shijingshan','tongzhou','changping','daxing','yizhuangkaifaqu','shunyi','fangshan','mentougou','pinggu',\
'huairou','miyun','yanqing']
for place in area_list:
all_info = get_info(place)
save_info(all_info,place)
Acy.
发布了3 篇原创文章 · 获赞 0 · 访问量 59
私信
关注
标签:info,totalPrice,存于,python,list,len,爬取,item,builtYear 来源: https://blog.csdn.net/Babyacy/article/details/104590892