编程语言
首页 > 编程语言> > python爬虫 爬取贝壳网中所有行政区内的二手房 将获取信息存于excle中CSV文件 含分析

python爬虫 爬取贝壳网中所有行政区内的二手房 将获取信息存于excle中CSV文件 含分析

作者:互联网

-- coding: utf-8 --

“”"
Created on Sat Feb 29 19:13:37 2020

@author: acliu
“”"

获取房屋的基本信息(若干页)
贝壳二手房北京房价
分行政区保存csv格式
网址:https://bj.ke.com/ershoufang/

import requests
import csv
import re
import xlwt
from bs4 import BeautifulSoup
#请求头,防止反爬。
#如果单用headers不够,可以加入host \ cookies
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',\
    "Host": "cd.ke.com",
}

# 将获取的信息保存到csv中,
def save_info(content,title):
    head = ('position', 'floor', 'builtYear', 'layout', 'size', 'orientation', 'totalPrice', 'perPrice')
    style = xlwt.XFStyle()   #初始化样式
    font = xlwt.Font()    #创建字体
    font.name = 'Times New Roman'
    font.bold = True #黑体
    style.font = font #设定样式
    with open('%s.csv' % title, 'w',newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(head)
        for i in range(len(content)):
            row = content[i]
            writer.writerow(row)
            
#获取房屋相关信息
#主要包括'position','floor','builtYear','layout','size','orientation','totalPrice','unitPrice'
    
def get_info(place):

    all_info        = []
    position_list   = []
    floor_list      = []
    builtYear_list  = []
    layout_list     = []
    size_list       = []
    orientation_list= []
    totalPrice_list = []
    unitPrice_list  = []
    
    for i in range(10):
        link = 'https://cd.ke.com/ershoufang/%s/pg%dl2/' % (place, i)
        r = requests.get(link, headers=headers, timeout=10)
        print (str(i+1), 'status_code: ', r.status_code)
        soup = BeautifulSoup(r.text, 'lxml')
        positionInfo = soup.findAll('div', {'class': 'positionInfo'})
        houseInfo = soup.findAll('div', {'class': 'houseInfo'})
        totalPrice = soup.findAll('div', {'class': 'totalPrice'})
        unitPrice = soup.findAll('div', {'class': 'unitPrice'})
        for item in positionInfo:
            postion = item.a.text.strip()
            position_list.append(postion)
        for item in houseInfo:
            house_info = item.text.strip().replace('\n', ' ').replace(' ', '')
            floor = re.search('.楼层\(共[\d]+层\)', house_info).group()
            s = re.search('[\d]*年建', house_info)
            if s is not None: builtYear = s.group().replace("年建", '')
            else: builtYear = None
            layout = re.search('.室.厅', house_info).group()
            size = re.search('([\d]*\.[\d]*|[\d]*)平米', house_info).group().replace('平米', '')
            orientation = re.search('东南|东北|西南|西北|东|西|南|北', house_info).group()
            floor_list.append(floor)
            builtYear_list.append(builtYear)
            layout_list.append(layout)
            size_list.append(size)
            orientation_list.append(orientation)
        for item in totalPrice:
            total_price = item.span.text.strip()
            totalPrice_list.append(total_price)
        for item in unitPrice:
            unit_price = item.span.text.strip().replace('单价', '').replace('元/平米', '')
            unitPrice_list.append(unit_price)
    print (len(position_list))
    print (len(floor_list))
    print (len(builtYear_list))
    print (len(layout_list))
    print (len(size_list))
    print (len(orientation_list))
    print (len(totalPrice_list))
    print (len(totalPrice_list))
    for i in range(len(position_list)):
        item = [position_list[i], floor_list[i], builtYear_list[i], layout_list[i], \
            size_list[i], orientation_list[i], totalPrice_list[i], unitPrice_list[i]]
        all_info.append(item)
        
    return all_info

if __name__=='__main__':
    
    area_list = ['dongcheng','xicheng','chaoyang','haidian','fengtai','shijingshan','tongzhou','changping','daxing','yizhuangkaifaqu','shunyi','fangshan','mentougou','pinggu',\
    'huairou','miyun','yanqing']
        
    for place in area_list:
        all_info = get_info(place)
        save_info(all_info,place)
Acy. 发布了3 篇原创文章 · 获赞 0 · 访问量 59 私信 关注

标签:info,totalPrice,存于,python,list,len,爬取,item,builtYear
来源: https://blog.csdn.net/Babyacy/article/details/104590892