其他分享
首页 > 其他分享> > 机器学习-分析北京二手房价格

机器学习-分析北京二手房价格

作者:互联网

#导入库
import numpy as np
import pandas as pd
import random
from datetime import datetime
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

 

df=pd.read_csv('北京链家.csv')
df.head()
df1=df.dropna()
df1.drop(columns=['District','Garden','Id','Year','Direction','Layout'],inplace=True)

 




df.info()
#对数据中NAN进行处理
df['Elevator'].fillna('无电梯',inplace=True)
#.drop_duplicates(inplace=True)数据去重,Ture表示直接在原数据上删除重复项
#重新set初值
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
#画个箱线图,看数据分布情况
fig,ax = plt.subplots(1,2,figsize=(16,6))
df.boxplot(column=['Price'], flierprops={'markeredgecolor':'red', 'markersize':4}, ax=ax[0])
df.boxplot(column=['Size'], flierprops={'markeredgecolor':'red', 'markersize':4}, ax=ax[1])
#去除掉大于1200的房屋,由于参考价值并不是很大
df.drop(index = df[df['Price'] > 1200].index, inplace=True)
df.info()
#探究在售数量和均价的关系
#画布,可视化和修改图的参数
fig,ax = plt.subplots(2,1,figsize=(30,18))
x = df['Region'].unique()

y0 = df.groupby(by=['Region']).size().sort_values(ascending=False)
sns.barplot(x,y0,ax=ax[0],palette='BuPu_r')
ax[0].set_title('北京各区二手房在售数量')
ax[0].title.set_size(35)
#groupby 按照Region和price的平均值排序,其中横坐标为Region
y1 = df.groupby(by=['Region'])['Price'].mean().sort_values(ascending=False)
sns.barplot(x,y1,ax=ax[1],palette='Blues_r')
ax[1].set_title('北京各区二手房在售均价')
ax[1].title.set_size(35)
#探究朝向和价格的关系
plt.figure(figsize=(30,8))
x_d = df['Direction'].unique()
y_d = df.groupby(by=['Direction'])['Price'].mean().sort_values(ascending=False)
sns.barplot(x_d,y_d,palette='Greens_r')
#探究楼层对价格的影响,进行分类处理,1-9,9-17,17-25,依次到41,共记5组
#分桶处理,并设标签为0,1,2,3,4
#添加至原数据
floor = df['Floor']
floor_binary = pd.cut(floor,[1,9,17,25,33,41],labels=[0,1,2,3,4],right = False)
df['floor_binary'] = floor_binary
df['floor_binary'] = df['floor_binary'].astype('float') # .astype代表转换数据类型
plt.figure(figsize=(16,8))
x_f = df['floor_binary'].unique()
y_f = df.groupby(by=['floor_binary'])['Price'].mean().sort_values(ascending=False)
sns.barplot(x_f,y_f,palette='Reds_r')
#装修和价格关系
plt.figure(figsize=(16,8))
x = df['Renovation'].unique()

y = df.groupby(by=['Renovation'])['Price'].mean().sort_values(ascending=False)
sns.barplot(x[:4],y[:4],palette='Oranges_r')
#精装价格还是蛮高的
#年龄和价格的关系
fig,ax = plt.subplots(2,1,figsize=(30,18))
x = df['Year'].unique()

y0 = df.groupby(by=['Year']).size().sort_values(ascending=False)
sns.barplot(x,y0,ax=ax[0],palette='Reds_d')
ax[0].set_title('北京各房龄二手房在售数量')

y1 = df.groupby(by=['Year'])['Price'].mean().sort_values(ascending=False)
sns.barplot(x,y1,ax=ax[1],palette='Blues_r')
ax[1].set_title('北京各房龄二手房在售均价')
df.Elevator[df.Elevator=='有电梯'] = 1    
df.Elevator[df.Elevator=='无电梯'] = 0 
df.Renovation[df.Renovation=='精装'] = 1
df.Renovation[df.Renovation=='简装'] = 2
df.Renovation[df.Renovation=='毛坯'] = 3
df.Renovation[df.Renovation=='其他'] = 0
#之前这样操作全部变为NAN,保险起见重启df1
df1 = df
map1 = {'东城':13, '西城':12, '朝阳':11, '海淀':10, '丰台':9, '昌平':8, '大兴':7, '房山':6, '门头沟':5, '顺义':4,
       '亦庄开发区':3, '通州':2, '石景山':1}
df1['Region'] = df1['Region'].map(map1)
df1
df.corr()[['Price']]
# 设置绘图风格
sns.set_style('whitegrid')
sns.heatmap(df.corr()[['Price']],annot=True,vmax=1, square=True,cmap='bwr_r')

 

 

标签:False,机器,df,学习,二手房,sns,ax,import,Renovation
来源: https://www.cnblogs.com/158-174/p/16478943.html