机器学习-分析北京二手房价格
作者:互联网
#导入库 import numpy as np import pandas as pd import random from datetime import datetime from matplotlib import pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import Lasso from sklearn.ensemble import RandomForestRegressor %matplotlib inline import warnings warnings.filterwarnings('ignore') plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False
df=pd.read_csv('北京链家.csv') df.head()
df1=df.dropna() df1.drop(columns=['District','Garden','Id','Year','Direction','Layout'],inplace=True)
df.info()
#对数据中NAN进行处理 df['Elevator'].fillna('无电梯',inplace=True)
#.drop_duplicates(inplace=True)数据去重,Ture表示直接在原数据上删除重复项 #重新set初值 df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True)
#画个箱线图,看数据分布情况 fig,ax = plt.subplots(1,2,figsize=(16,6)) df.boxplot(column=['Price'], flierprops={'markeredgecolor':'red', 'markersize':4}, ax=ax[0]) df.boxplot(column=['Size'], flierprops={'markeredgecolor':'red', 'markersize':4}, ax=ax[1])
#去除掉大于1200的房屋,由于参考价值并不是很大 df.drop(index = df[df['Price'] > 1200].index, inplace=True) df.info()
#探究在售数量和均价的关系 #画布,可视化和修改图的参数 fig,ax = plt.subplots(2,1,figsize=(30,18)) x = df['Region'].unique() y0 = df.groupby(by=['Region']).size().sort_values(ascending=False) sns.barplot(x,y0,ax=ax[0],palette='BuPu_r') ax[0].set_title('北京各区二手房在售数量') ax[0].title.set_size(35) #groupby 按照Region和price的平均值排序,其中横坐标为Region y1 = df.groupby(by=['Region'])['Price'].mean().sort_values(ascending=False) sns.barplot(x,y1,ax=ax[1],palette='Blues_r') ax[1].set_title('北京各区二手房在售均价') ax[1].title.set_size(35)
#探究朝向和价格的关系 plt.figure(figsize=(30,8)) x_d = df['Direction'].unique() y_d = df.groupby(by=['Direction'])['Price'].mean().sort_values(ascending=False) sns.barplot(x_d,y_d,palette='Greens_r')
#探究楼层对价格的影响,进行分类处理,1-9,9-17,17-25,依次到41,共记5组 #分桶处理,并设标签为0,1,2,3,4 #添加至原数据 floor = df['Floor'] floor_binary = pd.cut(floor,[1,9,17,25,33,41],labels=[0,1,2,3,4],right = False) df['floor_binary'] = floor_binary df['floor_binary'] = df['floor_binary'].astype('float') # .astype代表转换数据类型
plt.figure(figsize=(16,8)) x_f = df['floor_binary'].unique() y_f = df.groupby(by=['floor_binary'])['Price'].mean().sort_values(ascending=False) sns.barplot(x_f,y_f,palette='Reds_r')
#装修和价格关系 plt.figure(figsize=(16,8)) x = df['Renovation'].unique() y = df.groupby(by=['Renovation'])['Price'].mean().sort_values(ascending=False) sns.barplot(x[:4],y[:4],palette='Oranges_r') #精装价格还是蛮高的
#年龄和价格的关系 fig,ax = plt.subplots(2,1,figsize=(30,18)) x = df['Year'].unique() y0 = df.groupby(by=['Year']).size().sort_values(ascending=False) sns.barplot(x,y0,ax=ax[0],palette='Reds_d') ax[0].set_title('北京各房龄二手房在售数量') y1 = df.groupby(by=['Year'])['Price'].mean().sort_values(ascending=False) sns.barplot(x,y1,ax=ax[1],palette='Blues_r') ax[1].set_title('北京各房龄二手房在售均价')
df.Elevator[df.Elevator=='有电梯'] = 1 df.Elevator[df.Elevator=='无电梯'] = 0 df.Renovation[df.Renovation=='精装'] = 1 df.Renovation[df.Renovation=='简装'] = 2 df.Renovation[df.Renovation=='毛坯'] = 3 df.Renovation[df.Renovation=='其他'] = 0 #之前这样操作全部变为NAN,保险起见重启df1 df1 = df map1 = {'东城':13, '西城':12, '朝阳':11, '海淀':10, '丰台':9, '昌平':8, '大兴':7, '房山':6, '门头沟':5, '顺义':4, '亦庄开发区':3, '通州':2, '石景山':1} df1['Region'] = df1['Region'].map(map1) df1
df.corr()[['Price']]
# 设置绘图风格 sns.set_style('whitegrid') sns.heatmap(df.corr()[['Price']],annot=True,vmax=1, square=True,cmap='bwr_r')
标签:False,机器,df,学习,二手房,sns,ax,import,Renovation 来源: https://www.cnblogs.com/158-174/p/16478943.html