其他分享
首页 > 其他分享> > 1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

作者:互联网

本案例的重点在于Matplotlib可视化的基础操作实战练习。

import os                   #导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
os.chdir("D:\Datalu\File")  #指定工作目录
%matplotlib inline          #必要可视化设置
plt.rcParams["font.sans-serif"] = ["KAITI"]
plt.rcParams["axes.unicode_minus"] = False

一、问题的提出

1.查看一年的平均气温
2.查看一月份的气温
3.每个月的平均气温(柱状图和箱线图)

1.1 导入两份气温数据

#导入室外气温数据
df1 = pd.read_csv("temperature_outdoor_2014.tsv",delimiter="\t", names=["time", "outdoor"])
df1.head(2)
timeoutdoor
013885309864.38
113885315864.25
#导入室内气温数据
df2 = pd.read_csv("temperature_indoor_2014.tsv",delimiter="\t", names=["time2", "indoor"])
df2.head(2)
time2indoor
0138853098621.94
1138853158622.00
#合并两份数据
df = pd.concat([df1,df2],join="inner",axis=1)
df
timeoutdoortime2indoor
013885309864.38138853098621.94
113885315864.25138853158622.00
213885321874.19138853218722.00
313885327874.06138853278722.00
413885333884.06138853338822.00
...............
4954014199759911.44141997779311.75
4954114199765921.50141997839311.75
4954214199771921.50141997899411.75
4954314199777931.56141997959511.75
4954414199783931.62141998019511.81

49545 rows × 4 columns

df.columns
Index(['time', 'outdoor', 'time2', 'indoor'], dtype='object')
df.drop('time2',axis=1,inplace=True)
df.head(2)
timeoutdoorindoor
013885309864.3821.94
113885315864.2522.00
dt1 = df.copy()
数据集一共有三列数据,其中一列是时间戳,两列是气温数据
这里有两种方法可以将其转化为时间按,第一种是在导入文件时,第二种是用to_datetime方法

二、查看数据基本信息

dt1.info(memory_usage="deep")    # 没有自动辨认成时间
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49545 entries, 0 to 49544
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     49545 non-null  int64  
 1   outdoor  49545 non-null  float64
 2   indoor   49545 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
dt1.values   #查看数据集的值
array([[1.38853099e+09, 4.38000000e+00, 2.19400000e+01],
       [1.38853159e+09, 4.25000000e+00, 2.20000000e+01],
       [1.38853219e+09, 4.19000000e+00, 2.20000000e+01],
       ...,
       [1.41997719e+09, 1.50000000e+00, 1.17500000e+01],
       [1.41997779e+09, 1.56000000e+00, 1.17500000e+01],
       [1.41997839e+09, 1.62000000e+00, 1.18100000e+01]])
dt1.values[:,0]   #查看数据集某一列的值
array([1.38853099e+09, 1.38853159e+09, 1.38853219e+09, ...,
       1.41997719e+09, 1.41997779e+09, 1.41997839e+09])
dt1.time.values  #也可以通过列名来查看值
array([1388530986, 1388531586, 1388532187, ..., 1419977192, 1419977793,
       1419978393], dtype=int64)

2.1 将时间戳转换为日期时间格式

dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
dt1
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-140-255e05936dae> in <module>
----> 1 dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
      2 dt1


pandas\_libs\tslibs\timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()


pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()


TypeError: Cannot convert input [0        1388530986
1        1388531586
2        1388532187
3        1388532787
4        1388533388
            ...    
49540    1419975991
49541    1419976592
49542    1419977192
49543    1419977793
49544    1419978393
Name: time, Length: 49545, dtype: int64] of type <class 'pandas.core.series.Series'> to Timestamp
df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))  #这样转换时间戳也是错误的
df
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-11-ea11b62a5933> in <module>
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
      2 df


D:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   4136             else:
   4137                 values = self.astype(object)._values
-> 4138                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   4139 
   4140         if len(mapped) and isinstance(mapped[0], Series):


pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()


<ipython-input-11-ea11b62a5933> in <lambda>(x)
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
      2 df


D:\Anaconda3\lib\site-packages\pandas\__init__.py in __getattr__(name)
    242         return _SparseArray
    243 
--> 244     raise AttributeError(f"module 'pandas' has no attribute '{name}'")
    245 
    246 


AttributeError: module 'pandas' has no attribute 'TimeStamp'
dt1["time"] =  pd.to_datetime(dt1["time"],unit="s")  #用to_datetime可以实现转换
dt1.head(2)
outdoorindoormonthtemperature_diff
time
2014-01-01 00:03:06+01:004.3821.94117.56
2014-01-01 00:13:06+01:004.2522.00117.75
dt1["time"] =  pd.to_datetime(dt1["time"].values,unit="s").tz_localize('UTC')  #UTC时间,协调世界时,和前面的时间好像一致
dt1.head(2)
outdoorindoormonthtemperature_diff
time
2014-01-01 00:03:06+01:004.3821.94117.56
2014-01-01 00:13:06+01:004.2522.00117.75
# 因为这是欧洲时间,因此还需要从协调世界时转换,比上面早了一个小时
dt1["time"] =  pd.to_datetime(dt1["time"].values).tz_localize('UTC').tz_convert("Europe/Stockholm")  
dt1
#然后把时间设置为索引,可以更方便操作
dt1.set_index("time",inplace =True)

2.2 查看整理好的数据最新信息

dt1.tail(2)
outdoorindoormonthtemperature_diff
time
2014-12-30 23:16:33+01:001.5611.751210.19
2014-12-30 23:26:33+01:001.6211.811210.19
dt1.describe()
outdoorindoor
count49545.00000049545.000000
mean8.46168523.396307
std7.8660084.684381
min-15.50000010.310000
25%2.62000019.810000
50%7.75000022.940000
75%13.88000027.620000
max34.38000033.120000

三、气温随着时间变化可视化

#放在一起画
plt.figure(figsize=(12,6),dpi = 100)
plt.plot(dt1[["outdoor","indoor"]],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

在这里插入图片描述

#注意到在2014年8月到9月之间有异常,放大细部观察
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.iloc[25000:35000,:],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()

在这里插入图片描述

#两列数据分开画
plt.figure(figsize=(12,6),dpi=100)
plt.plot(dt1["outdoor"],color="r",label="outdoor")
plt.plot(dt1["indoor"],color="b",label="indoor")
plt.title("斯德哥尔摩气温状况")
plt.xlabel("时间")
plt.ylabel("气温")
plt.legend()
plt.tight_layout()
plt.show()

在这里插入图片描述

上图看到,可能存在数据缺失

查看一年的平均气温

dt1.mean()
outdoor     8.461685
indoor     23.396307
dtype: float64

查看一月份的气温

#问题分解:怎么索引出一月的气温
df1.filter(lambda x:x.index.month == 1) #这样过滤不出来
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-59-1c58f5c242af> in <module>
      1 #2.查看一月份的气温
      2 #怎么索引出一月的气温
----> 3 df1.filter(lambda x:x.index.month == 1)


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
   4974         if items is not None:
   4975             name = self._get_axis_name(axis)
-> 4976             return self.reindex(**{name: [r for r in items if r in labels]})
   4977         elif like:
   4978 


TypeError: 'function' object is not iterable
dt1.loc[dt1.index.month==1] #方法一:用索引作条件进行过滤
outdoorindoormonthtemperature_diff
time
2014-01-01 00:03:06+01:004.3821.94117.56
2014-01-01 00:13:06+01:004.2522.00117.75
2014-01-01 00:23:07+01:004.1922.00117.81
2014-01-01 00:33:07+01:004.0622.00117.94
2014-01-01 00:43:08+01:004.0622.00117.94
...............
2014-01-31 23:16:56+01:00-3.8816.31120.19
2014-01-31 23:26:57+01:00-3.8116.31120.12
2014-01-31 23:36:57+01:00-3.8116.31120.12
2014-01-31 23:46:58+01:00-3.7516.31120.06
2014-01-31 23:56:58+01:00-3.6916.38120.07

4452 rows × 4 columns

dt1.loc[(dt1.index>"2014-01-01") & (dt1.index<"2014-02-01")]  #方法二:用条件过滤
outdoorindoor
time
2014-01-01 00:03:06+01:004.3821.94
2014-01-01 00:13:06+01:004.2522.00
2014-01-01 00:23:07+01:004.1922.00
2014-01-01 00:33:07+01:004.0622.00
2014-01-01 00:43:08+01:004.0622.00
.........
2014-01-31 23:16:56+01:00-3.8816.31
2014-01-31 23:26:57+01:00-3.8116.31
2014-01-31 23:36:57+01:00-3.8116.31
2014-01-31 23:46:58+01:00-3.7516.31
2014-01-31 23:56:58+01:00-3.6916.38

4452 rows × 2 columns

dt1["2014-1-1":"2014-1-31"]    #方法三:切片索引
outdoorindoor
time
2014-01-01 00:03:06+01:004.3821.94
2014-01-01 00:13:06+01:004.2522.00
2014-01-01 00:23:07+01:004.1922.00
2014-01-01 00:33:07+01:004.0622.00
2014-01-01 00:43:08+01:004.0622.00
.........
2014-01-31 23:16:56+01:00-3.8816.31
2014-01-31 23:26:57+01:00-3.8116.31
2014-01-31 23:36:57+01:00-3.8116.31
2014-01-31 23:46:58+01:00-3.7516.31
2014-01-31 23:56:58+01:00-3.6916.38

4452 rows × 2 columns

dt1["2014-01"]  #方法四:用时间索引直接正则表达式过滤
outdoorindoor
time
2014-01-01 00:03:06+01:004.3821.94
2014-01-01 00:13:06+01:004.2522.00
2014-01-01 00:23:07+01:004.1922.00
2014-01-01 00:33:07+01:004.0622.00
2014-01-01 00:43:08+01:004.0622.00
.........
2014-01-31 23:16:56+01:00-3.8816.31
2014-01-31 23:26:57+01:00-3.8116.31
2014-01-31 23:36:57+01:00-3.8116.31
2014-01-31 23:46:58+01:00-3.7516.31
2014-01-31 23:56:58+01:00-3.6916.38

4452 rows × 2 columns

dt1["month"] = dt1.index.month  #增加一列月份特征
dt1["temperature_diff"] = dt1["indoor"]-dt1["outdoor"]  #增加一列温差特征
dt1.head()
outdoorindoormonthtemperature_diff
time
2014-01-01 00:03:06+01:004.3821.94117.56
2014-01-01 00:13:06+01:004.2522.00117.75
2014-01-01 00:23:07+01:004.1922.00117.81
2014-01-01 00:33:07+01:004.0622.00117.94
2014-01-01 00:43:08+01:004.0622.00117.94
dt1.loc[dt1["month"] ==1].iloc[:,0:2]  #方法五:通过其他新增特征过滤
outdoorindoor
time
2014-01-01 00:03:06+01:004.3821.94
2014-01-01 00:13:06+01:004.2522.00
2014-01-01 00:23:07+01:004.1922.00
2014-01-01 00:33:07+01:004.0622.00
2014-01-01 00:43:08+01:004.0622.00
.........
2014-01-31 23:16:56+01:00-3.8816.31
2014-01-31 23:26:57+01:00-3.8116.31
2014-01-31 23:36:57+01:00-3.8116.31
2014-01-31 23:46:58+01:00-3.7516.31
2014-01-31 23:56:58+01:00-3.6916.38

4452 rows × 2 columns

#画出一月份室内外气温图
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,0:1],color="r",label="一月室外气温")
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,1:2],color="b",label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

在这里插入图片描述

#查看一月室内外温差的变动幅度
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,3:],color="r",label="一月室内外温差")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

在这里插入图片描述

1月13日这一天斯德哥尔摩内外温差达到最大,最大为35度。

每个月的平均气温(柱状图和箱线图)

#先计算出每个月的平均气温,再画图
#方法一  从时间这一特征中衍生出月份特征,根据月份进行分组
dt2 = dt1.groupby("month").mean()
dt2
outdoorindoortemperature_diff
month
1-1.77664619.86259021.639236
22.23161320.23167518.000063
34.61543719.59729814.981861
48.10519322.15111914.045926
512.26139626.33405314.072656
615.58695528.68702513.100070
720.78031430.6073799.827065
816.49482328.09469811.599875
912.82390526.94929014.125385
109.35200023.37831414.026315
114.99214220.60823915.616097
12-0.06013916.46441816.524557
#方法二  重采样
dt3 = dt1.to_period(freq="M").groupby(level=0).mean()
dt3
outdoorindoormonthtemperature_diff
time
2014-01-1.77664619.862590121.639236
2014-022.23161320.231675218.000063
2014-034.61543719.597298314.981861
2014-048.10519322.151119414.045926
2014-0512.26139626.334053514.072656
2014-0615.58695528.687025613.100070
2014-0720.78031430.60737979.827065
2014-0816.49482328.094698811.599875
2014-0912.82390526.949290914.125385
2014-109.35200023.3783141014.026315
2014-114.99214220.6082391115.616097
2014-12-0.06013916.4644181216.524557
#pandas自带画图
dt2[["outdoor","indoor"]].plot(kind="bar",color=["r","b"],figsize=(12,6))
<matplotlib.axes._subplots.AxesSubplot at 0x19a33701588>

在这里插入图片描述

#matplotlib画图
plt.figure(figsize=(16,6),dpi = 100)
bar_width=0.35
plt.bar(dt2.index.values,dt2["outdoor"].values,color="r",width=0.4,label="一月室内外温差")
plt.bar(dt2.index.values+bar_width,dt2["indoor"].values,color="b",width=0.4,label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()

在这里插入图片描述

#两张图分开画
fig,ax = plt.subplots(1,2,figsize=(8,4))
fig.subplots_adjust(wspace=0.5)

ax[0].bar(dt2.index.values,dt2["outdoor"].values,color="r",label="室外气温")
ax[0].set_title("斯德哥尔摩气温图")
ax[0].set_xlabel("时间")
ax[0].set_ylabel("气温",rotation=0)
ax[0].legend()

ax[1].bar(dt2.index.values,dt2["indoor"].values,color="b",label="室内气温")
ax[1].set_title("斯德哥尔摩气温图")
ax[1].set_xlabel("时间")
ax[1].set_ylabel("气温",rotation=0)
ax[1].legend()

plt.tight_layout()
plt.show()

在这里插入图片描述

#室内和室外气温箱线图
plt.figure(figsize=(16,6),dpi = 100)
plt.boxplot([dt2["outdoor"].values,dt2["indoor"].values],labels=["室外气温","室内气温"],whis=1.63)
plt.grid(axis="y",ls=":",lw=1,color="gray",alpha=0.4)
plt.show()

在这里插入图片描述

标签:00,plt,0002,23,31,01,可视化,dt1,1001
来源: https://blog.csdn.net/lqw844597536/article/details/117262110