本案例的重点在于Matplotlib可视化的基础操作实战练习。
import os #导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
os.chdir("D:\Datalu\File") #指定工作目录
%matplotlib inline #必要可视化设置
plt.rcParams["font.sans-serif"] = ["KAITI"]
plt.rcParams["axes.unicode_minus"] = False
一、问题的提出
1.查看一年的平均气温
2.查看一月份的气温
3.每个月的平均气温(柱状图和箱线图)
1.1 导入两份气温数据
#导入室外气温数据
df1 = pd.read_csv("temperature_outdoor_2014.tsv",delimiter="\t", names=["time", "outdoor"])
df1.head(2)
time outdoor 0 1388530986 4.38 1 1388531586 4.25
#导入室内气温数据
df2 = pd.read_csv("temperature_indoor_2014.tsv",delimiter="\t", names=["time2", "indoor"])
df2.head(2)
time2 indoor 0 1388530986 21.94 1 1388531586 22.00
#合并两份数据
df = pd.concat([df1,df2],join="inner",axis=1)
df
time outdoor time2 indoor 0 1388530986 4.38 1388530986 21.94 1 1388531586 4.25 1388531586 22.00 2 1388532187 4.19 1388532187 22.00 3 1388532787 4.06 1388532787 22.00 4 1388533388 4.06 1388533388 22.00 ... ... ... ... ... 49540 1419975991 1.44 1419977793 11.75 49541 1419976592 1.50 1419978393 11.75 49542 1419977192 1.50 1419978994 11.75 49543 1419977793 1.56 1419979595 11.75 49544 1419978393 1.62 1419980195 11.81
49545 rows × 4 columns
df.columns
Index(['time', 'outdoor', 'time2', 'indoor'], dtype='object')
df.drop('time2',axis=1,inplace=True)
df.head(2)
time outdoor indoor 0 1388530986 4.38 21.94 1 1388531586 4.25 22.00
dt1 = df.copy()
数据集一共有三列数据,其中一列是时间戳,两列是气温数据
这里有两种方法可以将其转化为时间按,第一种是在导入文件时,第二种是用to_datetime方法
二、查看数据基本信息
dt1.info(memory_usage="deep") # 没有自动辨认成时间
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49545 entries, 0 to 49544
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 time 49545 non-null int64
1 outdoor 49545 non-null float64
2 indoor 49545 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
dt1.values #查看数据集的值
array([[1.38853099e+09, 4.38000000e+00, 2.19400000e+01],
[1.38853159e+09, 4.25000000e+00, 2.20000000e+01],
[1.38853219e+09, 4.19000000e+00, 2.20000000e+01],
...,
[1.41997719e+09, 1.50000000e+00, 1.17500000e+01],
[1.41997779e+09, 1.56000000e+00, 1.17500000e+01],
[1.41997839e+09, 1.62000000e+00, 1.18100000e+01]])
dt1.values[:,0] #查看数据集某一列的值
array([1.38853099e+09, 1.38853159e+09, 1.38853219e+09, ...,
1.41997719e+09, 1.41997779e+09, 1.41997839e+09])
dt1.time.values #也可以通过列名来查看值
array([1388530986, 1388531586, 1388532187, ..., 1419977192, 1419977793,
1419978393], dtype=int64)
2.1 将时间戳转换为日期时间格式
dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
dt1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-140-255e05936dae> in <module>
----> 1 dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
2 dt1
pandas\_libs\tslibs\timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()
pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()
TypeError: Cannot convert input [0 1388530986
1 1388531586
2 1388532187
3 1388532787
4 1388533388
...
49540 1419975991
49541 1419976592
49542 1419977192
49543 1419977793
49544 1419978393
Name: time, Length: 49545, dtype: int64] of type <class 'pandas.core.series.Series'> to Timestamp
df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x)) #这样转换时间戳也是错误的
df
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-ea11b62a5933> in <module>
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
2 df
D:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-11-ea11b62a5933> in <lambda>(x)
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
2 df
D:\Anaconda3\lib\site-packages\pandas\__init__.py in __getattr__(name)
242 return _SparseArray
243
--> 244 raise AttributeError(f"module 'pandas' has no attribute '{name}'")
245
246
AttributeError: module 'pandas' has no attribute 'TimeStamp'
dt1["time"] = pd.to_datetime(dt1["time"],unit="s") #用to_datetime可以实现转换
dt1.head(2)
outdoor indoor month temperature_diff time 2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56 2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
dt1["time"] = pd.to_datetime(dt1["time"].values,unit="s").tz_localize('UTC') #UTC时间,协调世界时,和前面的时间好像一致
dt1.head(2)
outdoor indoor month temperature_diff time 2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56 2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
# 因为这是欧洲时间,因此还需要从协调世界时转换,比上面早了一个小时
dt1["time"] = pd.to_datetime(dt1["time"].values).tz_localize('UTC').tz_convert("Europe/Stockholm")
dt1
#然后把时间设置为索引,可以更方便操作
dt1.set_index("time",inplace =True)
2.2 查看整理好的数据最新信息
dt1.tail(2)
outdoor indoor month temperature_diff time 2014-12-30 23:16:33+01:00 1.56 11.75 12 10.19 2014-12-30 23:26:33+01:00 1.62 11.81 12 10.19
dt1.describe()
outdoor indoor count 49545.000000 49545.000000 mean 8.461685 23.396307 std 7.866008 4.684381 min -15.500000 10.310000 25% 2.620000 19.810000 50% 7.750000 22.940000 75% 13.880000 27.620000 max 34.380000 33.120000
三、气温随着时间变化可视化
#放在一起画
plt.figure(figsize=(12,6),dpi = 100)
plt.plot(dt1[["outdoor","indoor"]],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
#注意到在2014年8月到9月之间有异常,放大细部观察
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.iloc[25000:35000,:],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()
#两列数据分开画
plt.figure(figsize=(12,6),dpi=100)
plt.plot(dt1["outdoor"],color="r",label="outdoor")
plt.plot(dt1["indoor"],color="b",label="indoor")
plt.title("斯德哥尔摩气温状况")
plt.xlabel("时间")
plt.ylabel("气温")
plt.legend()
plt.tight_layout()
plt.show()
上图看到,可能存在数据缺失
查看一年的平均气温
dt1.mean()
outdoor 8.461685
indoor 23.396307
dtype: float64
查看一月份的气温
#问题分解:怎么索引出一月的气温
df1.filter(lambda x:x.index.month == 1) #这样过滤不出来
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-59-1c58f5c242af> in <module>
1 #2.查看一月份的气温
2 #怎么索引出一月的气温
----> 3 df1.filter(lambda x:x.index.month == 1)
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
4974 if items is not None:
4975 name = self._get_axis_name(axis)
-> 4976 return self.reindex(**{name: [r for r in items if r in labels]})
4977 elif like:
4978
TypeError: 'function' object is not iterable
dt1.loc[dt1.index.month==1] #方法一:用索引作条件进行过滤
outdoor indoor month temperature_diff time 2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56 2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75 2014-01-01 00:23:07+01:00 4.19 22.00 1 17.81 2014-01-01 00:33:07+01:00 4.06 22.00 1 17.94 2014-01-01 00:43:08+01:00 4.06 22.00 1 17.94 ... ... ... ... ... 2014-01-31 23:16:56+01:00 -3.88 16.31 1 20.19 2014-01-31 23:26:57+01:00 -3.81 16.31 1 20.12 2014-01-31 23:36:57+01:00 -3.81 16.31 1 20.12 2014-01-31 23:46:58+01:00 -3.75 16.31 1 20.06 2014-01-31 23:56:58+01:00 -3.69 16.38 1 20.07
4452 rows × 4 columns
dt1.loc[(dt1.index>"2014-01-01") & (dt1.index<"2014-02-01")] #方法二:用条件过滤
outdoor indoor time 2014-01-01 00:03:06+01:00 4.38 21.94 2014-01-01 00:13:06+01:00 4.25 22.00 2014-01-01 00:23:07+01:00 4.19 22.00 2014-01-01 00:33:07+01:00 4.06 22.00 2014-01-01 00:43:08+01:00 4.06 22.00 ... ... ... 2014-01-31 23:16:56+01:00 -3.88 16.31 2014-01-31 23:26:57+01:00 -3.81 16.31 2014-01-31 23:36:57+01:00 -3.81 16.31 2014-01-31 23:46:58+01:00 -3.75 16.31 2014-01-31 23:56:58+01:00 -3.69 16.38
4452 rows × 2 columns
dt1["2014-1-1":"2014-1-31"] #方法三:切片索引
outdoor indoor time 2014-01-01 00:03:06+01:00 4.38 21.94 2014-01-01 00:13:06+01:00 4.25 22.00 2014-01-01 00:23:07+01:00 4.19 22.00 2014-01-01 00:33:07+01:00 4.06 22.00 2014-01-01 00:43:08+01:00 4.06 22.00 ... ... ... 2014-01-31 23:16:56+01:00 -3.88 16.31 2014-01-31 23:26:57+01:00 -3.81 16.31 2014-01-31 23:36:57+01:00 -3.81 16.31 2014-01-31 23:46:58+01:00 -3.75 16.31 2014-01-31 23:56:58+01:00 -3.69 16.38
4452 rows × 2 columns
dt1["2014-01"] #方法四:用时间索引直接正则表达式过滤
outdoor indoor time 2014-01-01 00:03:06+01:00 4.38 21.94 2014-01-01 00:13:06+01:00 4.25 22.00 2014-01-01 00:23:07+01:00 4.19 22.00 2014-01-01 00:33:07+01:00 4.06 22.00 2014-01-01 00:43:08+01:00 4.06 22.00 ... ... ... 2014-01-31 23:16:56+01:00 -3.88 16.31 2014-01-31 23:26:57+01:00 -3.81 16.31 2014-01-31 23:36:57+01:00 -3.81 16.31 2014-01-31 23:46:58+01:00 -3.75 16.31 2014-01-31 23:56:58+01:00 -3.69 16.38
4452 rows × 2 columns
dt1["month"] = dt1.index.month #增加一列月份特征
dt1["temperature_diff"] = dt1["indoor"]-dt1["outdoor"] #增加一列温差特征
dt1.head()
outdoor indoor month temperature_diff time 2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56 2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75 2014-01-01 00:23:07+01:00 4.19 22.00 1 17.81 2014-01-01 00:33:07+01:00 4.06 22.00 1 17.94 2014-01-01 00:43:08+01:00 4.06 22.00 1 17.94
dt1.loc[dt1["month"] ==1].iloc[:,0:2] #方法五:通过其他新增特征过滤
outdoor indoor time 2014-01-01 00:03:06+01:00 4.38 21.94 2014-01-01 00:13:06+01:00 4.25 22.00 2014-01-01 00:23:07+01:00 4.19 22.00 2014-01-01 00:33:07+01:00 4.06 22.00 2014-01-01 00:43:08+01:00 4.06 22.00 ... ... ... 2014-01-31 23:16:56+01:00 -3.88 16.31 2014-01-31 23:26:57+01:00 -3.81 16.31 2014-01-31 23:36:57+01:00 -3.81 16.31 2014-01-31 23:46:58+01:00 -3.75 16.31 2014-01-31 23:56:58+01:00 -3.69 16.38
4452 rows × 2 columns
#画出一月份室内外气温图
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,0:1],color="r",label="一月室外气温")
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,1:2],color="b",label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
#查看一月室内外温差的变动幅度
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,3:],color="r",label="一月室内外温差")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
1月13日这一天斯德哥尔摩内外温差达到最大,最大为35度。
每个月的平均气温(柱状图和箱线图)
#先计算出每个月的平均气温,再画图
#方法一 从时间这一特征中衍生出月份特征,根据月份进行分组
dt2 = dt1.groupby("month").mean()
dt2
outdoor indoor temperature_diff month 1 -1.776646 19.862590 21.639236 2 2.231613 20.231675 18.000063 3 4.615437 19.597298 14.981861 4 8.105193 22.151119 14.045926 5 12.261396 26.334053 14.072656 6 15.586955 28.687025 13.100070 7 20.780314 30.607379 9.827065 8 16.494823 28.094698 11.599875 9 12.823905 26.949290 14.125385 10 9.352000 23.378314 14.026315 11 4.992142 20.608239 15.616097 12 -0.060139 16.464418 16.524557
#方法二 重采样
dt3 = dt1.to_period(freq="M").groupby(level=0).mean()
dt3
outdoor indoor month temperature_diff time 2014-01 -1.776646 19.862590 1 21.639236 2014-02 2.231613 20.231675 2 18.000063 2014-03 4.615437 19.597298 3 14.981861 2014-04 8.105193 22.151119 4 14.045926 2014-05 12.261396 26.334053 5 14.072656 2014-06 15.586955 28.687025 6 13.100070 2014-07 20.780314 30.607379 7 9.827065 2014-08 16.494823 28.094698 8 11.599875 2014-09 12.823905 26.949290 9 14.125385 2014-10 9.352000 23.378314 10 14.026315 2014-11 4.992142 20.608239 11 15.616097 2014-12 -0.060139 16.464418 12 16.524557
#pandas自带画图
dt2[["outdoor","indoor"]].plot(kind="bar",color=["r","b"],figsize=(12,6))
<matplotlib.axes._subplots.AxesSubplot at 0x19a33701588>
#matplotlib画图
plt.figure(figsize=(16,6),dpi = 100)
bar_width=0.35
plt.bar(dt2.index.values,dt2["outdoor"].values,color="r",width=0.4,label="一月室内外温差")
plt.bar(dt2.index.values+bar_width,dt2["indoor"].values,color="b",width=0.4,label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()
#两张图分开画
fig,ax = plt.subplots(1,2,figsize=(8,4))
fig.subplots_adjust(wspace=0.5)
ax[0].bar(dt2.index.values,dt2["outdoor"].values,color="r",label="室外气温")
ax[0].set_title("斯德哥尔摩气温图")
ax[0].set_xlabel("时间")
ax[0].set_ylabel("气温",rotation=0)
ax[0].legend()
ax[1].bar(dt2.index.values,dt2["indoor"].values,color="b",label="室内气温")
ax[1].set_title("斯德哥尔摩气温图")
ax[1].set_xlabel("时间")
ax[1].set_ylabel("气温",rotation=0)
ax[1].legend()
plt.tight_layout()
plt.show()
#室内和室外气温箱线图
plt.figure(figsize=(16,6),dpi = 100)
plt.boxplot([dt2["outdoor"].values,dt2["indoor"].values],labels=["室外气温","室内气温"],whis=1.63)
plt.grid(axis="y",ls=":",lw=1,color="gray",alpha=0.4)
plt.show()
标签: 00 ,plt ,0002 ,23 ,31 ,01 ,可视化 ,dt1 ,1001
来源: https://blog.csdn.net/lqw844597536/article/details/117262110