首页 > 编程语言> > python-探索性数据分析-粮农组织分析
python-探索性数据分析-粮农组织分析

2021-11-30 19:03:21 作者：互联网
python-探索性数据分析-粮农组织分析

消除饥饿，消除贫困，自然资源循环利用探索性分析案例
代码

#导入需要的常用库
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os,sys
import warnings
import folium
import missingno as msno
warnings.filterwarnings('ignore')
sns.set_context("poster",font_scale=1.3)
import gzip
import scipy

#读入数据文件（压缩格式）
data=pd.read_csv('aquastat.csv.gzip',compression='gzip')
print(data.head())
print(data.shape)
print(data.info)
# 将不重复的指标列来出来（指标，指标说明）
data[['variable','variable_full']].drop_duplicates()
#看一共统计多少个国家
print(data.country.nunique())
countries = data.country.unique()
#看有多少个时间周期
print(data.time_period.nunique())
time_periods = data.time_period.unique()
print(time_periods)
mid_periods = range(1960,2017,5)
#看总面积指标是否完整
data[data.variable=='total_area'].value.isnull().sum()
#切片
#横截面：看一个时期内所有的国家不同指标情况
def time_slice(df,time_period):
    df = df[df.time_period == time_period]
    df = df.pivot(index='country',columns='variable',values='value')
    df.columns.name = time_period
    return df
print(time_slice(data,time_periods[0]).head())
#切片
#时间序列：看一个国家，各个时间周期指标
def country_slice(df,country):
    df = df[df.country==country]
    df=df.pivot(index='variable',columns='time_period',values='value')
    df.index.name = country
    return df
print(country_slice(data,countries[40]).head())
#切片
#面板数据：所有国家随时间推移，作为数据给出
def variable_slice(df,variable):
    df=df[df.variable==variable]
    df=df.pivot(index='country',columns='time_period',values='value')
    return df
print(variable_slice(data,'total_pop').head())
#切片
#地理空间：所有地理上相互联系的国家
def time_series(df,country,variable):
    series = df[(df.country==country) & (df.variable==variable)]
    series = series.dropna()[['year_measured','value']]
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured',inplace=True)
    series.columns=[variable]
    return series
print(time_series(data,'Belarus','total_pop'))

print(data.region.unique())
#粒度太小，进行合并成较大粒度
# simple_regions = {
#     'World | Asia':'Asia',
#     'Americas | Central America and Caribbean | Central America': 'North America',
#     'Americas | Central America and Caribbean | Greater Antilles': 'North America',
#     'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
#     'Americas | Northern America | Northern America': 'North America',
#     'Americas | Northern America | Mexico': 'North America',
#     'Americas | Southern America | Guyana': 'South America',
#     'Americas | Southern America | Andean': 'South America',
#     'Americas | Southern America | Brazil': 'South America',
#     'Americas | Southern America | Southern America' 'World | Africa': 'South America',
#     'World | Africa':'Africa',
#     'World | Europe':'Europe',
#     'World | Oceania':'Oceania'
#     }
#
# data.region = data.region.apply(lambda x: simple_regions[x])
# print(data.region.unique())

def subregion(data,region):
    return data[data.region==region]

#数据质量评估
recent= time_slice(data,'2013-2017')
msno.matrix(recent,labels=True)
#水资源总量
msno.matrix(variable_slice(data,'exploitable_total'),inline=False,sort='descending')
plt.xlabel('Time period')
plt.ylabel('Country')
plt.title('Missing total exploitable water resources data cross counties and time jperiods \n \n \n \n')
plt.show()
#去掉缺失严重的列exploitab
data=data.loc[~data.variable.str.contains('exploitab'),:]
#全国降雨指数缺失统计
msno.matrix(variable_slice(data,'national_rainfall_index'),
            inline=False,sort='descending')
plt.xlabel('Time period')
plt.ylabel('Country')
plt.title('Missing national rainfall index data across coutries and time periods \n \n \n')
plt.show()
print('************************************')

# null_data=recent['agg_to_gdp'].notnull()*1
# map=folium.Map(location=[48,-102],zoom_start=2)
# map.choropleth(geo_data=r'world.json',
#                data=null_data,
#                columns=['country','agg_to_gdp'],
#                key_no='feature.properties.name',reset=True,
#                fill_color='GnBu',fill_opacity=1,line_opacity=0.2,
#                legend_name='Missing agricultural contribution to GDP data 2013-2017')
# print(map)
# plt.show()
#地图上统计
def plot_null_map(df,time_period,variable,legend_name=None):
    geo = r'world.json'
    ts = time_slice(df,time_period).reset_index().copy()
    ts[variable]=ts[variable].notnull()*1
    map = folium.Map(location=[48,-102],zoom_start=2)
    map
    plt.show()
    map.choropleth(geo_data=geo,
                   data=ts,
                   columns=['country',variable],
                   key_no='feature.properties.name',reset=True,
                   fill_color='GnBu',fill_opacity=1,line_opacity=0.2,
                   legend_name=legend_name if legend_name else variable)
    return map
save_map = plot_null_map(data,'2013-2017','number_undernourished','Number undernourished is missing')
save_map.save('save_map.html')

#统计时间周期，不同指标变化情况,不同变量在不同时间上是否被收集
fig,ax = plt.subplots(figsize=(16,16))
sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T,ax=ax)
plt.xticks(rotation=45)
plt.xlabel('Time period')
plt.ylabel('Variable')
plt.title('Number of countries with data reportes')
plt.show()

#
recent[['total_pop','urban_pop','rural_pop']].describe().astype(int)
#排序
recent_sort = recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head()
print(recent_sort)
#分析峰度，倾斜,偏度
recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.skew)
recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.kurtosis)
#看看当前数据分布

fig,ax = plt.subplots(figsize=(12,8))
ax.hist(recent.total_pop.values,bins=50)
ax.set_xlabel('Total population')
ax.set_ylabel('Number of contries')
ax.set_title('Distrbution of population of countries 2013-2017')
plt.show()
#理论上，我们将分布标准差与其均值线性相关,数据对数变换
recent_log = recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
print(recent_log)
# 总结
学习案例
标签：plt,粮农组织,python,探索性,variable,time,df,America,data
来源： https://blog.csdn.net/s_unbo/article/details/121637065