编程语言
首页 > 编程语言> > 利用python进行数据分析(2)

利用python进行数据分析(2)

作者:互联网

第七章数据清洗与准备

7.1 处理缺失值

string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
#当清洗数据用于分析时,对缺失数据本身进行分析以确定数据收集问题或数据丢失导致的数据偏差通常很重要。
#Python内建的None值在对象数组中也被当作NA处理
string_data[0] = None
string_data.isnull()
0     True
1    False
2     True
3    False
dtype: bool
函数名描述
dropna根据每个标签的值是否是缺失数据来筛选轴标签,并根据允许丢失的数据量来确定阈值
fillna用某些值填充缺失的数据或使用插值方法(如’ffill’或’bfill’).
isnull返回表明哪些值是缺失值的布尔值
notnullisnull的反函数

7.1.1 过滤缺失值

#在Series上使用dropna,它会返回Series中所有的非空数据及其索引值
from numpy import nan as  NA
data = pd.Series([1,NA,3.5,NA,7])
data
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
#dropna默认情况下会删除包含缺失值的行
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data[data.notnull()]
0    1.0
2    3.5
4    7.0
dtype: float64
data = pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
data
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
#传入how='all’时,将删除所有值均为NA的行
data.dropna(how='all')
012
01.06.53.0
11.0NaNNaN
3NaN6.53.0
#如果要用同样的方式去删除列,传入参数axis=1
data[4] = NA
data
0124
01.06.53.0NaN
11.0NaNNaNNaN
2NaNNaNNaNNaN
3NaN6.53.0NaN
data.dropna(axis=1,how = 'all')
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
df = pd.DataFrame(np.random.randn(7,3))
df
012
0-0.1002880.1170810.629897
10.1452240.827820-0.197561
2-1.372610-0.5210750.783224
3-0.6793390.355698-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df
012
0-0.100288NaNNaN
10.145224NaNNaN
2-1.372610NaN0.783224
3-0.679339NaN-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
df.dropna()
012
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
#假设你只想保留包含一定数量的观察值的行。你可以用thresh参数来表示
#thresh=n是指:保留下来的每一行,其非NA的数目大于等于n,thresh=2表示保留至少n个非Nan的数据行
df.dropna(thresh = 2)
012
2-1.372610NaN0.783224
3-0.679339NaN-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943

7.1.2 补全缺失值

参数描述
value标量值或字典型对象用于填充缺失值
method插值方法,如果没有其他参数,默认是’ffill’
axis需要填充的轴,默认axis=0
inplace修改被调用的对象,而不是生成一个备份
limit用于前向或后向填充时最大的填充范围
df.fillna(0)
012
0-0.1002880.0000000.000000
10.1452240.0000000.000000
2-1.3726100.0000000.783224
3-0.6793390.000000-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
#在调用fillna时使用字典,你可以为不同列设定不同的填充值
df.fillna({1:0.5,2:0})
012
0-0.1002880.5000000.000000
10.1452240.5000000.000000
2-1.3726100.5000000.783224
3-0.6793390.500000-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
#fillna返回的是一个新的对象,但你也可以修改已经存在的对象
_ = df.fillna(0,inplace = True)
df
012
0-0.1002880.0000000.000000
10.1452240.0000000.000000
2-1.3726100.0000000.783224
3-0.6793390.000000-1.283404
4-1.5877080.2546160.149215
5-0.323276-0.393636-1.828212
6-0.639610-1.6778211.618943
#用于重建索引的相同的插值方法也可以用于fillna
df = pd.DataFrame(np.random.randn(6,3))
df
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.5831580.390681-2.386976
3-0.076475-0.0349951.635065
40.5288140.7117170.696243
5-0.1935770.162206-0.520191
df.iloc[2:,1] = NA
df
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.583158NaN-2.386976
3-0.076475NaN1.635065
40.528814NaN0.696243
5-0.193577NaN-0.520191
df.iloc[4:,2] = NA
df
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.583158NaN-2.386976
3-0.076475NaN1.635065
40.528814NaNNaN
5-0.193577NaNNaN
df.fillna(method='ffill')
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.5831580.921389-2.386976
3-0.0764750.9213891.635065
40.5288140.9213891.635065
5-0.1935770.9213891.635065
df.fillna(method='backfill')
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.583158NaN-2.386976
3-0.076475NaN1.635065
40.528814NaNNaN
5-0.193577NaNNaN
df.fillna(method='ffill',limit=2)
012
0-0.4284050.1993830.354342
10.0197820.9213890.534736
2-0.5831580.921389-2.386976
3-0.0764750.9213891.635065
40.528814NaN1.635065
5-0.193577NaN1.635065
data = pd.Series([5,NA,3,NA,7])
data.fillna(data.mean())
0    5.0
1    5.0
2    3.0
3    5.0
4    7.0
dtype: float64
data.mean()
5.0

7.2 数据转换

7.2.1 删除重复值

data = pd.DataFrame({'k1':['one','two']*3+['two'],
                    'k2':[1,1,2,3,3,4,4,]})
data
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
6two4
#DataFrame的duplicated方法返回的是一个布尔值Series,
#这个Series反映的是每一行是否存在重复(与之前出现过的行相同)情况
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
#drop_duplicates返回的是DataFrame,内容是duplicated返回数组中为False的部分
#删除重复值,只保留唯一值
#这些方法默认都是对列进行操作
data.drop_duplicates()
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
data['v1'] = range(7)
data
k1k2v1
0one10
1two11
2one22
3two33
4one34
5two45
6two46
data.drop_duplicates(['k1'])
k1k2v1
0one10
1two11
#duplicated和drop_duplicates默认都是保留第一个观测到的值。传入参数keep='last’将会返回最后一个
data.drop_duplicates(['k1','k2'],keep = 'last')
k1k2v1
0one10
1two11
2one22
3two33
4one34
6two46

7.2.2 使用函数或映射进行数据转换

data = pd.DataFrame({'food':['bacon','pulled pork','bacon','pastrami','corned beef',
                             'bacon','pastrami','honey ham','nova lox'],
                     'ounces':[4.0,3.0,12.0,6.0,7.5,8.0,3.0,5.0,6.0]})

data
foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3pastrami6.0
4corned beef7.5
5bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
meat_to_animal = {
'bacon':'pig',
'pulled pork':'pig',  
'pastrami':'cow',
'corned beef':'cow',  
'honey ham':'pig', 
'nova lox':'samlon',     
}
#Series的map方法接收一个函数或一个包含映射关系的字典型对象
lowercased = data['food'].str.lower()
lowercased
0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object
data['animal'] = lowercased.map(meat_to_animal)
data
foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3pastrami6.0cow
4corned beef7.5cow
5bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0samlon
#使用map是一种可以便捷执行按元素转换及其他清洗相关操作的方法
data['food'].map(lambda x :meat_to_animal[x.lower()])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    samlon
Name: food, dtype: object

7.2.3 替代值

data = pd.Series([1,-999,2,-999,-1000,3])
data
0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64
data.replace(-999,np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
#如果你想要一次替代多个值,你可以传入一个列表和替代值
data.replace([-999,-1000],np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
#要将不同的值替换为不同的值,可以传入替代值的列表
data.replace([-999,-1000],[np.nan,0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
#参数也可以通过字典传递
data.replace({-999:np.nan,-1000:0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

7.2.4 重命名轴索引

data = pd.DataFrame(np.arange(12).reshape(3,4),
                   index = ['Ohio','Colorado','New York'],
                   columns = ['one','two','three','four'])
data
onetwothreefour
Ohio0123
Colorado4567
New York891011
transform = lambda x :x[:4].upper()
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data
onetwothreefour
OHIO0123
COLO4567
NEW891011
data.rename(index = str.title,columns = str.upper)
ONETWOTHREEFOUR
Ohio0123
Colo4567
New891011
#rename可以结合字典型对象使用,为轴标签的子集提供新的值
data.rename(index = {'OHIO':'INDIANA'},
           columns = {'three':'peekaboo'})
onetwopeekaboofour
INDIANA0123
COLO4567
NEW891011

7.2.5 离散化和分箱

ages = [20,22,24,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
#pd.value_counts(cats)是对pandas.cut的结果中的箱数量的计数
pd.value_counts(cats)
(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64
#与区间的数学符号一致,小括号表示边是开放的,中括号表示它是封闭的(包括边)。
#你可以通过传递right=False来改变哪一边是封闭的
pd.cut(ages,[18,26,36,61,100],right = False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
#你也可以通过向labels选项传递一个列表或数组来传入自定义的箱名
group_names = ['youth','youngadult','middleaged','senior']
a = pd.cut(ages,bins,labels = group_names)
pd.value_counts(a)
youth         5
youngadult    3
middleaged    3
senior        1
dtype: int64
#如果你传给cut整数个的箱来代替显式的箱边,pandas将根据数据中的最小值和最大值计算出等长的箱
#precision=2的选项将十进制精度限制在两位
data = np.random.rand(20)
pd.cut(data,4,precision=2)
[(0.51, 0.74], (0.29, 0.51], (0.74, 0.97], (0.29, 0.51], (0.06, 0.29], ..., (0.06, 0.29], (0.29, 0.51], (0.74, 0.97], (0.51, 0.74], (0.74, 0.97]]
Length: 20
Categories (4, interval[float64]): [(0.06, 0.29] < (0.29, 0.51] < (0.51, 0.74] < (0.74, 0.97]]
#qcut是一个与分箱密切相关的函数,它基于样本分位数进行分箱。取决于数据的分布,使用cut通常不会使每个箱具有相同数据量的数据点。
#由于qcut使用样本的分位数,你可以通过qcut获得等长的箱
data = np.random.randn(1000)
cats = pd.qcut(data,4)#切成4份
cats
[(-0.00707, 0.65], (-0.00707, 0.65], (-2.936, -0.626], (-0.626, -0.00707], (-2.936, -0.626], ..., (-0.626, -0.00707], (-0.626, -0.00707], (-0.626, -0.00707], (-0.626, -0.00707], (-0.00707, 0.65]]
Length: 1000
Categories (4, interval[float64]): [(-2.936, -0.626] < (-0.626, -0.00707] < (-0.00707, 0.65] < (0.65, 3.139]]
pd.value_counts(cats)
(-2.936, -0.626]      250
(-0.626, -0.00707]    250
(-0.00707, 0.65]      250
(0.65, 3.139]         250
dtype: int64
#与cut类似,你可以传入自定义的分位数(0和1之间的数据,包括边)
pd.cut(data,[0,0.1,0.5,0.9,1])
[(0.5, 0.9], (0.1, 0.5], NaN, NaN, NaN, ..., NaN, NaN, NaN, NaN, (0.1, 0.5]]
Length: 1000
Categories (4, interval[float64]): [(0.0, 0.1] < (0.1, 0.5] < (0.5, 0.9] < (0.9, 1.0]]

7.2.6 检测和过滤异常值

data = pd.DataFrame(np.random.randn(1000,4))
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean0.0133430.0301420.0203120.042330
std1.0125280.9844430.9998690.982124
min-2.942920-3.799121-3.412855-2.632107
25%-0.668303-0.629645-0.654843-0.643005
50%0.0103490.0400640.0261970.028003
75%0.7015250.6793710.7061700.714993
max3.2744963.9984933.2642162.907744
#假设你想要找出一列中绝对值大于三的值
col = data[2]
col[np.abs(col) > 3]
91    -3.044972
711    3.264216
858   -3.412855
Name: 2, dtype: float64
data[(np.abs(data)>3).any(1)]
0123
91-0.341046-0.555910-3.0449720.474512
3252.233400-3.0274040.8457041.441757
332-0.460361-3.799121-0.3129310.478548
4570.0110043.9984930.9774190.577620
711-0.603762-1.6509013.264216-0.803395
7461.455624-3.178085-0.3871400.859193
858-2.1279230.163924-3.412855-0.073186
9463.274496-0.699596-1.0168790.358252
data[np.abs(data)>3] = np.sign(data)*3
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean0.0130690.0301480.0205060.042330
std1.0116800.9774590.9975730.982124
min-2.942920-3.000000-3.000000-2.632107
25%-0.668303-0.629645-0.654843-0.643005
50%0.0103490.0400640.0261970.028003
75%0.7015250.6793710.7061700.714993
max3.0000003.0000003.0000002.907744
data
0123
00.9972850.352539-0.158277-0.069519
1-1.144523-0.173312-0.6512270.686972
20.6501310.271325-0.304344-0.281217
30.527442-2.0237650.827982-1.855424
4-0.578451-0.949705-0.582701-1.725697
...............
9950.4943110.528862-0.1910970.118121
996-0.5821541.251247-1.622055-0.436563
9970.687732-1.670059-0.272708-0.369290
998-0.4432300.984728-0.283506-1.473420
999-0.276277-0.5972561.269391-0.704337

1000 rows × 4 columns

#语句np.sign(data)根据数据中的值的正负分别生成1和-1的数值
np.sign(data).head()
0123
01.01.0-1.0-1.0
1-1.0-1.0-1.01.0
21.01.0-1.0-1.0
31.0-1.01.0-1.0
4-1.0-1.0-1.0-1.0

7.2.7 置换和随机抽样

sampler = np.random.permutation(5)
sampler
array([3, 2, 0, 4, 1])
df = pd.DataFrame(np.arange(5*4).reshape(5,4))
df
0123
00123
14567
2891011
312131415
416171819
#整数数组可以用在基于iloc的索引或等价的take函数中
df.take(sampler)
0123
312131415
2891011
00123
416171819
14567
#要选出一个不含有替代值的随机子集,你可以使用Series和DataFrame的sample方法
df.sample(n=3)
0123
00123
416171819
312131415
#要生成一个带有替代值的样本(允许有重复选择),将replace=True传入sample方法
choices = pd.Series([5,6,-1,6,4])
draws = choices.sample(n=10,replace = True)
draws
2   -1
0    5
2   -1
3    6
0    5
1    6
1    6
4    4
3    6
1    6
dtype: int64

7.2.8 计算指标/虚拟变量

df = pd.DataFrame({'key':['b','b','a','c','a','b'],
                  'data1':range(6)})
df
keydata1
0b0
1b1
2a2
3c3
4a4
5b5
pd.get_dummies(df['key'])
abc
0010
1010
2100
3001
4100
5010
#在某些情况下,你可能想在指标DataFrame的列上加入前缀,然后与其他数据合并。
#在get_dummies方法中有一个前缀参数用于实现该功能
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1key_akey_bkey_c
00010
11010
22100
33001
44100
55010
mnames = ['movie_id','title','genres']
movies = pd.read_table(r'D:\PythonFlie\python\利用python进行数据分析(书籍笔记)\pydata-book-2nd-edition\datasets\movielens\movies.dat'
                       ,sep='::',header=None,names = mnames)
<ipython-input-188-960ac40c2eea>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  movies = pd.read_table(r'D:\PythonFlie\python\利用python进行数据分析(书籍笔记)\pydata-book-2nd-edition\datasets\movielens\movies.dat'
movies[::10]
movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
1011American President, The (1995)Comedy|Drama|Romance
2021Get Shorty (1995)Action|Comedy|Drama
3031Dangerous Minds (1995)Drama
4041Richard III (1995)Drama|War
............
38403910Dancer in the Dark (2000)Drama|Musical
38503920Faraway, So Close (In Weiter Ferne, So Nah!) (...Drama|Fantasy
38603930Creature From the Black Lagoon, The (1954)Horror
38703940Slumber Party Massacre III, The (1990)Horror
38803950Tigerland (2000)Drama

389 rows × 3 columns

#为每个电影流派添加指标变量需要进行一些数据处理。首先,我们从数据集中提取出所有不同的流派的列表
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres
array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)
zero_matrix = np.zeros((len(movies),len(genres)))
dummies = pd.DataFrame(zero_matrix,columns=genres)
zero_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
dummies
AnimationChildren'sComedyAdventureFantasyRomanceDramaActionCrimeThrillerHorrorSci-FiDocumentaryWarMusicalMysteryFilm-NoirWestern
00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
.........................................................
38780.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
38790.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
38800.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
38810.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
38820.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0

3883 rows × 18 columns

gen = movies.genres[0]
gen.split('|')
['Animation', "Children's", 'Comedy']
dummies.columns.get_indexer(gen.split("|"))
array([0, 1, 2], dtype=int64)
for i,gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split("|"))
    dummies.iloc[i,indices] = 1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]
movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir                              0.0
Genre_Western                                0.0
Name: 0, dtype: object
#将get_dummies与cut等离散化函数结合使用是统计应用的一个有用方法
np.random.seed(12345)
values = np.random.rand(10)
values
array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))
(0.0, 0.2](0.2, 0.4](0.4, 0.6](0.6, 0.8](0.8, 1.0]
000001
101000
210000
301000
400100
500100
600001
700010
800010
900010

7.3 字符串操作

7.3.1 字符串对象方法

#一个逗号分隔的字符串可以使用split方法拆分成多块:
val = 'a,b, guido'
val.split(',')
['a', 'b', ' guido']
#split常和strip一起使用,用于清除空格(包括换行)
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
#这些子字符串可以使用加法与两个冒号分隔符连接在一起
first,second,third = pieces
first+"::"+second+"::"+third
'a::b::guido'
#在字符串’ : : ’的join方法中传入一个列表或元组是一种更快且更加Pythonic(Python风格化)的方法
"::".join(pieces)
'a::b::guido'
#使用Python的in关键字是检测子字符串的最佳方法,尽管index和find也能实现同样的功能
'guido' in val
True
#请注意find和index的区别在于index在字符串没有找到时会抛出一个异常(而find是返回-1)
val.index('guido')
5
val.find('guido')
5
#count返回的是某个特定的子字符串在字符串中出现的次数

val.count('guido')
1
#replace将用一种模式替代另一种模式。它通常也用于传入空字符串来删除某个模式
val.replace(',','::')
'a::b:: guido'
val.replace(',','')
'ab guido'
方法描述
count返回子字符串在字符串中的非重叠出现次数
endswith如果字符串以后缀结尾则返回True
startswith如果字符串以前缀开始则返回True
join使用字符串作为间隔符,用于粘合其他字符串的序列
index如果在字符串中找到,则返回子字符串中第一个字符的位置:如果找不到则引发ValueError
find返回字符串中第一个出现子字符的第一个字符的位置:类似index,但如果没有找到则返回-1
rfind返回子字符串在字符串中最后一次出现时第一个字符的位置,如果没有找到,则返回-1
replace使用一个字符串替代另一个字符串
strip,rstrip,1strip修剪空白,包括换行符,相当于对每个元素进行x. strip() (以及rstrip,lstrip)。
split使用分隔符将字符串拆分为子字符串的列表
lower将大写字母转换为小写字母
upper将小写字母转换为大写字母
casefold将字符转换为小写,并将任何特定于区域的变量字符组合转换为常见的可比较形式
ljust, rjust左对齐或右对齐;用空格(或其他一些字符)填充字符串的相反侧以返回且,有最小宽度的字符串

7.3.2 正则表达式

方法描述
findall将字符串中所有的非重叠匹配模式以列表形式返回
finditer与findall类似,但返回的是迭代器
match在字符串起始位置匹配模式,也可以将模式组建匹配到分组中;如果模式匹配上了,返回的一个匹配对象,否则返回None
search扫描字符串的匹配模式,如果扫描到了返回匹配对象,与match方法不同的是,search 方法的匹配可以是字符串的任意位置,而不仅仅是字符串的起始位置
split根据模式,将字符串拆分为多个部分
sub,subn用替换表达式替换字符串中所有的匹配(sub) 或第n个出现的匹配串(subn);使用符号\ 1. \ 2 …来引用替换字符串中的匹配组元素
import re
text = 'foo bar\t baz \tqux'
re.split('\s+',text)
['foo', 'bar', 'baz', 'qux']
#你可以使用re.compile自行编译,形成一个可复用的正则表达式对象
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
#如果你想获得的是一个所有匹配正则表达式的模式的列表,你可以使用findall方法
regex.findall(text)
[' ', '\t ', ' \t']
text = """
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9.%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern,flags = re.IGNORECASE)
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
#search返回的是文本中第一个匹配到的电子邮件地址
m = regex.search(text)
m
<re.Match object; span=(6, 21), match='dave@google.com'>
text[m.start():m.end()]
'dave@google.com'
#regex.match只在模式出现于字符串起始位置时进行匹配,如果没有匹配到,返回None
print(regex.match(text))
None
#sub会返回一个新的字符串,原字符串中的模式会被一个新的字符串替代
print(regex.sub('ABC',text))
Dave ABC
Steve ABC
Rob ABC
Ryan ABC

#假设您想查找电子邮件地址,并将每个地址分为三个部分:用户名,域名和域名后缀。
#要实现这一点,可以用括号将模式包起来
pattern = r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern,flags = re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
#当模式可以分组时,findall返回的是包含元组的列表
regex.findall(text)
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))
Dave Username:dave,Domain:google,Suffix:com
Steve Username:steve,Domain:gmail,Suffix:com
Rob Username:rob,Domain:gmail,Suffix:com
Ryan Username:ryan,Domain:yahoo,Suffix:com

7.3.3 pandas中的向量化字符串函数

方法描述
cat根据可选的分隔符按元素黏合字符串
contains返回是否含有某个模式/正则表达式的布尔值数组
count模式出现次数的计数
extract使用正则表达式从字符串Scries 中分组抽取-个或多个字符串;返回的结果是每个分组形成-列的DataFrame
endswith等价于对每个元素使用x. endwith (模式)
data = {'Dave':'dave@google.com','Steve':'steve@gmail.com','Rob':'rob@gmail.com','Ryan':np.nan}
data = pd.Series(data)
data
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Ryan                 NaN
dtype: object
data.isnull()
Dave     False
Steve    False
Rob      False
Ryan      True
dtype: bool
#你可以使用data.map将字符串和有效的正则表达式方法(以lambda或其他函数的方式传递)应用到每个值上,
#但是在NA(null)值上会失败,Series有面向数组的方法用于跳过NA值的字符串操作。这些方法通过Series的str属性进行调用
data.str.contains('gmail')
Dave     False
Steve     True
Rob       True
Ryan       NaN
dtype: object
#正则表达式也可以结合任意的re模块选项使用
data.str.findall(pattern,flags=re.IGNORECASE)
Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Ryan                       NaN
dtype: object
#可以使用str.get或在str属性内部索引
matches = data.str.match(pattern,flags=re.IGNORECASE)
matches
Dave     True
Steve    True
Rob      True
Ryan      NaN
dtype: object
#要访问嵌入式列表中的元素,我们可以将索引传递给这些函数中的任意一个
matches.str.get(l)
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-245-8d76f9329d2a> in <module>
      1 #要访问嵌入式列表中的元素,我们可以将索引传递给这些函数中的任意一个
----> 2 matches.str.get(l)


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5459             or name in self._accessors
   5460         ):
-> 5461             return object.__getattribute__(self, name)
   5462         else:
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):


D:\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
    178             # we're accessing the attribute of the class, i.e., Dataset.geo
    179             return self._accessor
--> 180         accessor_obj = self._accessor(obj)
    181         # Replace the property with the accessor object. Inspired by:
    182         # https://www.pydanny.com/cached-property.html


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in __init__(self, data)
    152         from pandas.core.arrays.string_ import StringDtype
    153 
--> 154         self._inferred_dtype = self._validate(data)
    155         self._is_categorical = is_categorical_dtype(data.dtype)
    156         self._is_string = isinstance(data.dtype, StringDtype)


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in _validate(data)
    215 
    216         if inferred_dtype not in allowed_types:
--> 217             raise AttributeError("Can only use .str accessor with string values!")
    218         return inferred_dtype
    219 


AttributeError: Can only use .str accessor with string values!
matches.str[0]
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-246-10bdd22fd8b2> in <module>
----> 1 matches.str[0]


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5459             or name in self._accessors
   5460         ):
-> 5461             return object.__getattribute__(self, name)
   5462         else:
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):


D:\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
    178             # we're accessing the attribute of the class, i.e., Dataset.geo
    179             return self._accessor
--> 180         accessor_obj = self._accessor(obj)
    181         # Replace the property with the accessor object. Inspired by:
    182         # https://www.pydanny.com/cached-property.html


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in __init__(self, data)
    152         from pandas.core.arrays.string_ import StringDtype
    153 
--> 154         self._inferred_dtype = self._validate(data)
    155         self._is_categorical = is_categorical_dtype(data.dtype)
    156         self._is_string = isinstance(data.dtype, StringDtype)


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in _validate(data)
    215 
    216         if inferred_dtype not in allowed_types:
--> 217             raise AttributeError("Can only use .str accessor with string values!")
    218         return inferred_dtype
    219 


AttributeError: Can only use .str accessor with string values!
#你可以使用字符串切片的类似语法进行向量化切片
data.str[:5]
Dave     dave@
Steve    steve
Rob      rob@g
Ryan       NaN
dtype: object

第八章数据规整:连接、联合与重塑

8.1 分层索引

#你看到的是一个以MultiIndex作为索引的Series的美化视图。索引中的“间隙”表示“直接使用上面的标签”
data = pd.Series(np.random.randn(9),
                index = [['a','a','a','b','b','c','c','d','d'],
                        [1,2,3,1,3,1,2,2,3]])
data
a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
data['b']
1    0.228913
3    1.352917
dtype: float64
data['b':'c']
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
dtype: float64
data.loc[['b','c']]
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
dtype: float64
#在“内部”层级中进行选择也是可以的
data.loc[:,3]
a    0.274992
b    1.352917
d    1.669025
dtype: float64
#分层索引在重塑数据和数组透视表等分组操作中扮演了重要角色。
#可以使用unstack方法将数据在DataFrame中重新排列
data.unstack()
123
a1.007189-1.2962210.274992
b0.228913NaN1.352917
c0.886429-2.001637NaN
dNaN-0.3718431.669025
#unstack的反操作是stack
data.unstack().stack()
a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64
#在DataFrame中,每个轴都可以拥有分层索引
frame = pd.DataFrame(np.arange(12).reshape(4,3),
                    index = [['a','a','b','b'],[1,2,1,2]],
                    columns = [['ohio','ohio','colorado'],['green','red','green']])
frame
ohiocolorado
greenredgreen
a1012
2345
b1678
291011
frame.index.names = ['key1','key2']
frame
ohiocolorado
greenredgreen
key1key2
a1012
2345
b1678
291011
frame.columns.names = ['state','color']
frame
stateohiocolorado
colorgreenredgreen
key1key2
a1012
2345
b1678
291011
frame['ohio']
colorgreenred
key1key2
a101
234
b167
2910
#一个MultiIndex对象可以使用其自身的构造函数创建并复用
pd.MultiIndex.from_arrays([['ohio','ohio','colorado'],['green','red','green']],
                      names = ['state','color'])
MultiIndex([(    'ohio', 'green'),
            (    'ohio',   'red'),
            ('colorado', 'green')],
           names=['state', 'color'])

8.1.1 重排序和层级排序

#swaplevel接收两个层级序号或层级名称,返回一个进行了层级变更的新对象(但是数据是不变的)
frame.swaplevel('key1','key2')
stateohiocolorado
colorgreenredgreen
key2key1
1a012
2a345
1b678
2b91011
#sort_index只能在单一层级上对数据进行排序。
#在进行层级变换时,使用sort_index以使得结果按照层级进行字典排序也很常见
frame.sort_index(level=1)
stateohiocolorado
colorgreenredgreen
key1key2
a1012
b1678
a2345
b291011
frame.sort_index(level=0)
stateohiocolorado
colorgreenredgreen
key1key2
a1012
2345
b1678
291011
frame.swaplevel(0,1).sort_index(level=0)
stateohiocolorado
colorgreenredgreen
key2key1
1a012
b678
2a345
b91011

8.1.2 按层级进行汇总统计

frame.sum(level='key2')
stateohiocolorado
colorgreenredgreen
key2
16810
2121416
frame.sum(level='color',axis=1)
colorgreenred
key1key2
a121
284
b1147
22010

8.1.3 使用DataFrame的列进行索引

选项行为
’ inner’只对两张表都有的键的交集进行联合
‘left’对所有左表的键进行联合
'right ’对所有右表的键进行联合
’ outer’对两张表都有的键的并集进行联合
frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                     'c':['one','one','one','two','two','two','two'],
                     'd':[0,1,2,0,1,2,3]})
frame
abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
#DataFrame的set_index函数会生成一个新的DataFrame,新的DataFrame使用一个或多个列作为索引
frame2 = frame.set_index(['c','d'])
frame2
ab
cd
one007
116
225
two034
143
252
361
#默认情况下这些列会从DataFrame中移除,你也可以将它们留在DataFrame中
frame.set_index(['c','d'],drop = False)
abcd
cd
one007one0
116one1
225one2
two034two0
143two1
252two2
361two3
#reset_index是set_index的反操作,分层索引的索引层级会被移动到列中
frame2.reset_index()
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361

8.2 联合与合并数据集

8.2.1 数据库风格的DataFrame连接

参数描述
left合并时操作中左边的DataFrame
right合并时操作中右边的DataFrame
how‘inner’. ‘outer’. ‘left’. ‘right’之一; 默认是’ inner’
on需要连接的列名。必须是在两边的DataFrame对象都有的列名,并以left和right中的列名的交集作为连接键
left_on1eft DataFrame 中用作连接键的列
right_onright DataFrame 中用作连接键的列
left_index使用left的行索引作为它的连接键(如果是Multilndex,则是多个键)
right_index使用right的行索引作为它的连接键(如果是MultiIndex,则是多个键)
sort通过连接键按字母顺序对合并的数据进行排序;在默认情况下为True (在大数据集上某些情况下禁用该功能可以获得更好的性能)
suffixes在重叠情况下,添加到列名后的字符串元组;默认是(’. x’,’ y’) (例如如果待合并的DataFrame中都含有’data’ 列,那么结果中会出现’data_x’、‘data_ y’)
copy如果为False,则在某些特殊情况下避免将数据复制到结果数据结构中;默认情况下总是复制
indicator添加一个特殊的列_ merge, 指示每一行的来源;值将根据每行中连接数据的来源分别为’left_ only’,‘right_ only’ 或’ both’
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
df2 = pd.DataFrame({'key':['a','b','d'],
                   'data2':range(3)})
df2
keydata2
0a0
1b1
2d2
pd.merge(df1,df2)
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
#请注意,我并没有指定在哪一列上进行连接。如果连接的键信息没有指定,merge会自动将重叠列名作为连接的键。
#但是,显式地指定连接键才是好的实现
pd.merge(df1,df2,on='key')
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
#如果每个对象的列名是不同的,你可以分别为它们指定列名
df3 = pd.DataFrame({'Lkey':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df3
Lkeydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
df4 = pd.DataFrame({'Rkey':['a','b','d'],
                   'data2':range(3)})
df4
Rkeydata2
0a0
1b1
2d2
pd.merge(df3,df4,left_on='Lkey',right_on='Rkey')
Lkeydata1Rkeydata2
0b0b1
1b1b1
2b6b1
3a2a0
4a4a0
5a5a0
#其他可选的选项有’left'、'right’和’outer'。
#外连接(outer join)是键的并集,联合了左连接和右连接的效果
pd.merge(df1,df2,how='outer')
keydata1data2
0b0.01.0
1b1.01.0
2b6.01.0
3a2.00.0
4a4.00.0
5a5.00.0
6c3.0NaN
7dNaN2.0
df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],
                   'data1':range(6)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5b5
df2 = pd.DataFrame({'key':['a','b','a','b','d'],
                   'data2':range(5)})
df2
keydata2
0a0
1b1
2a2
3b3
4d4
pd.merge(df1,df2,on='key',how='left')
keydata1data2
0b01.0
1b03.0
2b11.0
3b13.0
4a20.0
5a22.0
6c3NaN
7a40.0
8a42.0
9b51.0
10b53.0
pd.merge(df1,df2,on='key',how='inner')
keydata1data2
0b01
1b03
2b11
3b13
4b51
5b53
6a20
7a22
8a40
9a42
#使用多个键进行合并时,传入一个列名的列表
left = pd.DataFrame({'key1':['foo','foo','bar'],
                    'key2':['one','two','one'],
                    'lval':[1,2,3]})
left
key1key2lval
0fooone1
1footwo2
2barone3
right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
                    'key2':['one','one','one','two'],
                    'rval':[4,5,6,7]})
right
key1key2rval
0fooone4
1fooone5
2barone6
3bartwo7
pd.merge(left,right,on=['key1','key2'],how='outer')
key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
#merge有一个suffixes后缀选项,
#用于在左右两边DataFrame对象的重叠列名后指定需要添加的字符串
pd.merge(left,right,on=['key1'])
key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7
pd.merge(left,right,on=['key1'],suffixes=('_left','_right'))
key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

8.2.2 根据索引合并

left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
                     'value':range(6)})
left1
keyvalue
0a0
1b1
2a2
3a3
4b4
5c5
right1 = pd.DataFrame({'group_val':[3.5,7]},index = ['a','b'])
right1
group_val
a3.5
b7.0
pd.merge(left1,right1,left_on = 'key',right_index=True)
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
pd.merge(left1,right1,left_on = 'key',right_index=True,how='outer')
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN
#在多层索引数据的情况下,事情会更复杂,在索引上连接是一个隐式的多键合并
lefth = pd.DataFrame({'key1':['ohio','ohio','ohio','Nevada','Nevada'],
                     'key2':[2000,2001,2002,2001,2002],
                     'data':np.arange(5.)})
lefth
key1key2data
0ohio20000.0
1ohio20011.0
2ohio20022.0
3Nevada20013.0
4Nevada20024.0
righth = pd.DataFrame(np.arange(12).reshape(6,2),
                     index=[['nevada','nevada','ohio','ohio','ohio','ohio'],[2001,2000,2000,2000,2001,2002]],
                     columns = ['event1','event2'])
righth
event1event2
nevada200101
200023
ohio200045
200067
200189
20021011
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
key1key2dataevent1event2
0ohio20000.04.05.0
0ohio20000.06.07.0
1ohio20011.08.09.0
2ohio20022.010.011.0
3Nevada20013.0NaNNaN
4Nevada20024.0NaNNaN
4nevada2001NaN0.01.0
4nevada2000NaN2.03.0
left2 = pd.DataFrame([[1,2],[3,4],[5,6]],
                    index = ['a','c','e'],
                    columns = ['ohio','nevada'])
left2
ohionevada
a12
c34
e56
right2 = pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],
                    index = ['b','c','d','e'],
                    columns = ['missouri','alabama'])
right2
missourialabama
b78
c910
d1112
e1314
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
ohionevadamissourialabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
#join实例方法,用于按照索引合并
left2.join(right2,how='outer')
ohionevadamissourialabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left1.join(right1,on='key')
keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN
another = pd.DataFrame([[7,8],[9,10],[11,12],[16,17]],
                      index = ['a','c','e','f'],
                      columns = ['new york','oregon'])
another
new yorkoregon
a78
c910
e1112
f1617
left2.join([right2,another])
ohionevadamissourialabamanew yorkoregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
left2.join([right2,another],how='outer')
ohionevadamissourialabamanew yorkoregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
bNaNNaN7.08.0NaNNaN
dNaNNaN11.012.0NaNNaN
fNaNNaNNaNNaN16.017.0

8.2.3 沿轴向连接

参数描述
objs需要连接的pandas对象列表或字典,这是必选参数
axis连接的轴向;默认是0 (沿着行方向)
join可以是’inner’或’outer’ (默认是’outer’);用于指定连接方式是内连接(inner) 还是外连接(outer)
join_ _axes用于指定其他n-1轴的特定索引,可以替代内/外连接的逻辑
keys与要连接的对象关联的值,沿着连接轴形成分层索引;可以是任意值的列表或数组,也可以是元组的数组,也可以是数组的列表(如果向levels参数传入多层数组)
leels在键值传递时,该参数用于指定多层索引的层级
arr = np.arange(12).reshape(3,4)
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
s1 = pd.Series([0,1],index=['a','b'])
s1
a    0
b    1
dtype: int64
s2 = pd.Series([2,3,4],index=['c','d','e'])
s2
c    2
d    3
e    4
dtype: int64
s3 = pd.Series([5,6],index=['f','g'])
s3
f    5
g    6
dtype: int64
pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
#concat方法是沿着axis=0的轴向生效的,生成另一个Series。
#如果你传递axis=1,返回的结果则是一个DataFrame(axis=1时是列)
pd.concat([s1,s2,s3],axis=1)
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
s4 = pd.concat([s1,s3])
s4
a    0
b    1
f    5
g    6
dtype: int64
pd.concat([s1,s4],axis=1)
01
a0.00
b1.01
fNaN5
gNaN6
pd.concat([s1,s4],axis=1,join='inner')
01
a00
b11
#可以使用join_axes来指定用于连接其他轴向的轴
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])
01
a0.00
b1.01
fNaN5
gNaN6
result = pd.concat([s1,s2,s3],keys=['one','two','three'])
result
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64
result.unstack()
abcdefg
one0.01.0NaNNaNNaNNaNNaN
twoNaNNaN2.03.04.0NaNNaN
threeNaNNaNNaNNaNNaN5.06.0
pd.concat([s1,s2,s3],axis = 1,keys=['one','two','three'])
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
df1 = pd.DataFrame(np.arange(6).reshape(3,2),
                  index = ['a','b','c'],
                  columns = ['one','two'])
df1
onetwo
a01
b23
c45
df2 = pd.DataFrame(np.arange(4).reshape(2,2)+5,
                  index = ['a','c'],
                  columns = ['three','four'])
df2
threefour
a56
c78
pd.concat([df1,df2],axis=1,keys=['lever1','level2'])
lever1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
#如果你传递的是对象的字典而不是列表的话,则字典的键会用于keys选项
pd.concat({'level1':df1,'level2':df2},axis=1)
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat([df1,df2],axis=1,keys=['lever1','level2'],names=['upper','lower'])
upperlever1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
df1 = pd.DataFrame(np.random.randn(3,4),columns = ['a','b','c','d'])
df1
abcd
0-1.1195931.953114-1.514807-1.054782
10.5433931.1729030.9458290.656643
21.0126951.481920-0.413033-1.280521
df2 = pd.DataFrame(np.random.randn(2,3),columns = ['b','d','a'])
df2
bda
01.638046-0.8501121.895532
1-1.1759521.370474-0.992356
pd.concat([df1,df2],ignore_index=True)
abcd
0-1.1195931.953114-1.514807-1.054782
10.5433931.1729030.9458290.656643
21.0126951.481920-0.413033-1.280521
31.8955321.638046NaN-0.850112
4-0.992356-1.175952NaN1.370474

8.2.4 联合重叠数据

a = pd.Series([np.nan,2.5,0,3.5,4.5,np.nan],
             index=['f','e','d','c','b','a'])
a
f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64
b = pd.Series([0,np.nan,2,np.nan,np.nan,5],
             index=['a','b','c','d','e','f'])
b
a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64
#考虑NumPy的where函数,这个函数可以进行面向数组的if-else等价操作
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
#Series有一个combine_first方法,该方法可以等价于下面这种使用pandas常见数据对齐逻辑的轴向操作
b.combine_first(a)
a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64
df1 = pd.DataFrame({'a':[1,np.nan,5,np.nan],
                   'b':[np.nan,2,np.nan,6],
                   'c':range(2,18,4)})
df1
abc
01.0NaN2
1NaN2.06
25.0NaN10
3NaN6.014
df2 = pd.DataFrame({'a':[5,4,np.nan,3,7],
                   'b':[np.nan,3,4,6,8]})
df2
ab
05.0NaN
14.03.0
2NaN4.0
33.06.0
47.08.0
df1.combine_first(df2)
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN

8.3 重塑和透视

8.3.1 使用多层索引进行重塑

data = pd.DataFrame(np.arange(6).reshape(2,3),
                    index = pd.Index(['ohio','colorado'],name='state'),
                    columns = pd.Index(['one','two','three'],name='number'))
data
numberonetwothree
state
ohio012
colorado345
result = data.stack()
result
state     number
ohio      one       0
          two       1
          three     2
colorado  one       3
          two       4
          three     5
dtype: int32
result.unstack()
numberonetwothree
state
ohio012
colorado345
result.unstack(0)
stateohiocolorado
number
one03
two14
three25
result.unstack('state')
stateohiocolorado
number
one03
two14
three25
#如果层级中的所有值并未包含于每个子分组中时,拆分可能会引入缺失值
s1 = pd.Series([0,1,2,3],index=['a','b','c','d'])
s1
a    0
b    1
c    2
d    3
dtype: int64
s2 = pd.Series([4,5,6],index=['c','d','e'])
s2
c    4
d    5
e    6
dtype: int64
data = pd.concat([s1,s2],keys=['one','two'])
data
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
data.unstack()
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
#默认情况下,堆叠会过滤出缺失值,因此堆叠拆堆的操作是可逆的
data.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
data.unstack().stack(dropna = False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
#当你在DataFrame中拆堆时,被拆堆的层级会变为结果中最低的层级
df = pd.DataFrame({'left':result,'right':result+5},
                 columns=pd.Index(['left','right'],name='side'))
df
sideleftright
statenumber
ohioone05
two16
three27
coloradoone38
two49
three510
df.unstack('state')
sideleftright
stateohiocoloradoohiocolorado
number
one0358
two1469
three25710
#在调用stack方法时,我们可以指明需要堆叠的轴向名称
df.unstack('state').stack('side')
statecoloradoohio
numberside
oneleft30
right85
twoleft41
right96
threeleft52
right107
df.unstack('state').stack()
sideleftright
numberstate
oneohio05
colorado38
twoohio16
colorado49
threeohio27
colorado510

8.3.2 将“长”透视为“宽”

data = pd.read_csv('examples/macrodata.csv')
data.head()
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959.01.02710.3491707.4286.898470.0451886.928.98139.72.825.8177.1460.000.00
11959.02.02778.8011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.74
21959.03.02775.4881751.8289.226491.2601916.429.35140.53.825.3178.6572.741.09
31959.04.02785.2041753.7299.356484.0521931.329.37140.04.335.6179.3860.274.06
41960.01.02847.6991770.5331.722462.1991955.529.54139.63.505.2180.0072.311.19
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')
columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns
Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')
data = data.reindex(columns=columns)
data.head()
itemrealgdpinflunemp
02710.349NaN5.8
12778.801NaN5.1
22775.488NaN5.3
32785.204NaN5.6
42847.699NaN5.2
data.index = periods.to_timestamp('D','end')
data.head()
itemrealgdpinflunemp
date
1959-03-31 23:59:59.9999999992710.349NaN5.8
1959-06-30 23:59:59.9999999992778.801NaN5.1
1959-09-30 23:59:59.9999999992775.488NaN5.3
1959-12-31 23:59:59.9999999992785.204NaN5.6
1960-03-31 23:59:59.9999999992847.699NaN5.2
ldata = data.stack().reset_index().rename(columns={0:'value'})
ldata[:10]
dateitemvalue
01959-03-31 23:59:59.999999999realgdp2710.349
11959-03-31 23:59:59.999999999unemp5.800
21959-06-30 23:59:59.999999999realgdp2778.801
31959-06-30 23:59:59.999999999unemp5.100
41959-09-30 23:59:59.999999999realgdp2775.488
51959-09-30 23:59:59.999999999unemp5.300
61959-12-31 23:59:59.999999999realgdp2785.204
71959-12-31 23:59:59.999999999unemp5.600
81960-03-31 23:59:59.999999999realgdp2847.699
91960-03-31 23:59:59.999999999unemp5.200
pivoted = ldata.pivot('date','item','value')
pivoted
itemrealgdpunemp
date
1959-03-31 23:59:59.9999999992710.3495.8
1959-06-30 23:59:59.9999999992778.8015.1
1959-09-30 23:59:59.9999999992775.4885.3
1959-12-31 23:59:59.9999999992785.2045.6
1960-03-31 23:59:59.9999999992847.6995.2
.........
2008-09-30 23:59:59.99999999913324.6006.0
2008-12-31 23:59:59.99999999913141.9206.9
2009-03-31 23:59:59.99999999912925.4108.1
2009-06-30 23:59:59.99999999912901.5049.2
2009-09-30 23:59:59.99999999912990.3419.6

203 rows × 2 columns

8.3.3 将“宽”透视为“长”

df = pd.DataFrame({'key':['foo','bar','baz'],
                  'A':[1,2,3],
                  'B':[4,5,6],
                  'C':[7,8,9]})
df
keyABC
0foo147
1bar258
2baz369
#当使用pandas.melt时,我们必须指明哪些列是分组指标(如果有的话)
melted = pd.melt(df)
melted
variablevalue
0keyfoo
1keybar
2keybaz
3A1
4A2
5A3
6B4
7B5
8B6
9C7
10C8
11C9
melted = pd.melt(df,['key'])
melted
keyvariablevalue
0fooA1
1barA2
2bazA3
3fooB4
4barB5
5bazB6
6fooC7
7barC8
8bazC9
reshaped = melted.pivot('key','variable','value')
reshaped
variableABC
key
bar258
baz369
foo147
#使用reset_index来将数据回移一列
reshaped.reset_index()
variablekeyABC
0bar258
1baz369
2foo147
pd.melt(df,id_vars=['key'],value_vars=['A','B'])
keyvariablevalue
0fooA1
1barA2
2bazA3
3fooB4
4barB5
5bazB6
pd.melt(df,value_vars=['A','B','C'])
variablevalue
0A1
1A2
2A3
3B4
4B5
5B6
6C7
7C8
8C9
pd.melt(df,value_vars=['A','B','key'])
variablevalue
0A1
1A2
2A3
3B4
4B5
5B6
6keyfoo
7keybar
8keybaz

第九章绘图与可视化

%matplotlib notebook

9.1 简明matplotlib API入门

import matplotlib.pyplot as plt
import numpy as np
data = np.arange(10)
data
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.plot(data)

9.1.1 图片与子图

参数描述
nrows子图的行数
ncols子图的列数
sharex所有子图使用相同的x轴刻度(调整xlim会影响所有子图)
sharey所有子图使用相同的y轴刻度(调整ylim会影响所有子图)
subplot_ kw传入add_ subplot 的关键字参数字典,用于生成子图
**fig_ _kW在生成图片时使用的额外关键字参数,例如plt. subplots (2,2,figsize= (8,6))
#你可以使用plt.figure生成一个新的图片
fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
<IPython.core.display.Javascript object>

标签:数据分析,index,python,dtype,DataFrame,利用,pd,00.00,data
来源: https://blog.csdn.net/weixin_43816759/article/details/122214004