Pandas
丹丹

基本设置

1
pd.set_option("max_columns", 50)

数据缺失检查

1
2
3
apts=pd.Series(["Hangzhou", "Beijing", "Shenzhen"])
print(apts.notnull())
print(apts.isnull())
0    True
1    True
2    True
dtype: bool
0    False
1    False
2    False
dtype: bool

Series

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*- 
from pandas import Series

print("===用数组生成Series===")
obj = Series([4, 7, -5, 3])
print(obj)
print(obj.values)
print(obj.index)
print("")

print("===指定Series的index===")
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)
print(obj2['a'])
obj2['d'] = 6
print(obj2[['c', 'a', 'd']])
print(obj2[obj2 > 0]) # 找出大于0的元素
print('b' in obj2) # 判断索引是否存在
print('e' in obj2)
print("")

print("===使用字典生成Series===")
sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
print(obj3)
print("")

print("===使用字典生成Series,并额外指定index,不匹配部分为NaN===")
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index = states)
print(obj4)
print("")

print("===Series相加,相同索引部分相加===")
print(obj3 + obj4)
print("")

print("===指定Series及其索引的名字===")
obj4.name = 'population'
obj4.index.name = 'state'
print( obj4)
print("")

print("===替换index===")
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)
===用数组生成Series===
0    4
1    7
2   -5
3    3
dtype: int64
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)

===指定Series的index===
d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
c    3
a   -5
d    6
dtype: int64
d    6
b    7
c    3
dtype: int64
True
False

===使用字典生成Series===
Ohio      45000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

===使用字典生成Series,并额外指定index,不匹配部分为NaN===
California        NaN
Ohio          45000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

===Series相加,相同索引部分相加===
California         NaN
Ohio           90000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

===指定Series及其索引的名字===
state
California        NaN
Ohio          45000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

===替换index===
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

填充

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

# -*- coding: utf-8 -*-
import numpy as np
from numpy import nan as NA
import pandas as pd
from pandas import Series, DataFrame, Index

print("===填充0===")
df = DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
print(df.fillna(0))
df.fillna(0, inplace = True)
print(df)
print('')

print("===不同行列填充不同的值===")
print(df.fillna({1:0.5, 3:-1})) # 第3列不存在
print('')

print("===不同的填充方式===")
df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
print(df)
print(df.fillna(method = 'ffill'))
print(df.fillna(method = 'ffill', limit = 2))
print('')

print("===用统计数据填充===")
data = Series([1., NA, 3.5, NA, 7])
print(data.fillna(data.mean()))

===填充0===
          0         1         2
0  0.619043  0.000000  0.000000
1  1.262121  0.000000  0.000000
2 -0.536828  0.000000  1.354608
3  0.273867  0.000000  1.046143
4  0.648504  0.366829 -0.122517
5  0.773085 -0.547392 -0.289962
6  0.070365 -1.258029  0.144977
          0         1         2
0  0.619043  0.000000  0.000000
1  1.262121  0.000000  0.000000
2 -0.536828  0.000000  1.354608
3  0.273867  0.000000  1.046143
4  0.648504  0.366829 -0.122517
5  0.773085 -0.547392 -0.289962
6  0.070365 -1.258029  0.144977

===不同行列填充不同的值===
          0         1         2
0  0.619043  0.000000  0.000000
1  1.262121  0.000000  0.000000
2 -0.536828  0.000000  1.354608
3  0.273867  0.000000  1.046143
4  0.648504  0.366829 -0.122517
5  0.773085 -0.547392 -0.289962
6  0.070365 -1.258029  0.144977

===不同的填充方式===
          0         1         2
0  0.755876  1.438577  0.986349
1  0.457114 -0.435046 -0.365518
2  0.631733       NaN  1.537871
3  1.217393       NaN  0.636592
4 -1.095912       NaN       NaN
5 -1.314859       NaN       NaN
          0         1         2
0  0.755876  1.438577  0.986349
1  0.457114 -0.435046 -0.365518
2  0.631733 -0.435046  1.537871
3  1.217393 -0.435046  0.636592
4 -1.095912 -0.435046  0.636592
5 -1.314859 -0.435046  0.636592
          0         1         2
0  0.755876  1.438577  0.986349
1  0.457114 -0.435046 -0.365518
2  0.631733 -0.435046  1.537871
3  1.217393 -0.435046  0.636592
4 -1.095912       NaN  0.636592
5 -1.314859       NaN  0.636592

===用统计数据填充===
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame

print("===指定索引,在列中指定不存在的列,默认数据用NaN===")
frame2 = DataFrame(data,
columns = ['year', 'state', 'pop', 'debt'],
index = ['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.year)
print(frame2.loc['three'])
frame2['debt'] = 16.5 # 修改一整列
print(frame2)
frame2.debt = np.arange(5) # 用numpy数组修改元素
print(frame2)
print('')

print("===用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN===")
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print('')

print("===赋值给新列===")
frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等于Ohio为True
print(frame2)
print(frame2.columns)
print('')

print("===DataFrame转置===")
pop = {'Nevada':{2001:2.4, 2002:2.9},
'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print(frame3.T)
print('')

print("===指定索引顺序,以及使用切片初始化数据===")
print(DataFrame(pop))
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]}
print(DataFrame(pdata))
print('')

print("===指定索引和列的名称===")
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)
print('')

print("===stack===")
print(frame3.stack())
===指定索引,在列中指定不存在的列,默认数据用NaN===
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
       year   state  pop  debt
one    2000    Ohio  1.5     0
two    2001    Ohio  1.7     1
three  2002    Ohio  3.6     2
four   2001  Nevada  2.4     3
five   2002  Nevada  2.9     4

===用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN===
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7

===赋值给新列===
       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False
Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object')

===DataFrame转置===
      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6

===指定索引顺序,以及使用切片初始化数据===
      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4

===指定索引和列的名称===
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6
[[nan 1.5]
 [2.4 1.7]
 [2.9 3.6]]
[[2000 'Ohio' 1.5 nan True]
 [2001 'Ohio' 1.7 -1.2 True]
 [2002 'Ohio' 3.6 nan True]
 [2001 'Nevada' 2.4 -1.5 False]
 [2002 'Nevada' 2.9 -1.7 False]]

===stack===
year  state 
2000  Ohio      1.5
2001  Nevada    2.4
      Ohio      1.7
2002  Nevada    2.9
      Ohio      3.6
dtype: float64

Append, Join和Concatenate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
df1 = pd.DataFrame({'apts': [55000, 60000],
'cars': [200000, 300000],},
index = ['Shanghai', 'Beijing'])
df2 = pd.DataFrame({'apts': [25000, 20000],
'cars': [150000, 120000],},
index = ['Hangzhou', 'Najing'])
df3 = pd.DataFrame({'apts': [30000, 10000],
'cars': [180000, 100000],},
index = ['Guangzhou', 'Chongqing'])
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000],
'cities': ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin']})
print(df1)
print(df2)
print(df3)
print(df4)
           apts    cars
Shanghai  55000  200000
Beijing   60000  300000
           apts    cars
Hangzhou  25000  150000
Najing    20000  120000
            apts    cars
Guangzhou  30000  180000
Chongqing  10000  100000
   salaries     cities
0     10000     Suzhou
1     30000    Beijing
2     30000   Shanghai
3     20000  Guangzhou
4     15000    Tianjin
1
2
3
4
5
6
7
8
9
print("===concat===")
frames = [df1, df2, df3]
result = pd.concat(frames)
print(result)
print("")
result2 = pd.concat(frames, keys=['x', 'y', 'z'])
print(result2)
print("")
print(pd.concat([df1, df2], axis=1, sort=False))
===concat===
            apts    cars
Shanghai   55000  200000
Beijing    60000  300000
Hangzhou   25000  150000
Najing     20000  120000
Guangzhou  30000  180000
Chongqing  10000  100000

              apts    cars
x Shanghai   55000  200000
  Beijing    60000  300000
y Hangzhou   25000  150000
  Najing     20000  120000
z Guangzhou  30000  180000
  Chongqing  10000  100000

             apts      cars     apts      cars
Shanghai  55000.0  200000.0      NaN       NaN
Beijing   60000.0  300000.0      NaN       NaN
Hangzhou      NaN       NaN  25000.0  150000.0
Najing        NaN       NaN  20000.0  120000.0
1
2
3
4
print("===append===")
print(df1.append(df2))
print(df1.append(df3))

===append===
           apts    cars
Shanghai  55000  200000
Beijing   60000  300000
Hangzhou  25000  150000
Najing    20000  120000
            apts    cars
Shanghai   55000  200000
Beijing    60000  300000
Guangzhou  30000  180000
Chongqing  10000  100000
1
2
3
4
5
6
7
8
9
10
print("===join===")
print(df1.join(df4))
print("")
print(df1.join(df4, how='right'))
print("")
print(df1.join(df4, how='left'))
print("")
print(df1.join(df4, how='inner'))
print("")
print(df1.join(df4, how='outer'))
===join===
           apts    cars  salaries cities
Shanghai  55000  200000       NaN    NaN
Beijing   60000  300000       NaN    NaN

   apts  cars  salaries     cities
0   NaN   NaN     10000     Suzhou
1   NaN   NaN     30000    Beijing
2   NaN   NaN     30000   Shanghai
3   NaN   NaN     20000  Guangzhou
4   NaN   NaN     15000    Tianjin

           apts    cars  salaries cities
Shanghai  55000  200000       NaN    NaN
Beijing   60000  300000       NaN    NaN

Empty DataFrame
Columns: [apts, cars, salaries, cities]
Index: []

             apts      cars  salaries     cities
Shanghai  55000.0  200000.0       NaN        NaN
Beijing   60000.0  300000.0       NaN        NaN
0             NaN       NaN   10000.0     Suzhou
1             NaN       NaN   30000.0    Beijing
2             NaN       NaN   30000.0   Shanghai
3             NaN       NaN   20000.0  Guangzhou
4             NaN       NaN   15000.0    Tianjin

groupby

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import pandas as pd
salaries = pd.DataFrame({
'Name': ['July', 'Chu', 'Chu', 'Lin', 'July', 'July', 'Chu', 'July'],
'Year': [2016,2016,2016,2016,2017,2017,2017,2017],
'Salary': [10000,2000,4000,5000,18000,25000,3000,4000],
'Bonus': [3000,1000,1000,1200,4000,2300,500,1000]
})

group_by_name = salaries.groupby('Name')
display(group_by_name.sum())

group_by_name_year = salaries.groupby(['Name', 'Year'])
display(group_by_name_year.sum())
display(group_by_name_year.size())
display(group_by_name_year.describe())
Year Salary Bonus
Name
Chu 6049 9000 2500
July 8067 57000 10300
Lin 2016 5000 1200
Salary Bonus
Name Year
Chu 2016 6000 2000
2017 3000 500
July 2016 10000 3000
2017 47000 7300
Lin 2016 5000 1200
Name  Year
Chu   2016    2
      2017    1
July  2016    1
      2017    3
Lin   2016    1
dtype: int64
Bonus Salary
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
Name Year
Chu 2016 2.0 1000.000000 0.00000 1000.0 1000.0 1000.0 1000.0 1000.0 2.0 3000.000000 1414.213562 2000.0 2500.0 3000.0 3500.0 4000.0
2017 1.0 500.000000 NaN 500.0 500.0 500.0 500.0 500.0 1.0 3000.000000 NaN 3000.0 3000.0 3000.0 3000.0 3000.0
July 2016 1.0 3000.000000 NaN 3000.0 3000.0 3000.0 3000.0 3000.0 1.0 10000.000000 NaN 10000.0 10000.0 10000.0 10000.0 10000.0
2017 3.0 2433.333333 1504.43788 1000.0 1650.0 2300.0 3150.0 4000.0 3.0 15666.666667 10692.676622 4000.0 11000.0 18000.0 21500.0 25000.0
Lin 2016 1.0 1200.000000 NaN 1200.0 1200.0 1200.0 1200.0 1200.0 1.0 5000.000000 NaN 5000.0 5000.0 5000.0 5000.0 5000.0

参考:

  1. 廖雪峰Python数据分析