基本设置 1 pd.set_option("max_columns" , 50 )
数据缺失检查 1 2 3 apts=pd.Series(["Hangzhou" , "Beijing" , "Shenzhen" ]) print (apts.notnull())print (apts.isnull())
0 True
1 True
2 True
dtype: bool
0 False
1 False
2 False
dtype: bool
Series 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 from pandas import Seriesprint ("===用数组生成Series===" )obj = Series([4 , 7 , -5 , 3 ]) print (obj)print (obj.values)print (obj.index)print ("" )print ("===指定Series的index===" )obj2 = Series([4 , 7 , -5 , 3 ], index = ['d' , 'b' , 'a' , 'c' ]) print (obj2)print (obj2.index)print (obj2['a' ])obj2['d' ] = 6 print (obj2[['c' , 'a' , 'd' ]])print (obj2[obj2 > 0 ]) print ('b' in obj2) print ('e' in obj2)print ("" )print ("===使用字典生成Series===" )sdata = {'Ohio' :45000 , 'Texas' :71000 , 'Oregon' :16000 , 'Utah' :5000 } obj3 = Series(sdata) print (obj3)print ("" )print ("===使用字典生成Series,并额外指定index,不匹配部分为NaN===" )states = ['California' , 'Ohio' , 'Oregon' , 'Texas' ] obj4 = Series(sdata, index = states) print (obj4)print ("" )print ("===Series相加,相同索引部分相加===" )print (obj3 + obj4)print ("" )print ("===指定Series及其索引的名字===" )obj4.name = 'population' obj4.index.name = 'state' print ( obj4)print ("" )print ("===替换index===" )obj.index = ['Bob' , 'Steve' , 'Jeff' , 'Ryan' ] print (obj)
===用数组生成Series===
0 4
1 7
2 -5
3 3
dtype: int64
[ 4 7 -5 3]
RangeIndex(start=0, stop=4, step=1)
===指定Series的index===
d 4
b 7
a -5
c 3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
c 3
a -5
d 6
dtype: int64
d 6
b 7
c 3
dtype: int64
True
False
===使用字典生成Series===
Ohio 45000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
===使用字典生成Series,并额外指定index,不匹配部分为NaN===
California NaN
Ohio 45000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
===Series相加,相同索引部分相加===
California NaN
Ohio 90000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
===指定Series及其索引的名字===
state
California NaN
Ohio 45000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
===替换index===
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
填充 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 import numpy as npfrom numpy import nan as NAimport pandas as pdfrom pandas import Series, DataFrame, Indexprint ("===填充0===" )df = DataFrame(np.random.randn(7 , 3 )) df.iloc[:4 , 1 ] = NA df.iloc[:2 , 2 ] = NA print (df.fillna(0 ))df.fillna(0 , inplace = True ) print (df)print ('' )print ("===不同行列填充不同的值===" )print (df.fillna({1 :0.5 , 3 :-1 })) print ('' )print ("===不同的填充方式===" )df = DataFrame(np.random.randn(6 , 3 )) df.iloc[2 :, 1 ] = NA df.iloc[4 :, 2 ] = NA print (df)print (df.fillna(method = 'ffill' ))print (df.fillna(method = 'ffill' , limit = 2 ))print ('' )print ("===用统计数据填充===" )data = Series([1. , NA, 3.5 , NA, 7 ]) print (data.fillna(data.mean()))
===填充0===
0 1 2
0 0.619043 0.000000 0.000000
1 1.262121 0.000000 0.000000
2 -0.536828 0.000000 1.354608
3 0.273867 0.000000 1.046143
4 0.648504 0.366829 -0.122517
5 0.773085 -0.547392 -0.289962
6 0.070365 -1.258029 0.144977
0 1 2
0 0.619043 0.000000 0.000000
1 1.262121 0.000000 0.000000
2 -0.536828 0.000000 1.354608
3 0.273867 0.000000 1.046143
4 0.648504 0.366829 -0.122517
5 0.773085 -0.547392 -0.289962
6 0.070365 -1.258029 0.144977
===不同行列填充不同的值===
0 1 2
0 0.619043 0.000000 0.000000
1 1.262121 0.000000 0.000000
2 -0.536828 0.000000 1.354608
3 0.273867 0.000000 1.046143
4 0.648504 0.366829 -0.122517
5 0.773085 -0.547392 -0.289962
6 0.070365 -1.258029 0.144977
===不同的填充方式===
0 1 2
0 0.755876 1.438577 0.986349
1 0.457114 -0.435046 -0.365518
2 0.631733 NaN 1.537871
3 1.217393 NaN 0.636592
4 -1.095912 NaN NaN
5 -1.314859 NaN NaN
0 1 2
0 0.755876 1.438577 0.986349
1 0.457114 -0.435046 -0.365518
2 0.631733 -0.435046 1.537871
3 1.217393 -0.435046 0.636592
4 -1.095912 -0.435046 0.636592
5 -1.314859 -0.435046 0.636592
0 1 2
0 0.755876 1.438577 0.986349
1 0.457114 -0.435046 -0.365518
2 0.631733 -0.435046 1.537871
3 1.217393 -0.435046 0.636592
4 -1.095912 NaN 0.636592
5 -1.314859 NaN 0.636592
===用统计数据填充===
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
DataFrame 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 import numpy as npfrom pandas import Series, DataFrameprint ("===指定索引,在列中指定不存在的列,默认数据用NaN===" )frame2 = DataFrame(data, columns = ['year' , 'state' , 'pop' , 'debt' ], index = ['one' , 'two' , 'three' , 'four' , 'five' ]) print (frame2)print (frame2['state' ])print (frame2.year)print (frame2.loc['three' ])frame2['debt' ] = 16.5 print (frame2)frame2.debt = np.arange(5 ) print (frame2)print ('' )print ("===用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN===" )val = Series([-1.2 , -1.5 , -1.7 ], index = ['two' , 'four' , 'five' ]) frame2['debt' ] = val print (frame2)print ('' )print ("===赋值给新列===" )frame2['eastern' ] = (frame2.state == 'Ohio' ) print (frame2)print (frame2.columns)print ('' )print ("===DataFrame转置===" )pop = {'Nevada' :{2001 :2.4 , 2002 :2.9 }, 'Ohio' :{2000 :1.5 , 2001 :1.7 , 2002 :3.6 }} frame3 = DataFrame(pop) print (frame3)print (frame3.T)print ('' )print ("===指定索引顺序,以及使用切片初始化数据===" )print (DataFrame(pop))pdata = {'Ohio' :frame3['Ohio' ][:-1 ], 'Nevada' :frame3['Nevada' ][:2 ]} print (DataFrame(pdata))print ('' )print ("===指定索引和列的名称===" )frame3.index.name = 'year' frame3.columns.name = 'state' print (frame3)print (frame3.values)print (frame2.values)print ('' )print ("===stack===" )print (frame3.stack())
===指定索引,在列中指定不存在的列,默认数据用NaN===
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
Name: state, dtype: object
one 2000
two 2001
three 2002
four 2001
five 2002
Name: year, dtype: int64
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2002 Nevada 2.9 4
===用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN===
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
===赋值给新列===
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object')
===DataFrame转置===
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
===指定索引顺序,以及使用切片初始化数据===
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
Ohio Nevada
2000 1.5 NaN
2001 1.7 2.4
===指定索引和列的名称===
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
[[nan 1.5]
[2.4 1.7]
[2.9 3.6]]
[[2000 'Ohio' 1.5 nan True]
[2001 'Ohio' 1.7 -1.2 True]
[2002 'Ohio' 3.6 nan True]
[2001 'Nevada' 2.4 -1.5 False]
[2002 'Nevada' 2.9 -1.7 False]]
===stack===
year state
2000 Ohio 1.5
2001 Nevada 2.4
Ohio 1.7
2002 Nevada 2.9
Ohio 3.6
dtype: float64
Append, Join和Concatenate 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 df1 = pd.DataFrame({'apts' : [55000 , 60000 ], 'cars' : [200000 , 300000 ],}, index = ['Shanghai' , 'Beijing' ]) df2 = pd.DataFrame({'apts' : [25000 , 20000 ], 'cars' : [150000 , 120000 ],}, index = ['Hangzhou' , 'Najing' ]) df3 = pd.DataFrame({'apts' : [30000 , 10000 ], 'cars' : [180000 , 100000 ],}, index = ['Guangzhou' , 'Chongqing' ]) df4 = pd.DataFrame({'salaries' : [10000 , 30000 , 30000 , 20000 , 15000 ], 'cities' : ['Suzhou' , 'Beijing' , 'Shanghai' , 'Guangzhou' , 'Tianjin' ]}) print (df1)print (df2)print (df3)print (df4)
apts cars
Shanghai 55000 200000
Beijing 60000 300000
apts cars
Hangzhou 25000 150000
Najing 20000 120000
apts cars
Guangzhou 30000 180000
Chongqing 10000 100000
salaries cities
0 10000 Suzhou
1 30000 Beijing
2 30000 Shanghai
3 20000 Guangzhou
4 15000 Tianjin
1 2 3 4 5 6 7 8 9 print ("===concat===" )frames = [df1, df2, df3] result = pd.concat(frames) print (result)print ("" )result2 = pd.concat(frames, keys=['x' , 'y' , 'z' ]) print (result2)print ("" )print (pd.concat([df1, df2], axis=1 , sort=False ))
===concat===
apts cars
Shanghai 55000 200000
Beijing 60000 300000
Hangzhou 25000 150000
Najing 20000 120000
Guangzhou 30000 180000
Chongqing 10000 100000
apts cars
x Shanghai 55000 200000
Beijing 60000 300000
y Hangzhou 25000 150000
Najing 20000 120000
z Guangzhou 30000 180000
Chongqing 10000 100000
apts cars apts cars
Shanghai 55000.0 200000.0 NaN NaN
Beijing 60000.0 300000.0 NaN NaN
Hangzhou NaN NaN 25000.0 150000.0
Najing NaN NaN 20000.0 120000.0
1 2 3 4 print ("===append===" )print (df1.append(df2))print (df1.append(df3))
===append===
apts cars
Shanghai 55000 200000
Beijing 60000 300000
Hangzhou 25000 150000
Najing 20000 120000
apts cars
Shanghai 55000 200000
Beijing 60000 300000
Guangzhou 30000 180000
Chongqing 10000 100000
1 2 3 4 5 6 7 8 9 10 print ("===join===" )print (df1.join(df4))print ("" )print (df1.join(df4, how='right' ))print ("" )print (df1.join(df4, how='left' ))print ("" )print (df1.join(df4, how='inner' ))print ("" )print (df1.join(df4, how='outer' ))
===join===
apts cars salaries cities
Shanghai 55000 200000 NaN NaN
Beijing 60000 300000 NaN NaN
apts cars salaries cities
0 NaN NaN 10000 Suzhou
1 NaN NaN 30000 Beijing
2 NaN NaN 30000 Shanghai
3 NaN NaN 20000 Guangzhou
4 NaN NaN 15000 Tianjin
apts cars salaries cities
Shanghai 55000 200000 NaN NaN
Beijing 60000 300000 NaN NaN
Empty DataFrame
Columns: [apts, cars, salaries, cities]
Index: []
apts cars salaries cities
Shanghai 55000.0 200000.0 NaN NaN
Beijing 60000.0 300000.0 NaN NaN
0 NaN NaN 10000.0 Suzhou
1 NaN NaN 30000.0 Beijing
2 NaN NaN 30000.0 Shanghai
3 NaN NaN 20000.0 Guangzhou
4 NaN NaN 15000.0 Tianjin
groupby 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import pandas as pdsalaries = pd.DataFrame({ 'Name' : ['July' , 'Chu' , 'Chu' , 'Lin' , 'July' , 'July' , 'Chu' , 'July' ], 'Year' : [2016 ,2016 ,2016 ,2016 ,2017 ,2017 ,2017 ,2017 ], 'Salary' : [10000 ,2000 ,4000 ,5000 ,18000 ,25000 ,3000 ,4000 ], 'Bonus' : [3000 ,1000 ,1000 ,1200 ,4000 ,2300 ,500 ,1000 ] }) group_by_name = salaries.groupby('Name' ) display(group_by_name.sum ()) group_by_name_year = salaries.groupby(['Name' , 'Year' ]) display(group_by_name_year.sum ()) display(group_by_name_year.size()) display(group_by_name_year.describe())
Year
Salary
Bonus
Name
Chu
6049
9000
2500
July
8067
57000
10300
Lin
2016
5000
1200
Salary
Bonus
Name
Year
Chu
2016
6000
2000
2017
3000
500
July
2016
10000
3000
2017
47000
7300
Lin
2016
5000
1200
Name Year
Chu 2016 2
2017 1
July 2016 1
2017 3
Lin 2016 1
dtype: int64
Bonus
Salary
count
mean
std
min
25%
50%
75%
max
count
mean
std
min
25%
50%
75%
max
Name
Year
Chu
2016
2.0
1000.000000
0.00000
1000.0
1000.0
1000.0
1000.0
1000.0
2.0
3000.000000
1414.213562
2000.0
2500.0
3000.0
3500.0
4000.0
2017
1.0
500.000000
NaN
500.0
500.0
500.0
500.0
500.0
1.0
3000.000000
NaN
3000.0
3000.0
3000.0
3000.0
3000.0
July
2016
1.0
3000.000000
NaN
3000.0
3000.0
3000.0
3000.0
3000.0
1.0
10000.000000
NaN
10000.0
10000.0
10000.0
10000.0
10000.0
2017
3.0
2433.333333
1504.43788
1000.0
1650.0
2300.0
3150.0
4000.0
3.0
15666.666667
10692.676622
4000.0
11000.0
18000.0
21500.0
25000.0
Lin
2016
1.0
1200.000000
NaN
1200.0
1200.0
1200.0
1200.0
1200.0
1.0
5000.000000
NaN
5000.0
5000.0
5000.0
5000.0
5000.0
参考:
廖雪峰Python数据分析