데이터를 그룹화하여 통계적 특징을 요약하는 GroupBy 원리를 마스터합니다.
import warnings
warnings.filterwarnings('ignore' )
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.random.seed(12345 )
import matplotlib.pyplot as plt
plt.rc("figure" , figsize= (10 , 6 ))
np.set_printoptions(precision= 4 , suppress= True )
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font' , family= 'AppleGothic' )
plt.rc('axes' , unicode_minus= False )
import numpy as np
import pandas as pd
GroupBy 메커니즘
분할-적용-결합 과정을 통해 그룹 통계량을 산출하는 기초를 학습합니다.
df = pd.DataFrame({"key1" : ["a" , "a" , None , "b" , "b" , "a" , None ],
"key2" : pd.Series([1 , 2 , 1 , 2 , 1 , None , 1 ],
dtype= "Int64" ),
"data1" : np.random.standard_normal(7 ),
"data2" : np.random.standard_normal(7 )})
df
0
a
1
-0.204708
0.281746
1
a
2
0.478943
0.769023
2
None
1
-0.519439
1.246435
3
b
2
-0.555730
1.007189
4
b
1
1.965781
-1.296221
5
a
<NA>
1.393406
0.274992
6
None
1
0.092908
0.228913
GroupBy 메커니즘
분할-적용-결합(Split-Apply-Combine) 과정을 단계별로 이해합니다.
grouped = df["data1" ].groupby(df["key1" ])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x17a0b44d0>
key1
a 0.555881
b 0.705025
Name: data1, dtype: float64
means = df["data1" ].groupby([df["key1" ], df["key2" ]]).mean()
means
key1 key2
a 1 -0.204708
2 0.478943
b 1 1.965781
2 -0.555730
Name: data1, dtype: float64
key1
a
-0.204708
0.478943
b
1.965781
-0.555730
states = np.array(["OH" , "CA" , "CA" , "OH" , "OH" , "CA" , "OH" ])
years = [2005 , 2005 , 2006 , 2005 , 2006 , 2005 , 2006 ]
df["data1" ].groupby([states, years]).mean()
CA 2005 0.936175
2006 -0.519439
OH 2005 -0.380219
2006 1.029344
Name: data1, dtype: float64
df.groupby("key1" ).mean()
df.groupby("key2" ).mean(numeric_only= True )
df.groupby(["key1" , "key2" ]).mean()
key1
key2
a
1
-0.204708
0.281746
2
0.478943
0.769023
b
1
1.965781
-1.296221
2
-0.555730
1.007189
df.groupby(["key1" , "key2" ]).size()
key1 key2
a 1 1
2 1
b 1 1
2 1
dtype: int64
df.groupby("key1" , dropna= False ).size()
df.groupby(["key1" , "key2" ], dropna= False ).size()
key1 key2
a 1 1
2 1
<NA> 1
b 1 1
2 1
NaN 1 2
dtype: int64
df.groupby("key1" ).count()
for name, group in df.groupby("key1" ):
print (name)
print (group)
a
key1 key2 data1 data2
0 a 1 -0.204708 0.281746
1 a 2 0.478943 0.769023
5 a <NA> 1.393406 0.274992
b
key1 key2 data1 data2
3 b 2 -0.555730 1.007189
4 b 1 1.965781 -1.296221
for (k1, k2), group in df.groupby(["key1" , "key2" ]):
print ((k1, k2))
print (group)
('a', 1)
key1 key2 data1 data2
0 a 1 -0.204708 0.281746
('a', 2)
key1 key2 data1 data2
1 a 2 0.478943 0.769023
('b', 1)
key1 key2 data1 data2
4 b 1 1.965781 -1.296221
('b', 2)
key1 key2 data1 data2
3 b 2 -0.55573 1.007189
pieces = {name: group for name, group in df.groupby("key1" )}
pieces["b" ]
3
b
2
-0.555730
1.007189
4
b
1
1.965781
-1.296221
grouped = df.groupby({"key1" : "key" , "key2" : "key" ,
"data1" : "data" , "data2" : "data" }, axis= "columns" )
for group_key, group_values in grouped:
print (group_key)
print (group_values)
data
data1 data2
0 -0.204708 0.281746
1 0.478943 0.769023
2 -0.519439 1.246435
3 -0.555730 1.007189
4 1.965781 -1.296221
5 1.393406 0.274992
6 0.092908 0.228913
key
key1 key2
0 a 1
1 a 2
2 None 1
3 b 2
4 b 1
5 a <NA>
6 None 1
df.groupby(["key1" , "key2" ])[["data2" ]].mean()
key1
key2
a
1
0.281746
2
0.769023
b
1
-1.296221
2
1.007189
s_grouped = df.groupby(["key1" , "key2" ])["data2" ]
s_grouped
s_grouped.mean()
key1 key2
a 1 0.281746
2 0.769023
b 1 -1.296221
2 1.007189
Name: data2, dtype: float64
people = pd.DataFrame(np.random.standard_normal((5 , 5 )),
columns= ["a" , "b" , "c" , "d" , "e" ],
index= ["Joe" , "Steve" , "Wanda" , "Jill" , "Trey" ])
people.iloc[2 :3 , [1 , 2 ]] = np.nan # Add a few NA values
people
Joe
1.352917
0.886429
-2.001637
-0.371843
1.669025
Steve
-0.438570
-0.539741
0.476985
3.248944
-1.021228
Wanda
-0.577087
NaN
NaN
0.523772
0.000940
Jill
1.343810
-0.713544
-0.831154
-2.370232
-1.860761
Trey
-0.860757
0.560145
-1.265934
0.119827
-1.063512
mapping = {"a" : "red" , "b" : "red" , "c" : "blue" ,
"d" : "blue" , "e" : "red" , "f" : "orange" }
by_column = people.groupby(mapping, axis= "columns" )
by_column.sum ()
Joe
-2.373480
3.908371
Steve
3.725929
-1.999539
Wanda
0.523772
-0.576147
Jill
-3.201385
-1.230495
Trey
-1.146107
-1.364125
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis= "columns" ).count()
Joe
2
3
Steve
2
3
Wanda
1
2
Jill
2
3
Trey
2
3
people.groupby(len ).sum ()
3
1.352917
0.886429
-2.001637
-0.371843
1.669025
4
0.483052
-0.153399
-2.097088
-2.250405
-2.924273
5
-1.015657
-0.539741
0.476985
3.772716
-1.020287
key_list = ["one" , "one" , "one" , "two" , "two" ]
people.groupby([len , key_list]).min ()
3
one
1.352917
0.886429
-2.001637
-0.371843
1.669025
4
two
-0.860757
-0.713544
-1.265934
-2.370232
-1.860761
5
one
-0.577087
-0.539741
0.476985
0.523772
-1.021228
columns = pd.MultiIndex.from_arrays([["US" , "US" , "US" , "JP" , "JP" ],
[1 , 3 , 5 , 1 , 3 ]],
names= ["cty" , "tenor" ])
hier_df = pd.DataFrame(np.random.standard_normal((4 , 5 )), columns= columns)
hier_df
tenor
1
3
5
1
3
0
0.332883
-2.359419
-0.199543
-1.541996
-0.970736
1
-1.307030
0.286350
0.377984
-0.753887
0.331286
2
1.349742
0.069877
0.246674
-0.011862
1.004812
3
1.327195
-0.919262
-1.549106
0.022185
0.758363
hier_df.groupby(level= "cty" , axis= "columns" ).count()
df
grouped = df.groupby("key1" )
grouped["data1" ].nsmallest(2 )
key1
a 0 -0.204708
1 0.478943
b 3 -0.555730
4 1.965781
Name: data1, dtype: float64
데이터 집계
기본 내장 함수 또는 사용자 정의 함수를 사용하여 그룹화된 데이터를 요약합니다.
데이터 집계
그룹별로 요약 통계량을 산출하는 다양한 방식을 익힙니다.
def peak_to_peak(arr):
return arr.max () - arr.min ()
grouped.agg(peak_to_peak)
key1
a
1
1.598113
0.494031
b
1
2.521511
2.303410
count
mean
std
min
25%
50%
75%
max
count
mean
...
75%
max
count
mean
std
min
25%
50%
75%
max
a
2.0
1.5
0.707107
1.0
1.25
1.5
1.75
2.0
3.0
0.555881
...
0.936175
1.393406
3.0
0.441920
0.283299
0.274992
0.278369
0.281746
0.525384
0.769023
b
2.0
1.5
0.707107
1.0
1.25
1.5
1.75
2.0
2.0
0.705025
...
1.335403
1.965781
2.0
-0.144516
1.628757
-1.296221
-0.720368
-0.144516
0.431337
1.007189
2 rows × 24 columns
tips = pd.read_csv("examples/tips.csv" )
tips.head()
0
16.99
1.01
No
Sun
Dinner
2
1
10.34
1.66
No
Sun
Dinner
3
2
21.01
3.50
No
Sun
Dinner
3
3
23.68
3.31
No
Sun
Dinner
2
4
24.59
3.61
No
Sun
Dinner
4
tips["tip_pct" ] = tips["tip" ] / tips["total_bill" ]
tips.head()
0
16.99
1.01
No
Sun
Dinner
2
0.059447
1
10.34
1.66
No
Sun
Dinner
3
0.160542
2
21.01
3.50
No
Sun
Dinner
3
0.166587
3
23.68
3.31
No
Sun
Dinner
2
0.139780
4
24.59
3.61
No
Sun
Dinner
4
0.146808
grouped = tips.groupby(["day" , "smoker" ])
grouped_pct = grouped["tip_pct" ]
grouped_pct.agg("mean" )
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(["mean" , "std" , peak_to_peak])
day
smoker
Fri
No
0.151650
0.028123
0.067349
Yes
0.174783
0.051293
0.159925
Sat
No
0.158048
0.039767
0.235193
Yes
0.147906
0.061375
0.290095
Sun
No
0.160113
0.042347
0.193226
Yes
0.187250
0.154134
0.644685
Thur
No
0.160298
0.038774
0.193350
Yes
0.163863
0.039389
0.151240
grouped_pct.agg([("average" , "mean" ), ("stdev" , np.std)])
day
smoker
Fri
No
0.151650
0.028123
Yes
0.174783
0.051293
Sat
No
0.158048
0.039767
Yes
0.147906
0.061375
Sun
No
0.160113
0.042347
Yes
0.187250
0.154134
Thur
No
0.160298
0.038774
Yes
0.163863
0.039389
functions = ["count" , "mean" , "max" ]
result = grouped[["tip_pct" , "total_bill" ]].agg(functions)
result
count
mean
max
count
mean
max
Fri
No
4
0.151650
0.187735
4
18.420000
22.75
Yes
15
0.174783
0.263480
15
16.813333
40.17
Sat
No
45
0.158048
0.291990
45
19.661778
48.33
Yes
42
0.147906
0.325733
42
21.276667
50.81
Sun
No
57
0.160113
0.252672
57
20.506667
48.17
Yes
19
0.187250
0.710345
19
24.120000
45.35
Thur
No
45
0.160298
0.266312
45
17.113111
41.19
Yes
17
0.163863
0.241255
17
19.190588
43.11
day
smoker
Fri
No
4
0.151650
0.187735
Yes
15
0.174783
0.263480
Sat
No
45
0.158048
0.291990
Yes
42
0.147906
0.325733
Sun
No
57
0.160113
0.252672
Yes
19
0.187250
0.710345
Thur
No
45
0.160298
0.266312
Yes
17
0.163863
0.241255
ftuples = [("Average" , "mean" ), ("Variance" , np.var)]
grouped[["tip_pct" , "total_bill" ]].agg(ftuples)
Average
Variance
Average
Variance
Fri
No
0.151650
0.000791
18.420000
25.596333
Yes
0.174783
0.002631
16.813333
82.562438
Sat
No
0.158048
0.001581
19.661778
79.908965
Yes
0.147906
0.003767
21.276667
101.387535
Sun
No
0.160113
0.001793
20.506667
66.099980
Yes
0.187250
0.023757
24.120000
109.046044
Thur
No
0.160298
0.001503
17.113111
59.625081
Yes
0.163863
0.001551
19.190588
69.808518
grouped.agg({"tip" : np.max , "size" : "sum" })
grouped.agg({"tip_pct" : ["min" , "max" , "mean" , "std" ],
"size" : "sum" })
min
max
mean
std
sum
Fri
No
0.120385
0.187735
0.151650
0.028123
9
Yes
0.103555
0.263480
0.174783
0.051293
31
Sat
No
0.056797
0.291990
0.158048
0.039767
115
Yes
0.035638
0.325733
0.147906
0.061375
104
Sun
No
0.059447
0.252672
0.160113
0.042347
167
Yes
0.065660
0.710345
0.187250
0.154134
49
Thur
No
0.072961
0.266312
0.160298
0.038774
112
Yes
0.090014
0.241255
0.163863
0.039389
40
grouped = tips.groupby(["day" , "smoker" ], as_index= False )
grouped.mean(numeric_only= True )
0
Fri
No
18.420000
2.812500
2.250000
0.151650
1
Fri
Yes
16.813333
2.714000
2.066667
0.174783
2
Sat
No
19.661778
3.102889
2.555556
0.158048
3
Sat
Yes
21.276667
2.875476
2.476190
0.147906
4
Sun
No
20.506667
3.167895
2.929825
0.160113
5
Sun
Yes
24.120000
3.516842
2.578947
0.187250
6
Thur
No
17.113111
2.673778
2.488889
0.160298
7
Thur
Yes
19.190588
3.030000
2.352941
0.163863
def top(df, n= 5 , column= "tip_pct" ):
return df.sort_values(column, ascending= False )[:n]
top(tips, n= 6 )
172
7.25
5.15
Yes
Sun
Dinner
2
0.710345
178
9.60
4.00
Yes
Sun
Dinner
2
0.416667
67
3.07
1.00
Yes
Sat
Dinner
1
0.325733
232
11.61
3.39
No
Sat
Dinner
2
0.291990
183
23.17
6.50
Yes
Sun
Dinner
4
0.280535
109
14.31
4.00
Yes
Sat
Dinner
2
0.279525
Apply: 일반적인 분할-적용-결합
apply 메서드를 사용하여 그룹별로 더 복잡한 로직을 적용하는 방법을 다룹니다.
일반적인 Apply 연산
가장 유연하게 그룹별 연산을 적용할 수 있는 apply 메서드를 활용합니다.
tips.groupby("smoker" ).apply (top)
smoker
No
232
11.61
3.39
No
Sat
Dinner
2
0.291990
149
7.51
2.00
No
Thur
Lunch
2
0.266312
51
10.29
2.60
No
Sun
Dinner
2
0.252672
185
20.69
5.00
No
Sun
Dinner
5
0.241663
88
24.71
5.85
No
Thur
Lunch
2
0.236746
Yes
172
7.25
5.15
Yes
Sun
Dinner
2
0.710345
178
9.60
4.00
Yes
Sun
Dinner
2
0.416667
67
3.07
1.00
Yes
Sat
Dinner
1
0.325733
183
23.17
6.50
Yes
Sun
Dinner
4
0.280535
109
14.31
4.00
Yes
Sat
Dinner
2
0.279525
tips.groupby(["smoker" , "day" ]).apply (top, n= 1 , column= "total_bill" )
smoker
day
No
Fri
94
22.75
3.25
No
Fri
Dinner
2
0.142857
Sat
212
48.33
9.00
No
Sat
Dinner
4
0.186220
Sun
156
48.17
5.00
No
Sun
Dinner
6
0.103799
Thur
142
41.19
5.00
No
Thur
Lunch
5
0.121389
Yes
Fri
95
40.17
4.73
Yes
Fri
Dinner
4
0.117750
Sat
170
50.81
10.00
Yes
Sat
Dinner
3
0.196812
Sun
182
45.35
3.50
Yes
Sun
Dinner
3
0.077178
Thur
197
43.11
5.00
Yes
Thur
Lunch
4
0.115982
result = tips.groupby("smoker" )["tip_pct" ].describe()
result
result.unstack("smoker" )
smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
tips.groupby("smoker" , group_keys= False ).apply (top)
232
11.61
3.39
No
Sat
Dinner
2
0.291990
149
7.51
2.00
No
Thur
Lunch
2
0.266312
51
10.29
2.60
No
Sun
Dinner
2
0.252672
185
20.69
5.00
No
Sun
Dinner
5
0.241663
88
24.71
5.85
No
Thur
Lunch
2
0.236746
172
7.25
5.15
Yes
Sun
Dinner
2
0.710345
178
9.60
4.00
Yes
Sun
Dinner
2
0.416667
67
3.07
1.00
Yes
Sat
Dinner
1
0.325733
183
23.17
6.50
Yes
Sun
Dinner
4
0.280535
109
14.31
4.00
Yes
Sat
Dinner
2
0.279525
frame = pd.DataFrame({"data1" : np.random.standard_normal(1000 ),
"data2" : np.random.standard_normal(1000 )})
frame.head()
quartiles = pd.cut(frame["data1" ], 4 )
quartiles.head(10 )
0 (-1.23, 0.489]
1 (0.489, 2.208]
2 (-1.23, 0.489]
3 (-1.23, 0.489]
4 (0.489, 2.208]
5 (0.489, 2.208]
6 (-1.23, 0.489]
7 (-1.23, 0.489]
8 (-2.956, -1.23]
9 (-1.23, 0.489]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]
def get_stats(group):
return pd.DataFrame(
{"min" : group.min (), "max" : group.max (),
"count" : group.count(), "mean" : group.mean()}
)
grouped = frame.groupby(quartiles)
grouped.apply (get_stats)
data1
(-2.956, -1.23]
data1
-2.949343
-1.230179
94
-1.658818
data2
-3.399312
1.670835
94
-0.033333
(-1.23, 0.489]
data1
-1.228918
0.488675
598
-0.329524
data2
-2.989741
3.260383
598
-0.002622
(0.489, 2.208]
data1
0.489965
2.200997
298
1.065727
data2
-3.745356
2.954439
298
0.078249
(2.208, 3.928]
data1
2.212303
3.927528
10
2.644253
data2
-1.929776
1.765640
10
0.024750
grouped.agg(["min" , "max" , "count" , "mean" ])
min
max
count
mean
min
max
count
mean
(-2.956, -1.23]
-2.949343
-1.230179
94
-1.658818
-3.399312
1.670835
94
-0.033333
(-1.23, 0.489]
-1.228918
0.488675
598
-0.329524
-2.989741
3.260383
598
-0.002622
(0.489, 2.208]
0.489965
2.200997
298
1.065727
-3.745356
2.954439
298
0.078249
(2.208, 3.928]
2.212303
3.927528
10
2.644253
-1.929776
1.765640
10
0.024750
quartiles_samp = pd.qcut(frame["data1" ], 4 , labels= False )
quartiles_samp.head()
grouped = frame.groupby(quartiles_samp)
grouped.apply (get_stats)
data1
0
data1
-2.949343
-0.685484
250
-1.212173
data2
-3.399312
2.628441
250
-0.027045
1
data1
-0.683066
-0.030280
250
-0.368334
data2
-2.630247
3.260383
250
-0.027845
2
data1
-0.027734
0.618965
250
0.295812
data2
-3.056990
2.458842
250
0.014450
3
data1
0.623587
3.927528
250
1.248875
data2
-3.745356
2.954439
250
0.115899
s = pd.Series(np.random.standard_normal(6 ))
s[::2 ] = np.nan
s
s.fillna(s.mean())
0 -0.767366
1 0.227290
2 -0.767366
3 -2.153545
4 -0.767366
5 -0.375842
dtype: float64
states = ["Ohio" , "New York" , "Vermont" , "Florida" ,
"Oregon" , "Nevada" , "California" , "Idaho" ]
group_key = ["East" , "East" , "East" , "East" ,
"West" , "West" , "West" , "West" ]
data = pd.Series(np.random.standard_normal(8 ), index= states)
data
Ohio 0.329939
New York 0.981994
Vermont 1.105913
Florida -1.613716
Oregon 1.561587
Nevada 0.406510
California 0.359244
Idaho -0.614436
dtype: float64
data[["Vermont" , "Nevada" , "Idaho" ]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()
East -0.100594
West 0.960416
dtype: float64
def fill_mean(group):
return group.fillna(group.mean())
data.groupby(group_key).apply (fill_mean)
East Ohio 0.329939
New York 0.981994
Vermont -0.100594
Florida -1.613716
West Oregon 1.561587
Nevada 0.960416
California 0.359244
Idaho 0.960416
dtype: float64
fill_values = {"East" : 0.5 , "West" : - 1 }
def fill_func(group):
return group.fillna(fill_values[group.name])
data.groupby(group_key).apply (fill_func)
East Ohio 0.329939
New York 0.981994
Vermont 0.500000
Florida -1.613716
West Oregon 1.561587
Nevada -1.000000
California 0.359244
Idaho -1.000000
dtype: float64
suits = ["H" , "S" , "C" , "D" ] # Hearts, Spades, Clubs, Diamonds
card_val = (list (range (1 , 11 )) + [10 ] * 3 ) * 4
base_names = ["A" ] + list (range (2 , 11 )) + ["J" , "K" , "Q" ]
cards = []
for suit in suits:
cards.extend(str (num) + suit for num in base_names)
deck = pd.Series(card_val, index= cards)
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
def draw(deck, n= 5 ):
return deck.sample(n)
draw(deck)
4D 4
QH 10
8S 8
7D 7
9C 9
dtype: int64
def get_suit(card):
# last letter is suit
return card[- 1 ]
deck.groupby(get_suit).apply (draw, n= 2 )
C 6C 6
KC 10
D 7D 7
3D 3
H 7H 7
9H 9
S 2S 2
QS 10
dtype: int64
deck.groupby(get_suit, group_keys= False ).apply (draw, n= 2 )
AC 1
3C 3
5D 5
4D 4
10H 10
7H 7
QS 10
7S 7
dtype: int64
df = pd.DataFrame({"category" : ["a" , "a" , "a" , "a" ,
"b" , "b" , "b" , "b" ],
"data" : np.random.standard_normal(8 ),
"weights" : np.random.uniform(size= 8 )})
df
0
a
-1.691656
0.955905
1
a
0.511622
0.012745
2
a
-0.401675
0.137009
3
a
0.968578
0.763037
4
b
-1.818215
0.492472
5
b
0.279963
0.832908
6
b
-0.200819
0.658331
7
b
-0.217221
0.612009
grouped = df.groupby("category" )
def get_wavg(group):
return np.average(group["data" ], weights= group["weights" ])
grouped.apply (get_wavg)
category
a -0.495807
b -0.357273
dtype: float64
close_px = pd.read_csv("examples/stock_px.csv" , parse_dates= True ,
index_col= 0 )
close_px.info()
close_px.tail(4 )
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 AAPL 2214 non-null float64
1 MSFT 2214 non-null float64
2 XOM 2214 non-null float64
3 SPX 2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
2011-10-11
400.29
27.00
76.27
1195.54
2011-10-12
402.19
26.96
77.16
1207.25
2011-10-13
408.43
27.18
76.37
1203.66
2011-10-14
422.00
27.27
78.11
1224.58
def spx_corr(group):
return group.corrwith(group["SPX" ])
rets = close_px.pct_change().dropna()
def get_year(x):
return x.year
by_year = rets.groupby(get_year)
by_year.apply (spx_corr)
2003
0.541124
0.745174
0.661265
1.0
2004
0.374283
0.588531
0.557742
1.0
2005
0.467540
0.562374
0.631010
1.0
2006
0.428267
0.406126
0.518514
1.0
2007
0.508118
0.658770
0.786264
1.0
2008
0.681434
0.804626
0.828303
1.0
2009
0.707103
0.654902
0.797921
1.0
2010
0.710105
0.730118
0.839057
1.0
2011
0.691931
0.800996
0.859975
1.0
def corr_aapl_msft(group):
return group["AAPL" ].corr(group["MSFT" ])
by_year.apply (corr_aapl_msft)
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
import statsmodels.api as sm
def regress(data, yvar= None , xvars= None ):
Y = data[yvar]
X = data[xvars]
X["intercept" ] = 1.
result = sm.OLS(Y, X).fit()
return result.params
by_year.apply (regress, yvar= "AAPL" , xvars= ["SPX" ])
2003
1.195406
0.000710
2004
1.363463
0.004201
2005
1.766415
0.003246
2006
1.645496
0.000080
2007
1.198761
0.003438
2008
0.968016
-0.001110
2009
0.879103
0.002954
2010
1.052608
0.001261
2011
0.806605
0.001514
df = pd.DataFrame({'key' : ['a' , 'b' , 'c' ] * 4 ,
'value' : np.arange(12. )})
df
0
a
0.0
1
b
1.0
2
c
2.0
3
a
3.0
4
b
4.0
5
c
5.0
6
a
6.0
7
b
7.0
8
c
8.0
9
a
9.0
10
b
10.0
11
c
11.0
g = df.groupby('key' )['value' ]
g.mean()
key
a 4.5
b 5.5
c 6.5
Name: value, dtype: float64
def get_mean(group):
return group.mean()
g.transform(get_mean)
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
def times_two(group):
return group * 2
g.transform(times_two)
0 0.0
1 2.0
2 4.0
3 6.0
4 8.0
5 10.0
6 12.0
7 14.0
8 16.0
9 18.0
10 20.0
11 22.0
Name: value, dtype: float64
def get_ranks(group):
return group.rank(ascending= False )
g.transform(get_ranks)
0 4.0
1 4.0
2 4.0
3 3.0
4 3.0
5 3.0
6 2.0
7 2.0
8 2.0
9 1.0
10 1.0
11 1.0
Name: value, dtype: float64
def normalize(x):
return (x - x.mean()) / x.std()
g.transform(normalize)
g.apply (normalize)
key
a 0 -1.161895
3 -0.387298
6 0.387298
9 1.161895
b 1 -1.161895
4 -0.387298
7 0.387298
10 1.161895
c 2 -1.161895
5 -0.387298
8 0.387298
11 1.161895
Name: value, dtype: float64
g.transform('mean' )
normalized = (df['value' ] - g.transform('mean' )) / g.transform('std' )
normalized
0 -1.161895
1 -1.161895
2 -1.161895
3 -0.387298
4 -0.387298
5 -0.387298
6 0.387298
7 0.387298
8 0.387298
9 1.161895
10 1.161895
11 1.161895
Name: value, dtype: float64
tips.head()
tips.pivot_table(index= ["day" , "smoker" ],
values= ["size" , "tip" , "tip_pct" , "total_bill" ])
day
smoker
Fri
No
2.250000
2.812500
0.151650
18.420000
Yes
2.066667
2.714000
0.174783
16.813333
Sat
No
2.555556
3.102889
0.158048
19.661778
Yes
2.476190
2.875476
0.147906
21.276667
Sun
No
2.929825
3.167895
0.160113
20.506667
Yes
2.578947
3.516842
0.187250
24.120000
Thur
No
2.488889
2.673778
0.160298
17.113111
Yes
2.352941
3.030000
0.163863
19.190588
tips.pivot_table(index= ["time" , "day" ], columns= "smoker" ,
values= ["tip_pct" , "size" ])
smoker
No
Yes
No
Yes
Dinner
Fri
2.000000
2.222222
0.139622
0.165347
Sat
2.555556
2.476190
0.158048
0.147906
Sun
2.929825
2.578947
0.160113
0.187250
Thur
2.000000
NaN
0.159744
NaN
Lunch
Fri
3.000000
1.833333
0.187735
0.188937
Thur
2.500000
2.352941
0.160311
0.163863
tips.pivot_table(index= ["time" , "day" ], columns= "smoker" ,
values= ["tip_pct" , "size" ], margins= True )
smoker
No
Yes
All
No
Yes
All
Dinner
Fri
2.000000
2.222222
2.166667
0.139622
0.165347
0.158916
Sat
2.555556
2.476190
2.517241
0.158048
0.147906
0.153152
Sun
2.929825
2.578947
2.842105
0.160113
0.187250
0.166897
Thur
2.000000
NaN
2.000000
0.159744
NaN
0.159744
Lunch
Fri
3.000000
1.833333
2.000000
0.187735
0.188937
0.188765
Thur
2.500000
2.352941
2.459016
0.160311
0.163863
0.161301
All
2.668874
2.408602
2.569672
0.159328
0.163196
0.160803
tips.pivot_table(index= ["time" , "smoker" ], columns= "day" ,
values= "tip_pct" , aggfunc= len , margins= True )
time
smoker
Dinner
No
3.0
45.0
57.0
1.0
106
Yes
9.0
42.0
19.0
NaN
70
Lunch
No
1.0
NaN
NaN
44.0
45
Yes
6.0
NaN
NaN
17.0
23
All
19.0
87.0
76.0
62.0
244
tips.pivot_table(index= ["time" , "size" , "smoker" ], columns= "day" ,
values= "tip_pct" , fill_value= 0 )
time
size
smoker
Dinner
1
No
0.000000
0.137931
0.000000
0.000000
Yes
0.000000
0.325733
0.000000
0.000000
2
No
0.139622
0.162705
0.168859
0.159744
Yes
0.171297
0.148668
0.207893
0.000000
3
No
0.000000
0.154661
0.152663
0.000000
...
...
...
...
...
...
...
Lunch
3
Yes
0.000000
0.000000
0.000000
0.204952
4
No
0.000000
0.000000
0.000000
0.138919
Yes
0.000000
0.000000
0.000000
0.155410
5
No
0.000000
0.000000
0.000000
0.121389
6
No
0.000000
0.000000
0.000000
0.173706
21 rows × 4 columns
from io import StringIO
data = """Sample Nationality Handedness
1 USA Right-handed
2 Japan Left-handed
3 USA Right-handed
4 Japan Right-handed
5 Japan Left-handed
6 Japan Right-handed
7 USA Right-handed
8 USA Left-handed
9 Japan Right-handed
10 USA Right-handed"""
data = pd.read_table(StringIO(data), sep= " \ s+" )
0
1
USA
Right-handed
1
2
Japan
Left-handed
2
3
USA
Right-handed
3
4
Japan
Right-handed
4
5
Japan
Left-handed
5
6
Japan
Right-handed
6
7
USA
Right-handed
7
8
USA
Left-handed
8
9
Japan
Right-handed
9
10
USA
Right-handed
pd.crosstab(data["Nationality" ], data["Handedness" ], margins= True )
Nationality
Japan
2
3
5
USA
1
4
5
All
3
7
10
pd.crosstab([tips["time" ], tips["day" ]], tips["smoker" ], margins= True )
time
day
Dinner
Fri
3
9
12
Sat
45
42
87
Sun
57
19
76
Thur
1
0
1
Lunch
Fri
1
6
7
Thur
44
17
61
All
151
93
244
pd.options.display.max_rows = PREVIOUS_MAX_ROWS