9 10장: 데이터 집계와 그룹 연산

데이터를 그룹화하여 통계적 특징을 요약하는 GroupBy 원리를 마스터합니다.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

import numpy as np
import pandas as pd

9.1 GroupBy 메커니즘

분할-적용-결합 과정을 통해 그룹 통계량을 산출하는 기초를 학습합니다.

df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                      dtype="Int64"),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
df

	key1	key2	data1	data2
0	a	1	-0.204708	0.281746
1	a	2	0.478943	0.769023
2	None	1	-0.519439	1.246435
3	b	2	-0.555730	1.007189
4	b	1	1.965781	-1.296221
5	a	<NA>	1.393406	0.274992
6	None	1	0.092908	0.228913

9.2 GroupBy 메커니즘

분할-적용-결합(Split-Apply-Combine) 과정을 단계별로 이해합니다.

grouped = df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x17a0b44d0>

grouped.mean()

key1
a    0.555881
b    0.705025
Name: data1, dtype: float64

means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means

key1  key2
a     1      -0.204708
      2       0.478943
b     1       1.965781
      2      -0.555730
Name: data1, dtype: float64

means.unstack()

key2	1	2
key1
a	-0.204708	0.478943
b	1.965781	-0.555730

states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]
df["data1"].groupby([states, years]).mean()

CA  2005    0.936175
    2006   -0.519439
OH  2005   -0.380219
    2006    1.029344
Name: data1, dtype: float64

df.groupby("key1").mean()
df.groupby("key2").mean(numeric_only=True)
df.groupby(["key1", "key2"]).mean()

		data1	data2
key1	key2
a	1	-0.204708	0.281746
a	2	0.478943	0.769023
b	1	1.965781	-1.296221
b	2	-0.555730	1.007189

df.groupby(["key1", "key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

df.groupby("key1", dropna=False).size()
df.groupby(["key1", "key2"], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

df.groupby("key1").count()

	key2	data1	data2
key1
a	2	3	3
b	2	2	2

for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -0.204708  0.281746
1    a     2  0.478943  0.769023
5    a  <NA>  1.393406  0.274992
b
  key1  key2     data1     data2
3    b     2 -0.555730  1.007189
4    b     1  1.965781 -1.296221

for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2     data1     data2
0    a     1 -0.204708  0.281746
('a', 2)
  key1  key2     data1     data2
1    a     2  0.478943  0.769023
('b', 1)
  key1  key2     data1     data2
4    b     1  1.965781 -1.296221
('b', 2)
  key1  key2    data1     data2
3    b     2 -0.55573  1.007189

pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

	key1	key2	data1	data2
3	b	2	-0.555730	1.007189
4	b	1	1.965781	-1.296221

grouped = df.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"}, axis="columns")

for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0 -0.204708  0.281746
1  0.478943  0.769023
2 -0.519439  1.246435
3 -0.555730  1.007189
4  1.965781 -1.296221
5  1.393406  0.274992
6  0.092908  0.228913
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1

df.groupby(["key1", "key2"])[["data2"]].mean()

		data2
key1	key2
a	1	0.281746
a	2	0.769023
b	1	-1.296221
b	2	1.007189

s_grouped = df.groupby(["key1", "key2"])["data2"]
s_grouped
s_grouped.mean()

key1  key2
a     1       0.281746
      2       0.769023
b     1      -1.296221
      2       1.007189
Name: data2, dtype: float64

people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

	a	b	c	d	e
Joe	1.352917	0.886429	-2.001637	-0.371843	1.669025
Steve	-0.438570	-0.539741	0.476985	3.248944	-1.021228
Wanda	-0.577087	NaN	NaN	0.523772	0.000940
Jill	1.343810	-0.713544	-0.831154	-2.370232	-1.860761
Trey	-0.860757	0.560145	-1.265934	0.119827	-1.063512

mapping = {"a": "red", "b": "red", "c": "blue",
           "d": "blue", "e": "red", "f" : "orange"}

by_column = people.groupby(mapping, axis="columns")
by_column.sum()

	blue	red
Joe	-2.373480	3.908371
Steve	3.725929	-1.999539
Wanda	0.523772	-0.576147
Jill	-3.201385	-1.230495
Trey	-1.146107	-1.364125

map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis="columns").count()

	blue	red
Joe	2	3
Steve	2	3
Wanda	1	2
Jill	2	3
Trey	2	3

people.groupby(len).sum()

	a	b	c	d	e
3	1.352917	0.886429	-2.001637	-0.371843	1.669025
4	0.483052	-0.153399	-2.097088	-2.250405	-2.924273
5	-1.015657	-0.539741	0.476985	3.772716	-1.020287

key_list = ["one", "one", "one", "two", "two"]
people.groupby([len, key_list]).min()

		a	b	c	d	e
3	one	1.352917	0.886429	-2.001637	-0.371843	1.669025
4	two	-0.860757	-0.713544	-1.265934	-2.370232	-1.860761
5	one	-0.577087	-0.539741	0.476985	0.523772	-1.021228

columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                    [1, 3, 5, 1, 3]],
                                    names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)
hier_df

cty	US			JP
tenor	1	3	5	1	3
0	0.332883	-2.359419	-0.199543	-1.541996	-0.970736
1	-1.307030	0.286350	0.377984	-0.753887	0.331286
2	1.349742	0.069877	0.246674	-0.011862	1.004812
3	1.327195	-0.919262	-1.549106	0.022185	0.758363

hier_df.groupby(level="cty", axis="columns").count()

cty	JP	US
0	2	3
1	2	3
2	2	3
3	2	3

df
grouped = df.groupby("key1")
grouped["data1"].nsmallest(2)

key1   
a     0   -0.204708
      1    0.478943
b     3   -0.555730
      4    1.965781
Name: data1, dtype: float64

9.3 데이터 집계

기본 내장 함수 또는 사용자 정의 함수를 사용하여 그룹화된 데이터를 요약합니다.

9.4 데이터 집계

그룹별로 요약 통계량을 산출하는 다양한 방식을 익힙니다.

def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

	key2	data1	data2
key1
a	1	1.598113	0.494031
b	1	2.521511	2.303410

grouped.describe()

	key2								data1					data2
	count	mean	std	min	25%	50%	75%	max	count	mean	...	75%	max	count	mean	std	min	25%	50%	75%	max
key1
a	2.0	1.5	0.707107	1.0	1.25	1.5	1.75	2.0	3.0	0.555881	...	0.936175	1.393406	3.0	0.441920	0.283299	0.274992	0.278369	0.281746	0.525384	0.769023
b	2.0	1.5	0.707107	1.0	1.25	1.5	1.75	2.0	2.0	0.705025	...	1.335403	1.965781	2.0	-0.144516	1.628757	-1.296221	-0.720368	-0.144516	0.431337	1.007189

2 rows × 24 columns

tips = pd.read_csv("examples/tips.csv")
tips.head()

	total_bill	tip	smoker	day	time	size
0	16.99	1.01	No	Sun	Dinner	2
1	10.34	1.66	No	Sun	Dinner	3
2	21.01	3.50	No	Sun	Dinner	3
3	23.68	3.31	No	Sun	Dinner	2
4	24.59	3.61	No	Sun	Dinner	4

tips["tip_pct"] = tips["tip"] / tips["total_bill"]
tips.head()

	total_bill	tip	smoker	day	time	size	tip_pct
0	16.99	1.01	No	Sun	Dinner	2	0.059447
1	10.34	1.66	No	Sun	Dinner	3	0.160542
2	21.01	3.50	No	Sun	Dinner	3	0.166587
3	23.68	3.31	No	Sun	Dinner	2	0.139780
4	24.59	3.61	No	Sun	Dinner	4	0.146808

grouped = tips.groupby(["day", "smoker"])

grouped_pct = grouped["tip_pct"]
grouped_pct.agg("mean")

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

grouped_pct.agg(["mean", "std", peak_to_peak])

		mean	std	peak_to_peak
day	smoker
Fri	No	0.151650	0.028123	0.067349
Fri	Yes	0.174783	0.051293	0.159925
Sat	No	0.158048	0.039767	0.235193
Sat	Yes	0.147906	0.061375	0.290095
Sun	No	0.160113	0.042347	0.193226
Sun	Yes	0.187250	0.154134	0.644685
Thur	No	0.160298	0.038774	0.193350
Thur	Yes	0.163863	0.039389	0.151240

grouped_pct.agg([("average", "mean"), ("stdev", np.std)])

		average	stdev
day	smoker
Fri	No	0.151650	0.028123
Fri	Yes	0.174783	0.051293
Sat	No	0.158048	0.039767
Sat	Yes	0.147906	0.061375
Sun	No	0.160113	0.042347
Sun	Yes	0.187250	0.154134
Thur	No	0.160298	0.038774
Thur	Yes	0.163863	0.039389

functions = ["count", "mean", "max"]
result = grouped[["tip_pct", "total_bill"]].agg(functions)
result

		tip_pct			total_bill
		count	mean	max	count	mean	max
day	smoker
Fri	No	4	0.151650	0.187735	4	18.420000	22.75
Fri	Yes	15	0.174783	0.263480	15	16.813333	40.17
Sat	No	45	0.158048	0.291990	45	19.661778	48.33
Sat	Yes	42	0.147906	0.325733	42	21.276667	50.81
Sun	No	57	0.160113	0.252672	57	20.506667	48.17
Sun	Yes	19	0.187250	0.710345	19	24.120000	45.35
Thur	No	45	0.160298	0.266312	45	17.113111	41.19
Thur	Yes	17	0.163863	0.241255	17	19.190588	43.11

result["tip_pct"]

		count	mean	max
day	smoker
Fri	No	4	0.151650	0.187735
Fri	Yes	15	0.174783	0.263480
Sat	No	45	0.158048	0.291990
Sat	Yes	42	0.147906	0.325733
Sun	No	57	0.160113	0.252672
Sun	Yes	19	0.187250	0.710345
Thur	No	45	0.160298	0.266312
Thur	Yes	17	0.163863	0.241255

ftuples = [("Average", "mean"), ("Variance", np.var)]
grouped[["tip_pct", "total_bill"]].agg(ftuples)

		tip_pct		total_bill
		Average	Variance	Average	Variance
day	smoker
Fri	No	0.151650	0.000791	18.420000	25.596333
Fri	Yes	0.174783	0.002631	16.813333	82.562438
Sat	No	0.158048	0.001581	19.661778	79.908965
Sat	Yes	0.147906	0.003767	21.276667	101.387535
Sun	No	0.160113	0.001793	20.506667	66.099980
Sun	Yes	0.187250	0.023757	24.120000	109.046044
Thur	No	0.160298	0.001503	17.113111	59.625081
Thur	Yes	0.163863	0.001551	19.190588	69.808518

grouped.agg({"tip" : np.max, "size" : "sum"})
grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],
             "size" : "sum"})

		tip_pct				size
		min	max	mean	std	sum
day	smoker
Fri	No	0.120385	0.187735	0.151650	0.028123	9
Fri	Yes	0.103555	0.263480	0.174783	0.051293	31
Sat	No	0.056797	0.291990	0.158048	0.039767	115
Sat	Yes	0.035638	0.325733	0.147906	0.061375	104
Sun	No	0.059447	0.252672	0.160113	0.042347	167
Sun	Yes	0.065660	0.710345	0.187250	0.154134	49
Thur	No	0.072961	0.266312	0.160298	0.038774	112
Thur	Yes	0.090014	0.241255	0.163863	0.039389	40

grouped = tips.groupby(["day", "smoker"], as_index=False)
grouped.mean(numeric_only=True)

	day	smoker	total_bill	tip	size	tip_pct
0	Fri	No	18.420000	2.812500	2.250000	0.151650
1	Fri	Yes	16.813333	2.714000	2.066667	0.174783
2	Sat	No	19.661778	3.102889	2.555556	0.158048
3	Sat	Yes	21.276667	2.875476	2.476190	0.147906
4	Sun	No	20.506667	3.167895	2.929825	0.160113
5	Sun	Yes	24.120000	3.516842	2.578947	0.187250
6	Thur	No	17.113111	2.673778	2.488889	0.160298
7	Thur	Yes	19.190588	3.030000	2.352941	0.163863

def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]
top(tips, n=6)

	total_bill	tip	smoker	day	time	size	tip_pct
172	7.25	5.15	Yes	Sun	Dinner	2	0.710345
178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
232	11.61	3.39	No	Sat	Dinner	2	0.291990
183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
109	14.31	4.00	Yes	Sat	Dinner	2	0.279525

9.5 Apply: 일반적인 분할-적용-결합

apply 메서드를 사용하여 그룹별로 더 복잡한 로직을 적용하는 방법을 다룹니다.

9.6 일반적인 Apply 연산

가장 유연하게 그룹별 연산을 적용할 수 있는 apply 메서드를 활용합니다.

tips.groupby("smoker").apply(top)

		total_bill	tip	smoker	day	time	size	tip_pct
smoker
No	232	11.61	3.39	No	Sat	Dinner	2	0.291990
	149	7.51	2.00	No	Thur	Lunch	2	0.266312
	51	10.29	2.60	No	Sun	Dinner	2	0.252672
	185	20.69	5.00	No	Sun	Dinner	5	0.241663
	88	24.71	5.85	No	Thur	Lunch	2	0.236746
Yes	172	7.25	5.15	Yes	Sun	Dinner	2	0.710345
	178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
	67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
	183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
	109	14.31	4.00	Yes	Sat	Dinner	2	0.279525

tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

			total_bill	tip	smoker	day	time	size	tip_pct
smoker	day
No	Fri	94	22.75	3.25	No	Fri	Dinner	2	0.142857
	Sat	212	48.33	9.00	No	Sat	Dinner	4	0.186220
	Sun	156	48.17	5.00	No	Sun	Dinner	6	0.103799
	Thur	142	41.19	5.00	No	Thur	Lunch	5	0.121389
Yes	Fri	95	40.17	4.73	Yes	Fri	Dinner	4	0.117750
	Sat	170	50.81	10.00	Yes	Sat	Dinner	3	0.196812
	Sun	182	45.35	3.50	Yes	Sun	Dinner	3	0.077178
	Thur	197	43.11	5.00	Yes	Thur	Lunch	4	0.115982

result = tips.groupby("smoker")["tip_pct"].describe()
result
result.unstack("smoker")

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

tips.groupby("smoker", group_keys=False).apply(top)

	total_bill	tip	smoker	day	time	size	tip_pct
232	11.61	3.39	No	Sat	Dinner	2	0.291990
149	7.51	2.00	No	Thur	Lunch	2	0.266312
51	10.29	2.60	No	Sun	Dinner	2	0.252672
185	20.69	5.00	No	Sun	Dinner	5	0.241663
88	24.71	5.85	No	Thur	Lunch	2	0.236746
172	7.25	5.15	Yes	Sun	Dinner	2	0.710345
178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
109	14.31	4.00	Yes	Sat	Dinner	2	0.279525

frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()
quartiles = pd.cut(frame["data1"], 4)
quartiles.head(10)

0     (-1.23, 0.489]
1     (0.489, 2.208]
2     (-1.23, 0.489]
3     (-1.23, 0.489]
4     (0.489, 2.208]
5     (0.489, 2.208]
6     (-1.23, 0.489]
7     (-1.23, 0.489]
8    (-2.956, -1.23]
9     (-1.23, 0.489]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

def get_stats(group):
    return pd.DataFrame(
        {"min": group.min(), "max": group.max(),
        "count": group.count(), "mean": group.mean()}
    )

grouped = frame.groupby(quartiles)
grouped.apply(get_stats)

		min	max	count	mean
data1
(-2.956, -1.23]	data1	-2.949343	-1.230179	94	-1.658818
(-2.956, -1.23]	data2	-3.399312	1.670835	94	-0.033333
(-1.23, 0.489]	data1	-1.228918	0.488675	598	-0.329524
(-1.23, 0.489]	data2	-2.989741	3.260383	598	-0.002622
(0.489, 2.208]	data1	0.489965	2.200997	298	1.065727
(0.489, 2.208]	data2	-3.745356	2.954439	298	0.078249
(2.208, 3.928]	data1	2.212303	3.927528	10	2.644253
(2.208, 3.928]	data2	-1.929776	1.765640	10	0.024750

grouped.agg(["min", "max", "count", "mean"])

	data1				data2
	min	max	count	mean	min	max	count	mean
data1
(-2.956, -1.23]	-2.949343	-1.230179	94	-1.658818	-3.399312	1.670835	94	-0.033333
(-1.23, 0.489]	-1.228918	0.488675	598	-0.329524	-2.989741	3.260383	598	-0.002622
(0.489, 2.208]	0.489965	2.200997	298	1.065727	-3.745356	2.954439	298	0.078249
(2.208, 3.928]	2.212303	3.927528	10	2.644253	-1.929776	1.765640	10	0.024750

quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
quartiles_samp.head()
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

		min	max	count	mean
data1
0	data1	-2.949343	-0.685484	250	-1.212173
0	data2	-3.399312	2.628441	250	-0.027045
1	data1	-0.683066	-0.030280	250	-0.368334
1	data2	-2.630247	3.260383	250	-0.027845
2	data1	-0.027734	0.618965	250	0.295812
2	data2	-3.056990	2.458842	250	0.014450
3	data1	0.623587	3.927528	250	1.248875
3	data2	-3.745356	2.954439	250	0.115899

s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s
s.fillna(s.mean())

0   -0.767366
1    0.227290
2   -0.767366
3   -2.153545
4   -0.767366
5   -0.375842
dtype: float64

states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]
data = pd.Series(np.random.standard_normal(8), index=states)
data

Ohio          0.329939
New York      0.981994
Vermont       1.105913
Florida      -1.613716
Oregon        1.561587
Nevada        0.406510
California    0.359244
Idaho        -0.614436
dtype: float64

data[["Vermont", "Nevada", "Idaho"]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()

East   -0.100594
West    0.960416
dtype: float64

def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

East  Ohio          0.329939
      New York      0.981994
      Vermont      -0.100594
      Florida      -1.613716
West  Oregon        1.561587
      Nevada        0.960416
      California    0.359244
      Idaho         0.960416
dtype: float64

fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

East  Ohio          0.329939
      New York      0.981994
      Vermont       0.500000
      Florida      -1.613716
West  Oregon        1.561587
      Nevada       -1.000000
      California    0.359244
      Idaho        -1.000000
dtype: float64

suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

deck.head(13)

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

4D     4
QH    10
8S     8
7D     7
9C     9
dtype: int64

def get_suit(card):
    # last letter is suit
    return card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  6C     6
   KC    10
D  7D     7
   3D     3
H  7H     7
   9H     9
S  2S     2
   QS    10
dtype: int64

deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

AC      1
3C      3
5D      5
4D      4
10H    10
7H      7
QS     10
7S      7
dtype: int64

df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df

	category	data	weights
0	a	-1.691656	0.955905
1	a	0.511622	0.012745
2	a	-0.401675	0.137009
3	a	0.968578	0.763037
4	b	-1.818215	0.492472
5	b	0.279963	0.832908
6	b	-0.200819	0.658331
7	b	-0.217221	0.612009

grouped = df.groupby("category")
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

grouped.apply(get_wavg)

category
a   -0.495807
b   -0.357273
dtype: float64

close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True,
                       index_col=0)
close_px.info()
close_px.tail(4)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB

	AAPL	MSFT	XOM	SPX
2011-10-11	400.29	27.00	76.27	1195.54
2011-10-12	402.19	26.96	77.16	1207.25
2011-10-13	408.43	27.18	76.37	1203.66
2011-10-14	422.00	27.27	78.11	1224.58

def spx_corr(group):
    return group.corrwith(group["SPX"])

rets = close_px.pct_change().dropna()

def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

	AAPL	MSFT	XOM	SPX
2003	0.541124	0.745174	0.661265	1.0
2004	0.374283	0.588531	0.557742	1.0
2005	0.467540	0.562374	0.631010	1.0
2006	0.428267	0.406126	0.518514	1.0
2007	0.508118	0.658770	0.786264	1.0
2008	0.681434	0.804626	0.828303	1.0
2009	0.707103	0.654902	0.797921	1.0
2010	0.710105	0.730118	0.839057	1.0
2011	0.691931	0.800996	0.859975	1.0

def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])
by_year.apply(corr_aapl_msft)

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

	SPX	intercept
2003	1.195406	0.000710
2004	1.363463	0.004201
2005	1.766415	0.003246
2006	1.645496	0.000080
2007	1.198761	0.003438
2008	0.968016	-0.001110
2009	0.879103	0.002954
2010	1.052608	0.001261
2011	0.806605	0.001514

df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
df

	key	value
0	a	0.0
1	b	1.0
2	c	2.0
3	a	3.0
4	b	4.0
5	c	5.0
6	a	6.0
7	b	7.0
8	c	8.0
9	a	9.0
10	b	10.0
11	c	11.0

g = df.groupby('key')['value']
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

def get_mean(group):
    return group.mean()
g.transform(get_mean)

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

def times_two(group):
    return group * 2
g.transform(times_two)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

def get_ranks(group):
    return group.rank(ascending=False)
g.transform(get_ranks)

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

def normalize(x):
    return (x - x.mean()) / x.std()

g.transform(normalize)
g.apply(normalize)

key    
a    0    -1.161895
     3    -0.387298
     6     0.387298
     9     1.161895
b    1    -1.161895
     4    -0.387298
     7     0.387298
     10    1.161895
c    2    -1.161895
     5    -0.387298
     8     0.387298
     11    1.161895
Name: value, dtype: float64

g.transform('mean')
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

tips.head()
tips.pivot_table(index=["day", "smoker"],
                 values=["size", "tip", "tip_pct", "total_bill"])

		size	tip	tip_pct	total_bill
day	smoker
Fri	No	2.250000	2.812500	0.151650	18.420000
Fri	Yes	2.066667	2.714000	0.174783	16.813333
Sat	No	2.555556	3.102889	0.158048	19.661778
Sat	Yes	2.476190	2.875476	0.147906	21.276667
Sun	No	2.929825	3.167895	0.160113	20.506667
Sun	Yes	2.578947	3.516842	0.187250	24.120000
Thur	No	2.488889	2.673778	0.160298	17.113111
Thur	Yes	2.352941	3.030000	0.163863	19.190588

tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"])

		size		tip_pct
	smoker	No	Yes	No	Yes
time	day
Dinner	Fri	2.000000	2.222222	0.139622	0.165347
	Sat	2.555556	2.476190	0.158048	0.147906
	Sun	2.929825	2.578947	0.160113	0.187250
	Thur	2.000000	NaN	0.159744	NaN
Lunch	Fri	3.000000	1.833333	0.187735	0.188937
Lunch	Thur	2.500000	2.352941	0.160311	0.163863

tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"], margins=True)

		size			tip_pct
	smoker	No	Yes	All	No	Yes	All
time	day
Dinner	Fri	2.000000	2.222222	2.166667	0.139622	0.165347	0.158916
	Sat	2.555556	2.476190	2.517241	0.158048	0.147906	0.153152
	Sun	2.929825	2.578947	2.842105	0.160113	0.187250	0.166897
	Thur	2.000000	NaN	2.000000	0.159744	NaN	0.159744
Lunch	Fri	3.000000	1.833333	2.000000	0.187735	0.188937	0.188765
Lunch	Thur	2.500000	2.352941	2.459016	0.160311	0.163863	0.161301
All		2.668874	2.408602	2.569672	0.159328	0.163196	0.160803

tips.pivot_table(index=["time", "smoker"], columns="day",
                 values="tip_pct", aggfunc=len, margins=True)

	day	Fri	Sat	Sun	Thur	All
time	smoker
Dinner	No	3.0	45.0	57.0	1.0	106
Dinner	Yes	9.0	42.0	19.0	NaN	70
Lunch	No	1.0	NaN	NaN	44.0	45
Lunch	Yes	6.0	NaN	NaN	17.0	23
All		19.0	87.0	76.0	62.0	244

tips.pivot_table(index=["time", "size", "smoker"], columns="day",
                 values="tip_pct", fill_value=0)

		day	Fri	Sat	Sun	Thur
time	size	smoker
Dinner	1	No	0.000000	0.137931	0.000000	0.000000
	1	Yes	0.000000	0.325733	0.000000	0.000000
	2	No	0.139622	0.162705	0.168859	0.159744
	2	Yes	0.171297	0.148668	0.207893	0.000000
	3	No	0.000000	0.154661	0.152663	0.000000
...	...	...	...	...	...	...
Lunch	3	Yes	0.000000	0.000000	0.000000	0.204952
	4	No	0.000000	0.000000	0.000000	0.138919
	4	Yes	0.000000	0.000000	0.000000	0.155410
	5	No	0.000000	0.000000	0.000000	0.121389
	6	No	0.000000	0.000000	0.000000	0.173706

21 rows × 4 columns

from io import StringIO
data = """Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep="\s+")

data

	Sample	Nationality	Handedness
0	1	USA	Right-handed
1	2	Japan	Left-handed
2	3	USA	Right-handed
3	4	Japan	Right-handed
4	5	Japan	Left-handed
5	6	Japan	Right-handed
6	7	USA	Right-handed
7	8	USA	Left-handed
8	9	Japan	Right-handed
9	10	USA	Right-handed

pd.crosstab(data["Nationality"], data["Handedness"], margins=True)

Handedness	Left-handed	Right-handed	All
Nationality
Japan	2	3	5
USA	1	4	5
All	3	7	10

pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True)

	smoker	No	Yes	All
time	day
Dinner	Fri	3	9	12
	Sat	45	42	87
	Sun	57	19	76
	Thur	1	0	1
Lunch	Fri	1	6	7
Lunch	Thur	44	17	61
All		151	93	244

pd.options.display.max_rows = PREVIOUS_MAX_ROWS