7장 - 메타러너

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib

from fklearn.causal.validation.curves import relative_cumulative_gain_curve
from fklearn.causal.validation.auc import area_under_the_relative_cumulative_gain_curve


from cycler import cycler

color = ["0.0", "0.4", "0.8"]
default_cycler = cycler(color=color)
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

plt.rc("axes", prop_cycle=default_cycler)

7.1 이산형 처치 메타러너

pd.set_option("display.max_columns", 8)

import pandas as pd

data_biased = pd.read_csv("../data/email_obs_data.csv")
data_rnd = pd.read_csv("../data/email_rnd_data.csv")

print(len(data_biased), len(data_rnd))
data_rnd.head()

300000 10000

	mkt_email	next_mnth_pv	age	tenure	...	jewel	books	music_books_movies	health
0	0	244.26	61.0	1.0	...	1	0	0	2
1	0	29.67	36.0	1.0	...	1	0	2	2
2	0	11.73	64.0	0.0	...	0	1	0	1
3	0	41.41	74.0	0.0	...	0	4	1	0
4	0	447.89	59.0	0.0	...	1	1	2	1

5 rows × 27 columns

y = "next_mnth_pv"
T = "mkt_email"
X = list(data_rnd.drop(columns=[y, T]).columns)

train, test = data_biased, data_rnd

7.1.1 T 러너

from lightgbm import LGBMRegressor

np.random.seed(123)

m0 = LGBMRegressor()
m1 = LGBMRegressor()

m0.fit(train.query(f"{T}==0")[X], train.query(f"{T}==0")[y])
m1.fit(train.query(f"{T}==1")[X], train.query(f"{T}==1")[y]);

t_learner_cate_test = test.assign(cate=m1.predict(test[X]) - m0.predict(test[X]))

gain_curve_test = relative_cumulative_gain_curve(
    t_learner_cate_test, T, y, prediction="cate"
)
auc = area_under_the_relative_cumulative_gain_curve(
    t_learner_cate_test, T, y, prediction="cate"
)

plt.figure(figsize=(10, 4))
plt.plot(gain_curve_test, color="C0", label=f"AUC: {auc:.2f}")
plt.hlines(0, 0, 100, linestyle="--", color="black", label="Baseline")

plt.legend()
plt.title("T-Learner")

Text(0.5, 1.0, 'T-Learner')

np.random.seed(123)


def g_kernel(x, c=0, s=0.05):
    return np.exp((-((x - c) ** 2)) / s)


n0 = 500
x0 = np.random.uniform(-1, 1, n0)
y0 = np.random.normal(0.3 * g_kernel(x0), 0.1, n0)

n1 = 50
x1 = np.random.uniform(-1, 1, n1)
y1 = np.random.normal(0.3 * g_kernel(x1), 0.1, n1) + 1

df = pd.concat(
    [pd.DataFrame(dict(x=x0, y=y0, t=0)), pd.DataFrame(dict(x=x1, y=y1, t=1))]
).sort_values(by="x")

m0 = LGBMRegressor(min_child_samples=25)
m1 = LGBMRegressor(min_child_samples=25)

m0.fit(x0.reshape(-1, 1), y0)
m1.fit(x1.reshape(-1, 1), y1)

m0_hat = m0.predict(df.query("t==0")[["x"]])
m1_hat = m1.predict(df.query("t==1")[["x"]])

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))

ax1.scatter(x0, y0, alpha=0.5, label="T=0", marker=marker[0], color=color[1])
ax1.scatter(x1, y1, alpha=0.7, label="T=1", marker=marker[1], color=color[0])

ax1.plot(
    df.query("t==0")[["x"]],
    m0_hat,
    color="black",
    linestyle="solid",
    label="$\hat{\mu}_0$",
)
ax1.plot(
    df.query("t==1")[["x"]],
    m1_hat,
    color="black",
    linestyle="--",
    label="$\hat{\mu}_1$",
)

ax1.set_ylabel("Y")
ax1.set_xlabel("X")
ax1.legend(fontsize=14)
tau_0 = m1.predict(df.query("t==0")[["x"]]) - df.query("t==0")["y"]
tau_1 = df.query("t==1")["y"] - m0.predict(df.query("t==1")[["x"]])

ax2.scatter(
    df.query("t==0")[["x"]],
    tau_0,
    label="$\hat{\\tau_0}$",
    alpha=0.5,
    marker=marker[0],
    color=color[1],
)
ax2.scatter(
    df.query("t==1")[["x"]],
    tau_1,
    label="$\hat{\\tau_1}$",
    alpha=0.7,
    marker=marker[1],
    color=color[0],
)

ax2.plot(
    df[["x"]],
    m1.predict(df[["x"]]) - m0.predict(df[["x"]]),
    label="$\hat{CATE}$",
    color="black",
)

ax2.set_ylabel("Estimated Effect")
ax2.set_xlabel("X")
ax2.legend(fontsize=14)

7.1.2 X 러너

from sklearn.linear_model import LogisticRegression

np.random.seed(42)

mu_tau0 = LGBMRegressor(min_child_samples=25)
mu_tau1 = LGBMRegressor(min_child_samples=25)

mu_tau0.fit(df.query("t==0")[["x"]], tau_0)
mu_tau1.fit(df.query("t==1")[["x"]], tau_1)

mu_tau0_hat = mu_tau0.predict(df.query("t==0")[["x"]])
mu_tau1_hat = mu_tau1.predict(df.query("t==1")[["x"]])

plt.figure(figsize=(10, 4))
plt.scatter(
    df.query("t==0")[["x"]],
    tau_0,
    label="$\hat{\\tau_0}$",
    alpha=0.5,
    marker=marker[0],
    color=color[1],
)
plt.scatter(
    df.query("t==1")[["x"]],
    tau_1,
    label="$\hat{\\tau_1}$",
    alpha=0.8,
    marker=marker[1],
    color=color[0],
)
plt.plot(
    df.query("t==0")[["x"]],
    mu_tau0_hat,
    color="black",
    linestyle="solid",
    label="$\hat{\mu}\\tau 0$",
)
plt.plot(
    df.query("t==1")[["x"]],
    mu_tau1_hat,
    color="black",
    linestyle="dashed",
    label="$\hat{\mu}\\tau_1$",
)
plt.ylabel("Estimated Effect")
plt.xlabel("X")
plt.legend(fontsize=14)

plt.figure(figsize=(10, 4))

ps_model = LogisticRegression(penalty="none")
ps_model.fit(df[["x"]], df["t"])

ps = ps_model.predict_proba(df[["x"]])[:, 1]

cate = (1 - ps) * mu_tau1.predict(df[["x"]]) + ps * mu_tau0.predict(df[["x"]])

plt.scatter(
    df.query("t==0")[["x"]],
    tau_0,
    label="$\hat{\\tau_0}$",
    alpha=0.5,
    s=100 * (ps[df["t"] == 0]),
    marker=marker[0],
    color=color[1],
)
plt.scatter(
    df.query("t==1")[["x"]],
    tau_1,
    label="$\hat{\\tau_1}$",
    alpha=0.5,
    s=100 * (1 - ps[df["t"] == 1]),
    marker=marker[1],
    color=color[0],
)

plt.plot(df[["x"]], cate, label="x-learner", color="black")
plt.ylabel("Estimated Effect")
plt.xlabel("X")
plt.legend(fontsize=14)

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor

# propensity score model
ps_model = LogisticRegression(penalty="none")
ps_model.fit(train[X], train[T])


# first stage models
train_t0 = train.query(f"{T}==0")
train_t1 = train.query(f"{T}==1")

m0 = LGBMRegressor()
m1 = LGBMRegressor()

np.random.seed(123)

m0.fit(
    train_t0[X],
    train_t0[y],
    sample_weight=1 / ps_model.predict_proba(train_t0[X])[:, 0],
)

m1.fit(
    train_t1[X],
    train_t1[y],
    sample_weight=1 / ps_model.predict_proba(train_t1[X])[:, 1],
);

# second stage
tau_hat_0 = m1.predict(train_t0[X]) - train_t0[y]
tau_hat_1 = train_t1[y] - m0.predict(train_t1[X])

m_tau_0 = LGBMRegressor()
m_tau_1 = LGBMRegressor()

np.random.seed(123)

m_tau_0.fit(train_t0[X], tau_hat_0)
m_tau_1.fit(train_t1[X], tau_hat_1);

# estimate the CATE
ps_test = ps_model.predict_proba(test[X])[:, 1]

x_cate_test = test.assign(
    cate=(ps_test * m_tau_0.predict(test[X]) + (1 - ps_test) * m_tau_1.predict(test[X]))
)

gain_curve_test = relative_cumulative_gain_curve(x_cate_test, T, y, prediction="cate")
auc = area_under_the_relative_cumulative_gain_curve(
    x_cate_test, T, y, prediction="cate"
)

plt.figure(figsize=(10, 4))
plt.plot(gain_curve_test, color="C0", label=f"AUC: {auc:.2f}")
plt.hlines(0, 0, 100, linestyle="--", color="black", label="Baseline")

plt.legend()
plt.title("X-Learner")

Text(0.5, 1.0, 'X-Learner')

7.2 연속형 처치 메타러너

data_cont = pd.read_csv("../data/discount_data.csv")
data_cont.head()

	rest_id	day	month	weekday	...	is_nov	competitors_price	discounts	sales
0	0	2016-01-01	1	4	...	False	2.88	0	79.0
1	0	2016-01-02	1	5	...	False	2.64	0	57.0
2	0	2016-01-03	1	6	...	False	2.08	5	294.0
3	0	2016-01-04	1	0	...	False	3.37	15	676.5
4	0	2016-01-05	1	1	...	False	3.79	0	66.0

5 rows × 11 columns

train = data_cont.query("day<'2018-01-01'")
test = data_cont.query("day>='2018-01-01'")

7.2.1 S 러너

X = ["month", "weekday", "is_holiday", "competitors_price"]
T = "discounts"
y = "sales"

np.random.seed(123)
s_learner = LGBMRegressor()
s_learner.fit(train[X + [T]], train[y]);

t_grid = pd.DataFrame(dict(key=1, discounts=np.array([0, 10, 20, 30, 40])))

test_cf = (
    test.drop(columns=["discounts"])
    .assign(key=1)
    .merge(t_grid)
    # make predictions after expansion
    .assign(sales_hat=lambda d: s_learner.predict(d[X + [T]]))
)

test_cf.head(8)

	rest_id	day	month	weekday	...	sales	key	discounts	sales_hat
0	0	2018-01-01	1	0	...	251.5	1	0	67.957972
1	0	2018-01-01	1	0	...	251.5	1	10	444.245941
2	0	2018-01-01	1	0	...	251.5	1	20	793.045769
3	0	2018-01-01	1	0	...	251.5	1	30	1279.640793
4	0	2018-01-01	1	0	...	251.5	1	40	1512.630767
5	0	2018-01-02	1	1	...	541.0	1	0	65.672080
6	0	2018-01-02	1	1	...	541.0	1	10	495.669220
7	0	2018-01-02	1	1	...	541.0	1	20	1015.401471

8 rows × 13 columns

days = ["2018-12-25", "2018-01-01", "2018-06-01", "2018-06-18"]

plt.figure(figsize=(10, 4))
sns.lineplot(
    data=test_cf.query("day.isin(@days)").query("rest_id==2"),
    palette="gray",
    y="sales_hat",
    x="discounts",
    style="day",
);

from toolz import curry


@curry
def linear_effect(df, y, t):
    return np.cov(df[y], df[t])[0, 1] / df[t].var()


cate = (
    test_cf.groupby(["rest_id", "day"])
    .apply(linear_effect(t="discounts", y="sales_hat"))
    .rename("cate")
)

test_s_learner_pred = test.set_index(["rest_id", "day"]).join(cate)

test_s_learner_pred.head()

		month	weekday	weekend	is_holiday	...	competitors_price	discounts	sales	cate
rest_id	day
0	2018-01-01	1	0	False	True	...	4.92	5	251.5	37.247404
	2018-01-02	1	1	False	False	...	3.06	10	541.0	40.269854
	2018-01-03	1	2	False	False	...	4.61	10	431.0	37.412988
	2018-01-04	1	3	False	False	...	4.84	20	760.0	38.436815
	2018-01-05	1	4	False	False	...	6.29	0	78.0	31.428603

5 rows × 10 columns

from fklearn.causal.validation.auc import area_under_the_relative_cumulative_gain_curve
from fklearn.causal.validation.curves import relative_cumulative_gain_curve

gain_curve_test = relative_cumulative_gain_curve(
    test_s_learner_pred, T, y, prediction="cate"
)
auc = area_under_the_relative_cumulative_gain_curve(
    test_s_learner_pred, T, y, prediction="cate"
)

plt.figure(figsize=(10, 4))
plt.plot(gain_curve_test, color="C0", label=f"AUC: {auc:.2f}")
plt.hlines(0, 0, 100, linestyle="--", color="black", label="Baseline")

plt.legend()
plt.title("S-Learner")

Text(0.5, 1.0, 'S-Learner')

np.random.seed(123)


def sim_s_learner_ate():
    n = 10000
    nx = 20

    X = np.random.normal(0, 1, (n, nx))
    coefs_y = np.random.uniform(-1, 1, nx)
    coefs_t = np.random.uniform(-0.5, 0.5, nx)

    ps = 1 / (1 + np.exp(-(X.dot(coefs_t))))
    t = np.random.binomial(1, ps)
    te = 1

    y = np.random.normal(t * te + X.dot(coefs_y), 1)

    s_learner = LGBMRegressor(max_depth=5, n_jobs=4)
    s_learner.fit(np.concatenate([X, t.reshape(-1, 1)], axis=1), y)
    ate_hat = (
        s_learner.predict(np.concatenate([X, np.ones((n, 1))], axis=1))
        - s_learner.predict(np.concatenate([X, np.zeros((n, 1))], axis=1))
    ).mean()

    return ate_hat


ates = [sim_s_learner_ate() for _ in range(500)]

plt.figure(figsize=(10, 4))
plt.hist(ates, alpha=0.5, bins=20, label="Simulations")
plt.vlines(1, 0, 70, label="True ATE")
plt.legend()
plt.xlabel("ATE")
plt.title("500 Simulations")

Text(0.5, 1.0, '500 Simulations')

7.2.2 이중/편향 제거 머신러닝

from sklearn.model_selection import cross_val_predict

X = ["month", "weekday", "is_holiday", "competitors_price"]
T = "discounts"
y = "sales"

debias_m = LGBMRegressor()
denoise_m = LGBMRegressor()

t_res = train[T] - cross_val_predict(debias_m, train[X], train[T], cv=5)
y_res = train[y] - cross_val_predict(denoise_m, train[X], train[y], cv=5)

import statsmodels.api as sm

sm.OLS(y_res, t_res).fit().summary().tables[1]

	coef	std err	t	P>\|t\|	[0.025	0.975]
discounts	31.4615	0.151	208.990	0.000	31.166	31.757

이중 머신러닝으로 CATE 추정

y_star = y_res / t_res
w = t_res**2

cate_model = LGBMRegressor().fit(train[X], y_star, sample_weight=w)

test_r_learner_pred = test.assign(cate=cate_model.predict(test[X]))

gain_curve_test = relative_cumulative_gain_curve(
    test_r_learner_pred, T, y, prediction="cate"
)
auc = area_under_the_relative_cumulative_gain_curve(
    test_r_learner_pred, T, y, prediction="cate"
)

plt.figure(figsize=(10, 4))
plt.plot(gain_curve_test, color="C0", label=f"AUC: {auc:.2f}")
plt.hlines(0, 0, 100, linestyle="--", color="black", label="Baseline")

plt.legend()
plt.title("R-Learner")

Text(0.5, 1.0, 'R-Learner')

이중 머신러닝에 대한 시각적인 직관

np.random.seed(123)
n = 5000

x_h = np.random.randint(1, 4, n)
x_c = np.random.uniform(-1, 1, n)

t = np.random.normal(10 + 1 * x_c + 3 * x_c**2 + x_c**3, 0.3)
y = np.random.normal(t + x_h * t - 5 * x_c - x_c**2 - x_c**3, 0.3)

df_sim = pd.DataFrame(dict(x_h=x_h, x_c=x_c, t=t, y=y))

plt.figure(figsize=(10, 5))
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["0.1", "0.5", "0.9"])

sns.scatterplot(data=df_sim, y="y", x="t", hue="x_c", style="x_h", palette=cmap)
plt.legend(fontsize=14)

debias_m = LGBMRegressor(max_depth=3)
denoise_m = LGBMRegressor(max_depth=3)

t_res = cross_val_predict(debias_m, df_sim[["x_c"]], df_sim["t"], cv=10)

y_res = cross_val_predict(denoise_m, df_sim[["x_c", "x_h"]], df_sim["y"], cv=10)

df_res = df_sim.assign(t_res=df_sim["t"] - t_res, y_res=df_sim["y"] - y_res)

import statsmodels.formula.api as smf

smf.ols("y_res~t_res", data=df_res).fit().params["t_res"]

3.045230146006292

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1 = sns.scatterplot(
    data=df_res,
    y="y_res",
    x="t_res",
    hue="x_c",
    style="x_h",
    alpha=0.5,
    ax=ax1,
    palette=cmap,
)

h, l = ax1.get_legend_handles_labels()
ax1.legend(fontsize=14)

sns.scatterplot(
    data=df_res, y="y_res", x="t_res", hue="x_h", ax=ax2, alpha=0.5, palette=cmap
)
ax2.legend(fontsize=14)

df_star = df_res.assign(
    y_star=df_res["y_res"] / df_res["t_res"],
    weight=df_res["t_res"] ** 2,
)

for x in range(1, 4):
    cate = np.average(
        df_star.query(f"x_h=={x}")["y_star"],
        weights=df_star.query(f"x_h=={x}")["weight"],
    )

    print(f"CATE x_h={x}", cate)

CATE x_h=1 2.019759619990067
CATE x_h=2 2.974967932350952
CATE x_h=3 3.9962382855476957

plt.figure(figsize=(10, 6))

(
    sns.scatterplot(
        data=df_star,
        palette=cmap,
        y="y_star",
        x="t_res",
        hue="x_h",
        size="weight",
        sizes=(1, 100),
    ),
)

plt.hlines(
    np.average(
        df_star.query("x_h==1")["y_star"], weights=df_star.query("x_h==1")["weight"]
    ),
    -1,
    1,
    label="x_h=1",
    color="0.1",
)

plt.hlines(
    np.average(
        df_star.query("x_h==2")["y_star"], weights=df_star.query("x_h==2")["weight"]
    ),
    -1,
    1,
    label="x_h=2",
    color="0.5",
)

plt.hlines(
    np.average(
        df_star.query("x_h==3")["y_star"], weights=df_star.query("x_h==3")["weight"]
    ),
    -1,
    1,
    label="x_h=3",
    color="0.9",
)

plt.ylim(-1, 8)

plt.legend(fontsize=12)

7.3 요약