10장 - 지역 실험과 스위치백 실험

from toolz.curried import *

import pandas as pd
import numpy as np

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt
from cycler import cycler

color = ["0.0", "0.4", "0.8"]
default_cycler = cycler(color=color)
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

plt.rc("axes", prop_cycle=default_cycler)
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
import cvxpy as cp


class SyntheticControl(BaseEstimator, RegressorMixin):
    def __init__(self, fit_intercept=False):
        self.fit_intercept = fit_intercept

    def fit(self, y_pre_co, y_pre_tr):
        y_pre_co, y_pre_tr = check_X_y(y_pre_co, y_pre_tr)

        # add intercept
        intercept = np.ones((y_pre_co.shape[0], 1)) * self.fit_intercept
        X = np.concatenate([intercept, y_pre_co], axis=1)
        w = cp.Variable(X.shape[1])

        objective = cp.Minimize(cp.sum_squares(X @ w - y_pre_tr))
        constraints = [cp.sum(w[1:]) == 1, w[1:] >= 0]

        problem = cp.Problem(objective, constraints)

        self.loss_ = problem.solve(eps_abs=1)
        self.w_ = w.value[1:]
        self.intercept_ = w.value[0]

        self.is_fitted_ = True
        return self

    def predict(self, y_co):
        check_is_fitted(self)
        y_co = check_array(y_co)

        return y_co @ self.w_ + self.intercept_

10.1 지역 실험

지역 기반의 실험 설계를 통해 간섭 효과(Interference) 문제를 해결할 수 있습니다.

df = (
    pd.read_csv("../data/online_mkt.csv")
    .astype({"date": "datetime64[ns]"})
    .query("post==0")
)

df.head()
app_download population city state date post treated
0 3066.0 12396372 sao_paulo sao_paulo 2022-03-01 0 1
1 2701.0 12396372 sao_paulo sao_paulo 2022-03-02 0 1
2 1927.0 12396372 sao_paulo sao_paulo 2022-03-03 0 1
3 1451.0 12396372 sao_paulo sao_paulo 2022-03-04 0 1
4 1248.0 12396372 sao_paulo sao_paulo 2022-03-05 0 1
detectable_diff = df["app_download"].mean() * 0.05
sigma_2 = df.groupby("city")["app_download"].mean().var()

np.ceil((sigma_2 * 16) / (detectable_diff) ** 2)
36663.0

10.2 통제집단합성법 설계

pd.set_option("display.max_columns", 6)
df_piv = df.pivot("date", "city", "app_download")

df_piv.head()
city ananindeua aparecida_de_goiania aracaju ... teresina uberlandia vila_velha
date
2022-03-01 11.0 54.0 65.0 ... 68.0 29.0 63.0
2022-03-02 5.0 20.0 42.0 ... 17.0 29.0 11.0
2022-03-03 2.0 0.0 0.0 ... 55.0 30.0 14.0
2022-03-04 0.0 0.0 11.0 ... 49.0 35.0 0.0
2022-03-05 5.0 5.0 0.0 ... 31.0 6.0 1.0

5 rows × 50 columns

f = (
    df.groupby("city")["population"].first()
    / df.groupby("city")["population"].first().sum()
)
plt.figure(figsize=(12, 3))
plt.bar(f.sort_values().index, f.sort_values().values)
plt.title("Size of each city as a proportional of the market (f vector)")
plt.xticks(rotation=90, fontsize=14);

10.2.1 무작위로 실험군 선택하기

y_avg = df_piv.dot(f)
geos = list(df_piv.columns)
n_tr = 5
np.random.seed(42)
rand_geos = np.random.choice(geos, n_tr, replace=False)
rand_geos
array(['manaus', 'recife', 'sao_bernardo_do_campo', 'salvador', 'aracaju'],
      dtype='<U23')
def get_sc(geos, df_sc, y_mean_pre):
    model = SyntheticControl(fit_intercept=True)
    model.fit(df_sc[geos], y_mean_pre)

    selected_geos = geos[np.abs(model.w_) > 1e-5]

    return {"geos": selected_geos, "loss": model.loss_}


get_sc(rand_geos, df_piv, y_avg)
{'geos': array(['salvador', 'aracaju'], dtype='<U23'),
 'loss': 1598616.80875266}
def get_sc_st_combination(treatment_geos, df_sc, y_mean_pre):
    treatment_result = get_sc(treatment_geos, df_sc, y_mean_pre)

    remaining_geos = df_sc.drop(columns=treatment_result["geos"]).columns

    control_result = get_sc(remaining_geos, df_sc, y_mean_pre)

    return {
        "st_geos": treatment_result["geos"],
        "sc_geos": control_result["geos"],
        "loss": treatment_result["loss"] + control_result["loss"],
    }


resulting_geos = get_sc_st_combination(rand_geos, df_piv, y_avg)
resulting_geos.get("st_geos")
array(['salvador', 'aracaju'], dtype='<U23')
len(resulting_geos.get("st_geos")) + len(resulting_geos.get("sc_geos"))
50
synthetic_tr = SyntheticControl(fit_intercept=True)
synthetic_co = SyntheticControl(fit_intercept=True)

synthetic_tr.fit(df_piv[resulting_geos.get("st_geos")], y_avg)
synthetic_co.fit(df_piv[resulting_geos.get("sc_geos")], y_avg)

plt.figure(figsize=(10, 4))
plt.plot(y_avg, label="Average")
plt.plot(
    y_avg.index,
    synthetic_co.predict(df_piv[resulting_geos.get("sc_geos")]),
    label="sc",
    ls=":",
)
plt.plot(
    y_avg.index, synthetic_tr.predict(df_piv[resulting_geos.get("st_geos")]), label="st"
)


plt.xticks(rotation=45)
plt.legend()

10.2.2 무작위 탐색

from joblib import Parallel, delayed
from toolz import partial

np.random.seed(42)
geo_samples = [np.random.choice(geos, n_tr, replace=False) for _ in range(1000)]

est_combination = partial(get_sc_st_combination, df_sc=df_piv, y_mean_pre=y_avg)

results = Parallel(n_jobs=4)(delayed(est_combination)(geos) for geos in geo_samples)
resulting_geos = min(results, key=lambda x: x.get("loss"))
resulting_geos.get("st_geos")
array(['nova_iguacu', 'belem', 'joinville', 'sao_paulo'], dtype='<U23')
synthetic_tr = SyntheticControl(fit_intercept=True)
synthetic_co = SyntheticControl(fit_intercept=True)

synthetic_tr.fit(df_piv[resulting_geos.get("st_geos")], y_avg)
synthetic_co.fit(df_piv[resulting_geos.get("sc_geos")], y_avg)


plt.figure(figsize=(10, 4))
plt.plot(y_avg, label="Average")
plt.plot(
    y_avg.index,
    synthetic_co.predict(df_piv[resulting_geos.get("sc_geos")]),
    label="sc",
    ls=":",
)
plt.plot(
    y_avg.index, synthetic_tr.predict(df_piv[resulting_geos.get("st_geos")]), label="st"
)


plt.xticks(rotation=45)
plt.legend()

10.3 스위치백 실험

df = pd.read_csv("../data/sb_exp_every.csv")
df.head()
d delivery_time delivery_time_1 delivery_time_0 tau
0 1 2.84 2.84 5.84 -3.0
1 0 4.49 1.49 6.49 -5.0
2 0 7.27 2.27 8.27 -6.0
3 1 5.27 2.27 8.27 -6.0
4 1 5.59 4.59 10.59 -6.0
plt.figure(figsize=(12, 5))
x = df.index
plt.plot(x, df["delivery_time_0"], ls=":", color="0.0", label="Y0")
plt.plot(x, df["delivery_time_1"], ls=":", color="0.8", label="Y1")
plt.plot(x, df["delivery_time"], color="0.4", label="Y")
plt.scatter(
    x[df["d"] == 1],
    df["delivery_time"][df["d"] == 1],
    label="Treated",
    marker="v",
    color="0.4",
)
plt.scatter(
    x[df["d"] == 0],
    df["delivery_time"][df["d"] == 0],
    label="Control",
    marker="o",
    color="0.4",
)
plt.ylabel("delivery_time")
plt.xlabel("Hours")
plt.legend(fontsize=14)

10.3.1 시퀀스의 잠재적 결과

10.3.2 이월 효과의 차수 추정

df_lags = df.assign(**{f"d_l{l}": df["d"].shift(l) for l in range(7)})

df_lags[[f"d_l{l}" for l in range(7)]].head()
d_l0 d_l1 d_l2 ... d_l4 d_l5 d_l6
0 1 NaN NaN ... NaN NaN NaN
1 0 1.0 NaN ... NaN NaN NaN
2 0 0.0 1.0 ... NaN NaN NaN
3 1 0.0 0.0 ... NaN NaN NaN
4 1 1.0 0.0 ... 1.0 NaN NaN

5 rows × 7 columns

model = smf.ols(
    "delivery_time ~" + "+".join([f"d_l{l}" for l in range(7)]), data=df_lags
).fit()

model.summary().tables[1]
coef std err t P>|t| [0.025 0.975]
Intercept 9.3270 0.461 20.246 0.000 8.414 10.240
d_l0 -2.9645 0.335 -8.843 0.000 -3.629 -2.300
d_l1 -1.8861 0.339 -5.560 0.000 -2.559 -1.213
d_l2 -1.0013 0.340 -2.943 0.004 -1.676 -0.327
d_l3 0.2594 0.341 0.762 0.448 -0.416 0.935
d_l4 0.1431 0.340 0.421 0.675 -0.531 0.817
d_l5 0.1388 0.340 0.408 0.684 -0.536 0.813
d_l6 0.5588 0.336 1.662 0.099 -0.108 1.225
## remember to remove the intercept
tau_m_hat = model.params[1:].sum()
se_tau_m_hat = np.sqrt((model.bse[1:] ** 2).sum())
print("tau_m:", tau_m_hat)
print("95% CI:", [tau_m_hat - 1.96 * se_tau_m_hat, tau_m_hat + 1.96 * se_tau_m_hat])
tau_m: -4.751686115272022
95% CI: [-6.5087183781545574, -2.9946538523894857]
## selecting lags 0, 1 and 2
tau_m_hat = model.params[1:4].sum()
se_tau_m_hat = np.sqrt((model.bse[1:4] ** 2).sum())
print("tau_m:", tau_m_hat)
print("95% CI:", [tau_m_hat - 1.96 * se_tau_m_hat, tau_m_hat + 1.96 * se_tau_m_hat])
tau_m: -5.8518568954422925
95% CI: [-7.000105171362163, -4.703608619522422]

10.3.3 디자인 기반의 추정

rad_points_3 = np.array([True, False, False] * (2))
rad_points_3
array([ True, False, False,  True, False, False])
rad_points_3.cumsum()
array([1, 1, 1, 2, 2, 2])
from numpy.lib.stride_tricks import sliding_window_view

m = 2
sliding_window_view(rad_points_3.cumsum(), window_shape=m + 1)
array([[1, 1, 1],
       [1, 1, 2],
       [1, 2, 2],
       [2, 2, 2]])
np.diff(sliding_window_view(rad_points_3.cumsum(), 3), axis=1)
array([[0, 0],
       [0, 1],
       [1, 0],
       [0, 0]])
n_rand_windows = np.concatenate(
    [
        [np.nan] * m,
        np.diff(sliding_window_view(rad_points_3.cumsum(), 3), axis=1).sum(axis=1) + 1,
    ]
)

n_rand_windows
array([nan, nan,  1.,  2.,  2.,  1.])
p = 0.5
p**n_rand_windows
array([ nan,  nan, 0.5 , 0.25, 0.25, 0.5 ])
def compute_p(rand_points, m, p=0.5):
    n_windows_last_m = np.concatenate(
        [
            [np.nan] * m,
            np.diff(sliding_window_view(rand_points.cumsum(), m + 1), axis=1).sum(
                axis=1
            )
            + 1,
        ]
    )
    return p**n_windows_last_m


compute_p(np.ones(6) == 1, 2, 0.5)
array([  nan,   nan, 0.125, 0.125, 0.125, 0.125])
rand_points = np.array([True, False, False, True, False, True, False])
compute_p(rand_points, 2, 0.5)
array([ nan,  nan, 0.5 , 0.25, 0.25, 0.25, 0.25])
def last_m_d_equal(d_vec, d, m):
    return np.concatenate(
        [[np.nan] * m, (sliding_window_view(d_vec, m + 1) == d).all(axis=1)]
    )


print(last_m_d_equal([1, 1, 1, 0, 0, 0], 1, m=2))
print(last_m_d_equal([1, 1, 1, 0, 0, 0], 0, m=2))
[nan nan  1.  0.  0.  0.]
[nan nan  0.  0.  0.  1.]
def ipw_switchback(d, y, rand_points, m, p=0.5):
    p_last_m_equal_1 = compute_p(rand_points, m=m, p=p)
    p_last_m_equal_0 = compute_p(rand_points, m=m, p=1 - p)

    last_m_is_1 = last_m_d_equal(d, 1, m)
    last_m_is_0 = last_m_d_equal(d, 0, m)

    y1_rec = y * last_m_is_1 / p_last_m_equal_1
    y0_rec = y * last_m_is_0 / p_last_m_equal_0

    return np.mean((y1_rec - y0_rec)[m:])
ipw_switchback(df["d"], df["delivery_time"], np.ones(len(df)) == 1, m=2, p=0.5)
-7.426440677966101
from matplotlib import pyplot as plt
from statsmodels.tsa.arima_process import ArmaProcess


def gen_d(rand_points, p):
    result = [np.random.binomial(1, p)]

    for t in rand_points[1:]:
        result.append(np.random.binomial(1, p) * t + (1 - t) * result[-1])

    return np.array(result)


def y_given_d(d, effect_params, T, seed=None):
    np.random.seed(seed) if seed is not None else None
    x = np.arange(1, T + 1)
    return (
        np.log(x + 1)
        + 2 * np.sin(x * 2 * np.pi / 24)
        + np.convolve(~d.astype(bool), effect_params)[: -(len(effect_params) - 1)]
        + ArmaProcess([3, 2, 1], 12).generate_sample(T)
    ).round(2)


def gen_data_rand_every():
    effect_params = [3, 2, 1]
    T = 120
    p = 0.5
    m = 2

    d = np.random.binomial(1, 0.5, T)
    y = y_given_d(d, [3, 2, 1], T)
    rand_points = np.ones(T) == 1

    return pd.DataFrame(dict(d=d, y=y, rand_points=rand_points))


def tau_ols(df):
    df_lags = df.assign(**{f"d_l{l}": df["d"].shift(l) for l in range(3)})
    model = smf.ols("y ~" + "+".join([f"d_l{l}" for l in range(3)]), data=df_lags).fit()
    return model.params[1:].sum()


def tau_ipw(df):
    return ipw_switchback(df["d"], df["y"], df["rand_points"], m=2, p=0.5)


np.random.seed(123)
exps_dfs = [gen_data_rand_every() for _ in range(500)]

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8, 4))
ols_taus = list(map(tau_ols, exps_dfs))
ax1.hist(ols_taus, label="OLS tau", color="0.5")
ax1.vlines(-6, 0, 120, color="0", label="True tau")
ax1.legend()

ipw_taus = list(map(tau_ipw, exps_dfs))
ax2.hist(ipw_taus, label="IPW tau", color="0.5")
ax2.vlines(-6, 0, 120, color="0")
ax2.legend()

10.3.4 최적의 스위치백 설계

m = 2
T = 12
n = T / m
np.isin(np.arange(1, T + 1), [1] + [i * m + 1 for i in range(2, int(n) - 1)]) * 1
array([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0])
m = 3
T = 15
n = T / m
np.isin(np.arange(1, T + 1), [1] + [i * m + 1 for i in range(2, int(n) - 1)]) * 1
array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0])
def gen_d(rand_points, p):
    result = [np.random.binomial(1, p)]

    for t in rand_points[1:]:
        result.append(np.random.binomial(1, p) * t + (1 - t) * result[-1])

    return np.array(result)


T = 120
m = 2


def gen_exp(rand_points, T):
    effect_params = [3, 2, 1]
    p = 0.5

    d = gen_d(rand_points, p=p)
    y = y_given_d(d, [3, 2, 1], T)

    return pd.DataFrame(dict(d=d, y=y, rand_points=rand_points))


every_1 = np.array([True] * T)
every_3 = np.array([True, False, False] * (T // 3))
n = T // m
opt = np.isin(np.arange(1, T + 1), [1] + [i * m + 1 for i in range(2, int(n) - 1)])

np.random.seed(123)
exps_every_1 = [gen_exp(every_1, T) for _ in range(1000)]
exps_every_3 = [gen_exp(every_3, T) for _ in range(1000)]
exps_opt = [gen_exp(opt, T) for _ in range(1000)]
fig, axs = plt.subplots(3, 1, sharex=True, figsize=(12, 6))
ax1, ax2, ax3 = axs.ravel()

taus_every_1_ipw = list(map(tau_ipw, exps_every_1))
ax1.hist(taus_every_1_ipw, label="Every Period", color="0.5")
ax1.set_title(f"Var={np.round(np.var(taus_every_1_ipw), 2)}")
ax1.legend()


taus_every_3_ipw = list(map(tau_ipw, exps_every_3))
ax2.hist(taus_every_3_ipw, label="Every 3 Periods", color="0.5")
ax2.set_title(f"Var={np.round(np.var(taus_every_3_ipw), 2)}")
ax2.legend()

taus_opt_ipw = list(map(tau_ipw, exps_opt))
ax3.hist(taus_opt_ipw, label="Optimal Design", color="0.5")
ax3.set_title(f"Var={np.round(np.var(taus_opt_ipw), 2)}")
ax3.legend()

plt.tight_layout()

10.3.5 강건한 분산

df_opt = pd.read_csv("../data/sb_exp_opt.csv")
df_opt.head(6)
rand_points d delivery_time
0 True 0 5.84
1 False 0 5.40
2 False 0 8.86
3 False 0 8.79
4 True 0 10.93
5 False 0 7.02
tau_hat = ipw_switchback(
    df_opt["d"], df_opt["delivery_time"], df_opt["rand_points"], m=2, p=0.5
)

tau_hat
-9.921016949152545
np.vstack(np.hsplit(np.array([1, 1, 1, 2, 2, 3]), 3))
array([[1, 1],
       [1, 2],
       [2, 3]])
np.diff(np.vstack(np.hsplit(np.array([1, 1, 0, 0, 0, 0]), 3))[:, 0]) == 0
array([False,  True])
def var_opt_design(d_opt, y_opt, T, m):
    assert (T // m == T / m) & (T // m >= 4), "T must be divisible by m and T/m >= 4"

    # discard 1st block
    y_m_blocks = np.vstack(np.hsplit(y_opt, int(T / m))).sum(axis=1)[1:]

    # take 1st column
    d_m_blocks = np.vstack(np.split(d_opt, int(T / m))[1:])[:, 0]

    return (
        8 * y_m_blocks[0] ** 2
        + (32 * y_m_blocks[1:-1] ** 2 * (np.diff(d_m_blocks) == 0)[:-1]).sum()
        + 8 * y_m_blocks[-1] ** 2
    ) / (T - m) ** 2
se_hat = np.sqrt(var_opt_design(df_opt["d"], df_opt["delivery_time"], T=120, m=2))

[tau_hat - 1.96 * se_hat, tau_hat + 1.96 * se_hat]
[-18.490627362048095, -1.351406536256997]

10.4 요약