5징 - 성향점수

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import graphviz as gr
from matplotlib import style
import seaborn as sns
from matplotlib import pyplot as plt

style.use("ggplot")
import statsmodels.formula.api as smf
from cycler import cycler

color = ["0.2", "0.6", "1.0"]
default_cycler = cycler(color=color)
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]


plt.rc("axes", prop_cycle=default_cycler)

5.1 관리자 교육의 효과

df = pd.read_csv("data/management_training.csv")
df.head()
departament_id intervention engagement_score tenure n_of_reports gender role last_engagement_score department_score department_size
0 76 1 0.277359 6 4 2 4 0.614261 0.224077 843
1 76 1 -0.449646 4 8 2 4 0.069636 0.224077 843
2 76 1 0.769703 6 4 2 4 0.866918 0.224077 843
3 76 1 -0.121763 6 4 2 4 0.029071 0.224077 843
4 76 1 1.526147 6 4 1 4 0.589857 0.224077 843

5.2 회귀분석과 보정

smf.ols("engagement_score ~ intervention", data=df).fit().summary().tables[1]
coef std err t P>|t| [0.025 0.975]
Intercept -0.2347 0.014 -16.619 0.000 -0.262 -0.207
intervention 0.4346 0.019 22.616 0.000 0.397 0.472
model = smf.ols(
    """engagement_score ~ intervention
+ tenure + last_engagement_score + department_score
+ n_of_reports + C(gender) + C(role)""",
    data=df,
).fit()

print("ATE:", model.params["intervention"])
print("95% CI:", model.conf_int().loc["intervention", :].values.T)
ATE: 0.2677908576676856
95% CI: [0.23357751 0.30200421]

5.3 성향점수

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "Y")
g.edge("X", "Y")
g.edge("X", "e(x)")
g.edge("e(x)", "T")

g

5.3.1 성향점수 추정

ps_model = smf.logit(
    """intervention ~
tenure + last_engagement_score + department_score
+ C(n_of_reports) + C(gender) + C(role)""",
    data=df,
).fit(disp=0)
data_ps = df.assign(
    propensity_score=ps_model.predict(df),
)

data_ps[["intervention", "engagement_score", "propensity_score"]].head()
intervention engagement_score propensity_score
0 1 0.277359 0.596106
1 1 -0.449646 0.391138
2 1 0.769703 0.602578
3 1 -0.121763 0.580990
4 1 1.526147 0.619976

5.3.2 성향점수와 직교화

model = smf.ols(
    "engagement_score ~ intervention + propensity_score", data=data_ps
).fit()
model.params["intervention"]
0.26331267490277066

5.3.3 성향점수 매칭

from sklearn.neighbors import KNeighborsRegressor

T = "intervention"
X = "propensity_score"
Y = "engagement_score"

treated = data_ps.query(f"{T}==1")
untreated = data_ps.query(f"{T}==0")

mt0 = KNeighborsRegressor(n_neighbors=1).fit(untreated[[X]], untreated[Y])

mt1 = KNeighborsRegressor(n_neighbors=1).fit(treated[[X]], treated[Y])

predicted = pd.concat(
    [
        # find matches for the treated looking at the untreated knn model
        treated.assign(match=mt0.predict(treated[[X]])),
        # find matches for the untreated looking at the treated knn model
        untreated.assign(match=mt1.predict(untreated[[X]])),
    ]
)

predicted.head()
departament_id intervention engagement_score tenure n_of_reports gender role last_engagement_score department_score department_size propensity_score match
0 76 1 0.277359 6 4 2 4 0.614261 0.224077 843 0.596106 0.557680
1 76 1 -0.449646 4 8 2 4 0.069636 0.224077 843 0.391138 -0.952622
2 76 1 0.769703 6 4 2 4 0.866918 0.224077 843 0.602578 -0.618381
3 76 1 -0.121763 6 4 2 4 0.029071 0.224077 843 0.580990 -1.404962
4 76 1 1.526147 6 4 1 4 0.589857 0.224077 843 0.619976 0.000354
np.mean(
    (predicted[Y] - predicted["match"]) * predicted[T]
    + (predicted["match"] - predicted[Y]) * (1 - predicted[T])
)
0.28777443474045966

5.3.4 역확률 가중치

g_data = (
    data_ps.assign(
        weight=data_ps["intervention"] / data_ps["propensity_score"]
        + (1 - data_ps["intervention"]) / (1 - data_ps["propensity_score"]),
        propensity_score=data_ps["propensity_score"].round(2),
    )
    .groupby(["propensity_score", "intervention"])[["weight", "engagement_score"]]
    .mean()
    .reset_index()
)

plt.figure(figsize=(10, 4))
for t in [0, 1]:
    sns.scatterplot(
        data=g_data.query(f"intervention=={t}"),
        y="engagement_score",
        x="propensity_score",
        size="weight",
        sizes=(1, 100),
        color=color[t],
        legend=None,
        label=f"T={t}",
        marker=marker[t],
    )

plt.title("Inverse Probability of Treatment Weighting")
plt.legend()

weight_t = 1 / data_ps.query("intervention==1")["propensity_score"]
weight_nt = 1 / (1 - data_ps.query("intervention==0")["propensity_score"])
t1 = data_ps.query("intervention==1")["engagement_score"]
t0 = data_ps.query("intervention==0")["engagement_score"]

y1 = sum(t1 * weight_t) / len(data_ps)
y0 = sum(t0 * weight_nt) / len(data_ps)

print("E[Y1]:", y1)
print("E[Y0]:", y0)
print("ATE", y1 - y0)
E[Y1]: 0.11656317232946772
E[Y0]: -0.1494155364781444
ATE 0.2659787088076121
np.mean(
    data_ps["engagement_score"]
    * (data_ps["intervention"] - data_ps["propensity_score"])
    / (data_ps["propensity_score"] * (1 - data_ps["propensity_score"]))
)
0.26597870880761226

5.3.5 역확률 가중치의 분산

from sklearn.linear_model import LogisticRegression
from patsy import dmatrix


# define function that computes the IPW estimator
def est_ate_with_ps(df, ps_formula, T, Y):
    X = dmatrix(ps_formula, df)
    ps_model = LogisticRegression(penalty="none", max_iter=1000).fit(X, df[T])
    ps = ps_model.predict_proba(X)[:, 1]

    # compute the ATE
    return np.mean((df[T] - ps) / (ps * (1 - ps)) * df[Y])
formula = """tenure + last_engagement_score + department_score
+ C(n_of_reports) + C(gender) + C(role)"""
T = "intervention"
Y = "engagement_score"

est_ate_with_ps(df, formula, T, Y)
0.2659755621752663
from joblib import Parallel, delayed  # for parallel processing


def bootstrap(data, est_fn, rounds=200, seed=123, pcts=[2.5, 97.5]):
    np.random.seed(seed)

    stats = Parallel(n_jobs=4)(
        delayed(est_fn)(data.sample(frac=1, replace=True)) for _ in range(rounds)
    )

    return np.percentile(stats, pcts)
from toolz import partial

print(f"ATE: {est_ate_with_ps(df, formula, T, Y)}")

est_fn = partial(est_ate_with_ps, ps_formula=formula, T=T, Y=Y)
print("95% C.I.: ", bootstrap(df, est_fn))
ATE: 0.2659755621752663
95% C.I.:  [0.22654315 0.30072595]

5.3.6 안정된 성향점수 가중치

print("Original Sample Size", data_ps.shape[0])
print("Treated Pseudo-Population Sample Size", sum(weight_t))
print("Untreated Pseudo-Population Sample Size", sum(weight_nt))
Original Sample Size 10391
Treated Pseudo-Population Sample Size 10435.089079197916
Untreated Pseudo-Population Sample Size 10354.298899788304
p_of_t = data_ps["intervention"].mean()

t1 = data_ps.query("intervention==1")
t0 = data_ps.query("intervention==0")

weight_t_stable = p_of_t / t1["propensity_score"]
weight_nt_stable = (1 - p_of_t) / (1 - t0["propensity_score"])

print("Treat size:", len(t1))
print("W treat", sum(weight_t_stable))

print("Control size:", len(t0))
print("W treat", sum(weight_nt_stable))
Treat size: 5611
W treat 5634.807508745978
Control size: 4780
W treat 4763.116999421415
nt = len(t1)
nc = len(t0)

y1 = sum(t1["engagement_score"] * weight_t_stable) / nt
y0 = sum(t0["engagement_score"] * weight_nt_stable) / nc

print("ATE: ", y1 - y0)
ATE:  0.26597870880761176

5.3.7 유사 모집단

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

sns.histplot(
    data_ps.query("intervention==0")["propensity_score"],
    stat="probability",
    label="Not Treated",
    color="C0",
    bins=30,
    ax=ax1,
    alpha=0.5,
)
sns.histplot(
    data_ps.query("intervention==1")["propensity_score"],
    stat="probability",
    label="Treated",
    color="C2",
    alpha=0.5,
    bins=30,
    ax=ax1,
)
ax1.set_title("Propensity Distribution")

sns.histplot(
    data_ps.query("intervention==0").assign(w=weight_nt_stable),
    x="propensity_score",
    stat="probability",
    color="C0",
    weights="w",
    label="Non Treated",
    bins=30,
    ax=ax2,
    alpha=0.5,
)

sns.histplot(
    data_ps.query("intervention==1").assign(w=weight_t_stable),
    x="propensity_score",
    stat="probability",
    color="C2",
    weights="w",
    label="Treated",
    bins=30,
    alpha=0.5,
    ax=ax2,
)
ax2.set_title("Weighted Propensity Distribution")
plt.legend()

plt.tight_layout()

5.3.8 선택편향

5.3.9 편향-분산 트레이드오프

5.3.10 성향점수의 양수성 가정

np.random.seed(42)

n = 1000
x = np.random.normal(0, 1, n)
t = np.random.normal(x, 0.5, n) > 0

y0 = -x
y1 = y0 + t  # ate of 1

y = np.random.normal((1 - t) * y0 + t * y1, 0.2)

df_no_pos = pd.DataFrame(dict(x=x, t=t.astype(int), y=y))

df_no_pos.head()
x t y
0 1.624345 1 -0.526442
1 -0.611756 0 0.659516
2 -0.528172 0 0.438549
3 -1.072969 0 0.950810
4 0.865408 1 -0.271397
ps_model_sim = smf.logit("""t ~ x""", data=df_no_pos).fit(disp=0)
df_no_pos_ps = df_no_pos.assign(ps=ps_model_sim.predict(df_no_pos))

ps = ps_model_sim.predict(df_no_pos)
w = df_no_pos["t"] * df_no_pos["t"].mean() / ps + (1 - df_no_pos["t"]) * (
    (1 - df_no_pos["t"].mean()) / (1 - ps)
)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 7))

sns.scatterplot(data=df_no_pos_ps.assign(w=w), x="x", y="y", hue="t", ax=ax1)
ax1.set_title("Original Data")

sns.histplot(
    df_no_pos_ps.query("t==0")["ps"],
    stat="probability",
    label="Non Treated",
    color="C0",
    bins=30,
    ax=ax2,
)
sns.histplot(
    df_no_pos_ps.query("t==1")["ps"],
    stat="probability",
    label="Treated",
    color="C1",
    alpha=0.5,
    bins=30,
    ax=ax2,
)
ax2.set_xlabel("e(x)")
ax2.legend()
ax2.set_title("Positivity Check")

sns.scatterplot(
    data=df_no_pos_ps.assign(**{"1/P(T=t)": w}),
    x="x",
    y="y",
    hue="t",
    ax=ax3,
    size="1/P(T=t)",
    sizes=(1, 100),
)
ax3.set_title("IPW Data")

plt.tight_layout()

est_fn = partial(est_ate_with_ps, ps_formula="x", T="t", Y="y")
print("ATE:", est_fn(df_no_pos))
print("95% C.I.: ", bootstrap(df_no_pos, est_fn))
ATE: 0.6478011810615735
95% C.I.:  [0.41710504 0.88840195]
smf.ols("t ~ x + t", data=df_no_pos).fit().params["t"]
1.0

5.4 디자인 vs. 모델 기반 식별

5.5 이중 강건 추정

from sklearn.linear_model import LinearRegression


def doubly_robust(df, formula, T, Y):
    X = dmatrix(formula, df)

    ps_model = LogisticRegression(penalty="none", max_iter=1000).fit(X, df[T])
    ps = ps_model.predict_proba(X)[:, 1]

    m0 = LinearRegression().fit(X[df[T] == 0, :], df.query(f"{T}==0")[Y])
    m1 = LinearRegression().fit(X[df[T] == 1, :], df.query(f"{T}==1")[Y])

    m0_hat = m0.predict(X)
    m1_hat = m1.predict(X)

    return np.mean(df[T] * (df[Y] - m1_hat) / ps + m1_hat) - np.mean(
        (1 - df[T]) * (df[Y] - m0_hat) / (1 - ps) + m0_hat
    )
formula = """tenure + last_engagement_score + department_score
+ C(n_of_reports) + C(gender) + C(role)"""
T = "intervention"
Y = "engagement_score"

print("DR ATE:", doubly_robust(df, formula, T, Y))

est_fn = partial(doubly_robust, formula=formula, T=T, Y=Y)
print("95% CI", bootstrap(df, est_fn))
DR ATE: 0.27115831057931455
95% CI [0.23012681 0.30524944]

5.5.1 처치 모델링이 쉬운 경우

np.random.seed(123)

n = 10000
x = np.random.beta(1, 1, n).round(2) * 2
e = 1 / (1 + np.exp(-(1 + 1.5 * x)))
t = np.random.binomial(1, e)

y1 = 1
y0 = 1 - 1 * x**3
y = t * (y1) + (1 - t) * y0 + np.random.normal(0, 1, n)

df_easy_t = pd.DataFrame(dict(y=y, x=x, t=t))

print("True ATE:", np.mean(y1 - y0))
True ATE: 2.0056243152
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.scatterplot(
    data=(
        df_easy_t.assign(size=1)
        .groupby(["x"])
        .agg({"size": "sum", "t": "mean"})
        .reset_index()
    ),
    x="x",
    y="t",
    sizes=(1, 100),
    size="size",
    ax=ax1,
)
ax1.set_title("P(T|X)")

sns.scatterplot(
    data=(
        df_easy_t.assign(size=1)
        .groupby(["x", "t"])
        .agg({"size": "sum", "y": "mean"})
        .reset_index()
    ),
    x="x",
    y="y",
    hue="t",
    sizes=(1, 100),
    size="size",
    ax=ax2,
)

ax2.set_title("E[Y|X, T]")

h, l = ax2.get_legend_handles_labels()
plt.legend(h[0:3], l[0:3])

m0 = smf.ols("y~x", data=df_easy_t.query("t==0")).fit()
m1 = smf.ols("y~x", data=df_easy_t.query("t==1")).fit()
regr_ate = (m1.predict(df_easy_t) - m0.predict(df_easy_t)).mean()

print("Regression ATE:", regr_ate)
Regression ATE: 1.786678396833022
regr = smf.ols("y~x*t", data=df_easy_t).fit()

plt.figure(figsize=(10, 4))

sns.scatterplot(
    data=(
        df_easy_t.assign(count=1)
        .groupby(["x", "t"])
        .agg({"count": "sum", "y": "mean"})
        .reset_index()
    ),
    x="x",
    y="y",
    hue="t",
    sizes=(1, 100),
    size="count",
)
g = sns.lineplot(
    data=(
        df_easy_t.assign(pred=regr.fittedvalues)
        .groupby(["x", "t"])
        .mean()
        .reset_index()
    ),
    x="x",
    y="pred",
    hue="t",
    sizes=(1, 100),
)

h, l = g.get_legend_handles_labels()
plt.legend(h[0:3], l[0:3])

m = smf.ols("y~t*(x + np.power(x, 3))", data=df_easy_t).fit()
regr_ate = (m.predict(df_easy_t.assign(t=1)) - m.predict(df_easy_t.assign(t=0))).mean()

print("Regression ATE:", regr_ate)
Regression ATE: 1.9970999747190072
est_fn = partial(est_ate_with_ps, ps_formula="x", T="t", Y="y")
print("Propensity Score ATE:", est_fn(df_easy_t))
print("95% CI", bootstrap(df_easy_t, est_fn))
Propensity Score ATE: 2.002350388474011
95% CI [1.80802227 2.22565667]
est_fn = partial(doubly_robust, formula="x", T="t", Y="y")
print("DR ATE:", est_fn(df_easy_t))
print("95% CI", bootstrap(df_easy_t, est_fn))
DR ATE: 2.001617934263116
95% CI [1.87088771 2.145382  ]

5.5.2 결과 모델링이 쉬운 경우

np.random.seed(123)

n = 10000
x = np.random.beta(1, 1, n).round(2) * 2
e = 1 / (1 + np.exp(-(2 * x - x**3)))
t = np.random.binomial(1, e)

y1 = x
y0 = y1 + 1  # ate of -1
y = t * (y1) + (1 - t) * y0 + np.random.normal(0, 1, n)

df_easy_y = pd.DataFrame(dict(y=y, x=x, t=t))

print("True ATE:", np.mean(y1 - y0))
True ATE: -1.0

The same kind of plot from before can be used to show the complex functional form for \(P(T|X)\) and the simplicity of \(E[Y_t|X]\).

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.scatterplot(
    data=(
        df_easy_y.assign(size=1)
        .groupby(["x"])
        .agg({"size": "sum", "t": "mean"})
        .reset_index()
    ),
    x="x",
    y="t",
    sizes=(1, 100),
    size="size",
    ax=ax1,
    legend=None,
)
sns.scatterplot(
    data=(
        df_easy_y.assign(size=1)
        .groupby(["x", "t"])
        .agg({"size": "sum", "y": "mean"})
        .reset_index()
    ),
    x="x",
    y="y",
    hue="t",
    sizes=(1, 100),
    size="size",
    ax=ax2,
)
ax1.set_title("P(T|X)")
ax2.set_title("E[Y|X, T]")

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

est_fn = partial(est_ate_with_ps, ps_formula="x", T="t", Y="y")
print("Propensity Score ATE:", est_fn(df_easy_y))
print("95% CI", bootstrap(df_easy_y, est_fn))
Propensity Score ATE: -1.1042900278680896
95% CI [-1.14326893 -1.06576358]
m0 = smf.ols("y~x", data=df_easy_y.query("t==0")).fit()
m1 = smf.ols("y~x", data=df_easy_y.query("t==1")).fit()
regr_ate = (m1.predict(df_easy_y) - m0.predict(df_easy_y)).mean()

print("Regression ATE:", regr_ate)
Regression ATE: -1.0008783612504342
est_fn = partial(doubly_robust, formula="x", T="t", Y="y")
print("DR ATE:", est_fn(df_easy_y))
print("95% CI", bootstrap(df_easy_y, est_fn))
DR ATE: -1.0028459347805823
95% CI [-1.04156952 -0.96353366]

5.6 연속형 처치에서의 일반화 성향점수

df_cont_t = pd.read_csv("../data/interest_rate.csv")

df_cont_t.head()
ml_1 ml_2 interest duration
0 0.392938 0.326285 7.1 12.0
1 -0.427721 0.679573 5.6 17.0
2 -0.546297 0.647309 11.1 12.0
3 0.102630 -0.264776 7.2 18.0
4 0.438938 -0.648818 9.5 19.0
m_naive = smf.ols("duration ~ interest", data=df_cont_t).fit()
m_naive.summary().tables[1]
coef std err t P>|t| [0.025 0.975]
Intercept 14.5033 0.226 64.283 0.000 14.061 14.946
interest 0.3393 0.029 11.697 0.000 0.282 0.396
model_t = smf.ols("interest~ml_1+ml_2", data=df_cont_t).fit()
def conditional_density(x, mean, std):
    denom = std * np.sqrt(2 * np.pi)
    num = np.exp(-((1 / 2) * ((x - mean) / std) ** 2))
    return (num / denom).ravel()


gps = conditional_density(
    df_cont_t["interest"], model_t.fittedvalues, np.std(model_t.resid)
)
gps
array([0.1989118 , 0.14524168, 0.03338421, ..., 0.07339096, 0.19365006,
       0.15732008])
from scipy.stats import norm

gps = norm(loc=model_t.fittedvalues, scale=np.std(model_t.resid)).pdf(
    df_cont_t["interest"]
)
gps
array([0.1989118 , 0.14524168, 0.03338421, ..., 0.07339096, 0.19365006,
       0.15732008])
노트Beyond the Normal

If the treatment follows another distribution other than the normal, you can use generalized linear models (glm) to fit it. For example, if \(T\) was assigned according to a Poisson distribution, you could build the GPS weights with something like the following code

import statsmodels.api as sm
from scipy.stats import poisson
 
mt = smf.glm("t~x1+x2",
             data=df, family=sm.families.Poisson()).fit()
 
gps = poisson(mu=m_pois.fittedvalues).pmf(df["t"])
 
w = 1/gps

Using the inverse of the GPS as weights in a regression model can adjust for the bias. You can see that now you’ll find a negative effect of interest on duration, which makes more business sense.

final_model = smf.wls("duration~interest", data=df_cont_t, weights=1 / gps).fit()

final_model.params["interest"]
-0.6673977919925854
np.random.seed(42)
sample = df_cont_t.sample(2000)


model_ex = smf.ols("interest~ml_1", data=sample).fit()
gps_ex = norm(loc=model_ex.fittedvalues, scale=np.std(model_ex.resid)).pdf(
    sample["interest"]
)

w_ex = 1 / gps_ex

m = smf.ols("duration~interest", data=sample).fit()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 6))

sns.scatterplot(
    data=sample, x="interest", y="duration", alpha=1, ax=ax1, hue="ml_1", palette="gray"
)
ax1.plot(sample["interest"], m.fittedvalues)

ax1.set_ylabel("Duration")
ax1.set_xlabel("Interest")
ax1.set_title("Original Data")
sns.scatterplot(
    data=sample.assign(w=w_ex),
    x="ml_1",
    y="interest",
    size="w",
    sizes=(1, 200),
    alpha=0.5,
    ax=ax2,
)
ax2.plot(sample["ml_1"], model_ex.fittedvalues)
ax2.set_ylabel("Interest")
ax2.set_xlabel("ML-1")
ax2.set_title("Weights")


m = smf.wls("duration~interest", data=sample, weights=w_ex).fit()

sns.scatterplot(
    data=sample.assign(w=w_ex),
    x="interest",
    y="duration",
    size="w",
    sizes=(1, 200),
    alpha=1,
    ax=ax3,
    hue="ml_1",
    legend=None,
    palette="gray",
)
ax3.plot(sample["interest"], m.fittedvalues)
ax3.set_ylabel("Duration")
ax3.set_xlabel("Interest")
ax3.set_title("Weighted Data")

plt.tight_layout()

stabilizer = norm(
    loc=df_cont_t["interest"].mean(),
    scale=np.std(df_cont_t["interest"] - df_cont_t["interest"].mean()),
).pdf(df_cont_t["interest"])

gipw = stabilizer / gps

print("Original Sample Size:", len(df_cont_t))
print("Effective Stable Weights Sample Size:", sum(gipw))
Original Sample Size: 10000
Effective Stable Weights Sample Size: 9988.19595174861
np.random.seed(42)
sample = df_cont_t.sample(2000)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 6))

model_ex = smf.ols("interest~ml_1", data=sample).fit()
gps_ex = norm(loc=model_ex.fittedvalues, scale=np.std(model_ex.resid)).pdf(
    sample["interest"]
)
stabilizer_ex = norm(
    loc=sample["interest"].mean(),
    scale=np.std(sample["interest"] - sample["interest"].mean()),
).pdf(sample["interest"])


w_ex = stabilizer_ex / gps_ex

m = smf.ols("duration~interest", data=sample).fit()

sns.scatterplot(
    data=sample, x="interest", y="duration", alpha=1, ax=ax1, hue="ml_1", palette="gray"
)
ax1.plot(sample["interest"], m.fittedvalues, lw=3)
ax1.set_ylabel("Duration")
ax1.set_xlabel("Interest")
ax1.set_title("Original Data")
sns.scatterplot(
    data=sample.assign(w=w_ex),
    x="ml_1",
    y="interest",
    size="w",
    sizes=(1, 100),
    alpha=0.5,
    ax=ax2,
)

ax2.plot(sample["ml_1"], model_ex.fittedvalues, label="E[t|x]", lw=3)

ax2.hlines(sample["interest"].mean(), -1, 1, label="E[t]", color="C2", lw=3)
ax2.set_ylabel("Interest")
ax2.set_xlabel("ML-1")
ax2.set_title("Weights")

h, l = ax2.get_legend_handles_labels()
ax2.legend(
    h[5:],
    l[5:],
)

m = smf.wls("duration~interest", data=sample, weights=w_ex).fit()

sns.scatterplot(
    data=sample.assign(w=w_ex),
    x="interest",
    y="duration",
    size="w",
    sizes=(1, 100),
    alpha=1,
    ax=ax3,
    hue="ml_1",
    legend=None,
    palette="gray",
)
ax3.plot(sample["interest"], m.fittedvalues, lw=3)
ax3.set_ylabel("Duration")
ax3.set_xlabel("Interest")
ax3.set_title("Weighted Data")
Text(0.5, 1.0, 'Weighted Data')

final_model = smf.wls("duration ~ interest", data=df_cont_t, weights=gipw).fit()

final_model.params["interest"]
-0.7787046278134069
def gps_normal_ate(df, formula, T, Y, stable=True):
    mt = smf.ols(f"{T}~" + formula, data=df).fit()
    gps = norm(loc=mt.fittedvalues, scale=np.std(mt.resid)).pdf(df[T])
    stabilizer = norm(loc=df[T].mean(), scale=np.std(df[T] - df[T].mean())).pdf(df[T])

    if stable:
        return smf.wls(f"{Y}~{T}", data=df, weights=stabilizer / gps).fit().params[T]
    else:
        return smf.wls(f"{Y}~{T}", data=df, weights=1 / gps).fit().params[T]


print(
    "95% CI, non-stable: ",
    bootstrap(
        df_cont_t,
        partial(
            gps_normal_ate,
            formula="ml_1 + ml_2",
            T="interest",
            Y="duration",
            stable=False,
        ),
    ),
)
print(
    "95% CI, stable: ",
    bootstrap(
        df_cont_t,
        partial(gps_normal_ate, formula="ml_1 + ml_2", T="interest", Y="duration"),
    ),
)
95% CI, non-stable:  [-0.81074164 -0.52605933]
95% CI, stable:  [-0.85834311 -0.71001914]

5.7 요약

np.random.seed(42)

n = 100

df1 = pd.DataFrame(
    dict(
        x=1,
        t=np.random.beta(2.5, 1, n),
    )
).assign(y=lambda d: np.random.normal(d["x"] + d["t"], 0.1))

df2 = pd.DataFrame(
    dict(
        x=2,
        t=np.random.beta(1, 1, n),
    )
).assign(y=lambda d: np.random.normal(d["x"] + d["t"], 0.1))

df3 = pd.DataFrame(
    dict(
        x=3,
        t=np.random.beta(1, 2.5, n),
    )
).assign(y=lambda d: np.random.normal(d["x"] + d["t"], 0.1))


df_example = pd.concat([df1, df2, df3])

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 6))

m = smf.ols("y~t", data=df_example).fit()
sns.scatterplot(data=df_example, x="t", y="y", hue="x", s=50, ax=ax1, palette="gray")
ax1.plot(df_example["t"], m.fittedvalues)
ax1.set_title("Biased Data")

## Orthogonalization

mt = smf.ols("t~x", data=df_example).fit()
my = smf.ols("y~x", data=df_example).fit()

df_ex_res = df_example.assign(
    t_res=mt.resid + df_example["t"].mean(), y_res=my.resid + df_example["y"].mean()
)

m_final = smf.ols("y_res~t_res", data=df_ex_res).fit()
sns.scatterplot(
    data=df_ex_res,
    x="t_res",
    y="y_res",
    hue="x",
    s=50,
    ax=ax2,
    legend=None,
    palette="gray",
)
ax2.plot(df_ex_res["t_res"], m_final.fittedvalues)

ax2.set_ylim(df_example["y"].min(), df_example["y"].max())
ax2.set_xlim(df_example["t"].min(), df_example["t"].max())

ax2.set_ylabel("y - E[y|x] + E[y]")
ax2.set_xlabel("t - E[t|x] + E[t]")
ax2.set_title("Orthogonalization")


## IPW
gps_example = norm(loc=mt.fittedvalues, scale=np.std(mt.resid)).pdf(df_example["t"])
stabilizer_example = norm(
    loc=df_example["t"].mean(), scale=np.std(df_example["t"] - df_example["t"].mean())
).pdf(df_example["t"])


w_example = stabilizer_example / gps_example
mw = smf.wls("y~t", data=df_example, weights=w_example).fit()


sns.scatterplot(
    data=df_example.assign(w=w_example),
    x="t",
    y="y",
    hue="x",
    ax=ax3,
    size="w",
    legend=None,
    sizes=(11, 100),
    palette="gray",
)
ax3.plot(df_example["t"], mw.fittedvalues)
ax3.set_title("IPW")

plt.tight_layout()