시뮬레이션 데이터

Chapter 1

import pandas as pd
import numpy as np

units = range(1, 500 + 1)

np.random.seed(42)

data = (
    pd.DataFrame(
        dict(
            store=np.repeat(units, 4),
            unit_fe=np.repeat(np.random.normal(0, 5, size=len(units)), 4),
            weeks_to_xmas=np.tile([3, 2, 1, 0], len(units)),
            avg_week_sales=np.repeat(np.random.gamma(20, 1, size=len(units)), 4).round(
                2
            ),
        )
    )
    .assign(
        is_on_sale=lambda d: (
            (
                (d["unit_fe"] > 0) * np.random.binomial(1, 0.2, d.shape[0])
                | (d["avg_week_sales"] > np.random.gamma(25, 1, size=len(units) * 4))
                | (np.random.normal(d["weeks_to_xmas"], 2) > 2)
                * np.random.binomial(1, 0.7, size=d.shape[0])  # xmas
            )
            * 1
        ),
    )
    .assign(
        y0=lambda d: (
            -10
            + 10 * d["unit_fe"]
            + d["avg_week_sales"]
            + 2 * d["avg_week_sales"] * d["weeks_to_xmas"]
        )
    )
    .assign(y1=lambda d: d["y0"] + 50)
    .assign(
        tau=lambda d: d["y1"] - d["y0"],
        weekly_amount_sold=lambda d: (
            np.random.normal(np.where(d["is_on_sale"] == 1, d["y1"], d["y0"]), 10)
            .clip(0, np.inf)
            .round(2)
        ),
    )
    .drop(columns=["unit_fe", "y0", "y1", "tau"])
)

data.to_csv("../data/xmas_sales.csv", index=False)
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 5)

np.random.seed(123)
data = (
    pd.read_csv("../data/online_classroom.csv")
    .assign(
        cross_sell_email=lambda d: np.select(
            [d["format_ol"].astype(bool), d["format_blended"].astype(bool)],
            ["no_email", "long"],
            default="short",
        )
    )
    .assign(age=lambda d: np.random.gamma(1.5, 5, d.shape[0]).astype(int) + 14)
    .assign(conversion=lambda d: ((d["falsexam"] - d["age"]) > 70).astype(int))
    .drop(
        columns=[
            "asian",
            "black",
            "hawaiian",
            "hispanic",
            "unknown",
            "white",
            "format_ol",
            "format_blended",
            "falsexam",
        ]
    )
)

data.to_csv("../data/cross_sell_email.csv", index=False)

Chapter 4

np.random.seed(123)
data = (
    pd.read_csv("../data/online_classroom.csv")
    .assign(
        recommender=lambda d: np.select(
            [d["format_ol"].astype(bool), d["format_blended"].astype(bool)],
            ["benchmark", "benchmark"],
            default="challenger",
        )
    )
    .assign(age=lambda d: np.random.gamma(1.5, 5, d.shape[0]).astype(int) + 14)
    .assign(tenure=lambda d: np.random.gamma(1, 1.1, d.shape[0]).astype(int).clip(0, 4))
    .assign(
        watch_time=lambda d: np.random.normal(
            3 * (d["recommender"] == "challenger")
            + 10 * d["tenure"]
            - 0.8 * d["age"]
            + 0.001 * d["age"] ** 2,
            11,
        )
    )
    .assign(
        watch_time=lambda d: (
            (d["watch_time"] - d["watch_time"].min())
            / (d["watch_time"].max() + 20 - d["watch_time"].min())
            * 6
        ).round(2)
    )
    .drop(
        columns=[
            "asian",
            "gender",
            "black",
            "hawaiian",
            "hispanic",
            "unknown",
            "white",
            "format_ol",
            "format_blended",
            "falsexam",
        ]
    )
)

data.to_csv("../data/rec_ab_test.csv", index=False)
np.random.seed(123)
risk_data = (
    pd.read_csv("../data/wage.csv")
    .sample(50000, replace=True)
    .assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
    .assign(
        credit_score1=lambda d: np.random.normal(
            100 + 0.1 * d["wage"] + d["urban"] + d["educ"] + 2 * d["exper"], 50
        ).round(-1)
    )
    .assign(
        credit_score2=lambda d: np.random.normal(
            100 + 0.05 * d["wage"] + d["hours"] + d["age"] + 2 * d["IQ"], 50
        ).round(-1)
    )
    .assign(
        credit_score1=lambda d: np.round(
            1000
            * (d["credit_score1"] - d["credit_score1"].min() + 20)
            / (d["credit_score1"].max() - d["credit_score1"].min()),
            0,
        )
    )
    .assign(
        credit_score2=lambda d: np.round(
            1000
            * (d["credit_score2"] - d["credit_score2"].min() + 20)
            / (d["credit_score2"].max() - d["credit_score2"].min()),
            0,
        )
    )
    .assign(
        credit_limit=lambda d: (
            np.random.normal(
                500
                + 1.5 * pd.IntervalIndex(pd.qcut(d["credit_score1"], 10)).mid
                + 1.5 * pd.IntervalIndex(pd.qcut(d["wage"], 10)).mid,
                1000,
            )
            .round(-2)
            .clip(200, np.inf)
        )
    )
    .assign(
        default=lambda d: (
            np.random.normal(
                +0
                - 0.1 * d["credit_score1"]
                - 0.5 * d["credit_score2"]
                + 0.05 * d["IQ"]
                + 0.005 * d["credit_limit"]
                - 0.17 * d["wage"],
                375,
            )
            > -50
        ).astype(int)
    )
    .drop(
        columns=[
            "IQ",
            "lhwage",
            "tenure",
            "black",
            "hours",
            "sibs",
            "south",
            "urban",
            "brthord",
            "meduc",
            "feduc",
            "age",
        ]
    )
    .reset_index(drop=True)
)

spend_data = risk_data.assign(
    spend=lambda d: np.random.normal(
        50
        + 120 * np.power(d["credit_limit"], 1 / 2.5)
        - d["credit_score1"]
        + 1.2 * d["wage"]
        - 10 * d["exper"]
        - 10 * d["educ"]
        + 400 * d["married"]
    ).astype(int)
).drop(columns=["default"])

risk_data.to_csv("../data/risk_data.csv", index=False)
spend_data.to_csv("../data/spend_data.csv", index=False)
np.random.seed(123)
risk_data_rnd = (
    pd.read_csv("../data/wage.csv")
    .sample(10000, replace=True)
    .assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
    .assign(
        credit_score1=lambda d: np.random.normal(
            100 + 0.1 * d["wage"] + d["urban"] + d["educ"] + 2 * d["exper"], 50
        ).round(-1)
    )
    .assign(
        credit_score2=lambda d: np.random.normal(
            100 + 0.05 * d["wage"] + d["hours"] + d["age"] + 2 * d["IQ"], 50
        ).round(-1)
    )
    .assign(
        credit_score1=lambda d: np.round(
            1000
            * (d["credit_score1"] - d["credit_score1"].min() + 20)
            / (d["credit_score1"].max() - d["credit_score1"].min()),
            0,
        )
    )
    .assign(
        credit_score2=lambda d: np.round(
            1000
            * (d["credit_score2"] - d["credit_score2"].min() + 20)
            / (d["credit_score2"].max() - d["credit_score2"].min()),
            0,
        )
    )
    .assign(
        credit_score1_buckets=lambda d: (
            (d["credit_score1"] / 200).round(0) * 200
        ).astype(int)
    )
    .assign(
        credit_limit=lambda d: (
            np.random.beta(1 + d["credit_score1_buckets"] / 100, 6) * 10000
        ).round(-2)
    )
    .assign(
        default=lambda d: (
            np.random.normal(
                +0
                - 0.1 * d["credit_score1"]
                - 0.5 * d["credit_score2"]
                + 0.05 * d["IQ"]
                + 0.5 * d["sibs"]
                - 0.5 * d["feduc"]
                + 0.005 * d["credit_limit"]
                - 0.17 * d["wage"],
                100,
            )
            > -300
        ).astype(int)
    )
    .drop(
        columns=[
            "IQ",
            "lhwage",
            "tenure",
            "black",
            "hours",
            "sibs",
            "south",
            "urban",
            "brthord",
            "meduc",
            "feduc",
            "age",
        ]
    )
    .reset_index(drop=True)
)

risk_data_rnd.to_csv("../data/risk_data_rnd.csv", index=False)
np.random.seed(123)
spend_data_rnd = (
    pd.read_csv("../data/wage.csv")
    .sample(500, replace=True)
    .assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
    .assign(credit_score1=lambda d: np.random.normal(100, 50, len(d)).round(-1))
    .assign(
        credit_score1=lambda d: np.round(
            1000
            * (d["credit_score1"] - d["credit_score1"].min() + 20)
            / (d["credit_score1"].max() - d["credit_score1"].min()),
            0,
        )
    )
    .assign(
        credit_score1_buckets=lambda d: (
            (d["credit_score1"] / 200).round(0) * 200
        ).astype(int)
    )
    .assign(
        credit_limit=lambda d: (
            np.random.beta(1 + d["credit_score1_buckets"] / 100, 6) * 10000
        ).round(-2)
    )
    .assign(
        spend=lambda d: np.random.normal(
            500
            + 20 * np.power(d["credit_limit"], 1 / 2)
            + 1.2 * d["wage"]
            - 10 * d["exper"]
            - 10 * d["educ"]
            + 400 * d["married"],
            600,
        ).astype(int)
    )
    .drop(
        columns=[
            "IQ",
            "lhwage",
            "tenure",
            "black",
            "hours",
            "sibs",
            "south",
            "urban",
            "brthord",
            "meduc",
            "feduc",
            "age",
        ]
    )
    .reset_index(drop=True)
)

spend_data_rnd.to_csv("../data/spend_data_rnd.csv", index=False)

Chapter 5

import numpy as np
import pandas as pd

np.random.seed(123)

df = (
    pd.read_csv(
        "https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/master/causal-inference-for-the-brave-and-true/data/learning_mindset.csv"
    )
    .rename(
        columns={
            "schoolid": "departament_id",
            "achievement_score": "engagement_score",
            "success_expect": "tenure",
            "school_achievement": "last_engagement_score",
            "school_urbanicity": "role",
            "school_poverty": "department_score",
            "school_size": "department_size",
            "ethnicity": "n_of_reports",
        }
    )
    # reduce overlapp for better examples
    .assign(
        intervention=lambda d: (
            d["intervention"].astype(bool)
            | (np.random.normal(d["last_engagement_score"]) > 1.65)
        ).astype(int)
    )
    .assign(
        intervention=lambda d: (
            d["intervention"].astype(bool) | (np.random.normal(d["tenure"], 2) > 7)
        ).astype(int)
    )
    .assign(n_of_reports=lambda d: d["n_of_reports"].clip(0, 8))
    .assign(
        department_size=lambda d: d.groupby("departament_id")["n_of_reports"].transform(
            sum
        )
    )
    .assign(
        last_engagement_score=lambda d: np.random.normal(d["last_engagement_score"])
    )
    .drop(columns=["frst_in_family", "school_mindset", "school_ethnic_minority"])
)


df.to_csv("../data/management_training.csv", index=False)
/tmp/ipykernel_46876/2760081185.py:14: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
  .assign(department_size = lambda d: d.groupby("departament_id")["n_of_reports"].transform(sum))
np.random.seed(123)

n = 10000

x1 = np.random.uniform(-1, 1, n)
x2 = np.random.uniform(-1, 1, n)
t = np.random.normal(7.5 - 1 * (x1 + x2), 2).clip(1, 14).round(1)
y = np.random.normal((23 - 0.8 * t - 8 * (x1 + x2))).clip(1, np.inf).round(0)

df_cont_t = pd.DataFrame(dict(ml_1=x1, ml_2=x2, interest=t, duration=y))

df_cont_t.to_csv("../data/interest_rate.csv", index=False)

Chapter 6

import pandas as pd
from pandas.tseries import holiday

np.random.seed(123)


month_fe = {
    1: 23,
    2: 11,
    3: 10,
    4: 9,
    5: 4,
    6: 2,
    7: 10,
    8: 5,
    9: 10,
    10: 20,
    11: 25,
    12: 30,
}

week_fe = {0: 2, 1: 2, 2: 3, 3: 7, 4: 15, 5: 12, 6: 5}

store_fe = {0: 20, 1: 14, 2: 3, 3: 10, 4: 9, 5: 12, 6: 5}

day = pd.Series(pd.date_range("2016-01-01", "2019-01-01"))
weekend = day.dt.weekday >= 5
is_holiday = day.isin(
    holiday.USFederalHolidayCalendar().holidays("2016-01-01", "2019-01-01")
)
is_dec = day.dt.month == 12
is_nov = day.dt.month == 11

time_data = pd.DataFrame(
    dict(
        day=day,
        month=day.dt.month,
        weekday=day.dt.weekday,
        weekend=weekend,
        is_holiday=is_holiday,
        is_dec=day.dt.month == 12,
        is_nov=day.dt.month == 11,
    )
).assign(
    month_fe=lambda d: d["month"].map(month_fe),
    week_fe=lambda d: d["weekday"].map(week_fe),
)

data = (
    pd.concat([time_data.assign(rest_id=i) for i in range(len(store_fe))])
    .assign(store_fe=lambda d: d["rest_id"].map(store_fe))
    .assign(
        competitors_price=lambda d: (
            np.random.beta(
                5 + d["is_holiday"] + d["is_dec"] * 2 + d["is_nov"],
                0.5 * d["store_fe"],
                len(d),
            )
            * 9
            + 1
        ).round(2)
    )
    .assign(
        sales0=lambda d: np.random.poisson(
            d["month_fe"]
            + d["store_fe"]
            + d["week_fe"]
            + 3 * d["competitors_price"]
            + 10 * d["is_holiday"]
            + 5 * d["weekend"]
        )
    )
    .assign(
        effect=lambda d: np.random.poisson(
            150
            + d["month_fe"]
            - d["store_fe"]
            + d["week_fe"]
            - 40 * np.sqrt(d["competitors_price"])
            + 5 * d["is_holiday"]
            - 3 * d["weekend"]
        )
    )
    .assign(
        discounts=lambda d: (
            ((np.random.beta(1, 3, size=len(d)) * 50) / 5).astype(int) * 5
        )
    )
    .assign(sales=lambda d: d["sales0"] + 0.5 * d["effect"] * d["discounts"])[
        [
            "rest_id",
            "day",
            "month",
            "weekday",
            "weekend",
            "is_holiday",
            "is_dec",
            "is_nov",
            "competitors_price",
            "discounts",
            "sales",
        ]
    ]
)


data.to_csv("../data/daily_restaurant_sales.csv", index=False)

Chapter 7

np.random.seed(42)

categs = {
    "vehicle": 0.1,
    "food": 1,
    "beverage": 1,
    "art": 1,
    "baby": 1,
    "personal_care": 1,
    "toys": 1,
    "clothing": 2,
    "decor": 1,
    "cell_phones": 3,
    "construction": 1,
    "home_appliances": 1,
    "electronics": 2,
    "sports": 1,
    "tools": 1,
    "games": 2,
    "industry": 1,
    "pc": 2,
    "jewel": 1,
    "books": 1,
    "music_books_movies": 1,
    "health": 2,
}


n = 10000

age = (18 + np.random.beta(2, 7, n) * 90).round(0)
tenure = np.random.exponential(0.5, n).round(0)
ammount_spent = (
    np.random.exponential(1, n) * 100
    + np.random.binomial(1, 0.01, n) * np.random.uniform(500, 50000, n)
).round(2)

categ_purchage = {cat: np.random.poisson(l, n) for cat, l in categs.items()}

X = pd.DataFrame(categ_purchage).assign(
    age=age,
    tenure=tenure,
    ammount_spent=ammount_spent,
)

mkt_email_rnd = np.random.binomial(1, 0.5, n)

coefs = np.concatenate([np.random.uniform(-1, 1, len(categs)), np.array([1, 20, 0.4])])
y0 = np.random.exponential(X.values.dot(coefs))

cate_coef = np.concatenate(
    [np.random.uniform(0, 100, len(categs)), np.array([-1, -20, 0])]
)

tau = np.random.normal(X.values.dot(cate_coef))

data_rnd = X.assign(mkt_email=mkt_email_rnd, next_mnth_pv=y0 + tau * mkt_email_rnd)[
    ["mkt_email", "next_mnth_pv", "age", "tenure", "ammount_spent"]
    + list(categs.keys())
].round(2)


data_rnd.to_csv("../data/email_rnd_data.csv", index=False)
np.random.seed(2)

n = 300000

age = (18 + np.random.beta(2, 7, n) * 90).round(0)
tenure = np.random.exponential(0.5, n).round(0)
ammount_spent = (
    np.random.exponential(1, n) * 100
    + np.random.binomial(1, 0.01, n) * np.random.uniform(500, 50000, n)
).round(2)

categ_purchage = {cat: np.random.poisson(l, n) for cat, l in categs.items()}

X = pd.DataFrame(categ_purchage).assign(
    age=age,
    tenure=tenure,
    ammount_spent=ammount_spent,
)

coefs_ps = np.concatenate(
    [np.random.uniform(-1, 1, len(categs)), np.array([-1, 40, 0.4])]
)


mkt_email_biased = (np.random.normal(X.values.dot(coefs_ps), 2000) > 0).astype(int)

y0 = np.random.exponential(X.values.dot(coefs))
tau = np.random.normal(X.values.dot(cate_coef))

data_biased = X.assign(
    mkt_email=mkt_email_biased, next_mnth_pv=y0 + tau * mkt_email_biased
)[
    ["mkt_email", "next_mnth_pv", "age", "tenure", "ammount_spent"]
    + list(categs.keys())
].round(2)

data_biased.to_csv("../data/email_obs_data.csv", index=False)
import pandas as pd
import numpy as np

from pandas.tseries import holiday

np.random.seed(123)


month_fe = {
    1: 23,
    2: 11,
    3: 10,
    4: 9,
    5: 4,
    6: 2,
    7: 10,
    8: 5,
    9: 10,
    10: 20,
    11: 25,
    12: 30,
}

week_fe = {0: 2, 1: 2, 2: 3, 3: 7, 4: 15, 5: 12, 6: 5}

store_fe = {0: 20, 1: 14, 2: 3, 3: 10, 4: 9, 5: 12, 6: 5}

day = pd.Series(pd.date_range("2016-01-01", "2019-01-01"))
weekend = day.dt.weekday >= 5
is_holiday = day.isin(
    holiday.USFederalHolidayCalendar().holidays("2016-01-01", "2019-01-01")
)
is_dec = day.dt.month == 12
is_nov = day.dt.month == 11

time_data = pd.DataFrame(
    dict(
        day=day,
        month=day.dt.month,
        weekday=day.dt.weekday,
        weekend=weekend,
        is_holiday=is_holiday,
        is_dec=day.dt.month == 12,
        is_nov=day.dt.month == 11,
    )
).assign(
    month_fe=lambda d: d["month"].map(month_fe),
    week_fe=lambda d: d["weekday"].map(week_fe),
)

data_cont = (
    pd.concat([time_data.assign(rest_id=i) for i in range(len(store_fe))])
    .assign(store_fe=lambda d: d["rest_id"].map(store_fe))
    .assign(
        competitors_price=lambda d: (
            np.random.beta(
                5 + d["is_holiday"] + d["is_dec"] * 2 + d["is_nov"],
                0.5 * d["store_fe"],
                len(d),
            )
            * 9
            + 1
        ).round(2)
    )
    .assign(
        sales0=lambda d: np.random.poisson(
            d["month_fe"]
            + d["store_fe"]
            + d["week_fe"]
            + 3 * d["competitors_price"]
            + 10 * d["is_holiday"]
            + 5 * d["weekend"]
        )
    )
    .assign(
        effect=lambda d: np.random.poisson(
            150
            + d["month_fe"]
            - d["store_fe"]
            + d["week_fe"]
            - 40 * np.sqrt(d["competitors_price"])
            + 5 * d["is_holiday"]
            - 3 * d["weekend"]
        )
    )
    .assign(
        discounts=lambda d: (
            ((np.random.beta(1, 3, size=len(d)) * 50) / 5).astype(int) * 5
        )
    )
    .assign(sales=lambda d: d["sales0"] + 0.5 * d["effect"] * d["discounts"])[
        [
            "rest_id",
            "day",
            "month",
            "weekday",
            "weekend",
            "is_holiday",
            "is_dec",
            "is_nov",
            "competitors_price",
            "discounts",
            "sales",
        ]
    ]
)


data_cont.to_csv("../data/discount_data.csv", index=False)

Chapter 8

import numpy as np
import pandas as pd

# 데이터 생성
date = pd.date_range("2021-05-01", "2021-07-31", freq="D")
cohorts = pd.to_datetime(["2021-05-15", "2021-06-04", "2021-06-20"])
poss_regions = ["S", "N", "W", "E"]

reg_ps = dict(zip(poss_regions, [0.3, 0.6, 0.7, 0.8]))
reg_fe = dict(zip(poss_regions, [20, 16, 8, 2]))
reg_trend = dict(zip(poss_regions, [0, 0.2, 0.4, 0.6]))

units = np.array(range(1, 200 + 1))

np.random.seed(123)

unit_reg = np.random.choice(poss_regions, len(units))
exp_trend = np.random.exponential(0.01, len(units))
treated_unit = np.random.binomial(1, np.vectorize(reg_ps.__getitem__)(unit_reg))

# staggered adoption dgp
df = (
    pd.DataFrame(
        dict(
            date=np.tile(date, len(units)),  # 수정: date.date 대신 date 사용
            city=np.repeat(units, len(date)),
            region=np.repeat(unit_reg, len(date)),
            treated_unit=np.repeat(treated_unit, len(date)),
            cohort=np.repeat(np.random.choice(cohorts.date, len(units)), len(date)),
            eff_heter=np.repeat(np.random.exponential(1, size=len(units)), len(date)),
            unit_fe=np.repeat(np.random.normal(0, 2, size=len(units)), len(date)),
            time_fe=np.tile(np.random.normal(size=len(date)), len(units)),
            week_day=np.tile(date.weekday, len(units)),
            w_seas=np.tile(abs(5 - date.weekday) % 7, len(units)),
        )
    )
    .assign(
        reg_fe=lambda d: d["region"].map(reg_fe),
        reg_trend=lambda d: d["region"].map(reg_trend),
        reg_ps=lambda d: d["region"].map(reg_ps),
        trend=lambda d: (
            (pd.to_datetime(d["date"]) - pd.to_datetime(d["date"]).min()).dt.days
        ),
        day=lambda d: (
            (pd.to_datetime(d["date"]) - pd.to_datetime(d["date"]).min()).dt.days
        ),
        cohort=lambda d: np.where(
            d["treated_unit"] == 1, d["cohort"], pd.to_datetime("2100-01-01")
        ),
    )
    .assign(
        treated=lambda d: (
            (d["date"] >= d["cohort"]) & (d["treated_unit"] == 1)
        ).astype(int),
    )
    .assign(
        y0=lambda d: np.round(
            10
            + d["treated_unit"]
            + d["reg_trend"] * d["trend"] / 2
            + d["unit_fe"]
            + 0.4 * d["time_fe"]
            + 2 * d["reg_fe"]
            + d["w_seas"] / 5,
            0,
        ),
    )
    .assign(
        #     y0 = lambda d: np.round(d["y0"] + d.groupby("city")["y0"].shift(1).fillna(0)*0.2, 0)
    )
    .assign(
        y1=lambda d: (
            d["y0"]
            + np.minimum(
                0.2
                * (
                    np.maximum(
                        0,
                        (
                            pd.to_datetime(d["date"]) - pd.to_datetime(d["cohort"])
                        ).dt.days,
                    )
                ),
                1,
            )
            * d["eff_heter"]
            * 2
        )
    )
    .assign(
        tau=lambda d: d["y1"] - d["y0"],
        downloads=lambda d: (
            np.where(d["treated"] == 1, d["y1"], d["y0"])
            + np.random.normal(0, 0.7, len(d))
        ),
        #     date = lambda d: pd.to_datetime(d["date"]),
    )
    .round({"downloads": 0})
)

# 필터링 및 데이터 저장
reg_filter = ["N", "S", "E", "W"]

mkt_data_all = (
    df.query("region.isin(@reg_filter)")
    .query("date.astype('string') <= '2021-06-01'")
    #                 .assign(cohort = lambda d: np.where(d["cohort"].astype(str) <= "2021-06-01", d["cohort"], "2050-01-01"))
    .drop(
        columns=[
            "reg_fe",
            "time_fe",
            "cohort",
            "w_seas",
            "week_day",
            "reg_trend",
            "trend",
            "day",
            "unit_fe",
            "y0",
            "y1",
            "eff_heter",
            "reg_ps",
            "treated_unit",
        ]
    )
    .assign(post=lambda d: (d["date"].astype(str) >= "2021-05-15").astype(int))
    .assign(treated=lambda d: d.groupby("city")["treated"].transform("max"))
)

mkt_data = mkt_data_all.query("region=='S'")

mkt_data_cohorts = (
    df.assign(post=lambda d: (d["date"] >= d["cohort"]).astype(int))
    .assign(treated=lambda d: d.groupby("city")["treated"].transform("max"))
    .drop(
        columns=[
            "reg_fe",
            "time_fe",
            "w_seas",
            "week_day",
            "reg_trend",
            "trend",
            "day",
            "unit_fe",
            "y0",
            "y1",
            "eff_heter",
            "reg_ps",
            "treated_unit",
        ]
    )
)

mkt_data_cohorts.to_csv("../data/offline_mkt_staggered.csv", index=False)
mkt_data_all.to_csv("../data/short_offline_mkt_all_regions.csv", index=False)
mkt_data.to_csv("../data/short_offline_mkt_south.csv", index=False)

Chapter 9

from statsmodels.tsa.arima_process import ArmaProcess


def simulate_series(
    dates,
    name,
    pop,
    pop_pct,
    ws_w,
    ms_w,
    ys_w,
    ar_w,
    trend_w,
    noise_w,
    ar=1,
    ma=7,
):
    ws = abs(5 - dates.weekday) % 7
    ms = abs(dates.day - 10)
    ys = abs(6 - dates.month) % 12

    arma = ArmaProcess(ar, ma).generate_sample(len(dates))
    trend = np.arange(0, len(dates))
    noise = np.random.normal(0, 1, len(dates))

    comps = (ws, ms, ys, arma, trend, noise)
    comp_noise = np.random.uniform(0.95, 1.05, len(comps))

    coefs = (ws_w, ms_w, ys_w, ar_w, trend_w, noise_w)

    result = (
        sum(c * w * noise for c, w, noise in zip(comps, coefs, comp_noise))
        * pop
        * pop_pct
        * 0.005
    )

    return pd.Series(result, name=name).clip(0, np.inf).round(0)
br_cities = pd.read_csv("../data/br_cities.csv")

np.random.seed(3)

states = br_cities["state"].unique()
pop_pct = np.random.beta(5, 10, len(states)) * 0.1
trends = np.random.beta(9, 5, len(states)) * 0.5 - 0.25
week_sas = np.random.uniform(2, 4, len(states))
m_sas = np.random.uniform(0, 1, len(states))
y_sas = np.random.uniform(0, 2, len(states))

states_params = pd.DataFrame(
    dict(
        pop_pct=pop_pct,
        state=states,
        trends=trends,
        week_sas=week_sas,
        m_sas=m_sas,
        y_sas=y_sas,
    )
)

# states_params.head()
br_cities = pd.read_csv("../data/br_cities.csv").merge(states_params, on="state")
np.random.seed(12345)

date = pd.date_range("2022-03-01", "2022-06-30", freq="D")

series = [
    simulate_series(
        date,
        name=city["city"],
        pop=city["pop"] * city["pop_pct"],
        pop_pct=city["pop_pct"],
        ws_w=city["week_sas"],
        ms_w=city["m_sas"],
        ys_w=city["y_sas"],
        ar_w=0.1,
        trend_w=city["trends"],
        noise_w=1000 / np.sqrt(city["pop"] * city["pop_pct"]),
    )
    .round(0)
    .to_frame()
    .assign(
        population=city["pop"],
        city=city["city"],
        state=city["state"],
        date=date,
    )
    .rename(columns={city["city"]: "app_download"})
    for _, city in br_cities.sort_values("pop", ascending=False).head(50).iterrows()
]
features = pd.concat(
    [
        pd.DataFrame(
            dict(
                activity=(
                    ArmaProcess([1, 0.9, 0.8, 0.7], 7).generate_sample(len(date))
                    + row["hdi"]
                ).round(0)
                * 3,
                date=date,
                state=row["state"],
                city=row["city"],
            )
        )
        for _, row in br_cities.sort_values("pop", ascending=False).head(50).iterrows()
    ]
)
def simulate_effect(df, city_col, date_col, y_col, city, start_at, window, effect_pct):
    window_mask = (df[date_col] >= start_at) & (df[city_col] == city)

    def rbf(x, center, sigma):
        return np.exp(-((x - center) ** 2) / sigma**2)

    # 수정됨: window.2 대신 window / 2 사용
    kernel = rbf(
        (df[date_col] - pd.to_datetime(start_at)).dt.days - window / 2 - 1,
        0,
        window / 2,
    )

    y = np.where(window_mask, df[y_col] + df[y_col] * effect_pct * kernel, df[y_col])

    return df.assign(**{y_col: y})
df = pd.concat(series)

df = simulate_effect(
    df, "city", "date", "app_download", "sao_paulo", "2022-05-01", 40, 0.4
)
df = simulate_effect(
    df, "city", "date", "app_download", "joao_pessoa", "2022-05-01", 40, 0.5
)
df = simulate_effect(
    df, "city", "date", "app_download", "porto_alegre", "2022-05-01", 40, 0.6
)

df = df.assign(
    post=(df["date"] >= "2022-05-01") * 1,
    treated=(df["city"].isin(["sao_paulo", "joao_pessoa", "porto_alegre"])) * 1,
)

df.to_csv("../data/online_mkt.csv", index=False)
df_norm = df.assign(app_download_pct=100 * df["app_download"] / df["population"])


df_norm_cov = (
    df_norm.merge(br_cities[["city", "state", "hdi"]])
    .assign(
        comp_download_pct=lambda d: (
            100 * (0.8 * d["app_download"] + d["hdi"]) / d["population"]
        )
    )
    .drop(columns=["hdi"])
)

df_norm_cov.to_csv("../data/online_mkt_cov.csv", index=False)

Chapter 10

import numpy as np
import pandas as pd
from statsmodels.tsa.arima_process import ArmaProcess

T = 120
m = 2
p = 0.5


def y_given_d(d, effect_params=[3, 2, 1], T=T, seed=None):
    np.random.seed(seed)
    x = np.arange(1, T + 1)
    return (
        np.log(x + 1)
        + 2 * np.sin(x * 2 * np.pi / 24)
        + np.convolve(~d.astype(bool), effect_params)[: -(len(effect_params) - 1)]
        + np.random.normal(0, 1, T)
        #             + ArmaProcess([3,2,], 3).generate_sample(T)
    ).round(2)


# correct tau = 3+2+1=6
effect_params = [3, 2, 1]


np.random.seed(123)

df_sb_every = (
    pd.DataFrame(
        dict(
            d=np.random.binomial(1, 0.5, T),
        )
    )
    .assign(
        delivery_time=lambda d: y_given_d(d["d"], seed=1),
        delivery_time_1=lambda d: y_given_d(np.ones(T), seed=1),
        delivery_time_0=lambda d: y_given_d(np.zeros(T), seed=1),
    )
    .assign(tau=lambda d: d["delivery_time_1"] - d["delivery_time_0"])
)


df_sb_every.to_csv("../data/sb_exp_every.csv", index=False)
def gen_d(rand_points, p):
    result = [np.random.binomial(1, p)]

    for t in rand_points[1:]:
        result.append(np.random.binomial(1, p) * t + (1 - t) * result[-1])

    return np.array(result)


# 가상의 y_given_d 함수 (실제 구현에 따라 수정 필요)
def y_given_d(d, T, seed):
    np.random.seed(seed)
    return np.random.rand(T)  # 예시로 랜덤 배열 반환


m = 2
T = 120
p = 0.5

n = T * m  # 수정됨: T.m 대신 T * m 사용

rand_points_opt = np.isin(
    np.arange(1, T + 1), [1] + [i * m + 1 for i in range(2, int(n) - 1)]
)

d_opt = gen_d(rand_points_opt, p)
y_opt = y_given_d(d_opt, T=T, seed=1)

pd.DataFrame(dict(rand_points=rand_points_opt, d=d_opt, delivery_time=y_opt)).to_csv(
    "../data/sb_exp_opt.csv", index=False
)

Chapter 11

n = 10000

# confounders
age = 18 + np.random.normal(24, 4, n).round(1)
income = 500 + np.random.gamma(1, age * 100, n).round(0)
credit_score = (np.random.beta(1, 3, n) * 1000).round(0)


u = np.random.uniform(-1, 1, n)

categs = ["never-taker", "complier"]

cutoff = 0.6
e = 1 / (1 + np.exp(-(u - 0.05 * age + 0.01 * credit_score)))
cust_categ = np.select([e <= cutoff, e > cutoff], categs)

# plt.hist(e)

# Instrument
prime_elegible = np.random.binomial(1, 0.5, n)
choose_prime = ((cust_categ == "complier") & (prime_elegible == 1)).astype(int)

# outcome
group_effect = np.select(
    [cust_categ == "complier", cust_categ == "never-taker"], [700, 200]
)

pv_0 = np.random.normal(200 + 0.4 * income + 10 * age - 500 * u, 200).round(2)
pv_1 = pv_0 + group_effect

tau = pv_1 - pv_0
pv = pv_0 + group_effect * choose_prime


df = pd.DataFrame(
    dict(
        categ=cust_categ,
        age=age,
        income=income,
        credit_score=credit_score,
        prime_elegible=prime_elegible,
        prime_card=choose_prime,
        pv=pv,
        tau=tau,
    )
)[
    [
        "age",
        "income",
        "credit_score",
        "prime_elegible",
        "prime_card",
        "pv",
        "tau",
        "categ",
    ]
]

df.to_csv("../data/prime_card.csv", index=False)