import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)11 12장: 파이썬 모델링 라이브러리 소개
데이터 분석을 넘어 기계 학습 모델링으로 나아가기 위한 기초 인터페이스를 다룹니다.
11.1 pandas와 모델 코드의 인터페이스
데이터프레임을 NumPy 배열로 변환하여 모델 라이브러리에 전달하는 과정을 이해합니다.
data = pd.DataFrame({
'x0': [1, 2, 3, 4, 5],
'x1': [0.01, -0.01, 0.25, -4.1, 0.],
'y': [-1.5, 0., 3.6, 1.3, -2.]})
data
data.columns
data.to_numpy()array([[ 1. , 0.01, -1.5 ],
[ 2. , -0.01, 0. ],
[ 3. , 0.25, 3.6 ],
[ 4. , -4.1 , 1.3 ],
[ 5. , 0. , -2. ]])
df2 = pd.DataFrame(data.to_numpy(), columns=['one', 'two', 'three'])
df2| one | two | three | |
|---|---|---|---|
| 0 | 1.0 | 0.01 | -1.5 |
| 1 | 2.0 | -0.01 | 0.0 |
| 2 | 3.0 | 0.25 | 3.6 |
| 3 | 4.0 | -4.10 | 1.3 |
| 4 | 5.0 | 0.00 | -2.0 |
df3 = data.copy()
df3['strings'] = ['a', 'b', 'c', 'd', 'e']
df3
df3.to_numpy()array([[1, 0.01, -1.5, 'a'],
[2, -0.01, 0.0, 'b'],
[3, 0.25, 3.6, 'c'],
[4, -4.1, 1.3, 'd'],
[5, 0.0, -2.0, 'e']], dtype=object)
model_cols = ['x0', 'x1']
data.loc[:, model_cols].to_numpy()array([[ 1. , 0.01],
[ 2. , -0.01],
[ 3. , 0.25],
[ 4. , -4.1 ],
[ 5. , 0. ]])
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],
categories=['a', 'b'])
data| x0 | x1 | y | category | |
|---|---|---|---|---|
| 0 | 1 | 0.01 | -1.5 | a |
| 1 | 2 | -0.01 | 0.0 | b |
| 2 | 3 | 0.25 | 3.6 | a |
| 3 | 4 | -4.10 | 1.3 | a |
| 4 | 5 | 0.00 | -2.0 | b |
dummies = pd.get_dummies(data.category, prefix='category',
dtype=float)
data_with_dummies = data.drop('category', axis=1).join(dummies)
data_with_dummies| x0 | x1 | y | category_a | category_b | |
|---|---|---|---|---|---|
| 0 | 1 | 0.01 | -1.5 | 1.0 | 0.0 |
| 1 | 2 | -0.01 | 0.0 | 0.0 | 1.0 |
| 2 | 3 | 0.25 | 3.6 | 1.0 | 0.0 |
| 3 | 4 | -4.10 | 1.3 | 1.0 | 0.0 |
| 4 | 5 | 0.00 | -2.0 | 0.0 | 1.0 |
11.2 Patsy를 이용한 모델 설명
수식 형태의 문자열을 사용하여 통계 모델의 디자인 행렬을 생성합니다.
11.3 Patsy를 이용한 모델 설명
설명 변수와 반응 변수를 수식 형태로 기술하는 방법을 배웁니다.
data = pd.DataFrame({
'x0': [1, 2, 3, 4, 5],
'x1': [0.01, -0.01, 0.25, -4.1, 0.],
'y': [-1.5, 0., 3.6, 1.3, -2.]})
data
import patsy
y, X = patsy.dmatrices('y ~ x0 + x1', data)y
XDesignMatrix with shape (5, 3)
Intercept x0 x1
1 1 0.01
1 2 -0.01
1 3 0.25
1 4 -4.10
1 5 0.00
Terms:
'Intercept' (column 0)
'x0' (column 1)
'x1' (column 2)
np.asarray(y)
np.asarray(X)array([[ 1. , 1. , 0.01],
[ 1. , 2. , -0.01],
[ 1. , 3. , 0.25],
[ 1. , 4. , -4.1 ],
[ 1. , 5. , 0. ]])
patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]DesignMatrix with shape (5, 2)
x0 x1
1 0.01
2 -0.01
3 0.25
4 -4.10
5 0.00
Terms:
'x0' (column 0)
'x1' (column 1)
coef, resid, _, _ = np.linalg.lstsq(X, y, rcond=None)coef
coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)
coefIntercept 0.312910
x0 -0.079106
x1 -0.265464
dtype: float64
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
XDesignMatrix with shape (5, 3)
Intercept x0 np.log(np.abs(x1) + 1)
1 1 0.00995
1 2 0.00995
1 3 0.22314
1 4 1.62924
1 5 0.00000
Terms:
'Intercept' (column 0)
'x0' (column 1)
'np.log(np.abs(x1) + 1)' (column 2)
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
XDesignMatrix with shape (5, 3)
Intercept standardize(x0) center(x1)
1 -1.41421 0.78
1 -0.70711 0.76
1 0.00000 1.02
1 0.70711 -3.33
1 1.41421 0.77
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)
new_data = pd.DataFrame({
'x0': [6, 7, 8, 9],
'x1': [3.1, -0.5, 0, 2.3],
'y': [1, 2, 3, 4]})
new_X = patsy.build_design_matrices([X.design_info], new_data)
new_X[DesignMatrix with shape (4, 3)
Intercept standardize(x0) center(x1)
1 2.12132 3.87
1 2.82843 0.27
1 3.53553 0.77
1 4.24264 3.07
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)]
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)
XDesignMatrix with shape (5, 2)
Intercept I(x0 + x1)
1 1.01
1 1.99
1 3.25
1 -0.10
1 5.00
Terms:
'Intercept' (column 0)
'I(x0 + x1)' (column 1)
data = pd.DataFrame({
'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
'key2': [0, 1, 0, 1, 0, 1, 0, 0],
'v1': [1, 2, 3, 4, 5, 6, 7, 8],
'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})
y, X = patsy.dmatrices('v2 ~ key1', data)
XDesignMatrix with shape (8, 2)
Intercept key1[T.b]
1 0
1 0
1 1
1 1
1 0
1 1
1 0
1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
y, X = patsy.dmatrices('v2 ~ key1 + 0', data)
XDesignMatrix with shape (8, 2)
key1[a] key1[b]
1 0
1 0
0 1
0 1
1 0
0 1
1 0
0 1
Terms:
'key1' (columns 0:2)
y, X = patsy.dmatrices('v2 ~ C(key2)', data)
XDesignMatrix with shape (8, 2)
Intercept C(key2)[T.1]
1 0
1 1
1 0
1 1
1 0
1 1
1 0
1 0
Terms:
'Intercept' (column 0)
'C(key2)' (column 1)
data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})
data
y, X = patsy.dmatrices('v2 ~ key1 + key2', data)
X
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)
XDesignMatrix with shape (8, 4)
Intercept key1[T.b] key2[T.zero] key1[T.b]:key2[T.zero]
1 0 1 0
1 0 0 0
1 1 1 1
1 1 0 0
1 0 1 0
1 1 0 0
1 0 1 0
1 1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
'key1:key2' (column 3)
import statsmodels.api as sm
import statsmodels.formula.api as smf11.4 statsmodels 소개
파이썬에서 통계 모델을 추정하고 분석하는 방법을 알아봅니다.
# To make the example reproducible
rng = np.random.default_rng(seed=12345)
def dnorm(mean, variance, size=1):
if isinstance(size, int):
size = size,
return mean + np.sqrt(variance) * rng.standard_normal(*size)
N = 100
X = np.c_[dnorm(0, 0.4, size=N),
dnorm(0, 0.6, size=N),
dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]
y = np.dot(X, beta) + epsX[:5]
y[:5]array([-0.5995, -0.5885, 0.1856, -0.0075, -0.0154])
X_model = sm.add_constant(X)
X_model[:5]array([[ 1. , -0.9005, -0.1894, -1.0279],
[ 1. , 0.7993, -1.546 , -0.3274],
[ 1. , -0.5507, -0.1203, 0.3294],
[ 1. , -0.1639, 0.824 , 0.2083],
[ 1. , -0.0477, -0.2131, -0.0482]])
model = sm.OLS(y, X)results = model.fit()
results.paramsarray([0.0668, 0.268 , 0.4505])
print(results.summary()) OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.469
Model: OLS Adj. R-squared (uncentered): 0.452
Method: Least Squares F-statistic: 28.51
Date: Thu, 26 Feb 2026 Prob (F-statistic): 2.66e-13
Time: 20:53:04 Log-Likelihood: -25.611
No. Observations: 100 AIC: 57.22
Df Residuals: 97 BIC: 65.04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.0668 0.054 1.243 0.217 -0.040 0.174
x2 0.2680 0.042 6.313 0.000 0.184 0.352
x3 0.4505 0.068 6.605 0.000 0.315 0.586
==============================================================================
Omnibus: 0.435 Durbin-Watson: 1.869
Prob(Omnibus): 0.805 Jarque-Bera (JB): 0.301
Skew: 0.134 Prob(JB): 0.860
Kurtosis: 2.995 Cond. No. 1.64
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
data[:5]| col0 | col1 | col2 | y | |
|---|---|---|---|---|
| 0 | -0.900506 | -0.189430 | -1.027870 | -0.599527 |
| 1 | 0.799252 | -1.545984 | -0.327397 | -0.588454 |
| 2 | -0.550655 | -0.120254 | 0.329359 | 0.185634 |
| 3 | -0.163916 | 0.824040 | 0.208275 | -0.007477 |
| 4 | -0.047651 | -0.213147 | -0.048244 | -0.015374 |
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
results.params
results.tvaluesIntercept -0.652501
col0 1.219768
col1 6.312369
col2 6.567428
dtype: float64
results.predict(data[:5])0 -0.592959
1 -0.531160
2 0.058636
3 0.283658
4 -0.102947
dtype: float64
init_x = 4
values = [init_x, init_x]
N = 1000
b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
values.append(new_x)from statsmodels.tsa.ar_model import AutoReg
MAXLAGS = 5
model = AutoReg(values, MAXLAGS)
results = model.fit()results.paramsarray([ 0.0235, 0.8097, -0.4287, -0.0334, 0.0427, -0.0567])
train = pd.read_csv('datasets/titanic/train.csv')
test = pd.read_csv('datasets/titanic/test.csv')
train.head(4)| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
11.5 scikit-learn 소개
가장 인기 있는 머신러닝 라이브러리를 사용하여 예측 모델을 구축하는 기초를 학습합니다.
train.isna().sum()
test.isna().sum()PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].to_numpy()
X_test = test[predictors].to_numpy()
y_train = train['Survived'].to_numpy()
X_train[:5]
y_train[:5]array([0, 1, 1, 1, 0])
11.6 scikit-learn
가장 대중적인 머신러닝 라이브러리의 표준 워크플로우를 경험합니다.
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()model.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
y_predict = model.predict(X_test)
y_predict[:10]array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(Cs=10)
model_cv.fit(X_train, y_train)LogisticRegressionCV()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=10)
scores = cross_val_score(model, X_train, y_train, cv=4)
scoresarray([0.7758, 0.7982, 0.7758, 0.7883])
pd.options.display.max_rows = PREVIOUS_MAX_ROWS