import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
gender_submission = pd.read_csv('data/gender_submission.csv')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
train['Sex_clean'] = train['Sex'].astype('category').cat.codes
test['Sex_clean'] = test['Sex'].astype('category').cat.codes
Embarked
train['Embarked'].isnull().sum()
test['Embarked'].isnull().sum()
train['Embarked'].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
train['Embarked'].fillna('S', inplace=True)
train['Embarked'].isnull().sum()
train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes
test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes
Family
train['Family'] = 1 + train['SibSp'] + train['Parch']
test['Family'] = 1 + test['SibSp'] + test['Parch']
train['Solo'] = (train['Family'] == 1)
test['Solo'] = (test['Family'] == 1)
## Fare
train['FareBin_4'] = pd.qcut(train['Fare'], 4)
test['FareBin_4'] = pd.qcut(test['Fare'], 4)
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
2 |
False |
(-0.001, 7.91] |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
0 |
0 |
2 |
False |
(31.0, 512.329] |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
0 |
2 |
1 |
True |
(7.91, 14.454] |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
0 |
2 |
2 |
False |
(31.0, 512.329] |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
1 |
2 |
1 |
True |
(7.91, 14.454] |
pd.crosstab(train['FareBin_4'], train['Pclass'])
| FareBin_4 |
|
|
|
| (-0.001, 7.91] |
6 |
6 |
211 |
| (7.91, 14.454] |
0 |
86 |
138 |
| (14.454, 31.0] |
51 |
70 |
101 |
| (31.0, 512.329] |
159 |
22 |
41 |
Title
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
train['Title'].value_counts()
Mr 517
Miss 182
Mrs 125
Master 40
Other 23
Mlle 2
Mme 1
Ms 1
Name: Title, dtype: int64
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'].value_counts()
Mr 517
Miss 185
Mrs 126
Master 40
Other 23
Name: Title, dtype: int64
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
test['Title'].value_counts()
Mr 240
Miss 79
Mrs 72
Master 21
Other 6
Name: Title, dtype: int64
train['Title_clean'] = train['Title'].astype('category').cat.codes
test['Title_clean'] = test['Title'].astype('category').cat.codes
Age
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)
train.loc[ train['Age'] <= 16, 'Age_clean'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 26), 'Age_clean'] = 1
train.loc[(train['Age'] > 26) & (train['Age'] <= 36), 'Age_clean'] = 2
train.loc[(train['Age'] > 36) & (train['Age'] <= 62), 'Age_clean'] = 3
train.loc[ train['Age'] > 62, 'Age_clean'] = 4
test.loc[ test['Age'] <= 16, 'Age_clean'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 26), 'Age_clean'] = 1
test.loc[(test['Age'] > 26) & (test['Age'] <= 36), 'Age_clean'] = 2
test.loc[(test['Age'] > 36) & (test['Age'] <= 62), 'Age_clean'] = 3
test.loc[ test['Age'] > 62, 'Age_clean'] = 4
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
2 |
False |
(-0.001, 7.91] |
Mr |
2 |
1.0 |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
0 |
0 |
2 |
False |
(31.0, 512.329] |
Mrs |
3 |
3.0 |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
0 |
2 |
1 |
True |
(7.91, 14.454] |
Miss |
1 |
1.0 |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
0 |
2 |
2 |
False |
(31.0, 512.329] |
Mrs |
3 |
2.0 |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
1 |
2 |
1 |
True |
(7.91, 14.454] |
Mr |
2 |
2.0 |
Fare
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)
pd.qcut(train['Fare'], 4)
0 (-0.001, 7.91]
1 (31.0, 512.329]
2 (7.91, 14.454]
3 (31.0, 512.329]
4 (7.91, 14.454]
...
886 (7.91, 14.454]
887 (14.454, 31.0]
888 (14.454, 31.0]
889 (14.454, 31.0]
890 (-0.001, 7.91]
Name: Fare, Length: 891, dtype: category
Categories (4, interval[float64]): [(-0.001, 7.91] < (7.91, 14.454] < (14.454, 31.0] < (31.0, 512.329]]
train.loc[ train['Fare'] <= 17, 'Fare_clean'] = 0
train.loc[(train['Fare'] > 17) & (train['Fare'] <= 30), 'Fare_clean'] = 1
train.loc[(train['Fare'] > 30) & (train['Fare'] <= 100), 'Fare_clean'] = 2
train.loc[ train['Fare'] > 100, 'Fare_clean'] = 3
train['Fare_clean'] = train['Fare_clean'].astype(int)
test.loc[ test['Fare'] <= 17, 'Fare_clean'] = 0
test.loc[(test['Fare'] > 17) & (test['Fare'] <= 30), 'Fare_clean'] = 1
test.loc[(test['Fare'] > 30) & (test['Fare'] <= 100), 'Fare_clean'] = 2
test.loc[ test['Fare'] > 100, 'Fare_clean'] = 3
test['Fare_clean'] = test['Fare_clean'].astype(int)
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
... |
S |
1 |
2 |
2 |
False |
(-0.001, 7.91] |
Mr |
2 |
1.0 |
0 |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
... |
C |
0 |
0 |
2 |
False |
(31.0, 512.329] |
Mrs |
3 |
3.0 |
2 |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
... |
S |
0 |
2 |
1 |
True |
(7.91, 14.454] |
Miss |
1 |
1.0 |
0 |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
... |
S |
0 |
2 |
2 |
False |
(31.0, 512.329] |
Mrs |
3 |
2.0 |
2 |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
... |
S |
1 |
2 |
1 |
True |
(7.91, 14.454] |
Mr |
2 |
2.0 |
0 |
5 rows × 21 columns
Cabin
train['Cabin'].str[:1].value_counts()
C 59
B 47
D 33
E 32
A 15
F 13
G 4
T 1
Name: Cabin, dtype: int64
mapping = {
'A': 0,
'B': 0.4,
'C': 0.8,
'D': 1.2,
'E': 1.6,
'F': 2.0,
'G': 2.4,
'T': 2.8
}
train['Cabin_clean'] = train['Cabin'].str[:1]
train['Cabin_clean'] = train['Cabin_clean'].map(mapping)
train[['Pclass', 'Cabin_clean']].head(10)
| 0 |
3 |
NaN |
| 1 |
1 |
0.8 |
| 2 |
3 |
NaN |
| 3 |
1 |
0.8 |
| 4 |
3 |
NaN |
| 5 |
3 |
NaN |
| 6 |
1 |
1.6 |
| 7 |
3 |
NaN |
| 8 |
3 |
NaN |
| 9 |
2 |
NaN |
train.groupby('Pclass')['Cabin_clean'].median()
Pclass
1 0.8
2 1.8
3 2.0
Name: Cabin_clean, dtype: float64
train['Cabin_clean'].head(10)
0 NaN
1 0.8
2 NaN
3 0.8
4 NaN
5 NaN
6 1.6
7 NaN
8 NaN
9 NaN
Name: Cabin_clean, dtype: float64
train['Cabin_clean'] = train.groupby('Pclass')['Cabin_clean'].transform('median')
train['Cabin_clean'].head(10)
0 2.0
1 0.8
2 2.0
3 0.8
4 2.0
5 2.0
6 0.8
7 2.0
8 2.0
9 1.8
Name: Cabin_clean, dtype: float64
위와 같은 방법으로 할 경우 이미 자리 잡은 값들이 덮어 씌워짐
따라서 아래와 같은 방법으로 없는 값들만 채워줘야함 (fillna)
train['Cabin_clean'].fillna(train.groupby('Pclass')['Cabin_clean'].transform('median'), inplace=True)
train['Cabin_clean'].head(10)
0 2.0
1 0.8
2 2.0
3 0.8
4 2.0
5 2.0
6 0.8
7 2.0
8 2.0
9 1.8
Name: Cabin_clean, dtype: float64
test['Cabin_clean'] = test['Cabin'].str[:1]
test['Cabin_clean'] = test['Cabin_clean'].map(mapping)
test['Cabin_clean'].fillna(test.groupby('Pclass')['Cabin_clean'].transform('median'), inplace=True)
Feature & label
feature = [
'Pclass',
'SibSp',
'Parch',
'Sex_clean',
'Embarked_clean',
'Family',
'Solo',
'Title_clean',
'Age_clean',
'Fare_clean',
'Cabin_clean'
]
Model Selection
data = train[feature]
target = train[label]
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0)
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
clf = RandomForestClassifier(n_estimators=170, max_depth=5, random_state=0)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()
clf = LGBMClassifier(n_estimators=150,
subsample=0.8,
colsample_bytree=0.8)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()
clf = LGBMClassifier(n_estimators=120,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8, random_state=0)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()
clf = XGBClassifier(n_estimators=90,
max_depth=5,
subsample=0.8,
colsample_bytree=0.8, random_state=0)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()
Make Prediction
x_train = train[feature]
x_test = test[feature]
y_train = train[label]
clf.fit(x_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=5, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=170,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
pred = clf.predict(x_test)
gender_submission['Survived'] = pred
gender_submission.to_csv('191111_random_forest.csv',index=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=5, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=170,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)