PyCaret 설치

!pip install pycaret

Google Colab 사용자의 경우 다음의 코드를 실행합니다.

from pycaret.utils import enable_colab

enable_colab()

필요한 모듈 import

import pandas as pd
import numpy as np
import seaborn as sns

pd.options.display.max_columns = None

이상치 탐지 알고리즘

이상치 탐지 알고리즘의 활용 분야

  • 제조
    • 공정 불량 원인을 감지
    • 설비의 이상 동작을 미리 감지/탐지
  • 금융
    • 신용카드 탈취, 악용 사례 탐지
    • 금융 분야의 비정상 거래 탐지 (피싱)
  • 고객 관리
    • 제품/서비스 고객의 이탈을 사전에 감지
  • 보안
    • 네트워크 침해사고 예방/탐지
  • 헬스케어
    • 건강의 이상신호 감지 및 사전 알림
  • 자연어
    • 소비자 의견 수렴 (독특하고 창의적인 의견 수렴)

관련 캐글 대회/데이터셋

실습을 위한 데이터셋 로드

from pycaret.datasets import get_data

dataset = get_data('mice')
MouseID DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N pPKCAB_N pRSK_N AKT_N BRAF_N CAMKII_N CREB_N ELK_N ERK_N GSK3B_N JNK_N MEK_N TRKA_N RSK_N APP_N Bcatenin_N SOD1_N MTOR_N P38_N pMTOR_N DSCR1_N AMPKA_N NR2B_N pNUMB_N RAPTOR_N TIAM1_N pP70S6_N NUMB_N P70S6_N pGSK3B_N pPKCG_N CDK5_N S6_N ADARB1_N AcetylH3K9_N RRP1_N BAX_N ARC_N ERBB4_N nNOS_N Tau_N GFAP_N GluR3_N GluR4_N IL1B_N P3525_N pCASP9_N PSD95_N SNCA_N Ubiquitin_N pGSK3B_Tyr216_N SHH_N BAD_N BCL2_N pS6_N pCFOS_N SYP_N H3AcK18_N EGR1_N H3MeK4_N CaNA_N Genotype Treatment Behavior class
0 309_1 0.503644 0.747193 0.430175 2.816329 5.990152 0.218830 0.177565 2.373744 0.232224 1.750936 0.687906 0.306382 0.402698 0.296927 1.022060 0.605673 1.877684 2.308745 0.441599 0.859366 0.416289 0.369608 0.178944 1.866358 3.685247 1.537227 0.264526 0.319677 0.813866 0.165846 0.453910 3.037621 0.369510 0.458539 0.335336 0.825192 0.576916 0.448099 0.586271 0.394721 0.339571 0.482864 0.294170 0.182150 0.842725 0.192608 1.443091 0.294700 0.354605 1.339070 0.170119 0.159102 0.188852 0.106305 0.144989 0.176668 0.125190 0.115291 0.228043 0.142756 0.430957 0.247538 1.603310 2.014875 0.108234 1.044979 0.831557 0.188852 0.122652 NaN 0.106305 0.108336 0.427099 0.114783 0.131790 0.128186 1.675652 Control Memantine C/S c-CS-m
1 309_2 0.514617 0.689064 0.411770 2.789514 5.685038 0.211636 0.172817 2.292150 0.226972 1.596377 0.695006 0.299051 0.385987 0.281319 0.956676 0.587559 1.725774 2.043037 0.445222 0.834659 0.400364 0.356178 0.173680 1.761047 3.485287 1.509249 0.255727 0.304419 0.780504 0.157194 0.430940 2.921882 0.342279 0.423560 0.324835 0.761718 0.545097 0.420876 0.545097 0.368255 0.321959 0.454519 0.276431 0.182086 0.847615 0.194815 1.439460 0.294060 0.354548 1.306323 0.171427 0.158129 0.184570 0.106592 0.150471 0.178309 0.134275 0.118235 0.238073 0.142037 0.457156 0.257632 1.671738 2.004605 0.109749 1.009883 0.849270 0.200404 0.116682 NaN 0.106592 0.104315 0.441581 0.111974 0.135103 0.131119 1.743610 Control Memantine C/S c-CS-m
2 309_3 0.509183 0.730247 0.418309 2.687201 5.622059 0.209011 0.175722 2.283337 0.230247 1.561316 0.677348 0.291276 0.381002 0.281710 1.003635 0.602449 1.731873 2.017984 0.467668 0.814329 0.399847 0.368089 0.173905 1.765544 3.571456 1.501244 0.259614 0.311747 0.785154 0.160895 0.423187 2.944136 0.343696 0.425005 0.324852 0.757031 0.543620 0.404630 0.552994 0.363880 0.313086 0.447197 0.256648 0.184388 0.856166 0.200737 1.524364 0.301881 0.386087 1.279600 0.185456 0.148696 0.190532 0.108303 0.145330 0.176213 0.132560 0.117760 0.244817 0.142445 0.510472 0.255343 1.663550 2.016831 0.108196 0.996848 0.846709 0.193685 0.118508 NaN 0.108303 0.106219 0.435777 0.111883 0.133362 0.127431 1.926427 Control Memantine C/S c-CS-m
3 309_4 0.442107 0.617076 0.358626 2.466947 4.979503 0.222886 0.176463 2.152301 0.207004 1.595086 0.583277 0.296729 0.377087 0.313832 0.875390 0.520293 1.566852 2.132754 0.477671 0.727705 0.385639 0.362970 0.179449 1.286277 2.970137 1.419710 0.259536 0.279218 0.734492 0.162210 0.410615 2.500204 0.344509 0.429211 0.330121 0.746980 0.546763 0.386860 0.547849 0.366771 0.328492 0.442650 0.398534 0.161768 0.760234 0.184169 1.612382 0.296382 0.290680 1.198765 0.159799 0.166112 0.185323 0.103184 0.140656 0.163804 0.123210 0.117439 0.234947 0.145068 0.430996 0.251103 1.484624 1.957233 0.119883 0.990225 0.833277 0.192112 0.132781 NaN 0.103184 0.111262 0.391691 0.130405 0.147444 0.146901 1.700563 Control Memantine C/S c-CS-m
4 309_5 0.434940 0.617430 0.358802 2.365785 4.718679 0.213106 0.173627 2.134014 0.192158 1.504230 0.550960 0.286961 0.363502 0.277964 0.864912 0.507990 1.480059 2.013697 0.483416 0.687794 0.367531 0.355311 0.174836 1.324695 2.896334 1.359876 0.250705 0.273667 0.702699 0.154827 0.398550 2.456560 0.329126 0.408755 0.313415 0.691956 0.536860 0.360816 0.512824 0.351551 0.312206 0.419095 0.393447 0.160200 0.768113 0.185718 1.645807 0.296829 0.309345 1.206995 0.164650 0.160687 0.188221 0.104784 0.141983 0.167710 0.136838 0.116048 0.255528 0.140871 0.481227 0.251773 1.534835 2.009109 0.119524 0.997775 0.878668 0.205604 0.129954 NaN 0.104784 0.110694 0.434154 0.118481 0.140314 0.148380 1.839730 Control Memantine C/S c-CS-m

이 튜토리얼에서는 Mice Protein Expression이라는 UCI의 데이터 세트를 사용합니다. 데이터세트는 피질의 핵 분획에서 감지 가능한 신호를 생성한 77개의 단백질/단백질 변형의 발현 수준으로 구성됩니다. 데이터 세트에는 단백질당 총 1080개의 측정값이 포함되어 있습니다. 각 측정은 독립적인 샘플/마우스로 간주될 수 있습니다.

train = dataset.sample(frac=0.8, random_state=123)
test = dataset.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('학습용 데이터셋: ' + str(train.shape))
print('예측용 데이터셋: ' + str(test.shape))
학습용 데이터셋: (864, 82)
예측용 데이터셋: (216, 82)

셋업 setup

from pycaret.anomaly import *           # 이상 탐지

MouseID는 순차적인 단순 ID 값이기 때문에 데이터 분석 및 모델의 학습에서 제외합니다.

s = setup(train,
          normalize=True,                # 데이터 정규화
          ignore_features=['MouseID'],   # 학습에 무시할 컬럼 지정
          session_id=123)                # 시드(SEED) 지정
  Description Value
0 Session id 123
1 Original data shape (864, 81)
2 Transformed data shape (864, 88)
3 Ignore features 1
4 Ordinal features 3
5 Numeric features 77
6 Categorical features 4
7 Rows with missing values 49.3%
8 Preprocess True
9 Imputation type simple
10 Numeric imputation mean
11 Categorical imputation constant
12 Maximum one-hot encoding -1
13 Encoding method None
14 Low variance threshold 0
15 Normalize True
16 Normalize method zscore
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment False
20 Experiment Name anomaly-default-name
21 USI 365d

모델 생성

iforest 모델

from IPython.display import Image

Image(url='https://miro.medium.com/max/1400/1*4P2vi2YVj4nHbU5SZ9i7Ig.png', width=750)
# isolation forest 모델 생성
iforest = create_model('iforest')

이상치 탐지: assign_model()

iforest_results = assign_model(iforest)
iforest_results.head()
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N pPKCAB_N pRSK_N AKT_N BRAF_N CAMKII_N CREB_N ELK_N ERK_N GSK3B_N JNK_N MEK_N TRKA_N RSK_N APP_N Bcatenin_N SOD1_N MTOR_N P38_N pMTOR_N DSCR1_N AMPKA_N NR2B_N pNUMB_N RAPTOR_N TIAM1_N pP70S6_N NUMB_N P70S6_N pGSK3B_N pPKCG_N CDK5_N S6_N ADARB1_N AcetylH3K9_N RRP1_N BAX_N ARC_N ERBB4_N nNOS_N Tau_N GFAP_N GluR3_N GluR4_N IL1B_N P3525_N pCASP9_N PSD95_N SNCA_N Ubiquitin_N pGSK3B_Tyr216_N SHH_N BAD_N BCL2_N pS6_N pCFOS_N SYP_N H3AcK18_N EGR1_N H3MeK4_N CaNA_N Genotype Treatment Behavior class Anomaly Anomaly_Score
0 0.274204 0.491040 0.235014 1.768983 2.596107 0.183060 0.132654 2.442697 0.169009 0.858450 0.297796 0.251773 0.232048 0.204976 0.646513 0.519531 1.089210 1.070259 0.399639 0.550342 0.219801 0.260410 0.151992 0.854067 1.756607 0.887456 0.178419 0.210777 0.499291 0.120794 0.337759 1.629238 0.467449 0.391002 0.363156 0.557948 0.447596 0.291092 0.440634 0.295088 0.248163 0.355550 0.473250 0.199117 0.910373 0.164188 2.375492 0.299683 0.605220 0.822474 0.286729 0.173208 0.199981 0.117935 0.151617 0.211784 0.306113 0.123021 0.236925 0.140582 0.553978 0.327416 1.630266 2.187122 0.162844 1.117071 0.938106 0.234622 0.187890 0.129738 0.117935 0.143652 0.367911 0.177526 0.177718 0.220804 1.202860 Ts65Dn Saline S/C t-SC-s 0 -0.050177
1 0.196671 0.364057 0.272195 1.720407 2.205302 0.295314 0.252774 3.852035 0.232737 1.316584 0.313810 0.349877 0.300863 0.366215 0.670160 0.811344 1.202836 0.961467 0.462392 0.585080 0.386868 0.401048 0.226264 0.855425 1.537916 0.822441 0.244143 0.266338 0.545623 0.229963 0.299630 1.452528 0.738286 0.424476 0.425709 0.772195 0.598335 0.304871 0.566276 0.301171 0.326449 0.361591 0.432491 0.135920 0.826708 0.154487 1.871221 0.214949 0.297310 0.922637 0.288979 0.179005 0.160676 0.122828 0.161152 0.174958 0.263033 0.119495 0.175672 0.088312 0.648417 0.309212 1.547251 2.185670 0.197572 1.488217 0.832897 0.198762 NaN NaN 0.122828 0.129017 0.432992 0.238991 0.239705 NaN 1.126875 Control Memantine S/C c-SC-m 0 -0.066044
2 0.296479 0.541739 0.300911 2.507018 3.555036 0.276779 0.216203 5.381680 0.236395 1.282935 0.329968 0.362472 0.312977 0.316917 0.860379 0.820241 1.717557 1.177542 0.425511 0.670771 0.315932 0.449150 0.208323 0.989411 1.982024 1.127062 0.267668 0.293031 0.727653 0.194287 0.442502 2.104654 0.546910 0.475991 0.465403 0.857917 0.658705 0.333908 0.595420 0.325536 0.344989 0.408520 0.381187 0.152949 0.869359 0.148893 1.565996 0.274294 0.296096 0.975156 0.061011 0.169174 0.173061 0.131993 0.161399 0.184722 0.159033 0.100220 0.207707 0.099375 0.584080 0.292885 1.417272 2.658949 0.181849 1.407301 0.892344 0.352206 0.148048 0.151766 0.131993 0.166638 0.376711 0.144499 0.189116 0.238465 1.195538 Ts65Dn Memantine S/C t-SC-m 0 -0.088480
3 0.228700 0.395179 0.234118 1.733184 2.220852 0.220665 0.161435 1.989723 0.185164 0.884342 0.255045 0.245703 0.221413 0.255792 0.636024 0.442638 1.015695 1.065022 0.408259 0.540172 0.238042 0.326981 0.212631 0.762892 1.425262 0.818199 0.216741 0.235426 0.559043 0.168161 0.309978 1.494208 0.661809 0.337444 0.309978 0.510650 0.463378 0.234679 0.400037 0.235239 0.231876 0.292788 0.460202 0.196736 0.804896 0.170807 2.628287 0.313327 0.669810 0.764098 0.536899 0.201269 0.169175 0.118948 0.174252 0.185131 0.395648 0.137081 0.201088 0.126927 0.631188 0.358114 1.437534 2.544515 0.179692 1.242248 0.976609 0.290843 0.216682 NaN 0.118948 0.158296 0.422121 0.321306 0.229193 0.355213 1.430825 Ts65Dn Saline S/C t-SC-s 0 -0.000076
4 0.814391 1.124141 0.388574 2.632156 4.972038 0.233940 0.133904 2.537785 0.218874 2.053513 1.086778 0.305773 0.368929 0.293721 0.857177 0.557189 1.760395 2.154634 0.509341 0.738580 0.671568 0.369531 0.197903 1.379173 3.171387 1.617814 0.279619 0.323973 0.821020 0.171026 0.482222 2.548994 0.346872 0.426058 0.296252 0.752441 0.613957 0.398698 0.553694 0.383512 0.322767 0.458720 0.269857 0.196742 0.980468 0.193295 1.580968 0.328737 0.442890 1.159503 0.096986 0.153893 0.192620 0.110232 0.152812 0.201744 0.181062 0.107326 0.212625 0.113341 0.497499 0.260814 1.701811 2.127061 0.114490 1.156934 0.892471 0.178021 0.124966 0.103136 0.110232 0.101446 0.474790 0.093877 0.120911 0.132265 1.985739 Ts65Dn Memantine C/S t-CS-m 0 -0.059018

결과 시각화

plot_model(iforest)

예측 predict_model

predicitons = predict_model(iforest, data=test)
predicitons[['Anomaly', 'Anomaly_Score']]
Anomaly Anomaly_Score
0 0 -0.042698
1 0 -0.083798
2 0 -0.073029
3 0 -0.079531
4 0 -0.057472
... ... ...
211 0 -0.058016
212 0 -0.063982
213 0 -0.050281
214 0 -0.083473
215 1 0.007951

216 rows × 2 columns