!pip install pycaretPyCaret 설치
Google Colab 사용자의 경우 다음의 코드를 실행합니다.
from pycaret.utils import enable_colab
enable_colab()필요한 모듈 import
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.max_columns = None이상치 탐지 알고리즘
이상치 탐지 알고리즘의 활용 분야
- 제조
- 공정 불량 원인을 감지
- 설비의 이상 동작을 미리 감지/탐지
- 금융
- 신용카드 탈취, 악용 사례 탐지
- 금융 분야의 비정상 거래 탐지 (피싱)
- 고객 관리
- 제품/서비스 고객의 이탈을 사전에 감지
- 보안
- 네트워크 침해사고 예방/탐지
- 헬스케어
- 건강의 이상신호 감지 및 사전 알림
- 자연어
- 소비자 의견 수렴 (독특하고 창의적인 의견 수렴)
관련 캐글 대회/데이터셋
실습을 위한 데이터셋 로드
from pycaret.datasets import get_data
dataset = get_data('mice')| MouseID | DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | Genotype | Treatment | Behavior | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 309_1 | 0.503644 | 0.747193 | 0.430175 | 2.816329 | 5.990152 | 0.218830 | 0.177565 | 2.373744 | 0.232224 | 1.750936 | 0.687906 | 0.306382 | 0.402698 | 0.296927 | 1.022060 | 0.605673 | 1.877684 | 2.308745 | 0.441599 | 0.859366 | 0.416289 | 0.369608 | 0.178944 | 1.866358 | 3.685247 | 1.537227 | 0.264526 | 0.319677 | 0.813866 | 0.165846 | 0.453910 | 3.037621 | 0.369510 | 0.458539 | 0.335336 | 0.825192 | 0.576916 | 0.448099 | 0.586271 | 0.394721 | 0.339571 | 0.482864 | 0.294170 | 0.182150 | 0.842725 | 0.192608 | 1.443091 | 0.294700 | 0.354605 | 1.339070 | 0.170119 | 0.159102 | 0.188852 | 0.106305 | 0.144989 | 0.176668 | 0.125190 | 0.115291 | 0.228043 | 0.142756 | 0.430957 | 0.247538 | 1.603310 | 2.014875 | 0.108234 | 1.044979 | 0.831557 | 0.188852 | 0.122652 | NaN | 0.106305 | 0.108336 | 0.427099 | 0.114783 | 0.131790 | 0.128186 | 1.675652 | Control | Memantine | C/S | c-CS-m |
| 1 | 309_2 | 0.514617 | 0.689064 | 0.411770 | 2.789514 | 5.685038 | 0.211636 | 0.172817 | 2.292150 | 0.226972 | 1.596377 | 0.695006 | 0.299051 | 0.385987 | 0.281319 | 0.956676 | 0.587559 | 1.725774 | 2.043037 | 0.445222 | 0.834659 | 0.400364 | 0.356178 | 0.173680 | 1.761047 | 3.485287 | 1.509249 | 0.255727 | 0.304419 | 0.780504 | 0.157194 | 0.430940 | 2.921882 | 0.342279 | 0.423560 | 0.324835 | 0.761718 | 0.545097 | 0.420876 | 0.545097 | 0.368255 | 0.321959 | 0.454519 | 0.276431 | 0.182086 | 0.847615 | 0.194815 | 1.439460 | 0.294060 | 0.354548 | 1.306323 | 0.171427 | 0.158129 | 0.184570 | 0.106592 | 0.150471 | 0.178309 | 0.134275 | 0.118235 | 0.238073 | 0.142037 | 0.457156 | 0.257632 | 1.671738 | 2.004605 | 0.109749 | 1.009883 | 0.849270 | 0.200404 | 0.116682 | NaN | 0.106592 | 0.104315 | 0.441581 | 0.111974 | 0.135103 | 0.131119 | 1.743610 | Control | Memantine | C/S | c-CS-m |
| 2 | 309_3 | 0.509183 | 0.730247 | 0.418309 | 2.687201 | 5.622059 | 0.209011 | 0.175722 | 2.283337 | 0.230247 | 1.561316 | 0.677348 | 0.291276 | 0.381002 | 0.281710 | 1.003635 | 0.602449 | 1.731873 | 2.017984 | 0.467668 | 0.814329 | 0.399847 | 0.368089 | 0.173905 | 1.765544 | 3.571456 | 1.501244 | 0.259614 | 0.311747 | 0.785154 | 0.160895 | 0.423187 | 2.944136 | 0.343696 | 0.425005 | 0.324852 | 0.757031 | 0.543620 | 0.404630 | 0.552994 | 0.363880 | 0.313086 | 0.447197 | 0.256648 | 0.184388 | 0.856166 | 0.200737 | 1.524364 | 0.301881 | 0.386087 | 1.279600 | 0.185456 | 0.148696 | 0.190532 | 0.108303 | 0.145330 | 0.176213 | 0.132560 | 0.117760 | 0.244817 | 0.142445 | 0.510472 | 0.255343 | 1.663550 | 2.016831 | 0.108196 | 0.996848 | 0.846709 | 0.193685 | 0.118508 | NaN | 0.108303 | 0.106219 | 0.435777 | 0.111883 | 0.133362 | 0.127431 | 1.926427 | Control | Memantine | C/S | c-CS-m |
| 3 | 309_4 | 0.442107 | 0.617076 | 0.358626 | 2.466947 | 4.979503 | 0.222886 | 0.176463 | 2.152301 | 0.207004 | 1.595086 | 0.583277 | 0.296729 | 0.377087 | 0.313832 | 0.875390 | 0.520293 | 1.566852 | 2.132754 | 0.477671 | 0.727705 | 0.385639 | 0.362970 | 0.179449 | 1.286277 | 2.970137 | 1.419710 | 0.259536 | 0.279218 | 0.734492 | 0.162210 | 0.410615 | 2.500204 | 0.344509 | 0.429211 | 0.330121 | 0.746980 | 0.546763 | 0.386860 | 0.547849 | 0.366771 | 0.328492 | 0.442650 | 0.398534 | 0.161768 | 0.760234 | 0.184169 | 1.612382 | 0.296382 | 0.290680 | 1.198765 | 0.159799 | 0.166112 | 0.185323 | 0.103184 | 0.140656 | 0.163804 | 0.123210 | 0.117439 | 0.234947 | 0.145068 | 0.430996 | 0.251103 | 1.484624 | 1.957233 | 0.119883 | 0.990225 | 0.833277 | 0.192112 | 0.132781 | NaN | 0.103184 | 0.111262 | 0.391691 | 0.130405 | 0.147444 | 0.146901 | 1.700563 | Control | Memantine | C/S | c-CS-m |
| 4 | 309_5 | 0.434940 | 0.617430 | 0.358802 | 2.365785 | 4.718679 | 0.213106 | 0.173627 | 2.134014 | 0.192158 | 1.504230 | 0.550960 | 0.286961 | 0.363502 | 0.277964 | 0.864912 | 0.507990 | 1.480059 | 2.013697 | 0.483416 | 0.687794 | 0.367531 | 0.355311 | 0.174836 | 1.324695 | 2.896334 | 1.359876 | 0.250705 | 0.273667 | 0.702699 | 0.154827 | 0.398550 | 2.456560 | 0.329126 | 0.408755 | 0.313415 | 0.691956 | 0.536860 | 0.360816 | 0.512824 | 0.351551 | 0.312206 | 0.419095 | 0.393447 | 0.160200 | 0.768113 | 0.185718 | 1.645807 | 0.296829 | 0.309345 | 1.206995 | 0.164650 | 0.160687 | 0.188221 | 0.104784 | 0.141983 | 0.167710 | 0.136838 | 0.116048 | 0.255528 | 0.140871 | 0.481227 | 0.251773 | 1.534835 | 2.009109 | 0.119524 | 0.997775 | 0.878668 | 0.205604 | 0.129954 | NaN | 0.104784 | 0.110694 | 0.434154 | 0.118481 | 0.140314 | 0.148380 | 1.839730 | Control | Memantine | C/S | c-CS-m |
이 튜토리얼에서는 Mice Protein Expression이라는 UCI의 데이터 세트를 사용합니다. 데이터세트는 피질의 핵 분획에서 감지 가능한 신호를 생성한 77개의 단백질/단백질 변형의 발현 수준으로 구성됩니다. 데이터 세트에는 단백질당 총 1080개의 측정값이 포함되어 있습니다. 각 측정은 독립적인 샘플/마우스로 간주될 수 있습니다.
train = dataset.sample(frac=0.8, random_state=123)
test = dataset.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('학습용 데이터셋: ' + str(train.shape))
print('예측용 데이터셋: ' + str(test.shape))학습용 데이터셋: (864, 82)
예측용 데이터셋: (216, 82)
셋업 setup
from pycaret.anomaly import * # 이상 탐지MouseID는 순차적인 단순 ID 값이기 때문에 데이터 분석 및 모델의 학습에서 제외합니다.
s = setup(train,
normalize=True, # 데이터 정규화
ignore_features=['MouseID'], # 학습에 무시할 컬럼 지정
session_id=123) # 시드(SEED) 지정| Description | Value | |
|---|---|---|
| 0 | Session id | 123 |
| 1 | Original data shape | (864, 81) |
| 2 | Transformed data shape | (864, 88) |
| 3 | Ignore features | 1 |
| 4 | Ordinal features | 3 |
| 5 | Numeric features | 77 |
| 6 | Categorical features | 4 |
| 7 | Rows with missing values | 49.3% |
| 8 | Preprocess | True |
| 9 | Imputation type | simple |
| 10 | Numeric imputation | mean |
| 11 | Categorical imputation | constant |
| 12 | Maximum one-hot encoding | -1 |
| 13 | Encoding method | None |
| 14 | Low variance threshold | 0 |
| 15 | Normalize | True |
| 16 | Normalize method | zscore |
| 17 | CPU Jobs | -1 |
| 18 | Use GPU | False |
| 19 | Log Experiment | False |
| 20 | Experiment Name | anomaly-default-name |
| 21 | USI | 365d |
모델 생성
iforest 모델
from IPython.display import Image
Image(url='https://miro.medium.com/max/1400/1*4P2vi2YVj4nHbU5SZ9i7Ig.png', width=750)
# isolation forest 모델 생성
iforest = create_model('iforest')이상치 탐지: assign_model()
iforest_results = assign_model(iforest)
iforest_results.head()| DYRK1A_N | ITSN1_N | BDNF_N | NR1_N | NR2A_N | pAKT_N | pBRAF_N | pCAMKII_N | pCREB_N | pELK_N | pERK_N | pJNK_N | PKCA_N | pMEK_N | pNR1_N | pNR2A_N | pNR2B_N | pPKCAB_N | pRSK_N | AKT_N | BRAF_N | CAMKII_N | CREB_N | ELK_N | ERK_N | GSK3B_N | JNK_N | MEK_N | TRKA_N | RSK_N | APP_N | Bcatenin_N | SOD1_N | MTOR_N | P38_N | pMTOR_N | DSCR1_N | AMPKA_N | NR2B_N | pNUMB_N | RAPTOR_N | TIAM1_N | pP70S6_N | NUMB_N | P70S6_N | pGSK3B_N | pPKCG_N | CDK5_N | S6_N | ADARB1_N | AcetylH3K9_N | RRP1_N | BAX_N | ARC_N | ERBB4_N | nNOS_N | Tau_N | GFAP_N | GluR3_N | GluR4_N | IL1B_N | P3525_N | pCASP9_N | PSD95_N | SNCA_N | Ubiquitin_N | pGSK3B_Tyr216_N | SHH_N | BAD_N | BCL2_N | pS6_N | pCFOS_N | SYP_N | H3AcK18_N | EGR1_N | H3MeK4_N | CaNA_N | Genotype | Treatment | Behavior | class | Anomaly | Anomaly_Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.274204 | 0.491040 | 0.235014 | 1.768983 | 2.596107 | 0.183060 | 0.132654 | 2.442697 | 0.169009 | 0.858450 | 0.297796 | 0.251773 | 0.232048 | 0.204976 | 0.646513 | 0.519531 | 1.089210 | 1.070259 | 0.399639 | 0.550342 | 0.219801 | 0.260410 | 0.151992 | 0.854067 | 1.756607 | 0.887456 | 0.178419 | 0.210777 | 0.499291 | 0.120794 | 0.337759 | 1.629238 | 0.467449 | 0.391002 | 0.363156 | 0.557948 | 0.447596 | 0.291092 | 0.440634 | 0.295088 | 0.248163 | 0.355550 | 0.473250 | 0.199117 | 0.910373 | 0.164188 | 2.375492 | 0.299683 | 0.605220 | 0.822474 | 0.286729 | 0.173208 | 0.199981 | 0.117935 | 0.151617 | 0.211784 | 0.306113 | 0.123021 | 0.236925 | 0.140582 | 0.553978 | 0.327416 | 1.630266 | 2.187122 | 0.162844 | 1.117071 | 0.938106 | 0.234622 | 0.187890 | 0.129738 | 0.117935 | 0.143652 | 0.367911 | 0.177526 | 0.177718 | 0.220804 | 1.202860 | Ts65Dn | Saline | S/C | t-SC-s | 0 | -0.050177 |
| 1 | 0.196671 | 0.364057 | 0.272195 | 1.720407 | 2.205302 | 0.295314 | 0.252774 | 3.852035 | 0.232737 | 1.316584 | 0.313810 | 0.349877 | 0.300863 | 0.366215 | 0.670160 | 0.811344 | 1.202836 | 0.961467 | 0.462392 | 0.585080 | 0.386868 | 0.401048 | 0.226264 | 0.855425 | 1.537916 | 0.822441 | 0.244143 | 0.266338 | 0.545623 | 0.229963 | 0.299630 | 1.452528 | 0.738286 | 0.424476 | 0.425709 | 0.772195 | 0.598335 | 0.304871 | 0.566276 | 0.301171 | 0.326449 | 0.361591 | 0.432491 | 0.135920 | 0.826708 | 0.154487 | 1.871221 | 0.214949 | 0.297310 | 0.922637 | 0.288979 | 0.179005 | 0.160676 | 0.122828 | 0.161152 | 0.174958 | 0.263033 | 0.119495 | 0.175672 | 0.088312 | 0.648417 | 0.309212 | 1.547251 | 2.185670 | 0.197572 | 1.488217 | 0.832897 | 0.198762 | NaN | NaN | 0.122828 | 0.129017 | 0.432992 | 0.238991 | 0.239705 | NaN | 1.126875 | Control | Memantine | S/C | c-SC-m | 0 | -0.066044 |
| 2 | 0.296479 | 0.541739 | 0.300911 | 2.507018 | 3.555036 | 0.276779 | 0.216203 | 5.381680 | 0.236395 | 1.282935 | 0.329968 | 0.362472 | 0.312977 | 0.316917 | 0.860379 | 0.820241 | 1.717557 | 1.177542 | 0.425511 | 0.670771 | 0.315932 | 0.449150 | 0.208323 | 0.989411 | 1.982024 | 1.127062 | 0.267668 | 0.293031 | 0.727653 | 0.194287 | 0.442502 | 2.104654 | 0.546910 | 0.475991 | 0.465403 | 0.857917 | 0.658705 | 0.333908 | 0.595420 | 0.325536 | 0.344989 | 0.408520 | 0.381187 | 0.152949 | 0.869359 | 0.148893 | 1.565996 | 0.274294 | 0.296096 | 0.975156 | 0.061011 | 0.169174 | 0.173061 | 0.131993 | 0.161399 | 0.184722 | 0.159033 | 0.100220 | 0.207707 | 0.099375 | 0.584080 | 0.292885 | 1.417272 | 2.658949 | 0.181849 | 1.407301 | 0.892344 | 0.352206 | 0.148048 | 0.151766 | 0.131993 | 0.166638 | 0.376711 | 0.144499 | 0.189116 | 0.238465 | 1.195538 | Ts65Dn | Memantine | S/C | t-SC-m | 0 | -0.088480 |
| 3 | 0.228700 | 0.395179 | 0.234118 | 1.733184 | 2.220852 | 0.220665 | 0.161435 | 1.989723 | 0.185164 | 0.884342 | 0.255045 | 0.245703 | 0.221413 | 0.255792 | 0.636024 | 0.442638 | 1.015695 | 1.065022 | 0.408259 | 0.540172 | 0.238042 | 0.326981 | 0.212631 | 0.762892 | 1.425262 | 0.818199 | 0.216741 | 0.235426 | 0.559043 | 0.168161 | 0.309978 | 1.494208 | 0.661809 | 0.337444 | 0.309978 | 0.510650 | 0.463378 | 0.234679 | 0.400037 | 0.235239 | 0.231876 | 0.292788 | 0.460202 | 0.196736 | 0.804896 | 0.170807 | 2.628287 | 0.313327 | 0.669810 | 0.764098 | 0.536899 | 0.201269 | 0.169175 | 0.118948 | 0.174252 | 0.185131 | 0.395648 | 0.137081 | 0.201088 | 0.126927 | 0.631188 | 0.358114 | 1.437534 | 2.544515 | 0.179692 | 1.242248 | 0.976609 | 0.290843 | 0.216682 | NaN | 0.118948 | 0.158296 | 0.422121 | 0.321306 | 0.229193 | 0.355213 | 1.430825 | Ts65Dn | Saline | S/C | t-SC-s | 0 | -0.000076 |
| 4 | 0.814391 | 1.124141 | 0.388574 | 2.632156 | 4.972038 | 0.233940 | 0.133904 | 2.537785 | 0.218874 | 2.053513 | 1.086778 | 0.305773 | 0.368929 | 0.293721 | 0.857177 | 0.557189 | 1.760395 | 2.154634 | 0.509341 | 0.738580 | 0.671568 | 0.369531 | 0.197903 | 1.379173 | 3.171387 | 1.617814 | 0.279619 | 0.323973 | 0.821020 | 0.171026 | 0.482222 | 2.548994 | 0.346872 | 0.426058 | 0.296252 | 0.752441 | 0.613957 | 0.398698 | 0.553694 | 0.383512 | 0.322767 | 0.458720 | 0.269857 | 0.196742 | 0.980468 | 0.193295 | 1.580968 | 0.328737 | 0.442890 | 1.159503 | 0.096986 | 0.153893 | 0.192620 | 0.110232 | 0.152812 | 0.201744 | 0.181062 | 0.107326 | 0.212625 | 0.113341 | 0.497499 | 0.260814 | 1.701811 | 2.127061 | 0.114490 | 1.156934 | 0.892471 | 0.178021 | 0.124966 | 0.103136 | 0.110232 | 0.101446 | 0.474790 | 0.093877 | 0.120911 | 0.132265 | 1.985739 | Ts65Dn | Memantine | C/S | t-CS-m | 0 | -0.059018 |
결과 시각화
plot_model(iforest)예측 predict_model
predicitons = predict_model(iforest, data=test)
predicitons[['Anomaly', 'Anomaly_Score']]| Anomaly | Anomaly_Score | |
|---|---|---|
| 0 | 0 | -0.042698 |
| 1 | 0 | -0.083798 |
| 2 | 0 | -0.073029 |
| 3 | 0 | -0.079531 |
| 4 | 0 | -0.057472 |
| ... | ... | ... |
| 211 | 0 | -0.058016 |
| 212 | 0 | -0.063982 |
| 213 | 0 | -0.050281 |
| 214 | 0 | -0.083473 |
| 215 | 1 | 0.007951 |
216 rows × 2 columns