경고 메시지 출력 표기 생략

from IPython.display import Image 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris 

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

비지도 학습 (Unsupervised Learning)

비지도 학습(Unsupervised Learning)은 기계 학습의 일종으로, 데이터가 어떻게 구성되었는지를 알아내는 문제의 범주에 속합니다. 이 방법은 지도 학습(Supervised Learning) 혹은 강화 학습(Reinforcement Learning)과는 달리 입력값에 대한 목표치가 주어지지 않습니다.

차원 축소: PCA
군집화: KMeans Clustering, DBSCAN
군집화 평가 (실루엣 스코어)

차원 축소

feature의 갯수를 줄이는 것을 뛰어 넘어, 특징을 추출하는 역할을 하기도 함.
계산 비용을 감소하는 효과
전반적인 데이터에 대한 이해도를 높이는 효과

데이터 로드 (iris 데이터)

iris변수에 load_iris() 데이터를 로드하여 대입합니다.

# 코드를 입력해 주세요
iris =

df 변수에 로드한 데이터를 DataFrame으로 변환합니다.

# 코드를 입력해 주세요
df = 

df.head()

[출력 결과]

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

PCA 차원축소

주성분 분석 (PCA) 는 선형 차원 축소 기법입니다. 매우 인기 있게 사용되는 차원 축소 기법입니다.

주요 특징중의 하나는 분산(variance)을 최대한 보존한다는 점입니다.

PCA의 원리에 관련된 블로그글

components에 1보다 작은 값을 넣으면, 분산을 기준으로 차원 축소
components에 1보다 큰 값을 넣으면, 해당 값을 기준으로 feature를 축소

from sklearn.decomposition import PCA

# PCA 선언
pca = PCA()

# scale 변환
data_scaled = StandardScaler().fit_transform(df.loc[:, 'sepal length (cm)': 'petal width (cm)'])

# PCA 변환
pca.fit_transform(data_scaled)[:5]

# 설명력 지수
pca.explained_variance_ratio_

# n_components = 2 설정
pca = PCA(n_components=2)

pca_data = pca.fit_transform(data_scaled)

data_scaled[:5]

pca_data[:5]

plt.scatter(pca_data[:, 0], pca_data[:, 1], c=df['target'])

pca = PCA(n_components=3)

pca_data = pca.fit_transform(data_scaled)

pca_data[:5]

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111, projection='3d') # Axe3D object

sample_size = 50
ax.scatter(pca_data[:, 0], pca_data[:, 1], pca_data[:, 2], alpha=0.6, c=df['target'])
plt.savefig('./tmp.svg')
plt.title("ax.plot")
plt.show()

군집화

Image('https://image.slidesharecdn.com/patternrecognitionbinoy-06-kmeansclustering-160317135729/95/pattern-recognition-binoy-k-means-clustering-13-638.jpg')

K-Means Clustering

K-Means Clustering 데모

군집화에서 가장 대중적으로 사용되는 알고리즘입니다.
centroid라는 중점을 기준으로 가장 가까운 포인트들을 선택하는 군집화 기법입니다.
뒤섞여 있는 데이터를 원하는 갯수로 군집화 합니다.

사용되는 예제

스팸 문자 분류
뉴스 기사 분류

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)

cluster_data = kmeans.fit_transform(df.loc[:, 'sepal length (cm)': 'petal width (cm)'])

cluster_data[:5]

kmeans.labels_

시각화

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(kmeans.labels_)
plt.title('K-Means Clustering', fontsize=15)
plt.subplot(1, 2, 2)
sns.countplot(df['target'])
plt.title('Iris Dataset', fontsize=15)
plt.show()

# hyperparameter 변화
kmeans = KMeans(n_clusters=3, max_iter=500)
cluster_data = kmeans.fit_transform(df.loc[:, 'sepal length (cm)': 'petal width (cm)'])

# 시각화
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(kmeans.labels_)
plt.title('K-Means Clustering', fontsize=15)
plt.subplot(1, 2, 2)
sns.countplot(df['target'])
plt.title('Iris Dataset', fontsize=15)
plt.show()

DBSCAN (Density-based spatial clustering of applications with noise)

밀도 기반 클러스터링

밀도가 높은 부분을 클러스터링 하는 방식
어느점을 기준으로 반경 x내에 점이 n개 이상 있으면 하나의 군집으로 인식하는 방식
KMeans 에서는 n_cluster의 갯수를 반드시 지정해 주어야 하나, DBSCAN 에서는 필요없음
기하학적인 clustering도 잘 찾아냄

Image('https://image.slidesharecdn.com/pydatanyc2015-151119175854-lva1-app6891/95/pydata-nyc-2015-automatically-detecting-outliers-with-datadog-26-638.jpg')

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=2)

dbscan_data = dbscan.fit_predict(df.loc[:, 'sepal length (cm)': 'petal width (cm)'])

dbscan_data

실루엣 스코어 (군집화 평가)

실루엣 기법은 클러스터링의 품질을 정량적으로 계산해주는 방법입니다.

i번째 데이터 x(i)에 대한 실루엣 계수 s(i) 값은 아래의 식으로 정의됩니다.

Image(url='https://mblogthumb-phinf.pstatic.net/MjAxNzA1MzBfMjA0/MDAxNDk2MTI0NDAzMTk0.dPX6Uh3O4h7MMc0BdXOYyJ5C4W9EN6OimkGsUaisXVIg.E-PsOP3a_SBm_HyfupxheTav3ZkXAjg-iDcynQKHAmAg.PNG.samsjang/%25EC%25BA%25A1%25EC%25B2%25981.PNG?type=w800')

a(i)는 클러스터내 데이터 응집도(cohesion)를 나타내는 값. 이 거리가 작으면 응집도가 높습니다.
b(i)는 클러스터간 분리도(separation)를 나타내는 값. 이 값이 크다면, 클러스터 간극이 크다는 의미입니다.

만약 클러스터 개수가 최적화 되어 있다면 b(i)의 값은 크고, a(i)의 값은 작아집니다. 따라서 s(i)의 값은 1에 가까운 숫자가 됩니다.

반대로 클러스터내 데이터 응집도와 클러스터간 분리도의 값이 같으면 실루엣 계수 s(i)는 0이 됩니다.

클러스터링의 품질을 정량적으로 평가해 주는 지표

1: 클러스터링의 품질이 좋다
0: 클러스터링의 품질이 안좋다 (클러스터링의 의미 없음)
음수: 잘못 분류됨

from sklearn.metrics import silhouette_samples, silhouette_score

score = silhouette_score(data_scaled, kmeans.labels_)
score

samples = silhouette_samples(data_scaled, kmeans.labels_)
samples[:5]

api 참고(scikit-learn 공식 도큐먼트)

def plot_silhouette(X, num_cluesters):
    for n_clusters in num_cluesters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
    
        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)
    
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
    
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)
    
        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]
    
            ith_cluster_silhouette_values.sort()
    
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
    
            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)
    
            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    
            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples
    
        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
    
        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    
        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')
    
        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')
    
        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')
    
        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")
    
        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')
    
        plt.show()

빨간 점선은 평균 실루엣 스코어를 의미합니다.

plot_silhouette(data_scaled, [2, 3, 4, 5])