샘플 데이터와 Out-of-Distribution 모델¶
In [1]:
!pip install -U imbalanced-learn
Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.7/dist-packages (0.8.0) Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.19.5) Requirement already satisfied: scikit-learn>=0.24 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (0.24.2) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.0.1) Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.7/dist-packages (from imbalanced-learn) (1.4.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.24->imbalanced-learn) (2.2.0)
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2021)
1. Data¶
1.1 Sample Data¶
In [3]:
from sklearn.datasets import make_moons
data, label = make_moons(n_samples=300, shuffle=True, noise=0.5, random_state=2021)
In [4]:
plt.scatter(data[:, 0], data[:, 1], c=label)
Out[4]:
<matplotlib.collections.PathCollection at 0x7f16f8c49bd0>
1.2 Resample Data¶
In [5]:
from imblearn.datasets import make_imbalance
from collections import Counter
def ratio_func(y, multiplier, minority_class):
target_stats = Counter(y)
return {minority_class: int(multiplier * target_stats[minority_class])}
data, label = make_imbalance(
data,
label,
sampling_strategy=ratio_func,
**{"multiplier": 0.1, "minority_class": 1,}
)
In [6]:
plt.scatter(data[:, 0], data[:, 1], c=label)
Out[6]:
<matplotlib.collections.PathCollection at 0x7f16f3196810>
1.3 Split Data¶
In [7]:
normal_data, abnormal_data = data[label==0], data[label==1]
normal_label, abnormal_label = label[label==0], label[label==1]
In [8]:
normal_label
Out[8]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [9]:
abnormal_label
Out[9]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
In [10]:
from sklearn.model_selection import train_test_split
train_data, test_normal_data, train_label, test_normal_label = train_test_split(
normal_data, normal_label, train_size=0.7, random_state=2021
)
In [11]:
test_data = np.concatenate([test_normal_data, abnormal_data])
test_label = np.concatenate([test_normal_label, abnormal_label])
In [12]:
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_label)
Out[12]:
<matplotlib.collections.PathCollection at 0x7f16f3147c90>
In [13]:
test_label.mean()
Out[13]:
0.25
In [14]:
plt.scatter(test_data[:, 0], test_data[:, 1], c=test_label)
Out[14]:
<matplotlib.collections.PathCollection at 0x7f16f30bb290>
1.4 시각화 데이터¶
In [15]:
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
2. Isolation Forest¶
In [16]:
from sklearn.ensemble import IsolationForest
isol_forest = IsolationForest()
2.1 학습 & 예측¶
In [17]:
isol_forest.fit(train_data, train_label)
Out[17]:
IsolationForest()
In [18]:
isol_test_pred = isol_forest.predict(test_data)
In [19]:
isol_test_pred
Out[19]:
array([ 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1,
-1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1,
-1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, -1,
-1, -1, 1, -1, -1, 1, -1, -1, -1])
In [21]:
isol_forest.decision_function(test_data)
Out[21]:
array([ 0.07920637, 0.01568211, -0.0007057 , 0.02886496, 0.02354266,
0.0608095 , 0.07836153, 0.07959195, 0.0777155 , 0.08368566,
0.08096668, 0.01768742, 0.08804025, -0.00679535, 0.09107424,
-0.07279372, 0.09572535, -0.0502307 , 0.03196264, 0.0340732 ,
0.03109257, 0.05498848, 0.01211572, -0.18090731, 0.01593877,
0.00788293, -0.06141088, 0.05915229, 0.04951808, 0.0617401 ,
0.08176808, 0.00573826, 0.05432684, 0.07215502, -0.00099358,
0.06042053, -0.00692154, -0.03889864, 0.04679394, 0.05090291,
0.07737391, 0.0739545 , 0.05009019, 0.07951504, 0.00591456,
-0.11496344, 0.06394809, -0.09365931, -0.06585884, 0.02217839,
-0.11233681, -0.06277224, -0.17346579, 0.0744311 , -0.14945106,
-0.13751562, 0.08314092, -0.11029918, -0.09093424, -0.11188274])
2.2 평가¶
정확도
In [22]:
from sklearn.metrics import accuracy_score
isol_test_acc = accuracy_score(test_label, isol_test_pred == -1)
In [23]:
print(f"Isolation Forest Test Accuracy is {isol_test_acc:.4f}")
Isolation Forest Test Accuracy is 0.7833
F1 Score
In [24]:
from sklearn.metrics import f1_score
isol_test_f1 = f1_score(test_label, isol_test_pred == -1)
In [25]:
print(f"Isolation Forest Test F1-Score is {isol_test_f1:.4f}")
Isolation Forest Test F1-Score is 0.6286
2.3 시각화¶
In [26]:
isol_Z = isol_forest.predict(np.c_[xx.ravel(), yy.ravel()])
isol_Z = isol_Z.reshape(xx.shape)
In [27]:
cs = plt.contourf(xx, yy, isol_Z, cmap=plt.cm.Paired)
plt.scatter(train_data[:,0], train_data[:,1], c=train_label)
Out[27]:
<matplotlib.collections.PathCollection at 0x7f16f2f81550>
In [28]:
cs = plt.contourf(xx, yy, isol_Z, cmap=plt.cm.Paired)
plt.scatter(test_data[:,0], test_data[:,1], c=test_label)
Out[28]:
<matplotlib.collections.PathCollection at 0x7f16f2f81690>
3. OCSVM¶
3.1 학습 & 예측¶
In [29]:
from sklearn.svm import OneClassSVM
ocsvm = OneClassSVM()
In [30]:
ocsvm.fit(train_data, train_label)
Out[30]:
OneClassSVM()
In [31]:
ocsvm_test_pred = ocsvm.predict(test_data)
In [32]:
ocsvm_test_pred
Out[32]:
array([ 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1,
-1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1,
-1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, -1,
-1, -1, 1, -1, -1, 1, -1, -1, -1])
3.2 평가¶
정확도
In [35]:
ocsvm_test_acc = accuracy_score(test_label, ocsvm_test_pred == -1)
In [36]:
print(f"OCSVM Test Accuracy is {ocsvm_test_acc:.4f}")
OCSVM Test Accuracy is 0.5667
F1 Score
In [37]:
ocsvm_test_f1 = f1_score(test_label, ocsvm_test_pred == -1)
In [38]:
print(f"OCSVM Test F1-Score is {ocsvm_test_f1:.4f}")
OCSVM Test F1-Score is 0.4800
3.3 시각화¶
In [39]:
ocsvm_Z = ocsvm.predict(np.c_[xx.ravel(), yy.ravel()])
ocsvm_Z = ocsvm_Z.reshape(xx.shape)
In [40]:
cs = plt.contourf(xx, yy, ocsvm_Z, cmap=plt.cm.Paired)
plt.scatter(train_data[:,0], train_data[:,1], c=train_label)
Out[40]:
<matplotlib.collections.PathCollection at 0x7f16f2cec190>
In [41]:
cs = plt.contourf(xx, yy, ocsvm_Z, cmap=plt.cm.Paired)
plt.scatter(test_data[:,0], test_data[:,1], c=test_label)
Out[41]:
<matplotlib.collections.PathCollection at 0x7f16f2c0ce90>
4. PCA¶
4.1 학습 & 예측¶
In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
In [43]:
pca.fit(train_data)
Out[43]:
PCA(n_components=1)
In [44]:
test_latent = pca.transform(test_data)
In [46]:
test_latent[:10]
Out[46]:
array([[ 2.96603448e-01],
[ 1.24231267e+00],
[ 1.06281107e+00],
[-7.40403889e-01],
[-1.04768362e+00],
[ 1.06747491e-01],
[ 2.14169771e-01],
[-2.90818242e-01],
[-6.48459113e-01],
[-2.02332106e-04]])
In [47]:
test_recon = pca.inverse_transform(test_latent)
In [49]:
recon_diff = (test_data - test_recon) ** 2
In [51]:
test_data[0]
Out[51]:
array([0.37198025, 0.49392302])
In [52]:
test_recon[0]
Out[52]:
array([0.34662995, 0.7059456 ])
In [50]:
recon_diff[0]
Out[50]:
array([0.00064264, 0.04495357])
In [53]:
pca_pred = recon_diff.mean(1)
In [54]:
pca_pred[:10]
Out[54]:
array([2.27981061e-02, 1.27982521e-02, 2.23592395e-01, 2.53514995e-01,
2.49587527e-01, 1.10212458e-01, 4.89886192e-02, 1.10896685e-02,
5.78102995e-05, 7.73374114e-03])
4.2 평가¶
In [55]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(test_label, pca_pred)
pca_auroc = auc(fpr, tpr)
In [56]:
plt.plot(fpr, tpr)
Out[56]:
[<matplotlib.lines.Line2D at 0x7f16ec916b10>]
In [57]:
print(f"PCA test AUROC is {pca_auroc:.4f}")
PCA test AUROC is 0.7052
Best Threshold
In [58]:
f1_scores = []
for t in threshold:
pca_test_pred = pca_pred > t
pca_test_f1 = f1_score(test_label, pca_test_pred)
f1_scores += [pca_test_f1]
print(f"threshold: {t:.4f}, f1-score: {pca_test_f1:.4f}")
threshold: 2.7621, f1-score: 0.0000 threshold: 1.7621, f1-score: 0.0000 threshold: 1.2702, f1-score: 0.2353 threshold: 0.9820, f1-score: 0.3333 threshold: 0.7935, f1-score: 0.4000 threshold: 0.7125, f1-score: 0.4762 threshold: 0.5471, f1-score: 0.5833 threshold: 0.2236, f1-score: 0.4444 threshold: 0.1373, f1-score: 0.4324 threshold: 0.1082, f1-score: 0.4390 threshold: 0.1044, f1-score: 0.4286 threshold: 0.0748, f1-score: 0.4348 threshold: 0.0567, f1-score: 0.4255 threshold: 0.0267, f1-score: 0.3929 threshold: 0.0229, f1-score: 0.3860 threshold: 0.0187, f1-score: 0.4068 threshold: 0.0163, f1-score: 0.4000 threshold: 0.0145, f1-score: 0.4262 threshold: 0.0140, f1-score: 0.4194 threshold: 0.0077, f1-score: 0.4179 threshold: 0.0076, f1-score: 0.4118 threshold: 0.0001, f1-score: 0.4054
In [60]:
best_thresh = threshold[np.argmax(f1_scores)]
best_thresh
Out[60]:
0.5471488977187242
In [61]:
pca_test_pred = pca_pred > best_thresh
In [62]:
pca_test_pred
Out[62]:
array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, True, False, False,
False, False, False, False, False, True, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, True, False, False, True, False, True, False,
True, True, False, False, True, True])
정확도
In [63]:
pca_test_acc = accuracy_score(test_label, pca_test_pred)
In [64]:
print(f"PCA Test Accuracy is {pca_test_acc:.4f}")
PCA Test Accuracy is 0.8333
F1 Score
In [65]:
pca_test_f1 = f1_score(test_label, pca_test_pred)
In [66]:
print(f"PCA Test F1-Score is {pca_test_f1:.4f}")
PCA Test F1-Score is 0.5833
4.3 시각화¶
In [67]:
Z = np.c_[xx.ravel(), yy.ravel()]
Z_latent = pca.transform(Z)
Z_recon = pca.inverse_transform(Z_latent)
pca_Z = (Z - Z_recon).mean(1)
In [68]:
pca_Z = list(map(int, pca_Z > best_thresh))
In [69]:
pca_Z = np.array(pca_Z).reshape(xx.shape)
In [70]:
cs = plt.contourf(xx, yy, pca_Z, cmap=plt.cm.Paired)
plt.scatter(train_data[:,0], train_data[:,1], c=train_label)
Out[70]:
<matplotlib.collections.PathCollection at 0x7f16f2b95910>
In [71]:
cs = plt.contourf(xx, yy, pca_Z, cmap=plt.cm.Paired)
plt.scatter(test_data[:,0], test_data[:,1], c=test_label)
Out[71]:
<matplotlib.collections.PathCollection at 0x7f16e4078f90>
5. 마무리¶
5.1 정확도¶
In [72]:
print(f"Isolation Forest Test Accuracy is {isol_test_acc:.4f}")
print(f"OCSVM Test Accuracy is {ocsvm_test_acc:.4f}")
print(f"PCA Test Accuracy is {pca_test_acc:.4f}")
Isolation Forest Test Accuracy is 0.7833 OCSVM Test Accuracy is 0.5667 PCA Test Accuracy is 0.8333
5.2 F1-Score¶
In [73]:
print(f"Isolation Forest Test F1-Score is {isol_test_f1:.4f}")
print(f"OCSVM Test F1-Score is {ocsvm_test_f1:.4f}")
print(f"PCA Test F1-Score is {pca_test_f1:.4f}")
Isolation Forest Test F1-Score is 0.6286 OCSVM Test F1-Score is 0.4800 PCA Test F1-Score is 0.5833
In [ ]:
'Machine Learning > Abnomaly Detection' 카테고리의 다른 글
| Sampling (0) | 2024.03.27 |
|---|
