Iris 데이터와 KNN¶
In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2021)
1. Data¶
1.1 Data Load¶
In [49]:
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target
In [50]:
target
Out[50]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [51]:
target != 0
Out[51]:
array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True])
In [52]:
data.shape
Out[52]:
(150, 4)
In [53]:
data = data[target != 0, 2:]
target = target[target != 0]
In [54]:
data.shape
Out[54]:
(100, 2)
In [55]:
data = pd.DataFrame(data)
target = pd.DataFrame(target)
In [56]:
~data.duplicated()
Out[56]:
0 True
1 True
2 True
3 True
4 True
...
95 True
96 True
97 True
98 True
99 True
Length: 100, dtype: bool
In [57]:
target[~data.duplicated()]
Out[57]:
| 0 | |
|---|---|
| 0 | 1 |
| 1 | 1 |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| ... | ... |
| 95 | 2 |
| 96 | 2 |
| 97 | 2 |
| 98 | 2 |
| 99 | 2 |
80 rows × 1 columns
In [58]:
target = target.loc[~data.duplicated()].values.flatten()
data = data.loc[~data.duplicated()].values
In [59]:
data.shape
Out[59]:
(80, 2)
In [60]:
target.shape
Out[60]:
(80,)
In [61]:
data
Out[61]:
array([[4.7, 1.4],
[4.5, 1.5],
[4.9, 1.5],
[4. , 1.3],
[4.6, 1.5],
[4.5, 1.3],
[4.7, 1.6],
[3.3, 1. ],
[4.6, 1.3],
[3.9, 1.4],
[3.5, 1. ],
[4.2, 1.5],
[4. , 1. ],
[3.6, 1.3],
[4.4, 1.4],
[4.1, 1. ],
[3.9, 1.1],
[4.8, 1.8],
[4.7, 1.2],
[4.3, 1.3],
[4.8, 1.4],
[5. , 1.7],
[3.8, 1.1],
[3.7, 1. ],
[3.9, 1.2],
[5.1, 1.6],
[4.5, 1.6],
[4.7, 1.5],
[4.4, 1.3],
[4.1, 1.3],
[4.4, 1.2],
[4.6, 1.4],
[4. , 1.2],
[4.2, 1.3],
[4.2, 1.2],
[3. , 1.1],
[6. , 2.5],
[5.1, 1.9],
[5.9, 2.1],
[5.6, 1.8],
[5.8, 2.2],
[6.6, 2.1],
[4.5, 1.7],
[6.3, 1.8],
[5.8, 1.8],
[6.1, 2.5],
[5.1, 2. ],
[5.3, 1.9],
[5.5, 2.1],
[5. , 2. ],
[5.1, 2.4],
[5.3, 2.3],
[5.5, 1.8],
[6.7, 2.2],
[6.9, 2.3],
[5. , 1.5],
[5.7, 2.3],
[4.9, 2. ],
[6.7, 2. ],
[4.9, 1.8],
[5.7, 2.1],
[6. , 1.8],
[5.6, 2.1],
[5.8, 1.6],
[6.1, 1.9],
[6.4, 2. ],
[5.6, 2.2],
[5.1, 1.5],
[5.6, 1.4],
[6.1, 2.3],
[5.6, 2.4],
[5.4, 2.1],
[5.1, 2.3],
[5.9, 2.3],
[5.7, 2.5],
[5.2, 2.3],
[5. , 1.9],
[5.2, 2. ],
[5.4, 2.3],
[5.1, 1.8]])
In [62]:
plt.scatter(data[:, 0], data[:, 1], c=target)
Out[62]:
<matplotlib.collections.PathCollection at 0x21a0b105210>
1.2 시각화 데이터¶
In [63]:
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
2. k 값에 따른 결정 경계¶
In [64]:
from sklearn.neighbors import KNeighborsClassifier
k 값에 따른 knn의 결정경계를 그려봅니다.
k 가 작을수록 overfitting이 k가 클수록 underfitting이 됩니다.
In [65]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
for idx, n in enumerate(range(1, 12, 2)):
# knn 생성 및 학습
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(data, target)
# 시각회 데이터 예측
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax = axes[idx//3, idx%3]
# 영역 표시
ax.contourf(xx, yy, Z)
# 데이터 표시
ax.scatter(
data[:, 0], data[:, 1], c=target, alpha=1.0, edgecolor="black"
)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel(iris.feature_names[0])
ax.set_ylabel(iris.feature_names[1])
ax.set_title(f"{n} Nearest Neighbors")
3. 나의 가장 가까운 이웃은?¶
KNN의 거리의 종류는 p를 통해서 바꿀 수 있습니다.
- p=1
- 맨해튼 거리
- p=2
- 유클리드 거리
3.1 Euclidean Distance¶
In [66]:
train_data, train_target = data[:-1], target[:-1]
test_data = data[-1:]
In [67]:
test_data
Out[67]:
array([[5.1, 1.8]])
In [68]:
len(train_data), len(test_data)
Out[68]:
(79, 1)
In [69]:
euclid_knn = KNeighborsClassifier(n_neighbors=10)
euclid_knn.fit(train_data, train_target)
Out[69]:
KNeighborsClassifier(n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=10)
In [96]:
euclid_knn.kneighbors(
test_data, n_neighbors=1, return_distance=False
).ravel()
Out[96]:
array([37], dtype=int64)
In [71]:
euclid_neighbors_idx = euclid_knn.kneighbors(
test_data, n_neighbors=10, return_distance=False
).ravel()
euclid_neighbors = train_data[euclid_neighbors_idx]
euclid_neighbors_label = train_target[euclid_neighbors_idx]
In [72]:
euclid_knn.kneighbors(
test_data, n_neighbors=10, return_distance=False
)
Out[72]:
array([[37, 76, 21, 59, 25, 46, 49, 47, 77, 57]], dtype=int64)
In [73]:
euclid_knn.kneighbors(
test_data, n_neighbors=10, return_distance=False
).ravel()
Out[73]:
array([37, 76, 21, 59, 25, 46, 49, 47, 77, 57], dtype=int64)
In [74]:
test_data
Out[74]:
array([[5.1, 1.8]])
In [75]:
euclid_neighbors
Out[75]:
array([[5.1, 1.9],
[5. , 1.9],
[5. , 1.7],
[4.9, 1.8],
[5.1, 1.6],
[5.1, 2. ],
[5. , 2. ],
[5.3, 1.9],
[5.2, 2. ],
[4.9, 2. ]])
In [76]:
euclid_neighbors_label
Out[76]:
array([2, 2, 1, 2, 1, 2, 2, 2, 2, 2])
In [77]:
euclid_knn.predict(test_data)
Out[77]:
array([2])
In [78]:
euclid_knn.predict_proba(test_data)
Out[78]:
array([[0.2, 0.8]])
In [79]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(euclid_neighbors[:, 0], euclid_neighbors[:, 1], c=euclid_neighbors_label, edgecolors="red", s=500)
Out[79]:
<matplotlib.collections.PathCollection at 0x21a0b50ddd0>
3.2 Manhattan Distance¶
In [80]:
manhattan_knn = KNeighborsClassifier(n_neighbors=10, p=1)
manhattan_knn.fit(train_data, train_target)
Out[80]:
KNeighborsClassifier(n_neighbors=10, p=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=10, p=1)
In [81]:
manhattan_neighbors_idx = manhattan_knn.kneighbors(
test_data, n_neighbors=10, return_distance=False
).ravel()
manhattan_neighbors = train_data[manhattan_neighbors_idx]
manhattan_neighbors_label = train_target[manhattan_neighbors_idx]
In [82]:
manhattan_neighbors
Out[82]:
array([[5.1, 1.9],
[4.9, 1.8],
[5. , 1.9],
[5. , 1.7],
[5.1, 1.6],
[5.1, 2. ],
[5. , 2. ],
[4.8, 1.8],
[5.3, 1.9],
[5.1, 1.5]])
In [83]:
manhattan_neighbors_label
Out[83]:
array([2, 2, 2, 1, 1, 2, 2, 1, 2, 2])
In [84]:
manhattan_knn.predict_proba(test_data)
Out[84]:
array([[0.3, 0.7]])
In [85]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(manhattan_neighbors[:, 0], manhattan_neighbors[:, 1], c=manhattan_neighbors_label, edgecolors="red", s=500)
Out[85]:
<matplotlib.collections.PathCollection at 0x21a7fc4a410>
3.3 비교¶
In [86]:
euclid_neighbors_idx
Out[86]:
array([37, 76, 21, 59, 25, 46, 49, 47, 77, 57], dtype=int64)
In [87]:
manhattan_neighbors_idx
Out[87]:
array([37, 59, 76, 21, 25, 46, 49, 17, 47, 67], dtype=int64)
In [88]:
set(euclid_neighbors_idx) - set(manhattan_neighbors_idx)
Out[88]:
{57, 77}
In [89]:
set(manhattan_neighbors_idx) - set(euclid_neighbors_idx)
Out[89]:
{17, 67}
In [90]:
diff_neighbors_idx = list(set(euclid_neighbors_idx) - set(manhattan_neighbors_idx))
diff_neighbors_idx.extend(list(set(manhattan_neighbors_idx) - set(euclid_neighbors_idx)))
diff_neighbors_idx
Out[90]:
[57, 77, 17, 67]
In [91]:
diff_neighbors = train_data[diff_neighbors_idx]
diff_neighbors_label = train_target[diff_neighbors_idx]
In [92]:
same_neighbors_idx = list(set(euclid_neighbors_idx) & set(manhattan_neighbors_idx))
same_neighbors_idx
Out[92]:
[37, 76, 46, 47, 49, 21, 25, 59]
In [93]:
same_neighbors = train_data[same_neighbors_idx]
same_neighbors_label = train_target[same_neighbors_idx]
In [94]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(diff_neighbors[:, 0], diff_neighbors[:, 1], c=diff_neighbors_label, edgecolors="red", s=500)
plt.scatter(same_neighbors[:, 0], same_neighbors[:, 1], c=same_neighbors_label, edgecolors="blue", s=500)
Out[94]:
<matplotlib.collections.PathCollection at 0x21a7fc8e4d0>
In [ ]:
'Machine Learning > KNN' 카테고리의 다른 글
| KNN으로 음수 가능 여부를 판단하기 (0) | 2024.03.12 |
|---|
