KNN 기초 실습

2024. 3. 12. 12:10·Machine Learning/KNN

 

 

 
 

Iris 데이터와 KNN¶

 
In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


np.random.seed(2021)
 
 

1. Data¶

 
 

1.1 Data Load¶

 
In [49]:
from sklearn.datasets import load_iris

iris = load_iris()

data = iris.data
target = iris.target
 
In [50]:
target
 
Out[50]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
 
In [51]:
target != 0
 
Out[51]:
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])
 
In [52]:
data.shape
 
Out[52]:
(150, 4)
 
In [53]:
data = data[target != 0, 2:]
target = target[target != 0]
 
In [54]:
data.shape
 
Out[54]:
(100, 2)
 
In [55]:
data = pd.DataFrame(data)
target = pd.DataFrame(target)
 
In [56]:
~data.duplicated()
 
Out[56]:
0     True
1     True
2     True
3     True
4     True
      ... 
95    True
96    True
97    True
98    True
99    True
Length: 100, dtype: bool
 
In [57]:
target[~data.duplicated()]
 
Out[57]:
  0
0 1
1 1
2 1
3 1
4 1
... ...
95 2
96 2
97 2
98 2
99 2

80 rows × 1 columns

 
In [58]:
target = target.loc[~data.duplicated()].values.flatten()
data = data.loc[~data.duplicated()].values
 
In [59]:
data.shape
 
Out[59]:
(80, 2)
 
In [60]:
target.shape
 
Out[60]:
(80,)
 
In [61]:
data
 
Out[61]:
array([[4.7, 1.4],
       [4.5, 1.5],
       [4.9, 1.5],
       [4. , 1.3],
       [4.6, 1.5],
       [4.5, 1.3],
       [4.7, 1.6],
       [3.3, 1. ],
       [4.6, 1.3],
       [3.9, 1.4],
       [3.5, 1. ],
       [4.2, 1.5],
       [4. , 1. ],
       [3.6, 1.3],
       [4.4, 1.4],
       [4.1, 1. ],
       [3.9, 1.1],
       [4.8, 1.8],
       [4.7, 1.2],
       [4.3, 1.3],
       [4.8, 1.4],
       [5. , 1.7],
       [3.8, 1.1],
       [3.7, 1. ],
       [3.9, 1.2],
       [5.1, 1.6],
       [4.5, 1.6],
       [4.7, 1.5],
       [4.4, 1.3],
       [4.1, 1.3],
       [4.4, 1.2],
       [4.6, 1.4],
       [4. , 1.2],
       [4.2, 1.3],
       [4.2, 1.2],
       [3. , 1.1],
       [6. , 2.5],
       [5.1, 1.9],
       [5.9, 2.1],
       [5.6, 1.8],
       [5.8, 2.2],
       [6.6, 2.1],
       [4.5, 1.7],
       [6.3, 1.8],
       [5.8, 1.8],
       [6.1, 2.5],
       [5.1, 2. ],
       [5.3, 1.9],
       [5.5, 2.1],
       [5. , 2. ],
       [5.1, 2.4],
       [5.3, 2.3],
       [5.5, 1.8],
       [6.7, 2.2],
       [6.9, 2.3],
       [5. , 1.5],
       [5.7, 2.3],
       [4.9, 2. ],
       [6.7, 2. ],
       [4.9, 1.8],
       [5.7, 2.1],
       [6. , 1.8],
       [5.6, 2.1],
       [5.8, 1.6],
       [6.1, 1.9],
       [6.4, 2. ],
       [5.6, 2.2],
       [5.1, 1.5],
       [5.6, 1.4],
       [6.1, 2.3],
       [5.6, 2.4],
       [5.4, 2.1],
       [5.1, 2.3],
       [5.9, 2.3],
       [5.7, 2.5],
       [5.2, 2.3],
       [5. , 1.9],
       [5.2, 2. ],
       [5.4, 2.3],
       [5.1, 1.8]])
 
In [62]:
plt.scatter(data[:, 0], data[:, 1], c=target)
 
Out[62]:
<matplotlib.collections.PathCollection at 0x21a0b105210>
 
 
 

1.2 시각화 데이터¶

 
In [63]:
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))
 
 

2. k 값에 따른 결정 경계¶

 
In [64]:
from sklearn.neighbors import KNeighborsClassifier
 
 

k 값에 따른 knn의 결정경계를 그려봅니다.
k 가 작을수록 overfitting이 k가 클수록 underfitting이 됩니다.

 
In [65]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
for idx, n in enumerate(range(1, 12, 2)):
    # knn 생성 및 학습
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(data, target)

    # 시각회 데이터 예측
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax = axes[idx//3, idx%3]

    # 영역 표시
    ax.contourf(xx, yy, Z)

    # 데이터 표시    
    ax.scatter(
        data[:, 0], data[:, 1], c=target, alpha=1.0, edgecolor="black"
    )
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel(iris.feature_names[0])
    ax.set_ylabel(iris.feature_names[1])
    ax.set_title(f"{n} Nearest Neighbors")
 
 
 
 

3. 나의 가장 가까운 이웃은?¶

 
 

KNN의 거리의 종류는 p를 통해서 바꿀 수 있습니다.

  • p=1
    • 맨해튼 거리
  • p=2
    • 유클리드 거리
 
 

3.1 Euclidean Distance¶

 
In [66]:
train_data, train_target = data[:-1], target[:-1]
test_data = data[-1:]
 
In [67]:
test_data
 
Out[67]:
array([[5.1, 1.8]])
 
In [68]:
len(train_data), len(test_data)
 
Out[68]:
(79, 1)
 
In [69]:
euclid_knn = KNeighborsClassifier(n_neighbors=10)
euclid_knn.fit(train_data, train_target)
 
Out[69]:
KNeighborsClassifier(n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=10)
 
In [96]:
euclid_knn.kneighbors(
    test_data, n_neighbors=1, return_distance=False
).ravel()
 
Out[96]:
array([37], dtype=int64)
 
In [71]:
euclid_neighbors_idx = euclid_knn.kneighbors(
    test_data, n_neighbors=10, return_distance=False
).ravel()
euclid_neighbors = train_data[euclid_neighbors_idx]
euclid_neighbors_label = train_target[euclid_neighbors_idx]
 
In [72]:
euclid_knn.kneighbors(
    test_data, n_neighbors=10, return_distance=False
)
 
Out[72]:
array([[37, 76, 21, 59, 25, 46, 49, 47, 77, 57]], dtype=int64)
 
In [73]:
euclid_knn.kneighbors(
    test_data, n_neighbors=10, return_distance=False
).ravel()
 
Out[73]:
array([37, 76, 21, 59, 25, 46, 49, 47, 77, 57], dtype=int64)
 
In [74]:
test_data
 
Out[74]:
array([[5.1, 1.8]])
 
In [75]:
euclid_neighbors
 
Out[75]:
array([[5.1, 1.9],
       [5. , 1.9],
       [5. , 1.7],
       [4.9, 1.8],
       [5.1, 1.6],
       [5.1, 2. ],
       [5. , 2. ],
       [5.3, 1.9],
       [5.2, 2. ],
       [4.9, 2. ]])
 
In [76]:
euclid_neighbors_label
 
Out[76]:
array([2, 2, 1, 2, 1, 2, 2, 2, 2, 2])
 
In [77]:
euclid_knn.predict(test_data)
 
Out[77]:
array([2])
 
In [78]:
euclid_knn.predict_proba(test_data)
 
Out[78]:
array([[0.2, 0.8]])
 
In [79]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(euclid_neighbors[:, 0], euclid_neighbors[:, 1], c=euclid_neighbors_label, edgecolors="red", s=500)
 
Out[79]:
<matplotlib.collections.PathCollection at 0x21a0b50ddd0>
 
 
 

3.2 Manhattan Distance¶

 
In [80]:
manhattan_knn = KNeighborsClassifier(n_neighbors=10, p=1)
manhattan_knn.fit(train_data, train_target)
 
Out[80]:
KNeighborsClassifier(n_neighbors=10, p=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=10, p=1)
 
In [81]:
manhattan_neighbors_idx = manhattan_knn.kneighbors(
    test_data, n_neighbors=10, return_distance=False
).ravel()
manhattan_neighbors = train_data[manhattan_neighbors_idx]
manhattan_neighbors_label = train_target[manhattan_neighbors_idx]
 
In [82]:
manhattan_neighbors
 
Out[82]:
array([[5.1, 1.9],
       [4.9, 1.8],
       [5. , 1.9],
       [5. , 1.7],
       [5.1, 1.6],
       [5.1, 2. ],
       [5. , 2. ],
       [4.8, 1.8],
       [5.3, 1.9],
       [5.1, 1.5]])
 
In [83]:
manhattan_neighbors_label
 
Out[83]:
array([2, 2, 2, 1, 1, 2, 2, 1, 2, 2])
 
In [84]:
manhattan_knn.predict_proba(test_data)
 
Out[84]:
array([[0.3, 0.7]])
 
In [85]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(manhattan_neighbors[:, 0], manhattan_neighbors[:, 1], c=manhattan_neighbors_label, edgecolors="red", s=500)
 
Out[85]:
<matplotlib.collections.PathCollection at 0x21a7fc4a410>
 
 
 

3.3 비교¶

 
In [86]:
euclid_neighbors_idx
 
Out[86]:
array([37, 76, 21, 59, 25, 46, 49, 47, 77, 57], dtype=int64)
 
In [87]:
manhattan_neighbors_idx
 
Out[87]:
array([37, 59, 76, 21, 25, 46, 49, 17, 47, 67], dtype=int64)
 
In [88]:
set(euclid_neighbors_idx) - set(manhattan_neighbors_idx)
 
Out[88]:
{57, 77}
 
In [89]:
set(manhattan_neighbors_idx) - set(euclid_neighbors_idx)
 
Out[89]:
{17, 67}
 
In [90]:
diff_neighbors_idx = list(set(euclid_neighbors_idx) - set(manhattan_neighbors_idx))
diff_neighbors_idx.extend(list(set(manhattan_neighbors_idx) - set(euclid_neighbors_idx)))
diff_neighbors_idx
 
Out[90]:
[57, 77, 17, 67]
 
In [91]:
diff_neighbors = train_data[diff_neighbors_idx]
diff_neighbors_label = train_target[diff_neighbors_idx]
 
In [92]:
same_neighbors_idx = list(set(euclid_neighbors_idx) & set(manhattan_neighbors_idx))
same_neighbors_idx
 
Out[92]:
[37, 76, 46, 47, 49, 21, 25, 59]
 
In [93]:
same_neighbors = train_data[same_neighbors_idx]
same_neighbors_label = train_target[same_neighbors_idx]
 
In [94]:
plt.figure(figsize=(15, 8))
plt.scatter(train_data[:, 0], train_data[:, 1], c=train_target, s=500)
plt.scatter(test_data[0, 0], test_data[0, 1], marker="*", s=1000)
plt.scatter(diff_neighbors[:, 0], diff_neighbors[:, 1], c=diff_neighbors_label, edgecolors="red", s=500)
plt.scatter(same_neighbors[:, 0], same_neighbors[:, 1], c=same_neighbors_label, edgecolors="blue", s=500)
 
Out[94]:
<matplotlib.collections.PathCollection at 0x21a7fc8e4d0>
 
 
In [ ]:
 

'Machine Learning > KNN' 카테고리의 다른 글

KNN으로 음수 가능 여부를 판단하기  (0) 2024.03.12
'Machine Learning/KNN' 카테고리의 다른 글
  • KNN으로 음수 가능 여부를 판단하기
Juson
Juson
  • Juson
    Juson의 데이터 공부
    Juson
  • 전체
    오늘
    어제
    • 분류 전체보기 (95)
      • RAG (2)
      • AI (2)
        • NLP (0)
        • Generative Model (0)
        • Deep Reinforcement Learning (2)
        • LLM (0)
      • Logistic Optimization (0)
      • Machine Learning (37)
        • Linear Regression (2)
        • Logistic Regression (2)
        • Decision Tree (5)
        • Naive Bayes (1)
        • KNN (2)
        • SVM (2)
        • Clustering (4)
        • Dimension Reduction (3)
        • Boosting (6)
        • Abnomaly Detection (2)
        • Recommendation (4)
        • Embedding & NLP (4)
      • Reinforcement Learning (5)
      • Deep Learning (10)
        • Deep learning Bacis Mathema.. (10)
      • Optimization (2)
        • OR Optimization (0)
        • Convex Optimization (0)
        • Integer Optimization (0)
      • SNA 분석 (0)
      • 포트폴리오 최적화 공부 (0)
        • 최적화 기법 (0)
        • 금융 베이스 (0)
      • Finanancial engineering (0)
      • 프로그래머스 데브코스(Boot camp) (15)
        • SQL (9)
        • Python (5)
        • Machine Learning (1)
      • Python (22)
      • Project (0)
  • 블로그 메뉴

    • 홈
    • 태그
    • 방명록
  • 링크

  • 공지사항

  • 인기 글

  • 태그

  • 최근 댓글

  • 최근 글

  • hELLO· Designed By정상우.v4.10.4
Juson
KNN 기초 실습
상단으로

티스토리툴바