본문 바로가기
IT&게임/빅데이터분석기사(빅분기)

빅데이터분석기사 제2유형 : ⑥ 평가지표

by 푸루루 2024. 6. 6.
728x90
반응형

이진분류 평가지표

# 머신러닝 예시 코드
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# train
train = pd.DataFrame({
    'f1': [2, 3, 5, 7, 11, 13, 17, 19, 23, 29],
    'f2': [30, 28, 26, 24, 22, 20, 18, 16, 14, 12],
    'target': ['A', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'B']
})

# test
test = pd.DataFrame({
    'f1': [7, 9, 15],
    'f2': [23, 18, 26]
})

# target 데이터 분리
target = train.pop('target')

# 머신러닝 학습 및 예측
# >> clf모델의 성능을 평가 할 수 없음 

clf = RandomForestClassifier(random_state=0)
clf.fit(train, target)
pred = clf.predict(test)
print(pred)
# 머신러닝 예시 코드(predict)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

train = pd.DataFrame({
    'f1': [2, 3, 5, 7, 11, 13, 17, 19, 23, 29],
    'f2': [30, 28, 26, 24, 22, 20, 18, 16, 14, 12],
    'target': ['A', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'B']
})

# 검증 데이터 분리 (30% 분리하였음)
target = train.pop('target')
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.3, random_state=0)

# 머신러닝 학습 및 예측
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)

print("val:", y_pred) # 예측값
print("정확도:", accuracy_score(y_val, y_pred)) # 정확도 평가 (실제,예측)

# test
test = pd.DataFrame({
    'f1': [7, 9, 15],
    'f2': [23, 18, 26]
})
pred = clf.predict(test)
print("test:", pred)
# 이진분류 데이터
# 숫자 아니면 문자로 되어 있음 두가지를 둘다 보겠음 
import pandas as pd
y_true = pd.DataFrame([0, 1, 1, 0, 0, 1, 1, 1, 1, 0]) #실제값
y_pred = pd.DataFrame([0, 0, 1, 1, 0, 0, 0, 1, 1, 0]) #예측값

y_true_str = pd.DataFrame(['B', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'B']) #실제값
y_pred_str = pd.DataFrame(['B', 'B', 'A', 'A', 'B', 'B', 'B', 'A', 'A', 'B']) #예측값

# 정확도(Accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true, y_pred)
print("정확도:", accuracy)

accuracy = accuracy_score(y_true_str, y_pred_str)
print("정확도:", accuracy)

# F1 스코어(F1 Score) *** 이거는 꼭 외워야해 ★★★★★★★★★★★★★★★ 프레딕은 이걸로 하자 !!! 
from sklearn.metrics import f1_score
f1 = f1_score(y_true, y_pred)
print("F1 스코어:", f1)

#양성 값이 A이다 라고 설정을 해줌 (문자)
f1 = f1_score(y_true_str, y_pred_str, pos_label='A')
print("F1 스코어:", f1)

# 머신러닝 예시 코드(predict_proba) -- 확률의 값임  >>ROC AUC 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

df = pd.DataFrame({
    'f1': [2, 3, 5, 7, 11, 13, 17, 19, 23, 29],
    'f2': [30, 28, 26, 24, 22, 20, 18, 16, 14, 12],
    'target': ['A', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'B']
})

target = df.pop('target')
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.5, random_state=0)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_val)

print(y_pred) # 예측값
roc_auc_score(y_val, y_pred[:,1]) # 정확도 평가

#0일 확률 , 1일 확률

# ROC-AUC ***
from sklearn.metrics import roc_auc_score
import numpy as np

# 실제값 (0: 음성, 1: 양성)
y_true = pd.DataFrame([0, 1, 0, 1, 1, 0, 0, 0, 1, 1])
# 예측값 중 양성(1) 확률
y_pred_proba = np.array([
    [0.8, 0.2],
    [0.1, 0.9],
    [0.77, 0.23],
    [0.6, 0.4],
    [0.2, 0.8],
    [0.4, 0.6],
    [0.6, 0.4],
    [0.8, 0.2],
    [0.3, 0.7],
    [0.4, 0.6]
])

#여기에만 집중하면됨 
roc_auc = roc_auc_score(y_true, y_pred_proba[:,1])
print("ROC-AUC:", roc_auc)

y_true_str = pd.DataFrame(['B', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'B']) #실제값
y_pred_proba_str = np.array([
    [0.8, 0.2],
    [0.1, 0.9],
    [0.77, 0.23],
    [0.6, 0.4],
    [0.2, 0.8],
    [0.4, 0.6],
    [0.6, 0.4],
    [0.8, 0.2],
    [0.3, 0.7],
    [0.4, 0.6]
]) # 예측값
roc_auc = roc_auc_score(y_true, y_pred_proba_str[:,1])
print("ROC-AUC:", roc_auc)

 

다중분류 평가지표

# 다중분류 데이터
y_true = pd.DataFrame([2, 2, 3, 3, 2, 1, 3, 3, 2, 1]) # 실제값
y_pred = pd.DataFrame([2, 2, 1, 3, 2, 1, 1, 2, 2, 1]) # 예측값

y_true_str = pd.DataFrame(['B', 'B', 'C', 'C', 'B', 'A', 'C', 'C', 'B', 'A']) # 실제값
y_pred_str = pd.DataFrame(['B', 'B', 'A', 'C', 'B', 'A', 'A', 'B', 'B', 'A']) # 예측값

# 정확도(Accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true, y_pred)
print("정확도:", accuracy)

accuracy = accuracy_score(y_true_str, y_pred_str)
print("정확도:", accuracy)

# F1 스코어(F1 Score) ★★★★★★★★★★★★★
from sklearn.metrics import f1_score
f1 = f1_score(y_true, y_pred, average='macro')  # average= micro, macro, weighted (문제에서 요구하는대로 적어주면됨)
print("F1 스코어:", f1)

f1 = f1_score(y_true_str, y_pred_str, average='macro')
print("F1 스코어:", f1)

# 다중분류 데이터(확률값)
y_true = pd.DataFrame([0, 1, 2, 0, 1]) # 실제값
y_pred_proba = pd.DataFrame([[0.2, 0.5, 0.3], [0.7, 0.2, 0.1], [0.4, 0.3, 0.3], [0.4, 0.1, 0.5], [0.1, 0.8, 0.1]], columns=[0, 1, 2]) # 예측값(각 클래스 확률)

 

 

회귀 평가지표

# 회귀 데이터
import pandas as pd
y_true = pd.DataFrame([0, 2, 5, 2, 4, 4, 7, 10]) # 실제값
y_pred = pd.DataFrame([1.14, 2.53, 4.87, 3.08, 4.21, 5.53, 7.51, 10.32]) # 예측값

# MSE(Mean Squared Error)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_true, y_pred)
print("MSE:", mse)

# MAE(Mean Absolute Error)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_true, y_pred)
print("MAE:", mae)

# 결정 계수(R-squared) ★★★★★★★★
from sklearn.metrics import r2_score
r2 = r2_score(y_true, y_pred)
print("결정 계수:", r2)

# RMSE(Root Mean Squared Error) ★★★★★★★★★★★★
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_true, y_pred)
rmse = mse ** 0.5
print("RMSE:", rmse)



728x90

댓글