1. 사고등급 기준은 어떤 논리로 할까 모른다
2. 컴퓨터에 맡기면 어떤 기준을 제시해줄까?
3. 의미를 찾을 수 있을라나..
4. 실루엣 점수도 꽝이고 각 군집별 EDA가 필요할 것 같다
5. 그래야 실루엣 점수를 올리지
6. 각 군집별 특성이 정리되면 그게 분류 기준이 될 수도 있을 것 같다.
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 21 08:59:56 2023
@author: Administrator
"""
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
#%% read files
os.chdir(r"C:\\python\\Accident_Rank\\")
file_list = glob.glob('./data/*.xlsx')
df_list = []
for file in tqdm(file_list):
df_temp = pd.read_excel(file,na_values=['-',' '])
df_list.append(df_temp)
df_raw_temp = pd.concat(df_list, axis=0)
del df_temp, file, file_list, df_list
#%% rename cols
df_raw = df_raw_temp.copy()
pd.set_option('display.max_rows',500)
cols = df_raw.columns.tolist()
for i in range(len(cols)):
cols[i] = cols[i].replace('\n', ' ')
df_raw.columns = cols
del i, cols
#%% check nan value
sns.set(rc={'figure.figsize':(20,10)})
col_nan = df_raw.isna().sum().sort_values(ascending=False)
col_nan = col_nan[col_nan>100]
plt.xticks(rotation=90)
plt.rc('font',family='NanumGothic')
plt.title('NaN Counts')
sns.barplot(x=col_nan.index,y=col_nan).get_figure().savefig('./plots/NaN counts')
#%% del cols and dropna
col_nan = col_nan.reset_index()
col_nan = col_nan['index'].tolist()
col_to_live = ['구분','원인차차종','요일','발생지점']
for col in col_to_live:
col_nan.remove(col)
df_raw = df_raw.drop(columns=col_nan,axis=1)
df_raw = df_raw.dropna(axis=0)
df_raw.isna().sum()
del col, col_nan, col_to_live
#%% select cols
print(df_raw.info(verbose=True, show_counts=True))
col_select = ['월별','주야','사망','부상','발생지점','요일','사고차량수','작업장구분','원인차차종']
df_ml = df_raw[col_select]
del col_select
#%% preprocessing
df_ml_gd = pd.get_dummies(df_ml)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_ml_gd_sc = sc.fit_transform(df_ml_gd)
del df_ml_gd, sc
#%% Clustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import silhouette_score
pca = PCA(n_components=2)
pca.fit(df_ml_gd_sc)
x_pca = pca.transform(df_ml_gd_sc)
pca_df = pd.DataFrame(x_pca)
kmeans = KMeans(n_clusters=4, random_state=0)
gmm = GaussianMixture(n_components=4, random_state=0)
fig, axes = plt.subplots(1, 2, figsize=(10, 4), subplot_kw={'xticks':(), 'yticks':()})
algos = [kmeans, gmm]
for ax, algo in zip(axes, algos):
clusters = algo.fit_predict(df_ml_gd_sc)
ax.scatter(pca_df.iloc[:,0], pca_df.iloc[:, 1], c=clusters, marker='+',alpha=0.2, s=20)
ax.set_title("{} : {:.2f}".format(algo.__class__.__name__, silhouette_score(df_ml_gd_sc, clusters)))
fig.savefig('./plots/Cluster.png')
plt.show()
'발전의 의지 > 파이썬' 카테고리의 다른 글
stackplot (0) | 2024.06.19 |
---|---|
청렴소통포탈 워드클라우드 (0) | 2024.01.08 |
빅데이터분석기사 취득 (0) | 2023.12.18 |
TCS 활용 UAM 노선 살펴보기 (0) | 2023.12.14 |
그냥 저냥 하는 것 (0) | 2023.07.14 |