그럴 수도 있지

발전의 의지/파이썬

청렴소통포탈 워드클라우드

OnlyMyStuff 2024. 1. 8. 17:35

 

요즘 회사가 시끄러워서

 

직원들이 뭐에 불만이 있는지 궁금했다

 

그래서 한번 만들어봤다

 

별건 없다 글이 너무 많아서

 

# -*- coding: utf-8 -*-
"""
Created on Tue May 23 11:36:34 2023

@author: Administrator
"""

#%%
import pandas as pd
import os
import re
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.select import Select
from time import sleep
os.chdir(r"C:\python\crawling_something")

from selenium.webdriver.common.by import By

    
#%% 홈페이지 켜기

driver = webdriver.Chrome('chromedriver')

url = ''

driver.get(url)

#%% 로그인 버튼

iframe_list = driver.find_elements(By.TAG_NAME,'iframe' )

driver.switch_to.frame(iframe_list[0])

driver.find_element('xpath',r'//*[@id="loginButton"]').click()

#%% CEO 포탈 클릭

driver.find_element('xpath',r'//*[@id="gnb_menu"]/ul/li[1]/a/img').click()

#%% 핸들 변경

driver.switch_to.window(driver.window_handles[1])


#%% 소통광장 클릭

driver.find_element('xpath','//*[@id="gnb"]/li[4]/a').click()

#%% 추출 함수 생성

def get_items(driver, xpath):
    driver.find_element('xpath',xpath).click()
    title = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[1]/td').text
    date = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[2]/td[1]').text
    author = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[2]/td[2]').text
    content = driver.find_element('xpath','//*[@id="cmsContent"]/div[2]').text
    delete = driver.find_element('xpath','//*[@id="report_cnt_val"]').text
    up = driver.find_element('xpath','//*[@id="like_cnt_val"]').text
    down = driver.find_element('xpath','//*[@id="un_like_cnt_val"]').text 
    driver.back()
    
    return title, date, author, content, delete, up, down

#%% 리스트 생성

title_list = []
date_list = []
author_list = []
content_list = []
delete_list = []
up_list = []
down_list = []

#%% 페이지 수 계산

post_max = int(driver.find_element('xpath','//*[@id="cmsContent"]/div[2]/table/tbody/tr[3]/td[1]').text)

if post_max%10 ==0 :
    page_max = post_max//10
else:
    page_max = post_max//10+1

#%% 페이지별 크롤링

for page_num in range(1,page_max+1):
    driver.find_element(By.XPATH, f"//*[@id='pagingNav']//*[text()='"+str(page_num)+f"']").click()
    print('page_num :',page_num)
    for post_num in range(1,11):
        print('post_num :', post_num)
        try:
            xpath = '//*[@id="cmsContent"]/div[2]/table/tbody/tr['+str(post_num+2)+']/td[5]/a'
            title, date, author, content, delete, up, down = get_items(driver,xpath)
            title_list.append(title)
            date_list.append(date)
            author_list.append(author)
            content_list.append(content)
            delete_list.append(delete)
            up_list.append(up)
            down_list.append(down)
        except:
            pass
    if page_num%10==0:
        driver.find_element(By.XPATH, f"//*[@id='pagingNav']//*[text()='"+'>'+f"']").click()
        
#%%
import pandas as pd

df = pd.DataFrame({'title':title_list, 'date':date_list,'author':author_list,'content':content_list,'delete':delete_list,'up':up_list,
                   'down':down_list})

df.to_csv('result.csv')

# -*- coding: utf-8 -*-
"""
Created on Fri Jan  5 10:57:28 2024

@author: Administrator
"""


df = pd.read_csv(r'C:/python/crawling_something/result.csv')

#%%
from konlpy.tag import Okt

from collections import Counter
from wordcloud import WordCloud


df['content'] =df['content'].str.replace('[^가-힣]',' ', regex=True)

import konlpy
kkma = konlpy.tag.Kkma() # 형태소 분석기 꼬꼬마

nouns = df['content'].apply(kkma.nouns)

#%%

nouns = nouns.explode()
df_word = pd.DataFrame({'word':nouns})
df_word['count'] = df_word['word'].str.len()
df_word = df_word.query('count >= 2')

#%%
df_word = df_word.groupby('word', as_index = False).count().sort_values('count', ascending = False)

#%%
dic_word = df_word.set_index('word').to_dict()['count']

#%%

import matplotlib.pyplot as plt
from wordcloud import WordCloud

wc = WordCloud(random_state = 123,  width = 400, font_path='NG.ttf',
               height = 400, background_color = 'white')

img_wordcloud = wc.generate_from_frequencies(dic_word)

plt.figure(figsize = (10, 10)) # 크기 지정하기
plt.axis('off') # 축 없애기
plt.imshow(img_wordcloud) # 결과 보여주기
plt.savefig('동감_워드클라우드') # 파일 저장

#%%

'발전의 의지 > 파이썬' 카테고리의 다른 글

해볼만한 것  (0) 2024.08.13
stackplot  (0) 2024.06.19
교통사고 등급 클러스터링  (1) 2023.12.21
빅데이터분석기사 취득  (0) 2023.12.18
TCS 활용 UAM 노선 살펴보기  (0) 2023.12.14