요즘 회사가 시끄러워서
직원들이 뭐에 불만이 있는지 궁금했다
그래서 한번 만들어봤다
별건 없다 글이 너무 많아서
# -*- coding: utf-8 -*-
"""
Created on Tue May 23 11:36:34 2023
@author: Administrator
"""
#%%
import pandas as pd
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.select import Select
from time import sleep
os.chdir(r"C:\python\crawling_something")
from selenium.webdriver.common.by import By
#%% 홈페이지 켜기
driver = webdriver.Chrome('chromedriver')
url = ''
driver.get(url)
#%% 로그인 버튼
iframe_list = driver.find_elements(By.TAG_NAME,'iframe' )
driver.switch_to.frame(iframe_list[0])
driver.find_element('xpath',r'//*[@id="loginButton"]').click()
#%% CEO 포탈 클릭
driver.find_element('xpath',r'//*[@id="gnb_menu"]/ul/li[1]/a/img').click()
#%% 핸들 변경
driver.switch_to.window(driver.window_handles[1])
#%% 소통광장 클릭
driver.find_element('xpath','//*[@id="gnb"]/li[4]/a').click()
#%% 추출 함수 생성
def get_items(driver, xpath):
driver.find_element('xpath',xpath).click()
title = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[1]/td').text
date = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[2]/td[1]').text
author = driver.find_element('xpath','//*[@id="cmsContent"]/div[1]/table/thead/tr[2]/td[2]').text
content = driver.find_element('xpath','//*[@id="cmsContent"]/div[2]').text
delete = driver.find_element('xpath','//*[@id="report_cnt_val"]').text
up = driver.find_element('xpath','//*[@id="like_cnt_val"]').text
down = driver.find_element('xpath','//*[@id="un_like_cnt_val"]').text
driver.back()
return title, date, author, content, delete, up, down
#%% 리스트 생성
title_list = []
date_list = []
author_list = []
content_list = []
delete_list = []
up_list = []
down_list = []
#%% 페이지 수 계산
post_max = int(driver.find_element('xpath','//*[@id="cmsContent"]/div[2]/table/tbody/tr[3]/td[1]').text)
if post_max%10 ==0 :
page_max = post_max//10
else:
page_max = post_max//10+1
#%% 페이지별 크롤링
for page_num in range(1,page_max+1):
driver.find_element(By.XPATH, f"//*[@id='pagingNav']//*[text()='"+str(page_num)+f"']").click()
print('page_num :',page_num)
for post_num in range(1,11):
print('post_num :', post_num)
try:
xpath = '//*[@id="cmsContent"]/div[2]/table/tbody/tr['+str(post_num+2)+']/td[5]/a'
title, date, author, content, delete, up, down = get_items(driver,xpath)
title_list.append(title)
date_list.append(date)
author_list.append(author)
content_list.append(content)
delete_list.append(delete)
up_list.append(up)
down_list.append(down)
except:
pass
if page_num%10==0:
driver.find_element(By.XPATH, f"//*[@id='pagingNav']//*[text()='"+'>'+f"']").click()
#%%
import pandas as pd
df = pd.DataFrame({'title':title_list, 'date':date_list,'author':author_list,'content':content_list,'delete':delete_list,'up':up_list,
'down':down_list})
df.to_csv('result.csv')
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 5 10:57:28 2024
@author: Administrator
"""
df = pd.read_csv(r'C:/python/crawling_something/result.csv')
#%%
from konlpy.tag import Okt
from collections import Counter
from wordcloud import WordCloud
df['content'] =df['content'].str.replace('[^가-힣]',' ', regex=True)
import konlpy
kkma = konlpy.tag.Kkma() # 형태소 분석기 꼬꼬마
nouns = df['content'].apply(kkma.nouns)
#%%
nouns = nouns.explode()
df_word = pd.DataFrame({'word':nouns})
df_word['count'] = df_word['word'].str.len()
df_word = df_word.query('count >= 2')
#%%
df_word = df_word.groupby('word', as_index = False).count().sort_values('count', ascending = False)
#%%
dic_word = df_word.set_index('word').to_dict()['count']
#%%
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wc = WordCloud(random_state = 123, width = 400, font_path='NG.ttf',
height = 400, background_color = 'white')
img_wordcloud = wc.generate_from_frequencies(dic_word)
plt.figure(figsize = (10, 10)) # 크기 지정하기
plt.axis('off') # 축 없애기
plt.imshow(img_wordcloud) # 결과 보여주기
plt.savefig('동감_워드클라우드') # 파일 저장
#%%
'발전의 의지 > 파이썬' 카테고리의 다른 글
해볼만한 것 (0) | 2024.08.13 |
---|---|
stackplot (0) | 2024.06.19 |
교통사고 등급 클러스터링 (1) | 2023.12.21 |
빅데이터분석기사 취득 (0) | 2023.12.18 |
TCS 활용 UAM 노선 살펴보기 (0) | 2023.12.14 |