Spaces:

duwing
/

comment_filter

Sleeping

App Files Files Community

comment_filter / app.py

duwing

Update app.py

58f2b8f verified 10 months ago

raw

history blame

4.51 kB

	import streamlit as st
	import tensorflow as tf
	import numpy as np
	import pandas as pd
	from transformers import *
	from tqdm import tqdm
	from tensorflow.python.client import device_lib

	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	import time


	PATH = './checkpoint-7500/'
	SEQ_LEN = 128
	tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

	def create_sentiment_bert():
	# 버트 pretrained 모델 로드
	model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
	# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
	token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
	mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
	segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
	# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
	bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

	bert_outputs = bert_outputs[1]
	sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
	sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

	sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
	return sentiment_model

	def sentence_convert_data(data):
	global tokenizer
	tokens, masks, segments = [], [], []
	token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')

	num_zeros = token.count(0)
	mask = [1](SEQ_LEN-num_zeros) + [0]num_zeros
	segment = [0]*SEQ_LEN

	tokens.append(token)
	segments.append(segment)
	masks.append(mask)

	tokens = np.array(tokens)
	masks = np.array(masks)
	segments = np.array(segments)
	return [tokens, masks, segments]

	def movie_evaluation_predict(sentence):
	data_x = sentence_convert_data(sentence)
	predict = sentiment_model.predict(data_x)
	predict_value = np.ravel(predict)
	predict_answer = np.round(predict_value,0).item()

	print(predict_value)

	if predict_answer == 0:
	st.write("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1.0-predict_value))
	elif predict_answer == 1:
	st.write("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value)

	def setup_driver():
	chrome_options = Options()
	chrome_options.add_argument("--headless") # 백그라운드 실행
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")

	driver = webdriver.Chrome(options=chrome_options)
	return driver

	def scrape_content(url):
	driver = setup_driver()
	try:
	driver.get(url)
	# 페이지 로딩 대기
	time.sleep(3)

	# 본문 추출
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	content = soup.find('article') # 본문 태그에 맞게 수정

	# 댓글 추출
	comments = soup.find_all('span', class_='u_cbox_contents') # 댓글 태그에 맞게 수정

	return {
	'content': content.text if content else "본문을 찾을 수 없습니다.",
	'comments': [comment.text for comment in comments]
	}
	finally:
	driver.quit()


	def main():
	sentiment_model = create_sentiment_bert()

	url = st.text_input("URL을 입력하세요")

	if st.button("크롤링 시작"):
	if url:
	with st.spinner("크롤링 중..."):
	result = scrape_content(url)

	st.subheader("본문")
	st.write(result['content'])

	st.subheader("댓글")
	for idx, comment in enumerate(result['comments'], 1):
	st.write(f"{idx}. {comment}")
	else:
	st.error("URL을 입력해주세요")


	'''
	test = st.form('test')
	sentence = test.text_input("Your sentence")
	submit = test.form_submit_button("Submit")

	if submit:
	movie_evaluation_predict(sentence)
	'''
	return 0

	if __name__ == "__main__":
	main()