comment_filter / app.py
duwing's picture
Update app.py
58f2b8f verified
raw
history blame
4.51 kB
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
PATH = './checkpoint-7500/'
SEQ_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def create_sentiment_bert():
# ๋ฒ„ํŠธ pretrained ๋ชจ๋ธ ๋กœ๋“œ
model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
# ํ† ํฐ ์ธํ’‹, ๋งˆ์Šคํฌ ์ธํ’‹, ์„ธ๊ทธ๋จผํŠธ ์ธํ’‹ ์ •์˜
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# ์ธํ’‹์ด [ํ† ํฐ, ๋งˆ์Šคํฌ, ์„ธ๊ทธ๋จผํŠธ]์ธ ๋ชจ๋ธ ์ •์˜
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
return sentiment_model
def sentence_convert_data(data):
global tokenizer
tokens, masks, segments = [], [], []
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
num_zeros = token.count(0)
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
segment = [0]*SEQ_LEN
tokens.append(token)
segments.append(segment)
masks.append(mask)
tokens = np.array(tokens)
masks = np.array(masks)
segments = np.array(segments)
return [tokens, masks, segments]
def movie_evaluation_predict(sentence):
data_x = sentence_convert_data(sentence)
predict = sentiment_model.predict(data_x)
predict_value = np.ravel(predict)
predict_answer = np.round(predict_value,0).item()
print(predict_value)
if predict_answer == 0:
st.write("(๋ถ€์ • ํ™•๋ฅ  : %.2f) ๋ถ€์ •์ ์ธ ์˜ํ™” ํ‰๊ฐ€์ž…๋‹ˆ๋‹ค." % (1.0-predict_value))
elif predict_answer == 1:
st.write("(๊ธ์ • ํ™•๋ฅ  : %.2f) ๊ธ์ •์ ์ธ ์˜ํ™” ํ‰๊ฐ€์ž…๋‹ˆ๋‹ค." % predict_value)
def setup_driver():
chrome_options = Options()
chrome_options.add_argument("--headless") # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์‹คํ–‰
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
return driver
def scrape_content(url):
driver = setup_driver()
try:
driver.get(url)
# ํŽ˜์ด์ง€ ๋กœ๋”ฉ ๋Œ€๊ธฐ
time.sleep(3)
# ๋ณธ๋ฌธ ์ถ”์ถœ
soup = BeautifulSoup(driver.page_source, 'html.parser')
content = soup.find('article') # ๋ณธ๋ฌธ ํƒœ๊ทธ์— ๋งž๊ฒŒ ์ˆ˜์ •
# ๋Œ“๊ธ€ ์ถ”์ถœ
comments = soup.find_all('span', class_='u_cbox_contents') # ๋Œ“๊ธ€ ํƒœ๊ทธ์— ๋งž๊ฒŒ ์ˆ˜์ •
return {
'content': content.text if content else "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.",
'comments': [comment.text for comment in comments]
}
finally:
driver.quit()
def main():
sentiment_model = create_sentiment_bert()
url = st.text_input("URL์„ ์ž…๋ ฅํ•˜์„ธ์š”")
if st.button("ํฌ๋กค๋ง ์‹œ์ž‘"):
if url:
with st.spinner("ํฌ๋กค๋ง ์ค‘..."):
result = scrape_content(url)
st.subheader("๋ณธ๋ฌธ")
st.write(result['content'])
st.subheader("๋Œ“๊ธ€")
for idx, comment in enumerate(result['comments'], 1):
st.write(f"{idx}. {comment}")
else:
st.error("URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”")
'''
test = st.form('test')
sentence = test.text_input("Your sentence")
submit = test.form_submit_button("Submit")
if submit:
movie_evaluation_predict(sentence)
'''
return 0
if __name__ == "__main__":
main()