File size: 4,513 Bytes
22acb53
2130106
 
 
 
 
 
f2496ac
 
 
 
 
 
 
 
 
 
22acb53
24d4881
aaef8fd
fdb1d41
aaef8fd
879bc79
 
404f618
879bc79
 
 
 
 
 
 
 
 
 
 
 
 
2130106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879bc79
e94df39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c6f4a1
 
e94df39
 
 
58f2b8f
9c6f4a1
e94df39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


PATH = './checkpoint-7500/'
SEQ_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def create_sentiment_bert():
  # ๋ฒ„ํŠธ pretrained ๋ชจ๋ธ ๋กœ๋“œ
  model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
  # ํ† ํฐ ์ธํ’‹, ๋งˆ์Šคํฌ ์ธํ’‹, ์„ธ๊ทธ๋จผํŠธ ์ธํ’‹ ์ •์˜
  token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
  mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
  segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
  # ์ธํ’‹์ด [ํ† ํฐ, ๋งˆ์Šคํฌ, ์„ธ๊ทธ๋จผํŠธ]์ธ ๋ชจ๋ธ ์ •์˜
  bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

  bert_outputs = bert_outputs[1]
  sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
  sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

  sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
  return sentiment_model

def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
    
    num_zeros = token.count(0) 
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros 
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    predict_answer = np.round(predict_value,0).item()

    print(predict_value)

    if predict_answer == 0:
      st.write("(๋ถ€์ • ํ™•๋ฅ  : %.2f) ๋ถ€์ •์ ์ธ ์˜ํ™” ํ‰๊ฐ€์ž…๋‹ˆ๋‹ค." % (1.0-predict_value))
    elif predict_answer == 1:
      st.write("(๊ธ์ • ํ™•๋ฅ  : %.2f) ๊ธ์ •์ ์ธ ์˜ํ™” ํ‰๊ฐ€์ž…๋‹ˆ๋‹ค." % predict_value)

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์‹คํ–‰
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def scrape_content(url):
    driver = setup_driver()
    try:
        driver.get(url)
        # ํŽ˜์ด์ง€ ๋กœ๋”ฉ ๋Œ€๊ธฐ
        time.sleep(3)
        
        # ๋ณธ๋ฌธ ์ถ”์ถœ
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = soup.find('article')  # ๋ณธ๋ฌธ ํƒœ๊ทธ์— ๋งž๊ฒŒ ์ˆ˜์ •
        
        # ๋Œ“๊ธ€ ์ถ”์ถœ
        comments = soup.find_all('span', class_='u_cbox_contents')  # ๋Œ“๊ธ€ ํƒœ๊ทธ์— ๋งž๊ฒŒ ์ˆ˜์ •
        
        return {
            'content': content.text if content else "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.",
            'comments': [comment.text for comment in comments]
        }
    finally:
        driver.quit()


def main():
    sentiment_model = create_sentiment_bert()

    url = st.text_input("URL์„ ์ž…๋ ฅํ•˜์„ธ์š”")
    
    if st.button("ํฌ๋กค๋ง ์‹œ์ž‘"):
        if url:
            with st.spinner("ํฌ๋กค๋ง ์ค‘..."):
                result = scrape_content(url)
                
                st.subheader("๋ณธ๋ฌธ")
                st.write(result['content'])
                
                st.subheader("๋Œ“๊ธ€")
                for idx, comment in enumerate(result['comments'], 1):
                    st.write(f"{idx}. {comment}")
        else:
            st.error("URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”")

    
    '''
    test = st.form('test')
    sentence = test.text_input("Your sentence")
    submit = test.form_submit_button("Submit")
    
    if submit:
        movie_evaluation_predict(sentence)
    '''
    return 0

if __name__ == "__main__":
    main()