Spaces:

duwing
/

comment_filter

Sleeping

File size: 4,513 Bytes

import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


PATH = './checkpoint-7500/'
SEQ_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def create_sentiment_bert():
  # 버트 pretrained 모델 로드
  model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
  # 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
  token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
  mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
  segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
  # 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
  bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

  bert_outputs = bert_outputs[1]
  sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
  sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

  sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
  return sentiment_model

def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
    
    num_zeros = token.count(0) 
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros 
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    predict_answer = np.round(predict_value,0).item()

    print(predict_value)

    if predict_answer == 0:
      st.write("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1.0-predict_value))
    elif predict_answer == 1:
      st.write("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value)

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # 백그라운드 실행
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def scrape_content(url):
    driver = setup_driver()
    try:
        driver.get(url)
        # 페이지 로딩 대기
        time.sleep(3)
        
        # 본문 추출
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = soup.find('article')  # 본문 태그에 맞게 수정
        
        # 댓글 추출
        comments = soup.find_all('span', class_='u_cbox_contents')  # 댓글 태그에 맞게 수정
        
        return {
            'content': content.text if content else "본문을 찾을 수 없습니다.",
            'comments': [comment.text for comment in comments]
        }
    finally:
        driver.quit()


def main():
    sentiment_model = create_sentiment_bert()

    url = st.text_input("URL을 입력하세요")
    
    if st.button("크롤링 시작"):
        if url:
            with st.spinner("크롤링 중..."):
                result = scrape_content(url)
                
                st.subheader("본문")
                st.write(result['content'])
                
                st.subheader("댓글")
                for idx, comment in enumerate(result['comments'], 1):
                    st.write(f"{idx}. {comment}")
        else:
            st.error("URL을 입력해주세요")

    
    '''
    test = st.form('test')
    sentence = test.text_input("Your sentence")
    submit = test.form_submit_button("Submit")
    
    if submit:
        movie_evaluation_predict(sentence)
    '''
    return 0

if __name__ == "__main__":
    main()