Spaces:

zhixiusue
/

Edutube

Sleeping

File size: 3,731 Bytes

9dcbc08
ba2392f
 
9dcbc08
ba2392f

import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

id_to_label = {
    0: 'O',
    1: 'B-TOPIC',
    2: 'I-TOPIC',
    3: 'B-STYLE',
    4: 'I-STYLE',
    5: 'B-LENGTH',
    6: 'I-LENGTH',
    7: 'B-LANGUAGE',
    8: 'I-LANGUAGE'
}

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(".")
    model = AutoModelForTokenClassification.from_pretrained(".")
    return tokenizer, model

tokenizer, model = load_model()

def predict(text, model, tokenizer, id_to_label):
    tokens = list(text)
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    word_ids = inputs.word_ids(batch_index=0)
    pred_labels = []
    tokens_out = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        token = tokens[word_idx]
        label = id_to_label[predictions[0][idx].item()]
        tokens_out.append(token)
        pred_labels.append(label)

    return tokens_out, pred_labels

def post_process(tokens, labels):
    words, word_labels = [], []
    current_word = ""
    current_label = None
    for token, label in zip(tokens, labels):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
                word_labels.append(current_label)
            current_word = token
            current_label = label
    if current_word:
        words.append(current_word)
        word_labels.append(current_label)
    return words, word_labels

def align_words_labels(words, labels):
    return list(zip(words, labels))

def extract_entities(aligned_result):
    entities, current_entity, current_text = [], None, ""
    for word, label in aligned_result:
        if label == "O":
            if current_entity:
                entities.append({"entity": current_entity, "text": current_text})
                current_entity, current_text = None, ""
            continue
        prefix, entity_type = label.split("-", 1)
        if prefix == "B":
            if current_entity:
                entities.append({"entity": current_entity, "text": current_text})
            current_entity, current_text = entity_type, word
        elif prefix == "I" and current_entity == entity_type:
            current_text += word
        else:
            if current_entity:
                entities.append({"entity": current_entity, "text": current_text})
            current_entity, current_text = entity_type, word
    if current_entity:
        entities.append({"entity": current_entity, "text": current_text})
    return entities

# Streamlit UI
st.title("🎯 Learning Condition Extractor")
st.write("사용자의 학습 목표 문장에서 조건(TOPIC, STYLE, LENGTH, LANGUAGE)을 추출합니다.")

user_input = st.text_input("학습 목표를 입력하세요:", value="딥러닝을 실습 위주로 30분 이내에 배우고 싶어요")

if st.button("추론 시작"):
    tokens, pred_labels = predict(user_input, model, tokenizer, id_to_label)
    words, word_labels = post_process(tokens, pred_labels)
    aligned = align_words_labels(words, word_labels)
    entities = extract_entities(aligned)

    result_dict = {'TOPIC': None, 'STYLE': None, 'LENGTH': None, 'LANGUAGE': None}
    for ent in entities:
        result_dict[ent['entity']] = ent['text']

    st.subheader("📌 추출된 조건")
    st.json(result_dict)