Spaces:
Sleeping
Sleeping
File size: 3,180 Bytes
1c38e8c 1360051 4457e86 aab722c a9d8b80 1360051 f973f9e bcd23af b68ab79 4457e86 1360051 f973f9e 5dcfd82 1360051 f973f9e 1360051 a9d8b80 f89536d 1360051 f973f9e c0a3abb a9d8b80 1360051 c0a3abb 1360051 f973f9e 1360051 f973f9e 1360051 f89536d 5dcfd82 f973f9e 1c38e8c 5dcfd82 bcd23af f973f9e bcd23af 1360051 f89536d 1360051 f89536d f973f9e f89536d bcd23af f89536d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
import tensorflow as tf
import numpy as np
import nltk
import os
from nltk.tokenize import sent_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
# ๐ Use safe cache directory inside Hugging Face or Docker
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# ๐ฅ Download NLTK tokenizer
nltk_data_path = "/tmp/nltk_data"
nltk.download("punkt_tab", download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)
# ๐ Load tokenizer and model from Hugging Face
tokenizer = DistilBertTokenizerFast.from_pretrained(
"distilbert-base-uncased", cache_dir="/tmp/huggingface"
)
model = TFDistilBertForSequenceClassification.from_pretrained(
"sundaram07/distilbert-sentence-classifier", cache_dir="/tmp/huggingface"
)
# ๐ฎ Predict AI probability for a sentence
def predict_sentence_ai_probability(sentence):
inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding=True)
outputs = model(inputs)
logits = outputs.logits
prob_ai = tf.sigmoid(logits)[0][0].numpy()
return prob_ai
# ๐ Analyze all sentences
def predict_ai_generated_percentage(text, threshold=0.15):
text = text.strip()
sentences = sent_tokenize(text)
ai_sentence_count = 0
results = []
for sentence in sentences:
prob = predict_sentence_ai_probability(sentence)
is_ai = prob <= threshold
results.append((sentence, prob, is_ai))
if is_ai:
ai_sentence_count += 1
total_sentences = len(sentences)
ai_percentage = (ai_sentence_count / total_sentences) * 100 if total_sentences > 0 else 0.0
return ai_percentage, results
# ๐ Streamlit Web App
st.set_page_config(page_title="AI Detector", layout="wide")
st.title("๐ง AI Content Detector")
st.markdown("This app detects the percentage of **AI-generated content** based on sentence-level analysis using DistilBERT.")
# Initialize session state to avoid duplicates
if "last_input" not in st.session_state:
st.session_state.last_input = ""
st.session_state.results = None
st.session_state.percentage = None
# ๐ User Input Area
user_input = st.text_area("๐ Paste your text below to check for AI-generated sentences:", height=300)
# ๐ Analyze Button
if st.button("๐ Analyze"):
if not user_input.strip():
st.warning("โ ๏ธ Please enter some text to analyze.")
else:
# Store in session_state to avoid duplicates
st.session_state.last_input = user_input
ai_percentage, analysis_results = predict_ai_generated_percentage(user_input)
st.session_state.results = analysis_results
st.session_state.percentage = ai_percentage
# Display only if results are present
if st.session_state.results is not None:
st.subheader("๐ Sentence-level Analysis")
for i, (sentence, prob, is_ai) in enumerate(st.session_state.results, start=1):
label = "๐ข Human" if not is_ai else "๐ด AI"
st.markdown(f"**{i}.** _{sentence}_\n\n โ {label}")
st.subheader("๐ Final Result")
st.success(f"Estimated **AI-generated content**: **{st.session_state.percentage:.2f}%**")
|