File size: 3,180 Bytes
1c38e8c
1360051
 
 
4457e86
aab722c
a9d8b80
1360051
f973f9e
 
 
 
bcd23af
b68ab79
4457e86
1360051
f973f9e
5dcfd82
 
 
 
 
 
1360051
f973f9e
1360051
a9d8b80
 
 
f89536d
1360051
 
f973f9e
c0a3abb
a9d8b80
1360051
 
 
 
 
 
c0a3abb
1360051
 
 
 
 
 
 
 
f973f9e
 
1360051
f973f9e
1360051
f89536d
 
 
 
 
 
5dcfd82
f973f9e
1c38e8c
5dcfd82
bcd23af
f973f9e
bcd23af
1360051
f89536d
 
1360051
f89536d
 
f973f9e
f89536d
 
 
 
 
 
bcd23af
f89536d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import tensorflow as tf
import numpy as np
import nltk
import os
from nltk.tokenize import sent_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification

# ๐Ÿ“ Use safe cache directory inside Hugging Face or Docker
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"

# ๐Ÿ“ฅ Download NLTK tokenizer
nltk_data_path = "/tmp/nltk_data"
nltk.download("punkt_tab", download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)

# ๐Ÿ”„ Load tokenizer and model from Hugging Face
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased", cache_dir="/tmp/huggingface"
)
model = TFDistilBertForSequenceClassification.from_pretrained(
    "sundaram07/distilbert-sentence-classifier", cache_dir="/tmp/huggingface"
)

# ๐Ÿ”ฎ Predict AI probability for a sentence
def predict_sentence_ai_probability(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding=True)
    outputs = model(inputs)
    logits = outputs.logits
    prob_ai = tf.sigmoid(logits)[0][0].numpy()
    return prob_ai

# ๐Ÿ“Š Analyze all sentences
def predict_ai_generated_percentage(text, threshold=0.15):
    text = text.strip()
    sentences = sent_tokenize(text)
    ai_sentence_count = 0
    results = []

    for sentence in sentences:
        prob = predict_sentence_ai_probability(sentence)
        is_ai = prob <= threshold
        results.append((sentence, prob, is_ai))
        if is_ai:
            ai_sentence_count += 1

    total_sentences = len(sentences)
    ai_percentage = (ai_sentence_count / total_sentences) * 100 if total_sentences > 0 else 0.0
    return ai_percentage, results

# ๐ŸŒ Streamlit Web App
st.set_page_config(page_title="AI Detector", layout="wide")
st.title("๐Ÿง  AI Content Detector")
st.markdown("This app detects the percentage of **AI-generated content** based on sentence-level analysis using DistilBERT.")

# Initialize session state to avoid duplicates
if "last_input" not in st.session_state:
    st.session_state.last_input = ""
    st.session_state.results = None
    st.session_state.percentage = None

# ๐Ÿ“‹ User Input Area
user_input = st.text_area("๐Ÿ“‹ Paste your text below to check for AI-generated sentences:", height=300)

# ๐Ÿ”˜ Analyze Button
if st.button("๐Ÿ” Analyze"):
    if not user_input.strip():
        st.warning("โš ๏ธ Please enter some text to analyze.")
    else:
        # Store in session_state to avoid duplicates
        st.session_state.last_input = user_input
        ai_percentage, analysis_results = predict_ai_generated_percentage(user_input)
        st.session_state.results = analysis_results
        st.session_state.percentage = ai_percentage

# Display only if results are present
if st.session_state.results is not None:
    st.subheader("๐Ÿ” Sentence-level Analysis")
    for i, (sentence, prob, is_ai) in enumerate(st.session_state.results, start=1):
        label = "๐ŸŸข Human" if not is_ai else "๐Ÿ”ด AI"
        st.markdown(f"**{i}.** _{sentence}_\n\n โ†’ {label}")

    st.subheader("๐Ÿ“Š Final Result")
    st.success(f"Estimated **AI-generated content**: **{st.session_state.percentage:.2f}%**")