Spaces:

vatsaljain19
/

YoutubeRAG

Sleeping

App Files Files Community

vatsaljain19 commited on Oct 5

Commit

d722032

verified ·

1 Parent(s): 407d7a9

Upload 4 files

Browse files

Files changed (4) hide show

src/app.py +101 -0
src/main.py +3 -0
src/rag_youtube_bot.py +80 -0
src/requirements.txt +10 -0

src/app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import streamlit as st
+from rag_youtube_bot import get_transcript, build_rag_chain, ask_question
+st.set_page_config(
+    page_title="🎥 YouTube RAG Chatbot",
+    page_icon="🤖",
+    layout="wide"
+)
+st.markdown("""
+    <style>
+        .main {
+            background-color: #0e1117;
+            color: white;
+        }
+        .stTextInput>div>div>input {
+            background-color: #1c1f26;
+            color: white;
+        }
+        .stButton>button {
+            background-color: #ff4b4b;
+            color: white;
+            border-radius: 10px;
+        }
+        .stTextArea textarea {
+            background-color: #1c1f26;
+            color: white;
+        }
+    </style>
+""", unsafe_allow_html=True)
+st.title("🎥 YouTube RAG Chatbot")
+st.markdown("##### 🤖 Ask anything about a YouTube video — powered by LangChain + DeepSeek")
+if "rag_chain" not in st.session_state:
+    st.session_state.rag_chain = None
+if "transcript" not in st.session_state:
+    st.session_state.transcript = None
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []  # stores (question, answer) pairs
+st.sidebar.header("🧠 About This App")
+st.sidebar.write("""
+This app uses:
+- **LangChain + DeepSeek (HuggingFace)**
+- **RAG (Retrieval-Augmented Generation)**
+- **YouTube transcripts as knowledge base**
+""")
+if st.sidebar.button("🔁 Clear Session"):
+    st.session_state.clear()
+    st.rerun()
+yt_link = st.text_input("🎬 Enter a YouTube video link:")
+col1, col2 = st.columns([1, 3])
+with col1:
+    fetch_btn = st.button("📜 Fetch Transcript")
+with col2:
+    st.markdown("")
+if fetch_btn and yt_link:
+    with st.spinner("Fetching transcript..."):
+        transcript = get_transcript(yt_link)
+        if transcript:
+            st.session_state.transcript = transcript
+            st.success("✅ Transcript fetched successfully!")
+            with st.spinner("Building RAG model..."):
+                st.session_state.rag_chain = build_rag_chain(transcript)
+                st.success("RAG model ready! Ask your questions below 👇")
+        else:
+            st.error("❌ Transcript not available for this video.")
+if st.session_state.rag_chain:
+    st.subheader("💬 Chat with the Video")
+    for q, a in st.session_state.chat_history:
+        with st.chat_message("user"):
+            st.markdown(f"**You:** {q}")
+        with st.chat_message("assistant"):
+            st.markdown(f"**Bot:** {a}")
+    user_input = st.chat_input("Ask your question here...")
+    if user_input:
+        with st.chat_message("user"):
+            st.markdown(f"**You:** {user_input}")
+        with st.spinner("Thinking... 🤔"):
+            answer = ask_question(st.session_state.rag_chain, user_input)
+        with st.chat_message("assistant"):
+            st.markdown(f"**Bot:** {answer}")
+        st.session_state.chat_history.append((user_input, answer))

src/main.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import langchain
2	+
3	+ print(langchain.__version__)

src/rag_youtube_bot.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from operator import itemgetter
+from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.prompts import PromptTemplate
+from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
+from langchain.schema.runnable import RunnableParallel, RunnableLambda
+from langchain_core.output_parsers import StrOutputParser
+from dotenv import load_dotenv
+load_dotenv()
+def get_transcript(video_url: str):
+    """Fetch transcript text from a YouTube video."""
+    if "youtu.be" in video_url:
+        video_id = video_url.split("/")[-1].split("?")[0]
+    else:
+        video_id = video_url.split("v=")[-1].split("&")[0]
+    try:
+        ytt_api = YouTubeTranscriptApi()
+        transcript_list = ytt_api.fetch(video_id, languages=['en'])
+        transcript = " ".join(chunk.text for chunk in transcript_list)
+        return transcript
+    except TranscriptsDisabled:
+        return None
+def build_rag_chain(transcript: str):
+    """Create a RAG chain from transcript text."""
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    chunks = splitter.create_documents([transcript])
+    embedding = HuggingFaceEmbeddings(
+        model_name='sentence-transformers/all-MiniLM-L6-v2'
+)
+    vector_store = Chroma.from_documents(chunks, embedding)
+    retriever = vector_store.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": 3}
+)
+    llm = HuggingFaceEndpoint(
+        repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
+        task="text-generation"
+)
+    model = ChatHuggingFace(llm=llm, temperature=0.2)
+    prompt = PromptTemplate(
+        template="""
+        You are a helpful assistant.
+        Answer ONLY from the provided transcript context.
+        If the context is insufficient, just say you don't know.
+        {context}
+        Question: {question}
+        """,
+        input_variables=['context', 'question']
+    )
+    def format_docs(retrieved_docs):
+        if not retrieved_docs:
+            return "No relevant transcript context found."
+        return " ".join(doc.page_content for doc in retrieved_docs)
+    parallel_chain = RunnableParallel({
+        'context': itemgetter("question") | retriever | RunnableLambda(format_docs),
+        'question': itemgetter("question")
+    })
+    parser = StrOutputParser()
+    chain = parallel_chain | prompt | model | parser
+    return chain
+def ask_question(chain, question: str):
+    """Ask a question using the provided chain."""
+    return chain.invoke({'question': question})

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+langchain
+langchain-community
+langchain-core
+langchain-huggingface
+huggingface_hub
+youtube-transcript-api
+sentence-transformers
+chromadb
+python-dotenv