vatsaljain19 commited on
Commit
d722032
·
verified ·
1 Parent(s): 407d7a9

Upload 4 files

Browse files
Files changed (4) hide show
  1. src/app.py +101 -0
  2. src/main.py +3 -0
  3. src/rag_youtube_bot.py +80 -0
  4. src/requirements.txt +10 -0
src/app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rag_youtube_bot import get_transcript, build_rag_chain, ask_question
3
+
4
+
5
+ st.set_page_config(
6
+ page_title="🎥 YouTube RAG Chatbot",
7
+ page_icon="🤖",
8
+ layout="wide"
9
+ )
10
+
11
+
12
+ st.markdown("""
13
+ <style>
14
+ .main {
15
+ background-color: #0e1117;
16
+ color: white;
17
+ }
18
+ .stTextInput>div>div>input {
19
+ background-color: #1c1f26;
20
+ color: white;
21
+ }
22
+ .stButton>button {
23
+ background-color: #ff4b4b;
24
+ color: white;
25
+ border-radius: 10px;
26
+ }
27
+ .stTextArea textarea {
28
+ background-color: #1c1f26;
29
+ color: white;
30
+ }
31
+ </style>
32
+ """, unsafe_allow_html=True)
33
+
34
+
35
+ st.title("🎥 YouTube RAG Chatbot")
36
+ st.markdown("##### 🤖 Ask anything about a YouTube video — powered by LangChain + DeepSeek")
37
+
38
+
39
+ if "rag_chain" not in st.session_state:
40
+ st.session_state.rag_chain = None
41
+ if "transcript" not in st.session_state:
42
+ st.session_state.transcript = None
43
+ if "chat_history" not in st.session_state:
44
+ st.session_state.chat_history = [] # stores (question, answer) pairs
45
+
46
+
47
+ st.sidebar.header("🧠 About This App")
48
+ st.sidebar.write("""
49
+ This app uses:
50
+ - **LangChain + DeepSeek (HuggingFace)**
51
+ - **RAG (Retrieval-Augmented Generation)**
52
+ - **YouTube transcripts as knowledge base**
53
+ """)
54
+
55
+ if st.sidebar.button("🔁 Clear Session"):
56
+ st.session_state.clear()
57
+ st.rerun()
58
+
59
+ yt_link = st.text_input("🎬 Enter a YouTube video link:")
60
+
61
+ col1, col2 = st.columns([1, 3])
62
+
63
+ with col1:
64
+ fetch_btn = st.button("📜 Fetch Transcript")
65
+
66
+ with col2:
67
+ st.markdown("")
68
+
69
+ if fetch_btn and yt_link:
70
+ with st.spinner("Fetching transcript..."):
71
+ transcript = get_transcript(yt_link)
72
+ if transcript:
73
+ st.session_state.transcript = transcript
74
+ st.success("✅ Transcript fetched successfully!")
75
+ with st.spinner("Building RAG model..."):
76
+ st.session_state.rag_chain = build_rag_chain(transcript)
77
+ st.success("RAG model ready! Ask your questions below 👇")
78
+ else:
79
+ st.error("❌ Transcript not available for this video.")
80
+
81
+ if st.session_state.rag_chain:
82
+ st.subheader("💬 Chat with the Video")
83
+
84
+ for q, a in st.session_state.chat_history:
85
+ with st.chat_message("user"):
86
+ st.markdown(f"**You:** {q}")
87
+ with st.chat_message("assistant"):
88
+ st.markdown(f"**Bot:** {a}")
89
+
90
+ user_input = st.chat_input("Ask your question here...")
91
+ if user_input:
92
+ with st.chat_message("user"):
93
+ st.markdown(f"**You:** {user_input}")
94
+
95
+ with st.spinner("Thinking... 🤔"):
96
+ answer = ask_question(st.session_state.rag_chain, user_input)
97
+
98
+ with st.chat_message("assistant"):
99
+ st.markdown(f"**Bot:** {answer}")
100
+
101
+ st.session_state.chat_history.append((user_input, answer))
src/main.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import langchain
2
+
3
+ print(langchain.__version__)
src/rag_youtube_bot.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain.prompts import PromptTemplate
6
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
7
+ from langchain.schema.runnable import RunnableParallel, RunnableLambda
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+
14
+ def get_transcript(video_url: str):
15
+ """Fetch transcript text from a YouTube video."""
16
+ if "youtu.be" in video_url:
17
+ video_id = video_url.split("/")[-1].split("?")[0]
18
+ else:
19
+ video_id = video_url.split("v=")[-1].split("&")[0]
20
+
21
+ try:
22
+ ytt_api = YouTubeTranscriptApi()
23
+ transcript_list = ytt_api.fetch(video_id, languages=['en'])
24
+ transcript = " ".join(chunk.text for chunk in transcript_list)
25
+ return transcript
26
+ except TranscriptsDisabled:
27
+ return None
28
+
29
+
30
+ def build_rag_chain(transcript: str):
31
+ """Create a RAG chain from transcript text."""
32
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
33
+ chunks = splitter.create_documents([transcript])
34
+
35
+ embedding = HuggingFaceEmbeddings(
36
+ model_name='sentence-transformers/all-MiniLM-L6-v2'
37
+ )
38
+ vector_store = Chroma.from_documents(chunks, embedding)
39
+
40
+ retriever = vector_store.as_retriever(
41
+ search_type="similarity",
42
+ search_kwargs={"k": 3}
43
+ )
44
+
45
+ llm = HuggingFaceEndpoint(
46
+ repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
47
+ task="text-generation"
48
+ )
49
+ model = ChatHuggingFace(llm=llm, temperature=0.2)
50
+
51
+ prompt = PromptTemplate(
52
+ template="""
53
+ You are a helpful assistant.
54
+ Answer ONLY from the provided transcript context.
55
+ If the context is insufficient, just say you don't know.
56
+
57
+ {context}
58
+ Question: {question}
59
+ """,
60
+ input_variables=['context', 'question']
61
+ )
62
+
63
+ def format_docs(retrieved_docs):
64
+ if not retrieved_docs:
65
+ return "No relevant transcript context found."
66
+ return " ".join(doc.page_content for doc in retrieved_docs)
67
+
68
+ parallel_chain = RunnableParallel({
69
+ 'context': itemgetter("question") | retriever | RunnableLambda(format_docs),
70
+ 'question': itemgetter("question")
71
+ })
72
+
73
+ parser = StrOutputParser()
74
+ chain = parallel_chain | prompt | model | parser
75
+ return chain
76
+
77
+
78
+ def ask_question(chain, question: str):
79
+ """Ask a question using the provided chain."""
80
+ return chain.invoke({'question': question})
src/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain-community
4
+ langchain-core
5
+ langchain-huggingface
6
+ huggingface_hub
7
+ youtube-transcript-api
8
+ sentence-transformers
9
+ chromadb
10
+ python-dotenv