Spaces:
Running
Running
Update .gitignore and enhance README with data management instructions
Browse files- Added log files to .gitignore to prevent unnecessary tracking.
- Expanded README to include new commands for clearing data and running the application with fresh data.
- Clarified what data gets cleared during testing, improving usability for developers.
- .gitignore +3 -0
- README.md +33 -3
- run_app.py +88 -10
- src/core/config.py +1 -1
- src/parsers/gemini_flash_parser.py +2 -2
- src/parsers/mistral_ocr_parser.py +4 -0
- src/rag/chat_service.py +11 -10
- src/ui/ui.py +31 -9
.gitignore
CHANGED
@@ -100,3 +100,6 @@ app_backup.py
|
|
100 |
|
101 |
# Ignore data folder
|
102 |
/data/
|
|
|
|
|
|
|
|
100 |
|
101 |
# Ignore data folder
|
102 |
/data/
|
103 |
+
|
104 |
+
# Ignore logs
|
105 |
+
==*
|
README.md
CHANGED
@@ -159,8 +159,38 @@ The application uses centralized configuration management. You can enhance funct
|
|
159 |
|
160 |
# For local development (faster startup)
|
161 |
python run_app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
```
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
### 🧪 **Development Features:**
|
165 |
- **Automatic Environment Setup**: Dependencies are checked and installed automatically
|
166 |
- **Configuration Validation**: Startup validation reports missing API keys and configuration issues
|
@@ -175,13 +205,13 @@ The application uses centralized configuration management. You can enhance funct
|
|
175 |
|
176 |
# Markit: Document to Markdown Converter
|
177 |
|
178 |
-
[](https://huggingface.co/spaces/Ansemin101/
|
179 |
|
180 |
**Author: Anse Min** | [GitHub](https://github.com/ansemin) | [LinkedIn](https://www.linkedin.com/in/ansemin/)
|
181 |
|
182 |
## Project Links
|
183 |
-
- **GitHub Repository**: [github.com/ansemin/
|
184 |
-
- **Hugging Face Space**: [huggingface.co/spaces/Ansemin101/
|
185 |
|
186 |
## Overview
|
187 |
Markit is a powerful tool that converts various document formats (PDF, DOCX, images, etc.) to Markdown format. It uses different parsing engines and OCR methods to extract text from documents and convert them to clean, readable Markdown formats.
|
|
|
159 |
|
160 |
# For local development (faster startup)
|
161 |
python run_app.py
|
162 |
+
|
163 |
+
# For testing with clean data (clears chat history and vector store)
|
164 |
+
python run_app.py --clear-data-and-run
|
165 |
+
|
166 |
+
# To only clear data without running the app
|
167 |
+
python run_app.py --clear-data
|
168 |
```
|
169 |
|
170 |
+
### 🧹 **Data Management for Testing:**
|
171 |
+
For local development and testing, you can easily clear all stored data:
|
172 |
+
|
173 |
+
```bash
|
174 |
+
# Clear all data and exit (useful for quick cleanup)
|
175 |
+
python run_app.py --clear-data
|
176 |
+
|
177 |
+
# Clear all data then run the app (useful for fresh testing)
|
178 |
+
python run_app.py --clear-data-and-run
|
179 |
+
|
180 |
+
# Show all available options
|
181 |
+
python run_app.py --help
|
182 |
+
```
|
183 |
+
|
184 |
+
**What gets cleared:**
|
185 |
+
- `data/chat_history/*` - All saved chat sessions
|
186 |
+
- `data/vector_store/*` - All document embeddings and vector database
|
187 |
+
|
188 |
+
This is particularly useful when:
|
189 |
+
- Testing new RAG features with fresh data
|
190 |
+
- Clearing old chat sessions and documents
|
191 |
+
- Resetting the system to a clean state
|
192 |
+
- Debugging document ingestion issues
|
193 |
+
|
194 |
### 🧪 **Development Features:**
|
195 |
- **Automatic Environment Setup**: Dependencies are checked and installed automatically
|
196 |
- **Configuration Validation**: Startup validation reports missing API keys and configuration issues
|
|
|
205 |
|
206 |
# Markit: Document to Markdown Converter
|
207 |
|
208 |
+
[](https://huggingface.co/spaces/Ansemin101/Markit_v2)
|
209 |
|
210 |
**Author: Anse Min** | [GitHub](https://github.com/ansemin) | [LinkedIn](https://www.linkedin.com/in/ansemin/)
|
211 |
|
212 |
## Project Links
|
213 |
+
- **GitHub Repository**: [github.com/ansemin/Markit_v2](https://github.com/ansemin/Markit_v2)
|
214 |
+
- **Hugging Face Space**: [huggingface.co/spaces/Ansemin101/Markit_v2](https://huggingface.co/spaces/Ansemin101/Markit_v2)
|
215 |
|
216 |
## Overview
|
217 |
Markit is a powerful tool that converts various document formats (PDF, DOCX, images, etc.) to Markdown format. It uses different parsing engines and OCR methods to extract text from documents and convert them to clean, readable Markdown formats.
|
run_app.py
CHANGED
@@ -5,21 +5,99 @@ Use this for local development when dependencies are already installed.
|
|
5 |
"""
|
6 |
import sys
|
7 |
import os
|
|
|
|
|
|
|
8 |
|
9 |
# Get the current directory and setup Python path
|
10 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
11 |
sys.path.append(current_dir)
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
if __name__ == "__main__":
|
25 |
-
|
|
|
5 |
"""
|
6 |
import sys
|
7 |
import os
|
8 |
+
import argparse
|
9 |
+
import shutil
|
10 |
+
from pathlib import Path
|
11 |
|
12 |
# Get the current directory and setup Python path
|
13 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
14 |
sys.path.append(current_dir)
|
15 |
|
16 |
+
def clear_data_directories():
|
17 |
+
"""Clear all data directories (chat_history and vector_store)."""
|
18 |
+
data_dir = Path(current_dir) / "data"
|
19 |
+
|
20 |
+
directories_to_clear = [
|
21 |
+
data_dir / "chat_history",
|
22 |
+
data_dir / "vector_store"
|
23 |
+
]
|
24 |
+
|
25 |
+
cleared_count = 0
|
26 |
+
for directory in directories_to_clear:
|
27 |
+
if directory.exists():
|
28 |
+
try:
|
29 |
+
# Remove all contents of the directory
|
30 |
+
for item in directory.iterdir():
|
31 |
+
if item.is_file():
|
32 |
+
item.unlink()
|
33 |
+
print(f"🗑️ Removed file: {item}")
|
34 |
+
elif item.is_dir():
|
35 |
+
shutil.rmtree(item)
|
36 |
+
print(f"🗑️ Removed directory: {item}")
|
37 |
+
cleared_count += len(list(directory.glob("*")))
|
38 |
+
print(f"✅ Cleared directory: {directory}")
|
39 |
+
except Exception as e:
|
40 |
+
print(f"❌ Error clearing {directory}: {e}")
|
41 |
+
else:
|
42 |
+
print(f"ℹ️ Directory doesn't exist: {directory}")
|
43 |
+
|
44 |
+
if cleared_count == 0:
|
45 |
+
print("ℹ️ No data found to clear.")
|
46 |
+
else:
|
47 |
+
print(f"🎉 Successfully cleared {cleared_count} items from data directories!")
|
48 |
|
49 |
+
def main_with_args():
|
50 |
+
"""Main function with command line argument parsing."""
|
51 |
+
parser = argparse.ArgumentParser(
|
52 |
+
description="Markit v2 - Document to Markdown Converter with RAG Chat",
|
53 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
54 |
+
epilog="""
|
55 |
+
Examples:
|
56 |
+
python run_app.py # Run the app normally
|
57 |
+
python run_app.py --clear-data # Clear all data and exit
|
58 |
+
python run_app.py --clear-data-and-run # Clear data then run the app
|
59 |
+
"""
|
60 |
+
)
|
61 |
+
|
62 |
+
parser.add_argument(
|
63 |
+
"--clear-data",
|
64 |
+
action="store_true",
|
65 |
+
help="Clear all data directories (chat_history, vector_store) and exit"
|
66 |
+
)
|
67 |
+
|
68 |
+
parser.add_argument(
|
69 |
+
"--clear-data-and-run",
|
70 |
+
action="store_true",
|
71 |
+
help="Clear all data directories then run the app"
|
72 |
+
)
|
73 |
+
|
74 |
+
args = parser.parse_args()
|
75 |
+
|
76 |
+
# Handle data clearing options
|
77 |
+
if args.clear_data or args.clear_data_and_run:
|
78 |
+
print("🧹 Clearing data directories...")
|
79 |
+
print("=" * 50)
|
80 |
+
clear_data_directories()
|
81 |
+
print("=" * 50)
|
82 |
+
|
83 |
+
if args.clear_data:
|
84 |
+
print("✅ Data clearing completed. Exiting.")
|
85 |
+
return
|
86 |
+
elif args.clear_data_and_run:
|
87 |
+
print("✅ Data clearing completed. Starting app...")
|
88 |
+
print()
|
89 |
+
|
90 |
+
# Load environment variables from .env file
|
91 |
+
try:
|
92 |
+
from dotenv import load_dotenv
|
93 |
+
load_dotenv()
|
94 |
+
print("Loaded environment variables from .env file")
|
95 |
+
except ImportError:
|
96 |
+
print("python-dotenv not installed, skipping .env file loading")
|
97 |
+
|
98 |
+
# Import and run main directly
|
99 |
+
from src.main import main
|
100 |
+
main()
|
101 |
|
102 |
if __name__ == "__main__":
|
103 |
+
main_with_args()
|
src/core/config.py
CHANGED
@@ -104,7 +104,7 @@ class RAGConfig:
|
|
104 |
# LLM settings for RAG
|
105 |
rag_model: str = "gemini-2.5-flash"
|
106 |
rag_temperature: float = 0.1
|
107 |
-
rag_max_tokens: int =
|
108 |
|
109 |
def __post_init__(self):
|
110 |
"""Load RAG configuration from environment variables."""
|
|
|
104 |
# LLM settings for RAG
|
105 |
rag_model: str = "gemini-2.5-flash"
|
106 |
rag_temperature: float = 0.1
|
107 |
+
rag_max_tokens: int = 32768
|
108 |
|
109 |
def __post_init__(self):
|
110 |
"""Load RAG configuration from environment variables."""
|
src/parsers/gemini_flash_parser.py
CHANGED
@@ -93,10 +93,10 @@ class GeminiFlashParser(DocumentParser):
|
|
93 |
)
|
94 |
],
|
95 |
config={
|
96 |
-
"temperature":
|
97 |
"top_p": 0.95,
|
98 |
"top_k": 40,
|
99 |
-
"max_output_tokens":
|
100 |
}
|
101 |
)
|
102 |
|
|
|
93 |
)
|
94 |
],
|
95 |
config={
|
96 |
+
"temperature": config.model.temperature,
|
97 |
"top_p": 0.95,
|
98 |
"top_k": 40,
|
99 |
+
"max_output_tokens": config.model.max_tokens,
|
100 |
}
|
101 |
)
|
102 |
|
src/parsers/mistral_ocr_parser.py
CHANGED
@@ -256,6 +256,8 @@ class MistralOcrParser(DocumentParser):
|
|
256 |
# Send to chat completion API with document understanding prompt
|
257 |
chat_response = client.chat.complete(
|
258 |
model="mistral-large-latest",
|
|
|
|
|
259 |
messages=[
|
260 |
{
|
261 |
"role": "user",
|
@@ -290,6 +292,8 @@ class MistralOcrParser(DocumentParser):
|
|
290 |
# Use the chat API with the image for document understanding
|
291 |
chat_response = client.chat.complete(
|
292 |
model="mistral-large-latest",
|
|
|
|
|
293 |
messages=[
|
294 |
{
|
295 |
"role": "user",
|
|
|
256 |
# Send to chat completion API with document understanding prompt
|
257 |
chat_response = client.chat.complete(
|
258 |
model="mistral-large-latest",
|
259 |
+
max_tokens=config.model.max_tokens,
|
260 |
+
temperature=config.model.temperature,
|
261 |
messages=[
|
262 |
{
|
263 |
"role": "user",
|
|
|
292 |
# Use the chat API with the image for document understanding
|
293 |
chat_response = client.chat.complete(
|
294 |
model="mistral-large-latest",
|
295 |
+
max_tokens=config.model.max_tokens,
|
296 |
+
temperature=config.model.temperature,
|
297 |
messages=[
|
298 |
{
|
299 |
"role": "user",
|
src/rag/chat_service.py
CHANGED
@@ -119,8 +119,8 @@ class RAGChatService:
|
|
119 |
self._llm = ChatGoogleGenerativeAI(
|
120 |
model="gemini-2.5-flash", # Latest Gemini model
|
121 |
google_api_key=google_api_key,
|
122 |
-
temperature=
|
123 |
-
max_tokens=
|
124 |
disable_streaming=False # Enable streaming (new parameter name)
|
125 |
)
|
126 |
|
@@ -144,15 +144,16 @@ class RAGChatService:
|
|
144 |
|
145 |
# Create a prompt template for RAG
|
146 |
prompt_template = ChatPromptTemplate.from_template("""
|
147 |
-
You are a helpful assistant that
|
148 |
|
149 |
Instructions:
|
150 |
-
1. Use the context
|
151 |
-
2.
|
152 |
-
3.
|
153 |
-
4.
|
154 |
-
5.
|
155 |
-
6.
|
|
|
156 |
|
157 |
Context from documents:
|
158 |
{context}
|
@@ -160,7 +161,7 @@ Context from documents:
|
|
160 |
Chat History:
|
161 |
{chat_history}
|
162 |
|
163 |
-
User
|
164 |
""")
|
165 |
|
166 |
def format_docs(docs: List[Document]) -> str:
|
|
|
119 |
self._llm = ChatGoogleGenerativeAI(
|
120 |
model="gemini-2.5-flash", # Latest Gemini model
|
121 |
google_api_key=google_api_key,
|
122 |
+
temperature=config.rag.rag_temperature,
|
123 |
+
max_tokens=config.rag.rag_max_tokens,
|
124 |
disable_streaming=False # Enable streaming (new parameter name)
|
125 |
)
|
126 |
|
|
|
144 |
|
145 |
# Create a prompt template for RAG
|
146 |
prompt_template = ChatPromptTemplate.from_template("""
|
147 |
+
You are a helpful assistant that can chat naturally while specializing in answering questions about uploaded documents.
|
148 |
|
149 |
Instructions:
|
150 |
+
1. For document-related questions: Use the provided context to give comprehensive answers and always cite your sources
|
151 |
+
2. For conversational interactions (greetings, introductions, clarifications, follow-ups): Respond naturally and helpfully
|
152 |
+
3. For questions about topics not covered in the documents: Politely explain that you specialize in the uploaded documents but can still have a conversation
|
153 |
+
4. When using document information, always cite which parts of the documents you referenced
|
154 |
+
5. Include relevant tables and code blocks when they help answer the question
|
155 |
+
6. Be conversational, friendly, and helpful
|
156 |
+
7. Remember information shared in our conversation (like names, preferences, etc.)
|
157 |
|
158 |
Context from documents:
|
159 |
{context}
|
|
|
161 |
Chat History:
|
162 |
{chat_history}
|
163 |
|
164 |
+
User Message: {question}
|
165 |
""")
|
166 |
|
167 |
def format_docs(docs: List[Document]) -> str:
|
src/ui/ui.py
CHANGED
@@ -191,7 +191,7 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
|
|
191 |
def handle_chat_message(message, history):
|
192 |
"""Handle a new chat message with streaming response."""
|
193 |
if not message or not message.strip():
|
194 |
-
return "", history
|
195 |
|
196 |
try:
|
197 |
# Add user message to history
|
@@ -207,10 +207,16 @@ def handle_chat_message(message, history):
|
|
207 |
response_text += chunk
|
208 |
# Update the last message in history with the current response
|
209 |
history[-1]["content"] = response_text
|
210 |
-
|
|
|
|
|
211 |
|
212 |
logger.info(f"Chat response completed for message: {message[:50]}...")
|
213 |
|
|
|
|
|
|
|
|
|
214 |
except Exception as e:
|
215 |
error_msg = f"Error generating response: {str(e)}"
|
216 |
logger.error(error_msg)
|
@@ -221,7 +227,9 @@ def handle_chat_message(message, history):
|
|
221 |
{"role": "user", "content": message},
|
222 |
{"role": "assistant", "content": f"❌ {error_msg}"}
|
223 |
]
|
224 |
-
|
|
|
|
|
225 |
|
226 |
def start_new_chat_session():
|
227 |
"""Start a new chat session."""
|
@@ -455,20 +463,33 @@ def create_ui():
|
|
455 |
font-weight: 500;
|
456 |
flex: 1;
|
457 |
min-width: 200px;
|
|
|
|
|
|
|
|
|
|
|
458 |
}
|
459 |
|
460 |
.service-ready {
|
461 |
background: #d4edda;
|
462 |
-
color: #
|
463 |
border: 1px solid #c3e6cb;
|
464 |
}
|
465 |
|
|
|
|
|
|
|
|
|
466 |
.service-error {
|
467 |
background: #f8d7da;
|
468 |
-
color: #
|
469 |
border: 1px solid #f5c6cb;
|
470 |
}
|
471 |
|
|
|
|
|
|
|
|
|
472 |
.service-icon {
|
473 |
font-size: 1.2em;
|
474 |
}
|
@@ -826,25 +847,26 @@ def create_ui():
|
|
826 |
msg_input.submit(
|
827 |
fn=handle_chat_message,
|
828 |
inputs=[msg_input, chatbot],
|
829 |
-
outputs=[msg_input, chatbot]
|
830 |
)
|
831 |
|
832 |
send_btn.click(
|
833 |
fn=handle_chat_message,
|
834 |
inputs=[msg_input, chatbot],
|
835 |
-
outputs=[msg_input, chatbot]
|
836 |
)
|
837 |
|
838 |
# New session handler with improved feedback
|
839 |
def enhanced_new_session():
|
840 |
history, info = start_new_chat_session()
|
841 |
session_html = f'<div class="session-info">{info}</div>'
|
842 |
-
|
|
|
843 |
|
844 |
new_session_btn.click(
|
845 |
fn=enhanced_new_session,
|
846 |
inputs=[],
|
847 |
-
outputs=[chatbot, session_info]
|
848 |
)
|
849 |
|
850 |
# Refresh status handler
|
|
|
191 |
def handle_chat_message(message, history):
|
192 |
"""Handle a new chat message with streaming response."""
|
193 |
if not message or not message.strip():
|
194 |
+
return "", history, gr.update()
|
195 |
|
196 |
try:
|
197 |
# Add user message to history
|
|
|
207 |
response_text += chunk
|
208 |
# Update the last message in history with the current response
|
209 |
history[-1]["content"] = response_text
|
210 |
+
# Update status in real-time during streaming
|
211 |
+
updated_status = get_chat_status()
|
212 |
+
yield "", history, updated_status
|
213 |
|
214 |
logger.info(f"Chat response completed for message: {message[:50]}...")
|
215 |
|
216 |
+
# Final status update after message completion
|
217 |
+
final_status = get_chat_status()
|
218 |
+
yield "", history, final_status
|
219 |
+
|
220 |
except Exception as e:
|
221 |
error_msg = f"Error generating response: {str(e)}"
|
222 |
logger.error(error_msg)
|
|
|
227 |
{"role": "user", "content": message},
|
228 |
{"role": "assistant", "content": f"❌ {error_msg}"}
|
229 |
]
|
230 |
+
# Update status even on error
|
231 |
+
error_status = get_chat_status()
|
232 |
+
yield "", history, error_status
|
233 |
|
234 |
def start_new_chat_session():
|
235 |
"""Start a new chat session."""
|
|
|
463 |
font-weight: 500;
|
464 |
flex: 1;
|
465 |
min-width: 200px;
|
466 |
+
color: #2c3e50 !important;
|
467 |
+
}
|
468 |
+
|
469 |
+
.service-status span {
|
470 |
+
color: #2c3e50 !important;
|
471 |
}
|
472 |
|
473 |
.service-ready {
|
474 |
background: #d4edda;
|
475 |
+
color: #2c3e50 !important;
|
476 |
border: 1px solid #c3e6cb;
|
477 |
}
|
478 |
|
479 |
+
.service-ready span {
|
480 |
+
color: #2c3e50 !important;
|
481 |
+
}
|
482 |
+
|
483 |
.service-error {
|
484 |
background: #f8d7da;
|
485 |
+
color: #2c3e50 !important;
|
486 |
border: 1px solid #f5c6cb;
|
487 |
}
|
488 |
|
489 |
+
.service-error span {
|
490 |
+
color: #2c3e50 !important;
|
491 |
+
}
|
492 |
+
|
493 |
.service-icon {
|
494 |
font-size: 1.2em;
|
495 |
}
|
|
|
847 |
msg_input.submit(
|
848 |
fn=handle_chat_message,
|
849 |
inputs=[msg_input, chatbot],
|
850 |
+
outputs=[msg_input, chatbot, status_display]
|
851 |
)
|
852 |
|
853 |
send_btn.click(
|
854 |
fn=handle_chat_message,
|
855 |
inputs=[msg_input, chatbot],
|
856 |
+
outputs=[msg_input, chatbot, status_display]
|
857 |
)
|
858 |
|
859 |
# New session handler with improved feedback
|
860 |
def enhanced_new_session():
|
861 |
history, info = start_new_chat_session()
|
862 |
session_html = f'<div class="session-info">{info}</div>'
|
863 |
+
updated_status = get_chat_status()
|
864 |
+
return history, session_html, updated_status
|
865 |
|
866 |
new_session_btn.click(
|
867 |
fn=enhanced_new_session,
|
868 |
inputs=[],
|
869 |
+
outputs=[chatbot, session_info, status_display]
|
870 |
)
|
871 |
|
872 |
# Refresh status handler
|