Spaces:
Runtime error
Runtime error
""" | |
Centralized configuration management for Markit application. | |
""" | |
import os | |
from typing import Optional, Dict, Any | |
from dataclasses import dataclass | |
class APIConfig: | |
"""Configuration for external API services.""" | |
google_api_key: Optional[str] = None | |
openai_api_key: Optional[str] = None | |
mistral_api_key: Optional[str] = None | |
def __post_init__(self): | |
"""Load API keys from environment variables.""" | |
self.google_api_key = os.getenv("GOOGLE_API_KEY") | |
self.openai_api_key = os.getenv("OPENAI_API_KEY") | |
self.mistral_api_key = os.getenv("MISTRAL_API_KEY") | |
class OCRConfig: | |
"""Configuration for OCR-related settings.""" | |
tesseract_path: Optional[str] = None | |
tessdata_path: Optional[str] = None | |
default_language: str = "eng" | |
def __post_init__(self): | |
"""Load OCR configuration from environment variables.""" | |
self.tesseract_path = os.getenv("TESSERACT_PATH") | |
self.tessdata_path = os.getenv("TESSDATA_PATH", "./tessdata") | |
class ModelConfig: | |
"""Configuration for AI model settings.""" | |
gemini_model: str = "gemini-2.5-flash" | |
mistral_model: str = "mistral-ocr-latest" | |
got_ocr_model: str = "stepfun-ai/GOT-OCR2_0" | |
temperature: float = 0.1 | |
max_tokens: int = 32768 | |
def __post_init__(self): | |
"""Load model configuration from environment variables.""" | |
self.gemini_model = os.getenv("GEMINI_MODEL", self.gemini_model) | |
self.mistral_model = os.getenv("MISTRAL_MODEL", self.mistral_model) | |
self.got_ocr_model = os.getenv("GOT_OCR_MODEL", self.got_ocr_model) | |
self.temperature = float(os.getenv("MODEL_TEMPERATURE", self.temperature)) | |
self.max_tokens = int(os.getenv("MODEL_MAX_TOKENS", self.max_tokens)) | |
class DoclingConfig: | |
"""Configuration for Docling parser.""" | |
artifacts_path: Optional[str] = None | |
enable_remote_services: bool = False | |
enable_tables: bool = True | |
enable_code_enrichment: bool = False | |
enable_formula_enrichment: bool = False | |
enable_picture_classification: bool = False | |
generate_picture_images: bool = False | |
ocr_cpu_threads: int = 4 | |
def __post_init__(self): | |
"""Load Docling configuration from environment variables.""" | |
self.artifacts_path = os.getenv("DOCLING_ARTIFACTS_PATH") | |
self.enable_remote_services = os.getenv("DOCLING_ENABLE_REMOTE_SERVICES", "false").lower() == "true" | |
self.enable_tables = os.getenv("DOCLING_ENABLE_TABLES", "true").lower() == "true" | |
self.enable_code_enrichment = os.getenv("DOCLING_ENABLE_CODE_ENRICHMENT", "false").lower() == "true" | |
self.enable_formula_enrichment = os.getenv("DOCLING_ENABLE_FORMULA_ENRICHMENT", "false").lower() == "true" | |
self.enable_picture_classification = os.getenv("DOCLING_ENABLE_PICTURE_CLASSIFICATION", "false").lower() == "true" | |
self.generate_picture_images = os.getenv("DOCLING_GENERATE_PICTURE_IMAGES", "false").lower() == "true" | |
self.ocr_cpu_threads = int(os.getenv("OMP_NUM_THREADS", self.ocr_cpu_threads)) | |
class RAGConfig: | |
"""Configuration for RAG (Retrieval-Augmented Generation) functionality.""" | |
# Vector store settings | |
vector_store_path: str = "./data/vector_store" | |
collection_name: str = "markit_documents" | |
# Chat history settings | |
chat_history_path: str = "./data/chat_history" | |
# Embedding settings | |
embedding_model: str = "models/text-embedding-004" | |
embedding_chunk_size: int = 1000 | |
# Chunking settings | |
chunk_size: int = 1000 | |
chunk_overlap: int = 200 | |
# Chat limits | |
max_messages_per_session: int = 50 | |
max_messages_per_hour: int = 100 | |
# Retrieval settings | |
retrieval_k: int = 4 | |
retrieval_score_threshold: float = 0.5 | |
# LLM settings for RAG | |
rag_model: str = "gemini-2.5-flash" | |
rag_temperature: float = 0.1 | |
rag_max_tokens: int = 32768 | |
def __post_init__(self): | |
"""Load RAG configuration from environment variables.""" | |
# For HF Spaces, ensure data directories are created | |
if os.getenv("SPACE_ID"): # HF Spaces environment | |
base_data_path = "/tmp/data" if not os.access("./data", os.W_OK) else "./data" | |
self.vector_store_path = os.getenv("VECTOR_STORE_PATH", f"{base_data_path}/vector_store") | |
self.chat_history_path = os.getenv("CHAT_HISTORY_PATH", f"{base_data_path}/chat_history") | |
else: | |
self.vector_store_path = os.getenv("VECTOR_STORE_PATH", self.vector_store_path) | |
self.chat_history_path = os.getenv("CHAT_HISTORY_PATH", self.chat_history_path) | |
self.collection_name = os.getenv("VECTOR_STORE_COLLECTION", self.collection_name) | |
self.embedding_model = os.getenv("EMBEDDING_MODEL", self.embedding_model) | |
self.embedding_chunk_size = int(os.getenv("EMBEDDING_CHUNK_SIZE", self.embedding_chunk_size)) | |
self.chunk_size = int(os.getenv("CHUNK_SIZE", self.chunk_size)) | |
self.chunk_overlap = int(os.getenv("CHUNK_OVERLAP", self.chunk_overlap)) | |
self.max_messages_per_session = int(os.getenv("MAX_MESSAGES_PER_SESSION", self.max_messages_per_session)) | |
self.max_messages_per_hour = int(os.getenv("MAX_MESSAGES_PER_HOUR", self.max_messages_per_hour)) | |
self.retrieval_k = int(os.getenv("RETRIEVAL_K", self.retrieval_k)) | |
self.retrieval_score_threshold = float(os.getenv("RETRIEVAL_SCORE_THRESHOLD", self.retrieval_score_threshold)) | |
self.rag_model = os.getenv("RAG_MODEL", self.rag_model) | |
self.rag_temperature = float(os.getenv("RAG_TEMPERATURE", self.rag_temperature)) | |
self.rag_max_tokens = int(os.getenv("RAG_MAX_TOKENS", self.rag_max_tokens)) | |
class AppConfig: | |
"""Main application configuration.""" | |
debug: bool = False | |
max_file_size: int = 10 * 1024 * 1024 # 10MB | |
allowed_extensions: tuple = (".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".tex", ".xlsx", ".docx", ".pptx", ".html", ".xhtml", ".md", ".csv") | |
temp_dir: str = "./temp" | |
# Multi-document batch processing settings | |
max_batch_files: int = 5 | |
max_batch_size: int = 20 * 1024 * 1024 # 20MB combined | |
batch_processing_types: tuple = ("combined", "individual", "summary", "comparison") | |
def __post_init__(self): | |
"""Load application configuration from environment variables.""" | |
self.debug = os.getenv("DEBUG", "false").lower() == "true" | |
self.max_file_size = int(os.getenv("MAX_FILE_SIZE", self.max_file_size)) | |
self.temp_dir = os.getenv("TEMP_DIR", self.temp_dir) | |
# Load batch processing configuration | |
self.max_batch_files = int(os.getenv("MAX_BATCH_FILES", self.max_batch_files)) | |
self.max_batch_size = int(os.getenv("MAX_BATCH_SIZE", self.max_batch_size)) | |
class Config: | |
"""Main configuration container.""" | |
def __init__(self): | |
self.api = APIConfig() | |
self.ocr = OCRConfig() | |
self.model = ModelConfig() | |
self.docling = DoclingConfig() | |
self.app = AppConfig() | |
self.rag = RAGConfig() | |
def validate(self) -> Dict[str, Any]: | |
"""Validate configuration and return validation results.""" | |
validation_results = { | |
"valid": True, | |
"warnings": [], | |
"errors": [] | |
} | |
# Check API keys | |
if not self.api.google_api_key: | |
validation_results["warnings"].append("Google API key not found - Gemini parser will be unavailable") | |
if not self.api.mistral_api_key: | |
validation_results["warnings"].append("Mistral API key not found - Mistral parser will be unavailable") | |
# Check RAG dependencies | |
if not self.api.google_api_key: | |
validation_results["warnings"].append("Google API key not found - RAG embeddings will be unavailable") | |
if not self.api.google_api_key: | |
validation_results["warnings"].append("Google API key not found - RAG chat will be unavailable") | |
# Check tesseract setup | |
if not self.ocr.tesseract_path and not os.path.exists("/usr/bin/tesseract"): | |
validation_results["warnings"].append("Tesseract not found in system PATH - OCR functionality may be limited") | |
# Check temp directory | |
try: | |
os.makedirs(self.app.temp_dir, exist_ok=True) | |
except Exception as e: | |
validation_results["errors"].append(f"Cannot create temp directory {self.app.temp_dir}: {e}") | |
validation_results["valid"] = False | |
# Check RAG directories | |
try: | |
os.makedirs(self.rag.vector_store_path, exist_ok=True) | |
os.makedirs(self.rag.chat_history_path, exist_ok=True) | |
except Exception as e: | |
validation_results["errors"].append(f"Cannot create RAG directories: {e}") | |
validation_results["valid"] = False | |
return validation_results | |
def get_available_parsers(self) -> list: | |
"""Get list of available parsers based on current configuration.""" | |
available = ["markitdown"] # Always available | |
if self.api.google_api_key: | |
available.append("gemini_flash") | |
if self.api.mistral_api_key: | |
available.append("mistral_ocr") | |
# GOT-OCR is available if we have GPU or can use ZeroGPU | |
available.append("got_ocr") | |
# Docling is available if package is installed | |
try: | |
import docling | |
available.append("docling") | |
except ImportError: | |
pass | |
return available | |
# Global configuration instance | |
config = Config() |