Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on Jun 10

Commit

01d337c

verified ·

1 Parent(s): e5ad1a5

Upload 8 files

Browse files

Files changed (8) hide show

clip_analyzer.py +24 -27
clip_model_manager.py +15 -18
clip_prompts.py +128 -5
clip_zero_shot_classifier.py +4 -4
llm_enhancer.py +2 -2
llm_model_manager.py +358 -0
requirements.txt +2 -2
scene_scoring_engine.py +9 -8

clip_analyzer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import clip
 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
@@ -20,13 +20,14 @@ class CLIPAnalyzer:
     Use Clip to intergrate scene understanding function
     """
-    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
-        初始化 CLIP 分析器。
         Args:
-            model_name: CLIP Model name, 默認 "ViT-B/16"
-            device: Use GPU if it can use
         """
         # 自動選擇設備
         if device is None:
@@ -34,12 +35,17 @@ class CLIPAnalyzer:
         else:
             self.device = device
-        print(f"Loading CLIP model {model_name} on {self.device}...")
         try:
-            self.model, self.preprocess = clip.load(model_name, device=self.device)
-            print(f"CLIP model loaded successfully.")
         except Exception as e:
-            print(f"Error loading CLIP model: {e}")
             raise
         self.scene_type_prompts = SCENE_TYPE_PROMPTS
@@ -64,7 +70,7 @@ class CLIPAnalyzer:
             if scene_texts:
                 self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
                 try:
-                    self.text_features_cache["scene_type_tokens"] = clip.tokenize(scene_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing scene_type_prompts: {e}")
                     self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
@@ -82,7 +88,7 @@ class CLIPAnalyzer:
             for scene_type, prompts in self.cultural_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
-                        cultural_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
                         cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
@@ -96,7 +102,7 @@ class CLIPAnalyzer:
             if lighting_texts:
                 self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
                 try:
-                    self.text_features_cache["lighting_tokens"] = clip.tokenize(lighting_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
                     self.text_features_cache["lighting_tokens"] = None
@@ -113,7 +119,7 @@ class CLIPAnalyzer:
             for scene_type, prompts in self.specialized_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
-                        specialized_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
                         specialized_tokens_dict_val[scene_type] = None
@@ -127,7 +133,7 @@ class CLIPAnalyzer:
             if viewpoint_texts:
                 self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
                 try:
-                    self.text_features_cache["viewpoint_tokens"] = clip.tokenize(viewpoint_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
                     self.text_features_cache["viewpoint_tokens"] = None
@@ -144,7 +150,7 @@ class CLIPAnalyzer:
             if object_combination_texts:
                 self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
                 try:
-                    self.text_features_cache["object_combination_tokens"] = clip.tokenize(object_combination_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing object_combination_prompts: {e}")
                     self.text_features_cache["object_combination_tokens"] = None
@@ -161,7 +167,7 @@ class CLIPAnalyzer:
             if activity_texts:
                 self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
                 try:
-                    self.text_features_cache["activity_tokens"] = clip.tokenize(activity_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing activity_prompts: {e}")
                     self.text_features_cache["activity_tokens"] = None
@@ -180,7 +186,7 @@ class CLIPAnalyzer:
         self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
         self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
-        print("CLIP text_features_cache prepared.")
     def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
         """
@@ -581,16 +587,7 @@ class CLIPAnalyzer:
         return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
     def text_to_embedding(self, text: str) -> np.ndarray:
-        """
-        將文本轉換為 CLIP 嵌入表示
-        Args:
-            text: 輸入文本
-        Returns:
-            np.ndarray: 文本的 CLIP 特徵向量
-        """
-        text_token = clip.tokenize([text]).to(self.device)
         with torch.no_grad():
             text_features = self.model.encode_text(text_token)

 import torch
+import open_clip
 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
     Use Clip to intergrate scene understanding function
     """
+    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
+        初始化 CLIP 分析器，使用 OpenCLIP 實現
         Args:
+            model_name: OpenCLIP 模型名稱，默認 "ViT-B-16"
+            device: 運行設備
+            pretrained: 預訓練權重，使用 "laion2b_s34b_b79k"
         """
         # 自動選擇設備
         if device is None:
         else:
             self.device = device
+        print(f"Loading OpenCLIP model {model_name} with {pretrained} on {self.device}...")
         try:
+            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+                model_name,
+                pretrained=pretrained,
+                device=self.device
+            )
+            self.tokenizer = open_clip.get_tokenizer(model_name)
+            print(f"OpenCLIP model loaded successfully.")
         except Exception as e:
+            print(f"Error loading OpenCLIP model: {e}")
             raise
         self.scene_type_prompts = SCENE_TYPE_PROMPTS
             if scene_texts:
                 self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
                 try:
+                    self.text_features_cache["scene_type_tokens"] = self.tokenizer(scene_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing scene_type_prompts: {e}")
                     self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
             for scene_type, prompts in self.cultural_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
+                        cultural_tokens_dict_val[scene_type] = self.tokenizer(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
                         cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
             if lighting_texts:
                 self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
                 try:
+                    self.text_features_cache["lighting_tokens"] = self.tokenizer(lighting_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
                     self.text_features_cache["lighting_tokens"] = None
             for scene_type, prompts in self.specialized_scene_prompts.items():
                 if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
                     try:
+                        specialized_tokens_dict_val[scene_type] = self.tokenizer(prompts).to(self.device)
                     except Exception as e:
                         print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
                         specialized_tokens_dict_val[scene_type] = None
             if viewpoint_texts:
                 self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
                 try:
+                    self.text_features_cache["viewpoint_tokens"] = self.tokenizer(viewpoint_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
                     self.text_features_cache["viewpoint_tokens"] = None
             if object_combination_texts:
                 self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
                 try:
+                    self.text_features_cache["object_combination_tokens"] = self.tokenizer(object_combination_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing object_combination_prompts: {e}")
                     self.text_features_cache["object_combination_tokens"] = None
             if activity_texts:
                 self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
                 try:
+                    self.text_features_cache["activity_tokens"] = self.tokenizer(activity_texts).to(self.device)
                 except Exception as e:
                     print(f"Warning: Error tokenizing activity_prompts: {e}")
                     self.text_features_cache["activity_tokens"] = None
         self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
         self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
+        print("OpenCLIP text_features_cache prepared.")
     def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
         """
         return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
     def text_to_embedding(self, text: str) -> np.ndarray:
+        text_token = self.tokenizer([text]).to(self.device)
         with torch.no_grad():
             text_features = self.model.encode_text(text_token)

clip_model_manager.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-import clip
 import numpy as np
 import logging
 import traceback
@@ -12,7 +12,7 @@ class CLIPModelManager:
     專門管理 CLIP 模型相關的操作，包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
     """
-    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化 CLIP 模型管理器
@@ -22,6 +22,8 @@ class CLIPModelManager:
         """
         self.logger = logging.getLogger(__name__)
         self.model_name = model_name
         # 設置運行設備
         if device is None:
@@ -29,19 +31,23 @@ class CLIPModelManager:
         else:
             self.device = device
-        self.model = None
         self.preprocess = None
         self._initialize_model()
     def _initialize_model(self):
         """
-        初始化CLIP模型
         """
         try:
-            self.logger.info(f"Initializing CLIP model ({self.model_name}) on {self.device}")
-            self.model, self.preprocess = clip.load(self.model_name, device=self.device)
-            self.logger.info("Successfully loaded CLIP model")
         except Exception as e:
             self.logger.error(f"Error loading CLIP model: {e}")
             self.logger.error(traceback.format_exc())
@@ -87,7 +93,7 @@ class CLIPModelManager:
                 for i in range(0, len(text_prompts), batch_size):
                     batch_prompts = text_prompts[i:i+batch_size]
-                    text_tokens = clip.tokenize(batch_prompts).to(self.device)
                     batch_features = self.model.encode_text(text_tokens)
                     batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
                     features_list.append(batch_features)
@@ -106,18 +112,9 @@ class CLIPModelManager:
             raise
     def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
-        """
-        編碼單個文本批次的特徵
-        Args:
-            text_prompts: 文本提示列表
-        Returns:
-            torch.Tensor: 標準化後的文本特徵
-        """
         try:
             with torch.no_grad():
-                text_tokens = clip.tokenize(text_prompts).to(self.device)
                 text_features = self.model.encode_text(text_tokens)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
                 return text_features

 import torch
+import open_clip
 import numpy as np
 import logging
 import traceback
     專門管理 CLIP 模型相關的操作，包括模型載入、設備管理、圖像和文本的特徵編碼等核心功能
     """
+    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
         初始化 CLIP 模型管理器
         """
         self.logger = logging.getLogger(__name__)
         self.model_name = model_name
+        self.pretrained = pretrained
+        self.tokenizer = None
         # 設置運行設備
         if device is None:
         else:
             self.device = device
         self.preprocess = None
         self._initialize_model()
     def _initialize_model(self):
         """
+        初始化OpenCLIP模型
         """
         try:
+            self.logger.info(f"Initializing OpenCLIP model ({self.model_name}) with {self.pretrained} on {self.device}")
+            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+                self.model_name,
+                pretrained=self.pretrained,
+                device=self.device
+            )
+            self.tokenizer = open_clip.get_tokenizer(self.model_name)
+            self.logger.info("Successfully loaded OpenCLIP model")
         except Exception as e:
             self.logger.error(f"Error loading CLIP model: {e}")
             self.logger.error(traceback.format_exc())
                 for i in range(0, len(text_prompts), batch_size):
                     batch_prompts = text_prompts[i:i+batch_size]
+                    text_tokens = self.tokenizer(batch_prompts).to(self.device)
                     batch_features = self.model.encode_text(text_tokens)
                     batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
                     features_list.append(batch_features)
             raise
     def encode_single_text(self, text_prompts: List[str]) -> torch.Tensor:
         try:
             with torch.no_grad():
+                text_tokens = self.tokenizer(text_prompts).to(self.device)
                 text_features = self.model.encode_text(text_tokens)
                 text_features = text_features / text_features.norm(dim=-1, keepdim=True)
                 return text_features

clip_prompts.py CHANGED Viewed

@@ -69,7 +69,49 @@ SCENE_TYPE_PROMPTS = {
     "construction_site": "A photo of a construction site with building materials, equipment and workers.",
     "medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
     "educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
-    "professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations."
 }
 # 文化特定場景提示
@@ -151,6 +193,30 @@ COMPARATIVE_PROMPTS = {
         "A street-level view showing pedestrian perspective and immediate surroundings.",
         "A bird's-eye view of city organization and movement patterns from high above.",
         "An eye-level perspective showing direct human interaction with urban elements."
     ]
 }
@@ -170,7 +236,16 @@ LIGHTING_CONDITION_PROMPTS = {
     "mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
     "beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
     "sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
-    "kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces."
 }
 # 針對新場景類型的特殊提示
@@ -228,6 +303,29 @@ SPECIALIZED_SCENE_PROMPTS = {
         "A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
         "A drone perspective of urban crossing design viewed from directly above.",
         "A vertical view of a street intersection showing crossing infrastructure."
     ]
 }
@@ -239,7 +337,15 @@ VIEWPOINT_PROMPTS = {
     "bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
     "street_level": "A photo taken from the perspective of someone standing on the street.",
     "interior": "A photo taken from inside a building showing the internal environment.",
-    "vehicular": "A photo taken from inside or mounted on a moving vehicle."
 }
 OBJECT_COMBINATION_PROMPTS = {
@@ -250,7 +356,15 @@ OBJECT_COMBINATION_PROMPTS = {
     "retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
     "crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
     "cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
-    "recreational_space": "A scene with sports equipment, play areas, and activity participants."
 }
 ACTIVITY_PROMPTS = {
@@ -261,5 +375,14 @@ ACTIVITY_PROMPTS = {
     "exercising": "People engaged in physical activities, using sports equipment, and training.",
     "cooking": "People preparing food, using kitchen equipment, and creating meals.",
     "crossing_street": "People walking across designated crosswalks and navigating intersections.",
-    "recreational_activity": "People engaged in leisure activities, games, and social recreation."
 }

     "construction_site": "A photo of a construction site with building materials, equipment and workers.",
     "medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
     "educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
+    "professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations.",
+    # 工作空間的多樣化
+    "modern_open_office": "A photo of a modern open office with collaborative workspaces, standing desks and contemporary furniture design.",
+    "traditional_cubicle_office": "A photo of a traditional office with individual cubicles, separated workstations and formal business environment.",
+    "home_office_study": "A photo of a home office or study room with personal workspace setup and residential comfort elements.",
+    "creative_workspace": "A photo of a creative workspace with design materials, artistic tools and inspiring work environment.",
+    "shared_workspace_hub": "A photo of a shared coworking space with flexible seating, community areas and collaborative atmosphere.",
+    # 用餐空間的情境化
+    "casual_family_dining": "A photo of a casual family dining area with comfortable seating and everyday meal setup.",
+    "formal_dining_room": "A photo of a formal dining room with elegant table setting and sophisticated dining arrangement.",
+    "breakfast_nook_area": "A photo of a cozy breakfast nook with intimate seating and morning meal atmosphere.",
+    "outdoor_patio_dining": "A photo of an outdoor patio dining area with weather-resistant furniture and al fresco dining setup.",
+    "kitchen_island_dining": "A photo of a kitchen island used for casual dining with bar-style seating and integrated cooking space.",
+    # 生活空間的使用情境
+    "family_entertainment_room": "A photo of a family room focused on entertainment with large TV, comfortable seating and recreational atmosphere.",
+    "reading_lounge_area": "A photo of a quiet reading area with comfortable chairs, good lighting and book storage.",
+    "social_gathering_space": "A photo of a living area arranged for social gatherings with multiple seating options and conversation-friendly layout.",
+    "relaxation_living_space": "A photo of a living room designed for relaxation with soft furnishings and calm atmosphere.",
+    # 商業空間的服務導向
+    "quick_service_restaurant": "A photo of a quick service restaurant with efficient ordering system and fast-casual dining setup.",
+    "coffee_shop_workspace": "A photo of a coffee shop that doubles as workspace with WiFi-friendly seating and laptop users.",
+    "boutique_retail_space": "A photo of a boutique retail store with curated merchandise display and personalized shopping experience.",
+    "convenience_store_market": "A photo of a convenience store with everyday items, quick shopping layout and accessible product arrangement.",
+    # 學習環境的專業化
+    "collaborative_classroom": "A photo of a modern classroom designed for group work with flexible seating and interactive learning setup.",
+    "lecture_hall_setting": "A photo of a traditional lecture hall with tiered seating and formal educational presentation setup.",
+    "study_hall_library": "A photo of a quiet study area in a library with individual study spaces and academic atmosphere.",
+    "computer_lab_classroom": "A photo of a computer lab or digital classroom with technology workstations and learning equipment.",
+    # 用時間判斷
+    "morning_routine_kitchen": "A photo of a kitchen during morning routine with breakfast preparation and daily startup activities.",
+    "evening_relaxation_living": "A photo of a living room in evening mode with dim lighting and relaxation activities.",
+    "weekend_leisure_space": "A photo of a living area during weekend with casual activities and relaxed atmosphere.",
+    # 活動強度的描述
+    "busy_work_environment": "A photo of an active workplace with multiple people engaged in work tasks and productive atmosphere.",
+    "quiet_study_atmosphere": "A photo of a peaceful study or work environment with focused activity and minimal distractions.",
+    "social_interaction_space": "A photo of a space designed for social interaction with multiple people engaging in conversation."
 }
 # 文化特定場景提示
         "A street-level view showing pedestrian perspective and immediate surroundings.",
         "A bird's-eye view of city organization and movement patterns from high above.",
         "An eye-level perspective showing direct human interaction with urban elements."
+    ],
+    "modern_vs_traditional_kitchen": [
+        "A modern kitchen with sleek stainless steel appliances, minimalist design and contemporary fixtures.",
+        "A traditional kitchen with classic wooden cabinets, vintage appliances and conventional design elements."
+    ],
+    "business_vs_leisure_dining": [
+        "A business dining environment with professional atmosphere, formal table settings and corporate meeting setup.",
+        "A leisure dining space with relaxed atmosphere, casual seating and recreational meal environment."
+    ],
+    "dense_vs_spacious_retail": [
+        "A densely packed retail space with closely arranged merchandise and compact shopping aisles.",
+        "A spacious retail environment with wide aisles, generous display spacing and open shopping layout."
+    ],
+    "private_vs_shared_workspace": [
+        "A private office space with individual workstation, personal storage and isolated work environment.",
+        "A shared workspace with communal tables, collaborative areas and open interaction zones."
+    ],
+    "functional_vs_aesthetic_space": [
+        "A purely functional workspace focused on efficiency with practical furniture and utilitarian design.",
+        "An aesthetically designed space emphasizing visual appeal with decorative elements and stylistic choices."
     ]
 }
     "mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
     "beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
     "sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
+    "kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces.",
+    "photography_studio_lighting": "A photo taken in a photography studio with controlled professional lighting and even illumination.",
+    "retail_display_lighting": "A photo taken in retail environment with strategic product lighting and commercial illumination design.",
+    "conference_room_lighting": "A photo taken in a conference room with balanced meeting lighting and presentation-friendly illumination.",
+    "golden_hour_outdoor": "A photo taken during golden hour with warm, low-angle sunlight creating dramatic shadows and highlights.",
+    "overcast_diffused_light": "A photo taken under overcast sky with soft, even diffused lighting and minimal shadows.",
+    "harsh_midday_sun": "A photo taken under intense midday sunlight with strong contrasts and sharp shadows.",
+    "office_mixed_lighting": "A photo taken in office environment combining natural window light with artificial ceiling illumination.",
+    "restaurant_ambient_lighting": "A photo taken in restaurant with carefully designed ambient lighting combining multiple warm light sources.",
+    "retail_accent_lighting": "A photo taken in retail space with accent lighting highlighting products against general ambient illumination."
 }
 # 針對新場景類型的特殊提示
         "A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
         "A drone perspective of urban crossing design viewed from directly above.",
         "A vertical view of a street intersection showing crossing infrastructure."
+    ],
+    "medical_waiting_room": [
+        "A medical facility waiting area with comfortable seating, health information displays and patient-focused design.",
+        "A healthcare waiting space with sanitized surfaces, medical equipment visibility and clinical atmosphere.",
+        "A medical office reception area with appointment scheduling setup and healthcare service information."
+    ],
+    "science_laboratory": [
+        "A science laboratory with experimental equipment, safety features and research workstations.",
+        "A chemistry lab with fume hoods, lab benches and scientific instrument arrangements.",
+        "A biology laboratory with microscopes, specimen storage and life science research setup."
+    ],
+    "design_studio_workspace": [
+        "A design studio with creative tools, inspiration boards and artistic project development areas.",
+        "An architecture office with drafting tables, model displays and design development workspaces.",
+        "A graphic design workspace with computer workstations, color calibration tools and creative project areas."
+    ],
+    "maintenance_workshop": [
+        "A maintenance workshop with repair tools, work benches and technical service equipment.",
+        "A mechanical service area with diagnostic equipment, repair stations and automotive maintenance setup.",
+        "A technical workshop with specialized tools, parts storage and equipment maintenance facilities."
     ]
 }
     "bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
     "street_level": "A photo taken from the perspective of someone standing on the street.",
     "interior": "A photo taken from inside a building showing the internal environment.",
+    "vehicular": "A photo taken from inside or mounted on a moving vehicle.",
+    # 較詳細的視角
+    "security_camera_angle": "A photo taken from fixed security camera position showing surveillance perspective of the area.",
+    "drone_inspection_view": "A photo taken from drone perspective for inspection purposes showing detailed overhead examination angle.",
+    "architectural_documentation_view": "A photo taken specifically for architectural documentation showing building features and structural details.",
+    "customer_entering_view": "A photo taken from the perspective of a customer or visitor entering the space for the first time.",
+    "worker_daily_perspective": "A photo taken from the viewpoint of someone who works in this environment on a daily basis.",
+    "maintenance_access_view": "A photo taken from the perspective needed for maintenance or service access to equipment and facilities."
 }
 OBJECT_COMBINATION_PROMPTS = {
     "retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
     "crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
     "cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
+    "recreational_space": "A scene with sports equipment, play areas, and activity participants.",
+    "medical_examination_setup": "A scene with medical examination table, diagnostic equipment, and healthcare monitoring devices.",
+    "laboratory_research_arrangement": "A scene with scientific instruments, sample containers, and research documentation materials.",
+    "technical_repair_station": "A scene with diagnostic tools, replacement parts, and mechanical repair equipment.",
+    "art_creation_workspace": "A scene with artistic supplies, canvases, and creative project materials arranged for art making.",
+    "music_practice_setup": "A scene with musical instruments, sheet music, and sound equipment for music practice.",
+    "craft_workshop_arrangement": "A scene with crafting tools, materials, and project supplies organized for handmade creation.",
+    "language_learning_environment": "A scene with language learning materials, reference books, and communication practice tools.",
+    "science_experiment_setup": "A scene with scientific apparatus, measurement tools, and experimental materials for hands-on learning."
 }
 ACTIVITY_PROMPTS = {
     "exercising": "People engaged in physical activities, using sports equipment, and training.",
     "cooking": "People preparing food, using kitchen equipment, and creating meals.",
     "crossing_street": "People walking across designated crosswalks and navigating intersections.",
+    "recreational_activity": "People engaged in leisure activities, games, and social recreation.",
+    "consulting": "People engaged in professional consultation with documents, presentations, and advisory discussions.",
+    "training": "People participating in skill development training with instructional materials and practice exercises.",
+    "maintenance": "People performing maintenance tasks with technical equipment and repair procedures.",
+    "brainstorming": "People engaged in creative brainstorming with idea development tools and collaborative thinking.",
+    "designing": "People working on design projects with creative tools, sketches, and visual development materials.",
+    "prototyping": "People building and testing prototypes with development materials and experimental approaches.",
+    "researching": "People conducting research with reference materials, databases, and investigative methods.",
+    "experimenting": "People performing experiments with scientific equipment and systematic testing procedures.",
+    "practicing": "People engaged in skill practice with repetitive exercises and performance improvement activities."
 }

clip_zero_shot_classifier.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-import clip
 from PIL import Image
 import numpy as np
 import logging
@@ -21,18 +21,18 @@ class CLIPZeroShotClassifier:
     這是一個總窗口class，協調各個組件的工作以提供統一的對外接口。
     """
-    def __init__(self, model_name: str = "ViT-B/16", device: str = None):
         """
         初始化CLIP零樣本分類器
         Args:
-            model_name: CLIP模型名稱，默認為"ViT-B/16"
             device: 運行設備，None則自動選擇
         """
         self.logger = logging.getLogger(__name__)
         # 初始化各個組件
-        self.clip_model_manager = CLIPModelManager(model_name, device)
         self.landmark_data_manager = LandmarkDataManager()
         self.image_analyzer = ImageAnalyzer()
         self.confidence_manager = ConfidenceManager()

 import torch
+import open_clip
 from PIL import Image
 import numpy as np
 import logging
     這是一個總窗口class，協調各個組件的工作以提供統一的對外接口。
     """
+    def __init__(self, model_name: str = "ViT-B-16", device: str = None, pretrained: str = "laion2b_s34b_b88k"):
         """
         初始化CLIP零樣本分類器
         Args:
+            model_name: OpenCLIP模型名稱，默認為"ViT-B-16"
             device: 運行設備，None則自動選擇
         """
         self.logger = logging.getLogger(__name__)
         # 初始化各個組件
+        self.clip_model_manager = CLIPModelManager(model_name, device, pretrained)
         self.landmark_data_manager = LandmarkDataManager()
         self.image_analyzer = ImageAnalyzer()
         self.confidence_manager = ConfidenceManager()

llm_enhancer.py CHANGED Viewed

@@ -3,7 +3,7 @@ import traceback
 import re
 from typing import Dict, List, Any, Optional
-from model_manager import ModelManager
 from prompt_template_manager import PromptTemplateManager
 from response_processor import ResponseProcessor
 from text_quality_validator import TextQualityValidator
@@ -44,7 +44,7 @@ class LLMEnhancer:
         try:
             # 初始化四個核心組件
-            self.model_manager = ModelManager(
                 model_path=model_path,
                 tokenizer_path=tokenizer_path,
                 device=device,

 import re
 from typing import Dict, List, Any, Optional
+from llm_model_manager import LLMModelManager
 from prompt_template_manager import PromptTemplateManager
 from response_processor import ResponseProcessor
 from text_quality_validator import TextQualityValidator
         try:
             # 初始化四個核心組件
+            self.model_manager = LLMModelManager(
                 model_path=model_path,
                 tokenizer_path=tokenizer_path,
                 device=device,

llm_model_manager.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import os
+import torch
+import logging
+from typing import Dict, Optional, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from huggingface_hub import login
+class ModelLoadingError(Exception):
+    """Custom exception for model loading failures"""
+    pass
+class ModelGenerationError(Exception):
+    """Custom exception for model generation failures"""
+    pass
+class LLMModelManager:
+    """
+    負責LLM模型的載入、設備管理和文本生成。
+    管理模型、記憶體優化和設備配置。
+    """
+    def __init__(self,
+                 model_path: Optional[str] = None,
+                 tokenizer_path: Optional[str] = None,
+                 device: Optional[str] = None,
+                 max_length: int = 2048,
+                 temperature: float = 0.3,
+                 top_p: float = 0.85):
+        """
+        初始化模型管理器
+        Args:
+            model_path: LLM模型的路徑或HuggingFace模型名稱，默認使用Llama 3.2
+            tokenizer_path: tokenizer的路徑，通常與model_path相同
+            device: 運行設備 ('cpu'或'cuda')，None時自動檢測
+            max_length: 輸入文本的最大長度
+            temperature: 生成文本的溫度參數
+            top_p: 生成文本時的核心採樣機率閾值
+        """
+        # 設置專屬logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.INFO)
+        # 模型配置
+        self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
+        self.tokenizer_path = tokenizer_path or self.model_path
+        # 設備管理
+        self.device = self._detect_device(device)
+        self.logger.info(f"Device selected: {self.device}")
+        # 生成參數
+        self.max_length = max_length
+        self.temperature = temperature
+        self.top_p = top_p
+        # 模型狀態
+        self.model = None
+        self.tokenizer = None
+        self._model_loaded = False
+        self.call_count = 0
+        # HuggingFace認證
+        self.hf_token = self._setup_huggingface_auth()
+    def _detect_device(self, device: Optional[str]) -> str:
+        """
+        檢測並設置運行設備
+        Args:
+            device: 用戶指定的設備，None時自動檢測
+        Returns:
+            str: ('cuda' or 'cpu')
+        """
+        if device:
+            if device == 'cuda' and not torch.cuda.is_available():
+                self.logger.warning("CUDA requested but not available, falling back to CPU")
+                return 'cpu'
+            return device
+        detected_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if detected_device == 'cuda':
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+            self.logger.info(f"CUDA detected with {gpu_memory:.2f} GB GPU memory")
+        return detected_device
+    def _setup_huggingface_auth(self) -> Optional[str]:
+        """
+        設置HuggingFace認證
+        Returns:
+            Optional[str]: HuggingFace token，如果可用
+        """
+        hf_token = os.environ.get("HF_TOKEN")
+        if hf_token:
+            try:
+                login(token=hf_token)
+                self.logger.info("Successfully authenticated with HuggingFace")
+                return hf_token
+            except Exception as e:
+                self.logger.error(f"HuggingFace authentication failed: {e}")
+                return None
+        else:
+            self.logger.warning("HF_TOKEN not found. Access to gated models may be limited")
+            return None
+    def _load_model(self):
+        """
+        載入LLM模型和tokenizer，使用8位量化以節省記憶體
+        Raises:
+            ModelLoadingError: 當模型載入失敗時
+        """
+        if self._model_loaded:
+            return
+        try:
+            self.logger.info(f"Loading model from {self.model_path} with 8-bit quantization")
+            # 清理GPU記憶體
+            self._clear_gpu_cache()
+            # 設置8位量化配置
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True
+            )
+            # 載入tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.tokenizer_path,
+                padding_side="left",
+                use_fast=False,
+                token=self.hf_token
+            )
+            # 設置特殊標記
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # 載入模型
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                quantization_config=quantization_config,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                token=self.hf_token
+            )
+            self._model_loaded = True
+            self.logger.info("Model loaded successfully")
+        except Exception as e:
+            error_msg = f"Failed to load model: {str(e)}"
+            self.logger.error(error_msg)
+            raise ModelLoadingError(error_msg) from e
+    def _clear_gpu_cache(self):
+        """清理GPU記憶體緩存"""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            self.logger.debug("GPU cache cleared")
+    def generate_response(self, prompt: str, **generation_kwargs) -> str:
+        """
+        生成LLM回應
+        Args:
+            prompt: 輸入提示詞
+            **generation_kwargs: 額外的生成參數，可覆蓋預設值
+        Returns:
+            str: 生成的回應文本
+        Raises:
+            ModelGenerationError: 當生成失敗時
+        """
+        # 確保模型已載入
+        if not self._model_loaded:
+            self._load_model()
+        try:
+            self.call_count += 1
+            self.logger.info(f"Generating response (call #{self.call_count})")
+            # clean GPU
+            self._clear_gpu_cache()
+            # 設置固定種子以提高一致性
+            torch.manual_seed(42)
+            # prepare input
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=self.max_length
+            ).to(self.device)
+            # 準備生成參數
+            generation_params = self._prepare_generation_params(**generation_kwargs)
+            generation_params.update({
+                "pad_token_id": self.tokenizer.eos_token_id,
+                "attention_mask": inputs.attention_mask,
+                "use_cache": True,
+            })
+            # resposne
+            with torch.no_grad():
+                outputs = self.model.generate(inputs.input_ids, **generation_params)
+            # 解碼回應
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = self._extract_generated_response(full_response, prompt)
+            if not response or len(response.strip()) < 10:
+                raise ModelGenerationError("Generated response is too short or empty")
+            self.logger.info(f"Response generated successfully ({len(response)} characters)")
+            return response
+        except Exception as e:
+            error_msg = f"Text generation failed: {str(e)}"
+            self.logger.error(error_msg)
+            raise ModelGenerationError(error_msg) from e
+    def _prepare_generation_params(self, **kwargs) -> Dict[str, Any]:
+        """
+        準備生成參數，支援模型特定的優化
+        Args:
+            **kwargs: 用戶提供的生成參數
+        Returns:
+            Dict[str, Any]: 完整的生成參數配置
+        """
+        # basic parameters
+        params = {
+            "max_new_tokens": 120,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "do_sample": True,
+        }
+        # 針對Llama模型的特殊優化
+        if "llama" in self.model_path.lower():
+            params.update({
+                "max_new_tokens": 600,
+                "temperature": 0.35, # not too big
+                "top_p": 0.75,
+                "repetition_penalty": 1.5,
+                "num_beams": 5,
+                "length_penalty": 1,
+                "no_repeat_ngram_size": 3
+            })
+        else:
+            params.update({
+                "max_new_tokens": 300,
+                "temperature": 0.6,
+                "top_p": 0.9,
+                "num_beams": 1,
+                "repetition_penalty": 1.05
+            })
+        # 用戶參數覆蓋預設值
+        params.update(kwargs)
+        return params
+    def _extract_generated_response(self, full_response: str, prompt: str) -> str:
+        """
+        從完整回應中提取生成的部分
+        Args:
+            full_response: 模型的完整輸出
+            prompt: 原始提示詞
+        Returns:
+            str: 提取的生成回應
+        """
+        # 尋找assistant標記
+        assistant_tag = "<|assistant|>"
+        if assistant_tag in full_response:
+            response = full_response.split(assistant_tag)[-1].strip()
+            # 檢查是否有未閉合的user標記
+            user_tag = "<|user|>"
+            if user_tag in response:
+                response = response.split(user_tag)[0].strip()
+            return response
+        # 移除輸入提示詞
+        if full_response.startswith(prompt):
+            return full_response[len(prompt):].strip()
+        return full_response.strip()
+    def reset_context(self):
+        """重置模型上下文，清理GPU緩存"""
+        if self._model_loaded:
+            self._clear_gpu_cache()
+            self.logger.info("Model context reset")
+        else:
+            self.logger.info("Model not loaded, no context to reset")
+    def get_current_device(self) -> str:
+        """
+        獲取當前運行設備
+        Returns:
+            str: 當前設備名稱
+        """
+        return self.device
+    def is_model_loaded(self) -> bool:
+        """
+        檢查模型是否已載入
+        Returns:
+            bool: 模型載入狀態
+        """
+        return self._model_loaded
+    def get_call_count(self) -> int:
+        """
+        獲取模型調用次數
+        Returns:
+            int: 調用次數
+        """
+        return self.call_count
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        獲取模型信息
+        Returns:
+            Dict[str, Any]: 包含模型路徑、設備、載入狀態等信息
+        """
+        return {
+            "model_path": self.model_path,
+            "device": self.device,
+            "is_loaded": self._model_loaded,
+            "call_count": self.call_count,
+            "has_hf_token": self.hf_token is not None
+        }

requirements.txt CHANGED Viewed

@@ -6,7 +6,7 @@ pillow>=9.4.0
 numpy>=1.23.5
 matplotlib>=3.7.0
 gradio>=3.32.0
-git+https://github.com/openai/CLIP.git
 yt-dlp>=2023.3.4
 requests>=2.28.1
 transformers
@@ -14,4 +14,4 @@ accelerate
 bitsandbytes
 sentencepiece
 huggingface_hub>=0.19.0
-urllib3>=1.26.0

 numpy>=1.23.5
 matplotlib>=3.7.0
 gradio>=3.32.0
+open-clip-torch>=2.20.0
 yt-dlp>=2023.3.4
 requests>=2.28.1
 transformers
 bitsandbytes
 sentencepiece
 huggingface_hub>=0.19.0
+urllib3>=1.26.0

scene_scoring_engine.py CHANGED Viewed

@@ -249,13 +249,13 @@ class SceneScoringEngine:
         Returns:
             (最佳場景類型, 置信度) 的元組
         """
         if not scene_scores:
             return "unknown", 0.0
-        # 檢查地標相關分數是否達到門檻，如果是，直接回傳 "tourist_landmark"
         # 假設場景分數 dictionary 中，"tourist_landmark"、"historical_monument"、"natural_landmark" 三個 key
         # 分別代表不同類型地標。將它們加總，若總分超過 0.3，就認定為地標場景。
-        # print(f"DEBUG: determine_scene_type input scores: {scene_scores}")
         landmark_score = (
             scene_scores.get("tourist_landmark", 0.0) +
             scene_scores.get("historical_monument", 0.0) +
@@ -268,7 +268,7 @@ class SceneScoringEngine:
         # 找分數最高的那個場景
         best_scene = max(scene_scores, key=scene_scores.get)
         best_score = scene_scores[best_scene]
-        # print(f"DEBUG: determine_scene_type result: scene={best_scene}, score={best_score}")
         return best_scene, float(best_score)
     def fuse_scene_scores(self, yolo_scene_scores: Dict[str, float],
@@ -361,8 +361,9 @@ class SceneScoringEngine:
             current_yolo_weight = default_yolo_weight
             current_clip_weight = default_clip_weight
             current_places365_weight = default_places365_weight
-            # print(f"DEBUG: Scene {scene_type} - yolo_score: {yolo_score}, clip_score: {clip_score}, places365_score: {places365_score}")
-            # print(f"DEBUG: Scene {scene_type} - weights: yolo={current_yolo_weight:.3f}, clip={current_clip_weight:.3f}, places365={current_places365_weight:.3f}")
             scene_definition = self.scene_types.get(scene_type, {})
@@ -394,8 +395,8 @@ class SceneScoringEngine:
                      "professional_kitchen", "cafe", "library", "gym", "retail_store",
                      "supermarket", "classroom", "conference_room", "medical_facility",
                      "educational_setting", "dining_area"]):
-                current_yolo_weight = 0.55
-                current_clip_weight = 0.20
                 current_places365_weight = 0.25
             # 對於特定室外常見場景（非地標），物體仍然重要
@@ -491,7 +492,7 @@ class SceneScoringEngine:
             fused_scores[scene_type] = min(1.0, max(0.0, fused_score))
         return fused_scores
-        # print(f"DEBUG: fuse_scene_scores final result: {fused_scores}")
     def update_enable_landmark_status(self, enable_landmark: bool):
         """

         Returns:
             (最佳場景類型, 置信度) 的元組
         """
+        print(f"DEBUG: determine_scene_type input scores: {scene_scores}")
         if not scene_scores:
             return "unknown", 0.0
+        # 檢查地標相關分數是否達到門檻，如果是，直接回傳 "tourist_landmark"
         # 假設場景分數 dictionary 中，"tourist_landmark"、"historical_monument"、"natural_landmark" 三個 key
         # 分別代表不同類型地標。將它們加總，若總分超過 0.3，就認定為地標場景。
         landmark_score = (
             scene_scores.get("tourist_landmark", 0.0) +
             scene_scores.get("historical_monument", 0.0) +
         # 找分數最高的那個場景
         best_scene = max(scene_scores, key=scene_scores.get)
         best_score = scene_scores[best_scene]
+        print(f"DEBUG: determine_scene_type result: scene={best_scene}, score={best_score}")
         return best_scene, float(best_score)
     def fuse_scene_scores(self, yolo_scene_scores: Dict[str, float],
             current_yolo_weight = default_yolo_weight
             current_clip_weight = default_clip_weight
             current_places365_weight = default_places365_weight
+            print(f"DEBUG: Scene {scene_type} - yolo_score: {yolo_score}, clip_score: {clip_score}, places365_score: {places365_score}")
+            print(f"DEBUG: Scene {scene_type} - weights: yolo={current_yolo_weight:.3f}, clip={current_clip_weight:.3f}, places365={current_places365_weight:.3f}")
             scene_definition = self.scene_types.get(scene_type, {})
                      "professional_kitchen", "cafe", "library", "gym", "retail_store",
                      "supermarket", "classroom", "conference_room", "medical_facility",
                      "educational_setting", "dining_area"]):
+                current_yolo_weight = 0.50
+                current_clip_weight = 0.25
                 current_places365_weight = 0.25
             # 對於特定室外常見場景（非地標），物體仍然重要
             fused_scores[scene_type] = min(1.0, max(0.0, fused_score))
         return fused_scores
+        print(f"DEBUG: fuse_scene_scores final result: {fused_scores}")
     def update_enable_landmark_status(self, enable_landmark: bool):
         """