Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

VisionScout / image_analyzer.py

DawnC

Upload 59 files

e6a18b7 verified 6 months ago

raw

history blame

14.9 kB


	import numpy as np
	import logging
	import traceback
	from typing import List, Dict, Tuple, Optional, Union, Any
	from PIL import Image

	class ImageAnalyzer:
	"""
	專注於圖像分析和預處理，包括多尺度金字塔分析、視角分析、建築特徵識別和圖像增強等功能
	"""

	def __init__(self):
	"""
	初始化圖像分析器
	"""
	self.logger = logging.getLogger(__name__)

	def get_image_hash(self, image: Union[Image.Image, np.ndarray]) -> int:
	"""
	為圖像生成簡單的 hash 值用於快取

	Args:
	image: PIL Image 或 numpy 數組

	Returns:
	int: 圖像的 hash 值
	"""
	try:
	if isinstance(image, np.ndarray):
	# 對於 numpy 數組，降採樣並計算簡單 hash
	small_img = image[::10, ::10] if image.ndim == 3 else image
	return hash(small_img.tobytes())
	else:
	# 對於 PIL 圖像，調整大小後轉換為 bytes
	small_img = image.resize((32, 32))
	return hash(small_img.tobytes())
	except Exception as e:
	self.logger.error(f"Error generating image hash: {e}")
	self.logger.error(traceback.format_exc())
	return 0

	def enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
	"""
	增強圖像特徵以改善地標檢測

	Args:
	image: 輸入圖像

	Returns:
	PIL.Image: 增強後的圖像
	"""
	try:
	# ensure PIL format
	if not isinstance(image, Image.Image):
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)
	else:
	raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

	# 轉換為numpy進行處理
	img_array = np.array(image)

	# 跳過灰度圖像的處理
	if len(img_array.shape) < 3:
	return image

	# 應用自適應對比度增強
	try:
	from skimage import color, exposure

	# 轉換到LAB色彩空間
	if img_array.shape[2] == 4: # 處理RGBA
	img_array = img_array[:,:,:3]

	lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
	l_channel = lab[:,:,0]

	# 增強L通道的對比度
	p2, p98 = np.percentile(l_channel, (2, 98))
	l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))

	# 替換L通道並轉換回RGB
	lab[:,:,0] = l_channel_enhanced
	enhanced_img = color.lab2rgb(lab) * 255.0
	enhanced_img = enhanced_img.astype(np.uint8)

	return Image.fromarray(enhanced_img)

	except ImportError:
	self.logger.warning("skimage not available for feature enhancement")
	return image

	except Exception as e:
	self.logger.error(f"Error in feature enhancement: {e}")
	self.logger.error(traceback.format_exc())
	return image

	def analyze_viewpoint(self, image: Union[Image.Image, np.ndarray],
	clip_model_manager) -> Dict[str, Any]:
	"""
	分析圖像視角以調整檢測參數

	Args:
	image: 輸入圖像
	clip_model_manager: CLIP模型管理器實例

	Returns:
	Dict: 視角分析結果
	"""
	try:
	viewpoint_prompts = {
	"aerial_view": "an aerial view from above looking down",
	"street_level": "a street level view looking up at a tall structure",
	"eye_level": "an eye-level horizontal view of a landmark",
	"distant": "a distant view of a landmark on the horizon",
	"close_up": "a close-up detailed view of architectural features",
	"interior": "an interior view inside a structure",
	"angled_view": "an angled view of a structure",
	"low_angle": "a low angle view looking up at a building"
	}

	# 計算相似度分數
	viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts, clip_model_manager)

	# 找到主要視角
	dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])

	return {
	"viewpoint_scores": viewpoint_scores,
	"dominant_viewpoint": dominant_viewpoint[0],
	"confidence": dominant_viewpoint[1]
	}

	except Exception as e:
	self.logger.error(f"Error in viewpoint analysis: {e}")
	self.logger.error(traceback.format_exc())
	return {
	"viewpoint_scores": {},
	"dominant_viewpoint": "eye_level",
	"confidence": 0.0
	}

	def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
	prompts: Dict[str, str],
	clip_model_manager) -> Dict[str, float]:
	"""
	計算圖像與一組特定提示之間的相似度分數

	Args:
	image: 輸入圖像
	prompts: 提示詞字典 {名稱: 提示文本}
	clip_model_manager: CLIP模型管理器實例

	Returns:
	Dict[str, float]: 每個提示的相似度分數
	"""
	try:
	# ensure PIL format
	if not isinstance(image, Image.Image):
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)
	else:
	raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

	# preprocess image
	image_input = clip_model_manager.preprocess_image(image)

	# get image features
	image_features = clip_model_manager.encode_image(image_input)

	# 計算與每個提示的similarity
	scores = {}
	prompt_texts = list(prompts.values())
	prompt_features = clip_model_manager.encode_single_text(prompt_texts)

	# 計算相似度
	similarity = clip_model_manager.calculate_similarity(image_features, prompt_features)

	# result
	for i, (name, _) in enumerate(prompts.items()):
	scores[name] = float(similarity[0][i])

	return scores

	except Exception as e:
	self.logger.error(f"Error calculating similarity scores: {e}")
	self.logger.error(traceback.format_exc())
	return {}

	def analyze_architectural_features(self, image: Union[Image.Image, np.ndarray],
	clip_model_manager) -> Dict[str, Any]:
	"""
	分析圖像中結構的建築特徵，不硬編碼特定地標

	Args:
	image: 輸入圖像
	clip_model_manager: CLIP模型管理器實例

	Returns:
	Dict: 建築特徵分析結果
	"""
	try:
	# 定義通用建築特徵提示，適用於所有類型的地標
	architecture_prompts = {
	"tall_structure": "a tall vertical structure standing alone",
	"tiered_building": "a building with multiple stacked tiers or segments",
	"historical_structure": "a building with historical architectural elements",
	"modern_design": "a modern structure with contemporary architectural design",
	"segmented_exterior": "a structure with visible segmented or sectioned exterior",
	"viewing_platform": "a tall structure with observation area at the top",
	"time_display": "a structure with timepiece features",
	"glass_facade": "a building with prominent glass exterior surfaces",
	"memorial_structure": "a monument or memorial structure",
	"ancient_construction": "ancient constructed elements or archaeological features",
	"natural_landmark": "a natural geographic formation or landmark",
	"slanted_design": "a structure with non-vertical or leaning profile"
	}

	# 計算與通用建築模式的相似度分數
	context_scores = self.calculate_similarity_scores(image, architecture_prompts, clip_model_manager)

	# 確定最相關的建築特徵
	top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]

	# 計算特徵置信度
	context_confidence = sum(score for _, score in top_features) / 3

	# 根據頂級特徵確定主要建築類別
	architectural_categories = {
	"tower": ["tall_structure", "viewing_platform", "time_display"],
	"skyscraper": ["tall_structure", "modern_design", "glass_facade"],
	"historical": ["historical_structure", "ancient_construction", "memorial_structure"],
	"natural": ["natural_landmark"],
	"distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
	}

	# 根據頂級特徵為每個類別評分
	category_scores = {}
	for category, features in architectural_categories.items():
	category_score = 0
	for feature, score in context_scores.items():
	if feature in features:
	category_score += score
	category_scores[category] = category_score

	primary_category = max(category_scores.items(), key=lambda x: x[1])[0]

	return {
	"architectural_features": top_features,
	"context_confidence": context_confidence,
	"primary_category": primary_category,
	"category_scores": category_scores
	}

	except Exception as e:
	self.logger.error(f"Error in architectural feature analysis: {e}")
	self.logger.error(traceback.format_exc())
	return {
	"architectural_features": [],
	"context_confidence": 0.0,
	"primary_category": "building",
	"category_scores": {}
	}

	def perform_pyramid_analysis(self, image: Union[Image.Image, np.ndarray],
	clip_model_manager, landmark_data_manager,
	levels: int = 4, base_threshold: float = 0.25,
	aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
	"""
	對圖像執行多尺度金字塔分析以改善地標檢測

	Args:
	image: 輸入圖像
	clip_model_manager: CLIP模型管理器實例
	landmark_data_manager: 地標數據管理器實例
	levels: 金字塔層級數
	base_threshold: 基礎置信度閾值
	aspect_ratios: 不同縱橫比列表

	Returns:
	Dict: 金字塔分析結果
	"""
	try:
	# 確保圖像是PIL格式
	if not isinstance(image, Image.Image):
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)
	else:
	raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")

	width, height = image.size
	pyramid_results = []

	# 獲取預計算的地標文本特徵
	landmark_prompts = landmark_data_manager.get_landmark_prompts()
	if not landmark_prompts:
	return {
	"is_landmark": False,
	"results": [],
	"best_result": None
	}

	landmark_text_features = clip_model_manager.encode_text_batch(landmark_prompts)

	# 對每個縮放和縱橫比組合進行處理
	for level in range(levels):
	# 計算縮放因子
	scale_factor = 1.0 - (level * 0.2)

	for aspect_ratio in aspect_ratios:
	# 計算新尺寸，保持面積近似不變
	if aspect_ratio != 1.0:
	# 保持面積近似不變的情況下調整縱橫比
	new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
	new_height = int(height * scale_factor * aspect_ratio**0.5)
	else:
	new_width = int(width * scale_factor)
	new_height = int(height * scale_factor)

	# 調整圖像大小
	scaled_image = image.resize((new_width, new_height), Image.LANCZOS)

	# 預處理圖像
	image_input = clip_model_manager.preprocess_image(scaled_image)

	# 獲取圖像特徵
	image_features = clip_model_manager.encode_image(image_input)

	# 計算相似度
	similarity = clip_model_manager.calculate_similarity(image_features, landmark_text_features)

	# 找到最佳匹配
	best_idx = similarity[0].argmax().item()
	best_score = similarity[0][best_idx]

	if best_score >= base_threshold:
	landmark_id, landmark_info = landmark_data_manager.get_landmark_by_index(best_idx)
	if landmark_id:
	pyramid_results.append({
	"landmark_id": landmark_id,
	"landmark_name": landmark_info.get("name", "Unknown"),
	"confidence": float(best_score),
	"scale_factor": scale_factor,
	"aspect_ratio": aspect_ratio,
	"location": landmark_info.get("location", "Unknown Location")
	})

	# 按置信度排序
	pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)

	return {
	"is_landmark": len(pyramid_results) > 0,
	"results": pyramid_results,
	"best_result": pyramid_results[0] if pyramid_results else None
	}

	except Exception as e:
	self.logger.error(f"Error in pyramid analysis: {e}")
	self.logger.error(traceback.format_exc())
	return {
	"is_landmark": False,
	"results": [],
	"best_result": None
	}