Spaces:

deniskiplimo816
/

llama-models

Sleeping

App Files Files Community

llama-models / app /vision.py

deniskiplimo816

Upload 27 files

293ab16 verified about 2 months ago

raw

history blame contribute delete

4.34 kB

	import torch
	from PIL import Image
	import pytesseract
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from io import BytesIO
	import base64
	from typing import Union
	import whisper # Must be `openai-whisper` installed

	# === DEVICE SETUP ===
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# === LOAD WHISPER MODEL FOR AUDIO TRANSCRIPTION ===
	try:
	whisper_model = whisper.load_model("base") # Options: "tiny", "base", "small", "medium", "large"
	except Exception as e:
	raise RuntimeError(f"❌ Failed to load Whisper model: {str(e)}")

	# === LOAD BLIP FOR IMAGE CAPTIONING ===
	try:
	blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	blip_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-base"
	).to(device)
	except Exception as e:
	raise RuntimeError(f"❌ Failed to load BLIP model: {str(e)}")

	# === TEXT EXTRACTION (OCR) ===

	def extract_text_from_image_base64(image_base64: str) -> str:
	"""Extract text from a base64-encoded image."""
	try:
	image_data = base64.b64decode(image_base64)
	image = Image.open(BytesIO(image_data))
	return pytesseract.image_to_string(image).strip()
	except Exception as e:
	return f"❌ OCR Error (base64): {str(e)}"

	def extract_text_from_image_path(image_path: str) -> str:
	"""Extract text from an image file path."""
	try:
	image = Image.open(image_path)
	return pytesseract.image_to_string(image).strip()
	except Exception as e:
	return f"❌ OCR Error (path): {str(e)}"

	def extract_text_from_image_bytes(image_bytes: bytes) -> str:
	"""Extract text from raw image bytes (e.g., file uploads)."""
	try:
	image = Image.open(BytesIO(image_bytes))
	return pytesseract.image_to_string(image).strip()
	except Exception as e:
	return f"❌ OCR Error (bytes): {str(e)}"

	def extract_text_from_image(image_base64: str) -> str:
	"""API alias for default OCR from base64 input."""
	return extract_text_from_image_base64(image_base64)

	# === IMAGE CAPTIONING ===

	def caption_image(image: Image.Image) -> str:
	"""Generate a caption from a PIL image object."""
	try:
	inputs = blip_processor(image.convert("RGB"), return_tensors="pt").to(device)
	outputs = blip_model.generate(**inputs)
	return blip_processor.decode(outputs[0], skip_special_tokens=True)
	except Exception as e:
	return f"❌ Captioning Error: {str(e)}"

	def caption_image_path(image_path: str) -> str:
	"""Generate a caption from image file path."""
	try:
	image = Image.open(image_path)
	return caption_image(image)
	except Exception as e:
	return f"❌ Captioning Error (path): {str(e)}"

	def caption_image_bytes(image_bytes: bytes) -> str:
	"""Generate a caption from image bytes."""
	try:
	image = Image.open(BytesIO(image_bytes))
	return caption_image(image)
	except Exception as e:
	return f"❌ Captioning Error (bytes): {str(e)}"

	def describe_image(input_data: Union[str, bytes]) -> str:
	"""
	Unified captioning API — accepts either path or bytes.
	"""
	try:
	if isinstance(input_data, bytes):
	return caption_image_bytes(input_data)
	elif isinstance(input_data, str):
	return caption_image_path(input_data)
	else:
	return "❌ Unsupported input type for describe_image"
	except Exception as e:
	return f"❌ Description Error: {str(e)}"

	# === AUDIO TRANSCRIPTION ===

	def transcribe_audio_bytes(audio_bytes: bytes) -> str:
	"""Transcribe raw audio bytes using Whisper."""
	try:
	# Save to temporary file
	temp_path = "/tmp/temp_audio.wav"
	with open(temp_path, "wb") as f:
	f.write(audio_bytes)
	result = whisper_model.transcribe(temp_path)
	return result.get("text", "").strip()
	except Exception as e:
	return f"❌ Transcription Error: {str(e)}"

	def transcribe_audio_path(audio_path: str) -> str:
	"""Transcribe audio file using Whisper."""
	try:
	result = whisper_model.transcribe(audio_path)
	return result.get("text", "").strip()
	except Exception as e:
	return f"❌ Transcription Error (path): {str(e)}"