File size: 5,712 Bytes
deaf141
 
 
 
 
 
98b7982
 
deaf141
 
 
 
d98c8c7
 
 
 
 
 
 
e176f16
d98c8c7
 
e176f16
d98c8c7
deaf141
 
e176f16
deaf141
d98c8c7
e176f16
 
 
d98c8c7
 
98b7982
 
e176f16
deaf141
e176f16
d98c8c7
deaf141
 
 
98b7982
deaf141
 
d98c8c7
deaf141
d98c8c7
 
 
e176f16
 
 
 
 
 
 
 
 
d98c8c7
 
 
e176f16
deaf141
d98c8c7
 
deaf141
d98c8c7
 
deaf141
d98c8c7
 
 
e176f16
d98c8c7
e176f16
 
 
 
262fa8a
e176f16
d98c8c7
 
e176f16
ac10dbc
d98c8c7
deaf141
388f62c
 
 
 
648af6b
388f62c
 
 
 
 
 
d98c8c7
e176f16
29c9cfa
 
3a3e478
29c9cfa
 
3a3e478
388f62c
200c23a
 
4f6854f
 
200c23a
 
 
 
 
 
 
 
 
 
 
648af6b
 
d98c8c7
deaf141
e176f16
 
a6b9b86
d98c8c7
 
 
 
e176f16
d98c8c7
deaf141
 
 
54dd091
 
b6dd7a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import torch
from PIL import Image
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
from typing import Union
from gtts import gTTS
import os
import uuid
import time
import gc

torch.set_num_threads(2)
_pipeline = None

def init_pipeline():
    global _pipeline
    if _pipeline is None:
        _pipeline = ImageCaptionPipeline()
    return _pipeline

class ImageCaptionPipeline:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        start_time = time.time()
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True)
        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device)
        print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
        
        start_time = time.time()
        self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
        self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
        print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")

    def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
        start_time = time.time()
        if isinstance(image, str):
            image = Image.open(image)
        image = image.convert("RGB")
        image = image.resize((512, 512), Image.Resampling.LANCZOS)
        inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
            english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
        print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
        
        start_time = time.time()
        translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
        with torch.no_grad():
            translated_ids = self.translator_model.generate(
                **translated_inputs,
                max_length=50,
                num_beams=2,
                early_stopping=True
            )
            russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
        
        gc.collect()
        return english_caption, russian_caption

    def generate_audio(self, text: str, language: str) -> str:
        start_time = time.time()
        lang_code = "ru" if language == "Русский" else "en"
        tts = gTTS(text=text, lang=lang_code)
        audio_path = f"caption_audio_{uuid.uuid4()}.mp3"
        tts.save(audio_path)
        print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд")
        return audio_path

def generate_captions(image: Image.Image) -> tuple:
    if image is not None:
        pipeline = init_pipeline()
        english_caption, russian_caption = pipeline.generate_captions(image)
        return english_caption, russian_caption, None
    return "Загрузите изображение.", "Загрузите изображение.", None

def generate_audio(english_caption: str, russian_caption: str, audio_language: str) -> str:
    if not english_caption and not russian_caption:
        return None
    pipeline = init_pipeline()
    text = russian_caption if audio_language == "Русский" else english_caption
    return pipeline.generate_audio(text, audio_language)

with gr.Blocks(css="""
    .btn {
        width: 200px; 
        background-color: #4B0082; 
        color: white;  
        font-size: 16px; 
    } 
    .equal-height {
        height: 100px !important;
    }
""") as iface:
    with gr.Row():
        with gr.Column(scale=1, min_width=400, variant="panel"):
            with gr.Row():
                image = gr.Image(type="pil", label="Изображение", height=400, width=400)

            with gr.Row():
                submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")

            
        with gr.Column(scale=1, variant="panel"):
            with gr.Row():
                english_caption = gr.Textbox(label="Английский язык:", lines=1, interactive=False)
                russian_caption = gr.Textbox(label="Русский язык:", lines=1, interactive=False)
            with gr.Row():
                audio_language = gr.Dropdown(
                    choices=["Русский", "English"], 
                    label="Язык озвучки", 
                    value="Русский", 
                    elem_classes="equal-height"
                )
                audio_output = gr.Audio(
                    label="Озвучка", 
                    elem_classes="equal-height"
                )
            with gr.Row():
                audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
    
    submit_button.click(
        fn=generate_captions,
        inputs=[image],
        outputs=[english_caption, russian_caption]
    )
    
    audio_button.click(
        fn=generate_audio,
        inputs=[english_caption, russian_caption, audio_language],
        outputs=[audio_output]
    )

if __name__ == "__main__":
    iface.launch()

# Пум-пуммм..