audio2text / app.py
Ander1's picture
Upload 11 files
4903e5a verified
import gradio as gr
import os
from elevenlabs_stt import transcribe_audio_elevenlabs
from whisper_stt import transcribe_audio_whisper
from transcript_refiner import refine_transcript
from utils import calculate_tokens_and_cost, OPENAI_MODELS, MODEL_PRICES
def process_audio(
audio_file,
openai_api_key,
elevenlabs_api_key,
service_choice,
openai_model,
language,
speaker_detection=False,
creativity=0.5
):
try:
if not openai_api_key or len(openai_api_key) < 20:
return "請輸入有效的 OpenAI API 金鑰", "", "", ""
if service_choice == "ElevenLabs" and (not elevenlabs_api_key or len(elevenlabs_api_key) < 20):
return "請輸入有效的 ElevenLabs API 金鑰", "", "", ""
# 音訊轉文字
if service_choice == "ElevenLabs":
transcript = transcribe_audio_elevenlabs(
audio_file,
elevenlabs_api_key,
language=language,
speaker_detection=speaker_detection
)
else: # Whisper
transcript = transcribe_audio_whisper(
audio_file,
language=language
)
# 優化文字
refined_text = refine_transcript(
transcript,
openai_api_key,
openai_model,
creativity
)
# 計算 token 和費用
tokens_info, cost_info = calculate_tokens_and_cost(
transcript,
refined_text,
openai_model
)
return transcript, refined_text, tokens_info, cost_info
except Exception as e:
return f"錯誤:{str(e)}", "", "", ""
finally:
# 清除敏感資訊
if 'openai_api_key' in locals():
del openai_api_key
if 'elevenlabs_api_key' in locals():
del elevenlabs_api_key
# 創建 Gradio 介面
with gr.Blocks() as demo:
gr.Markdown("# 音訊轉文字與優化系統")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="上傳音訊檔案",
type="filepath"
)
with gr.Row():
openai_key = gr.Textbox(
label="OpenAI API 金鑰",
placeholder="輸入您的 OpenAI API 金鑰",
type="password",
value="",
every=None
)
elevenlabs_key = gr.Textbox(
label="ElevenLabs API 金鑰",
placeholder="輸入您的 ElevenLabs API 金鑰(如果使用 ElevenLabs)",
type="password",
value="",
every=None
)
service = gr.Radio(
choices=["Whisper", "ElevenLabs"],
label="選擇轉錄服務",
value="Whisper"
)
model = gr.Dropdown(
choices=list(OPENAI_MODELS.keys()),
label="選擇 OpenAI 模型",
value="gpt-3.5-turbo"
)
language = gr.Textbox(
label="語言(可選)",
placeholder="輸入語言代碼,例如:zh-TW、en、ja",
value=""
)
speaker = gr.Checkbox(
label="啟用說話者辨識(僅限 ElevenLabs)",
value=False
)
creativity = gr.Slider(
minimum=0,
maximum=1,
value=0.5,
label="創意程度"
)
process_btn = gr.Button("處理音訊")
with gr.Column():
original_output = gr.Textbox(
label="原始轉錄文字",
lines=10
)
refined_output = gr.Textbox(
label="優化後文字",
lines=10
)
token_info = gr.Textbox(
label="Token 使用資訊",
lines=3
)
cost_info = gr.Textbox(
label="費用資訊",
lines=3
)
gr.Markdown("""
### 安全性說明
- API 金鑰僅在當前處理中使用
- 不會儲存任何敏感資訊
- 每次使用需重新輸入 API 金鑰
""")
# 設定處理函數
process_btn.click(
fn=process_audio,
inputs=[
audio_input,
openai_key,
elevenlabs_key,
service,
model,
language,
speaker,
creativity
],
outputs=[
original_output,
refined_output,
token_info,
cost_info
]
)
# 啟動應用程式
demo.launch()