vocalprint-ai / accent_detection_cli.py
abraarsyed's picture
Initial Commit
39ec667
import argparse
import json
import os
import tempfile
import shutil
import whisper
import torch
from core.processor import (
download_video,
extract_audio,
transcribe,
classify_accent,
compute_fluency
)
from core.logger import logger
def main():
parser = argparse.ArgumentParser(description="Accent & Fluency Detection CLI")
parser.add_argument('--url', required=True, help='Public video URL (YouTube, Loom, MP4)')
parser.add_argument('--output', help='Output path for JSON result')
parser.add_argument('--device', default='auto', choices=['auto', 'cpu', 'cuda'], help='Device to run Whisper')
parser.add_argument('--keep', action='store_true', help='Keep temporary files')
args = parser.parse_args()
whisper_device = 'cuda' if args.device == 'cuda' and torch.cuda.is_available() else 'cpu'
whisper_model = whisper.load_model("small", device=whisper_device)
temp_dir = tempfile.mkdtemp()
audio_path = os.path.join(temp_dir, "audio.wav")
try:
download_video(args.url, temp_dir)
video_file = next((f for f in os.listdir(temp_dir) if f.endswith(".mp4")), None)
if not video_file:
raise FileNotFoundError("No .mp4 file found in temp dir")
extract_audio(os.path.join(temp_dir, video_file), audio_path)
transcript, segments, language = transcribe(audio_path, whisper_model)
top_accent, confidence, top3 = classify_accent(audio_path)
fluency = compute_fluency(segments)
result = {
"accent": top_accent,
"accent_confidence": confidence,
"top_3_predictions": top3,
"fluency_score": fluency,
"language_detected_by_whisper": language,
"transcript_sample": transcript[:300]
}
print(json.dumps(result, indent=2))
if args.output:
with open(args.output, 'w') as f:
json.dump(result, f, indent=2)
logger.info(f"Saved output to {args.output}")
except Exception as e:
logger.error(f"FAILED: {e}")
finally:
if args.keep:
logger.info(f"Temporary files kept in: {temp_dir}")
else:
shutil.rmtree(temp_dir)
if __name__ == "__main__":
main()