DanishICUP commited on
Commit
17a5b4a
·
verified ·
1 Parent(s): cd2c5e9

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import gradio as gr
4
+ from brain_of_doctor import load_image, analyze_image_with_query
5
+ from voice_of_the_patient import record_audio, transcription_with_groq
6
+ from voice_of_the_doctor import (
7
+ text_to_speech_with_gtts_new_autoplay,
8
+ text_to_speech_with_elevenlabs_new_autoplay
9
+ )
10
+ from deep_translator import GoogleTranslator
11
+ from gtts import gTTS
12
+
13
+ load_dotenv()
14
+
15
+ system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose.
16
+ What's in this image?. Do you find anything wrong with it medically?
17
+ If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
18
+ your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
19
+ Donot say 'In the image I see' but say 'With what I see, I think you have ....'
20
+ Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
21
+ Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
22
+
23
+ # ✅ Helper for Urdu TTS (using gTTS)
24
+ def tts_with_language_support(text, output_file="final.wav", lang="en"):
25
+ try:
26
+ tts = gTTS(text=text, lang=lang)
27
+ tts.save(output_file)
28
+ return output_file
29
+ except Exception as e:
30
+ print("TTS generation failed:", e)
31
+ return None
32
+
33
+
34
+ def process_inputs(audio_filePath, image_filePath, language):
35
+ # Speech → text
36
+ speech_to_text_output = transcription_with_groq(
37
+ GROQ_API_KEY= os.getenv("GROQ_API_KEY"),
38
+ audio_filePath=audio_filePath,
39
+ sst_model="whisper-large-v3"
40
+ )
41
+
42
+ # Image analysis
43
+ if image_filePath:
44
+ doctor_response_en = analyze_image_with_query(
45
+ query=system_prompt + speech_to_text_output,
46
+ encoded_image=load_image(image_filePath),
47
+ model="meta-llama/llama-4-scout-17b-16e-instruct"
48
+ )
49
+ else:
50
+ doctor_response_en = "No image provided for me to analyze"
51
+
52
+ # Translation + voice selection
53
+ if language == "Urdu":
54
+ # Translate to Urdu
55
+ doctor_response = GoogleTranslator(source='en', target='ur').translate(doctor_response_en)
56
+ # Generate Urdu voice with gTTS
57
+ output_audio = tts_with_language_support(text=doctor_response, output_file="final.wav", lang="ur")
58
+ else:
59
+ # English voice via ElevenLabs
60
+ doctor_response = doctor_response_en
61
+ output_audio = text_to_speech_with_gtts_new_autoplay(
62
+ input_text=doctor_response,
63
+ output_file="final.wav"
64
+ )
65
+ output_audio= "final.wav"
66
+
67
+ return speech_to_text_output, doctor_response, output_audio
68
+
69
+
70
+ # ✅ Gradio Interface
71
+ iface = gr.Interface(
72
+ fn=process_inputs,
73
+ inputs=[
74
+ gr.Audio(sources=["microphone"], type="filepath", label="Patient's Voice"),
75
+ gr.Image(type="filepath", label="Upload Medical Image"),
76
+ gr.Radio(choices=["English", "Urdu"], value="English", label="Select Language")
77
+ ],
78
+ outputs=[
79
+ gr.Textbox(label="Speech to Text"),
80
+ gr.Textbox(label="Doctor's Response"),
81
+ gr.Audio(label="Doctor's Voice (Auto Play)", autoplay=True, type="filepath")
82
+ ],
83
+ title="AI Doctor — Developed by Danish Khan"
84
+ )
85
+
86
+ iface.launch(debug=True)
brain_of_doctor.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import base64
3
+ from groq import Groq
4
+
5
+
6
+ load_dotenv()
7
+
8
+
9
+ def load_image(image_path):
10
+ image_file = open(image_path, "rb")
11
+ return base64.b64encode(image_file.read()).decode("utf-8")
12
+
13
+ query = "Describe the condition of face briefly"
14
+ model = "meta-llama/llama-4-scout-17b-16e-instruct"
15
+
16
+
17
+ def analyze_image_with_query(model, query , encoded_image):
18
+ client = Groq()
19
+ messages = [
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {
24
+ "type": "text",
25
+ "text": query
26
+ },
27
+ {
28
+ "type": "image_url",
29
+ "image_url": {
30
+ "url": f"data:image/jpeg;base64,{encoded_image}"
31
+ },
32
+ },
33
+ ],
34
+ }
35
+ ]
36
+ chat_completion = client.chat.completions.create(
37
+ model=model,
38
+ messages=messages,
39
+ )
40
+ print("Loading image...")
41
+ print(chat_completion.choices[0].message.content)
42
+ return chat_completion.choices[0].message.content
43
+
44
+
45
+ print(analyze_image_with_query(model, query, load_image("acne.jpg")))
requirements.txt ADDED
File without changes
voice_of_the_doctor.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from gtts import gTTS
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def text_to_speech_with_gtts_old_no_autoplay_functionality(input_text , output_file):
8
+ language = 'en'
9
+
10
+ gttsObj = gTTS(
11
+ text=input_text,
12
+ lang=language,
13
+ slow=False
14
+ )
15
+ gttsObj.save(output_file)
16
+
17
+ text = "hello this is me danish with ai voice"
18
+ # text_to_speech_with_gtts_old_no_autoplay_functionality(input_text=text,output_file= "output_gtts.mp3")
19
+
20
+
21
+ import elevenlabs
22
+ from elevenlabs.client import ElevenLabs
23
+
24
+ ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
25
+
26
+ def text_to_speech_with_elevenlabs_old_no_autoplay_functionality(input_text , output_file):
27
+ client = ElevenLabs(api_key = ELEVENLABS_API_KEY)
28
+
29
+ audio = client.generate(
30
+ text = input_text,
31
+ voice = "Aria",
32
+ model = "eleven_turbo_v2",
33
+ output_format = "mp3_22050_32"
34
+ )
35
+ elevenlabs.save(audio , output_file)
36
+
37
+ # text_to_speech_with_elevenlabs_old_no_autoplay_functionality(input_text=text,output_file= "output_elevenlabs.mp3")
38
+
39
+
40
+
41
+
42
+ import subprocess
43
+ import platform
44
+ from pydub import AudioSegment
45
+
46
+
47
+ def text_to_speech_with_gtts_new_autoplay(input_text , output_file):
48
+ language = 'en'
49
+
50
+ #create gtts object
51
+ gttsObj = gTTS(
52
+ text=input_text,
53
+ lang=language,
54
+ slow=False
55
+ )
56
+ #save the audio file
57
+ gttsObj.save(output_file)
58
+
59
+ #converstion from mp3 to wav
60
+ wav_file = output_file.replace('.mp3', '.wav')
61
+ AudioSegment.from_mp3(output_file).export(wav_file,format="wav")
62
+ # Optionally remove the original mp3 file
63
+ # os.remove(output_file)
64
+ # Play the wav file based on the operating system
65
+ os_name = platform.system()
66
+
67
+ try:
68
+ if os_name == "Darwin":
69
+ subprocess.run(['afplay', wav_file])
70
+ elif os_name == "Windows":
71
+ subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_file}").PlaySync();'])
72
+ elif os_name == "Linux":
73
+ subprocess.run(['aplay', wav_file])
74
+ else:
75
+ raise OSError("Unsupported operating system")
76
+ except Exception as e:
77
+ print(f"An error occurred while trying to play the audio: {e}")
78
+
79
+ text = "hello this is me danish with ai voice"
80
+ # text_to_speech_with_gtts_new_autoplay(input_text=text,output_file= "gtts_testing_autoplay.mp3")
81
+
82
+
83
+ def text_to_speech_with_elevenlabs_new_autoplay(input_text , output_file):
84
+ client = ElevenLabs(api_key = ELEVENLABS_API_KEY)
85
+
86
+ audio = client.generate(
87
+ text = input_text,
88
+ voice = "Aria",
89
+ model = "eleven_turbo_v2",
90
+ output_format = "mp3_22050_32"
91
+ )
92
+ elevenlabs.save(audio , output_file)
93
+
94
+ wav_file = output_file.replace("mp3", "wav")
95
+ AudioSegment.from_mp3(output_file).export(wav_file,format="wav")
96
+
97
+ # os.remove(output_file)
98
+
99
+
100
+ os_name = platform.system()
101
+ try:
102
+ if os_name == "Darwin":
103
+ subprocess.run(['afplay', wav_file])
104
+ elif os_name == "Windows":
105
+ subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_file}").PlaySync();'])
106
+ elif os_name == "Linux":
107
+ subprocess.run(['aplay', wav_file])
108
+ else:
109
+ raise OSError("Unsupported operating system")
110
+ except Exception as e:
111
+ print(f"An error occurred while trying to play the audio: {e}")
112
+
113
+
114
+ text = "hello this is me danish with ai voice"
115
+ text_to_speech_with_elevenlabs_new_autoplay(input_text=text,output_file= "output_elevenlabs_autoplay.mp3")
voice_of_the_patient.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import speech_recognition as sr
3
+ from pydub import AudioSegment
4
+ from io import BytesIO
5
+
6
+ logging.basicConfig(level=logging.INFO , format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+
9
+ def record_audio(file_path , duration=20 , phrase_time_limit= None):
10
+ recognizer = sr.Recognizer()
11
+
12
+ try:
13
+ with sr.Microphone() as source:
14
+ logging.info("Adjusting for ambient noise...")
15
+ recognizer.adjust_for_ambient_noise(source)
16
+ logging.info("Recording audio...")
17
+
18
+ # Record the audio
19
+ audio_data = recognizer.listen(source , timeout=duration , phrase_time_limit=phrase_time_limit)
20
+ logging.info("Recording complete.")
21
+
22
+ # Convert the recorded audio to an MP3 file
23
+ audio_waves = audio_data.get_wav_data()
24
+ audio_segments = AudioSegment.from_wav(BytesIO(audio_waves))
25
+ audio_segments.export(file_path , format="mp3" , bitrate="128k")
26
+ logging.info(f"Audio saved to {file_path}.")
27
+
28
+ except Exception as e:
29
+ logging.error(f"An error occurred while recording audio: {e}")
30
+
31
+ audio_filePath = "patient_voice.mp3"
32
+ record_audio(file_path= audio_filePath)
33
+
34
+ import os
35
+ from groq import Groq
36
+ from dotenv import load_dotenv
37
+
38
+ load_dotenv()
39
+
40
+
41
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
42
+ sst_model = "whisper-large-v3"
43
+
44
+ def transcription_with_groq(audio_filePath , sst_model , GROQ_API_KEY):
45
+ try:
46
+ groq_client = Groq(api_key = GROQ_API_KEY)
47
+ audio_file = open(audio_filePath , "rb")
48
+ transcription = groq_client.audio.transcriptions.create(
49
+ model = sst_model,
50
+ file = audio_file,
51
+ language = "en"
52
+ )
53
+ logging.info("Transcription complete.")
54
+ print(transcription.text)
55
+ return transcription.text
56
+ except Exception as e:
57
+ logging.error(f"An error occurred during transcription: {e}")
58
+
59
+ transcription_with_groq(audio_filePath=audio_filePath , sst_model=sst_model,GROQ_API_KEY=GROQ_API_KEY)