File size: 5,164 Bytes
bcdb714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34da888
bcdb714
34da888
 
 
 
 
 
 
 
 
 
 
 
 
 
7787d4b
34da888
 
bcdb714
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# import gradio as gr
# import requests
# import time
# import json
# import os
# import datetime
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# ### SET YOUR ASSEMBLYAI API KEY
# ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY", "your_assemblyai_api_key")
# headers = {"authorization": ASSEMBLYAI_API_KEY}
# notes_file = "notes.json"

# ### LOAD LLM
# model_id = "IlmaJiyadh/phi3-4k-ft"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     device_map="auto",
#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
#     trust_remote_code=True
# )

# ### TRANSCRIBE AUDIO WITH ASSEMBLYAI
# def transcribe(audio_path):
#     with open(audio_path, 'rb') as f:
#         upload_res = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, files={"file": f})
#     audio_url = upload_res.json()["upload_url"]

#     transcript_res = requests.post("https://api.assemblyai.com/v2/transcript", json={"audio_url": audio_url}, headers=headers)
#     transcript_id = transcript_res.json()["id"]

#     while True:
#         poll = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
#         if poll['status'] == 'completed':
#             return poll['text']
#         elif poll['status'] == 'error':
#             return f"Transcription failed: {poll['error']}"
#         time.sleep(2)

# ### SUMMARIZE USING LLM
# def summarize(text):
#     prompt = f"Below is a lecture transcript. Take lecture notes in bullet points.\n\nInput:\n{text}\n\nSummary:\n"
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7, use_cache=False)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ## SAVE TO JSON
# def save_note(date, transcript, summary):
#     data = {"date": date, "transcript": transcript, "summary": summary}
#     if os.path.exists(notes_file):
#         with open(notes_file, "r") as f:
#             all_notes = json.load(f)
#     else:
#         all_notes = []
#     all_notes.append(data)
#     with open(notes_file, "w") as f:
#         json.dump(all_notes, f, indent=2)

# ### SEARCH NOTES
# def search_notes(query):
#     if not os.path.exists(notes_file):
#         return "No notes available yet."
#     with open(notes_file, "r") as f:
#         notes = json.load(f)
#     results = [n for n in notes if query.lower() in n['summary'].lower() or query.lower() in n['transcript'].lower()]
#     if not results:
#         return "No matching notes found."
#     return "\n\n".join([f"πŸ“… {n['date']}\n{n['summary']}" for n in results])

# ### FULL PIPELINE
# def full_pipeline(audio):
#     if audio is None:
#         return "No audio provided", "", ""
    
#     transcript = transcribe(audio)
#     summary = summarize(transcript)
#     date_str = str(datetime.date.today())
#     save_note(date_str, transcript, summary)
#     return transcript, summary, f"βœ… Lecture saved for {date_str}"

# ### BUILD GRADIO UI
# with gr.Blocks() as demo:
#     gr.Markdown("# πŸŽ“ Lecture Assistant (Audio β†’ Summary + Search)")
#     with gr.Row():
#         with gr.Column():
#             #audio_input = gr.Audio(source="microphone", type="filepath", label="πŸŽ™οΈ Record Audio")
#             audio_input = gr.Audio(type="filepath", label="πŸŽ™οΈ Record Audio")
#             submit_btn = gr.Button("Transcribe & Summarize")
#             transcript_output = gr.Textbox(label="πŸ“„ Transcript")
#             summary_output = gr.Textbox(label="πŸ“ Summary")
#             save_status = gr.Textbox(label="πŸ’Ύ Save Status")

#         with gr.Column():
#             search_query = gr.Textbox(label="πŸ” Search Notes")
#             search_btn = gr.Button("Search")
#             search_output = gr.Textbox(label="Results")

#     submit_btn.click(fn=full_pipeline, inputs=audio_input, outputs=[transcript_output, summary_output, save_status])
#     search_btn.click(fn=search_notes, inputs=search_query, outputs=search_output)

# demo.launch()

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "IlmaJiyadh/phi3-4k-ft"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)

def summarize(text):
    prompt = f"Below is a lecture transcript. Take lecture notes in bullet points.\n\nInput:\n{text}\n\nSummary:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7, use_cache=False)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

gr.Interface(
    fn=summarize,
    inputs=gr.Textbox(lines=10, label="πŸ“„ Paste Transcript"),
    outputs=gr.Textbox(label="πŸ“ Summary"),
    title="🧠 Transcript β†’ Summary (Phi-3 Fine-tuned)",
    description="Test only the summarization step."
).launch()