Spaces:
Running
Running
File size: 5,164 Bytes
bcdb714 34da888 bcdb714 34da888 7787d4b 34da888 bcdb714 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# import gradio as gr
# import requests
# import time
# import json
# import os
# import datetime
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# ### SET YOUR ASSEMBLYAI API KEY
# ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY", "your_assemblyai_api_key")
# headers = {"authorization": ASSEMBLYAI_API_KEY}
# notes_file = "notes.json"
# ### LOAD LLM
# model_id = "IlmaJiyadh/phi3-4k-ft"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
# model_id,
# device_map="auto",
# torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
# trust_remote_code=True
# )
# ### TRANSCRIBE AUDIO WITH ASSEMBLYAI
# def transcribe(audio_path):
# with open(audio_path, 'rb') as f:
# upload_res = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, files={"file": f})
# audio_url = upload_res.json()["upload_url"]
# transcript_res = requests.post("https://api.assemblyai.com/v2/transcript", json={"audio_url": audio_url}, headers=headers)
# transcript_id = transcript_res.json()["id"]
# while True:
# poll = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
# if poll['status'] == 'completed':
# return poll['text']
# elif poll['status'] == 'error':
# return f"Transcription failed: {poll['error']}"
# time.sleep(2)
# ### SUMMARIZE USING LLM
# def summarize(text):
# prompt = f"Below is a lecture transcript. Take lecture notes in bullet points.\n\nInput:\n{text}\n\nSummary:\n"
# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7, use_cache=False)
# return tokenizer.decode(outputs[0], skip_special_tokens=True)
# ## SAVE TO JSON
# def save_note(date, transcript, summary):
# data = {"date": date, "transcript": transcript, "summary": summary}
# if os.path.exists(notes_file):
# with open(notes_file, "r") as f:
# all_notes = json.load(f)
# else:
# all_notes = []
# all_notes.append(data)
# with open(notes_file, "w") as f:
# json.dump(all_notes, f, indent=2)
# ### SEARCH NOTES
# def search_notes(query):
# if not os.path.exists(notes_file):
# return "No notes available yet."
# with open(notes_file, "r") as f:
# notes = json.load(f)
# results = [n for n in notes if query.lower() in n['summary'].lower() or query.lower() in n['transcript'].lower()]
# if not results:
# return "No matching notes found."
# return "\n\n".join([f"π
{n['date']}\n{n['summary']}" for n in results])
# ### FULL PIPELINE
# def full_pipeline(audio):
# if audio is None:
# return "No audio provided", "", ""
# transcript = transcribe(audio)
# summary = summarize(transcript)
# date_str = str(datetime.date.today())
# save_note(date_str, transcript, summary)
# return transcript, summary, f"β
Lecture saved for {date_str}"
# ### BUILD GRADIO UI
# with gr.Blocks() as demo:
# gr.Markdown("# π Lecture Assistant (Audio β Summary + Search)")
# with gr.Row():
# with gr.Column():
# #audio_input = gr.Audio(source="microphone", type="filepath", label="ποΈ Record Audio")
# audio_input = gr.Audio(type="filepath", label="ποΈ Record Audio")
# submit_btn = gr.Button("Transcribe & Summarize")
# transcript_output = gr.Textbox(label="π Transcript")
# summary_output = gr.Textbox(label="π Summary")
# save_status = gr.Textbox(label="πΎ Save Status")
# with gr.Column():
# search_query = gr.Textbox(label="π Search Notes")
# search_btn = gr.Button("Search")
# search_output = gr.Textbox(label="Results")
# submit_btn.click(fn=full_pipeline, inputs=audio_input, outputs=[transcript_output, summary_output, save_status])
# search_btn.click(fn=search_notes, inputs=search_query, outputs=search_output)
# demo.launch()
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_id = "IlmaJiyadh/phi3-4k-ft"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True
)
def summarize(text):
prompt = f"Below is a lecture transcript. Take lecture notes in bullet points.\n\nInput:\n{text}\n\nSummary:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7, use_cache=False)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
gr.Interface(
fn=summarize,
inputs=gr.Textbox(lines=10, label="π Paste Transcript"),
outputs=gr.Textbox(label="π Summary"),
title="π§ Transcript β Summary (Phi-3 Fine-tuned)",
description="Test only the summarization step."
).launch()
|