Sambhavnoobcoder commited on
Commit
8cd0b84
·
1 Parent(s): bd6a104

adds files to handle summarisation via yt transcripts

Browse files
Files changed (1) hide show
  1. summarize.py +53 -0
summarize.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import sys
3
+
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+
7
+
8
+ def add_watch_word(link):
9
+ if 'watch?' in link or 'embed/' in link or 'v=' in link:
10
+ return link
11
+ else:
12
+ split_link = link.split('/')
13
+ last_part = split_link[-1]
14
+ return '/'.join(split_link[:-1]) + '/watch?v=' + last_part
15
+
16
+ def Summarizer(link, model):
17
+ link=add_watch_word(link)
18
+ video_id = link.split("=")[1]
19
+
20
+ try:
21
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
22
+ FinalTranscript = ' '.join([i['text'] for i in transcript])
23
+
24
+ if model == "Pegasus":
25
+ checkpoint = "google/pegasus-large"
26
+ elif model == "mT5":
27
+ checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
28
+ elif model == "BART":
29
+ checkpoint = "sshleifer/distilbart-cnn-12-6"
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
32
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
33
+
34
+
35
+ inputs = tokenizer(FinalTranscript,
36
+ max_length=1024,
37
+ truncation=True,
38
+ return_tensors="pt")
39
+
40
+ summary_ids = model.generate(inputs["input_ids"])
41
+ summary = tokenizer.batch_decode(summary_ids,
42
+ skip_special_tokens=True,
43
+ clean_up_tokenization_spaces=False)
44
+
45
+
46
+ return summary[0]
47
+
48
+
49
+ except Exception:
50
+ print(traceback.format_exc())
51
+ # or
52
+ print(sys.exc_info()[2])
53
+