spagestic commited on
Commit
0c66d86
·
verified ·
1 Parent(s): 3ea4b9d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +550 -0
app.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Web Scraper MCP Server
4
+
5
+ A Model Context Protocol server that provides web scraping tools.
6
+ Exposes functions to scrape websites, convert content to markdown, and generate sitemaps.
7
+ """
8
+
9
+ import gradio as gr
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from markdownify import markdownify as md
13
+ from urllib.parse import urljoin, urlparse
14
+ from typing import Tuple, List, Dict
15
+ import re
16
+ import tempfile
17
+ import zipfile
18
+ import os
19
+ def scrape_website_content(url: str) -> Tuple[str, str]:
20
+ """
21
+ Scrape a website and return its main content formatted as markdown and a downloadable file path.
22
+
23
+ Args:
24
+ url (str): The URL to scrape (can include or omit http/https protocol)
25
+
26
+ Returns:
27
+ Tuple[str, str]: The scraped content formatted as markdown, and a file path for download
28
+ """
29
+ try:
30
+ # Validate URL
31
+ if not url.startswith(('http://', 'https://')):
32
+ url = 'https://' + url
33
+
34
+ # Create session with proper headers
35
+ session = requests.Session()
36
+ session.headers.update({
37
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
38
+ })
39
+
40
+ # Make request
41
+ response = session.get(url, timeout=10)
42
+ response.raise_for_status()
43
+
44
+ # Parse HTML
45
+ soup = BeautifulSoup(response.content, 'html.parser')
46
+
47
+ # Remove unwanted elements
48
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
49
+ element.decompose()
50
+
51
+ # Try to find main content area
52
+ main_content = (
53
+ soup.find('main') or
54
+ soup.find('article') or
55
+ soup.find('div', class_=re.compile(r'content|main|post|article')) or
56
+ soup.find('body')
57
+ )
58
+
59
+ if main_content:
60
+ # Convert to markdown
61
+ markdown_text = md(str(main_content), heading_style="ATX")
62
+
63
+ # Clean up the markdown
64
+ # Remove excessive newlines
65
+ markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
66
+ # Remove empty links
67
+ markdown_text = re.sub(r'\[\s*\]\([^)]*\)', '', markdown_text)
68
+ # Clean up whitespace
69
+ markdown_text = re.sub(r'[ \t]+', ' ', markdown_text)
70
+
71
+ # Add title if available
72
+ title = soup.find('title')
73
+ if title:
74
+ markdown_text = f"# {title.get_text().strip()}\n\n{markdown_text}"
75
+ markdown_text = markdown_text.strip()
76
+ # Write to temp file for download
77
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
78
+ f.write(markdown_text)
79
+ temp_path = f.name
80
+ return markdown_text, temp_path
81
+ return "No main content found on the webpage.", None
82
+ except requests.exceptions.RequestException as e:
83
+ return f"Error fetching URL: {str(e)}", None
84
+ except Exception as e:
85
+ return f"Error processing content: {str(e)}", None
86
+
87
+
88
+ def generate_sitemap(url: str, max_links_per_domain: int = None) -> Tuple[str, str]:
89
+ """
90
+ Generate a sitemap from all links found on a webpage and provide a downloadable file path.
91
+
92
+ Args:
93
+ url (str): The URL to analyze for links (can include or omit http/https protocol)
94
+ max_links_per_domain (int, optional): Maximum number of links to display per domain.
95
+ If None, shows all links. Defaults to None.
96
+
97
+ Returns:
98
+ Tuple[str, str]: A markdown-formatted sitemap of all links found on the page, and a file path for download
99
+ """
100
+ try:
101
+ # Validate URL
102
+ if not url.startswith(('http://', 'https://')):
103
+ url = 'https://' + url
104
+
105
+ # Create session with proper headers
106
+ session = requests.Session()
107
+ session.headers.update({
108
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
109
+ })
110
+
111
+ # Make request
112
+ response = session.get(url, timeout=10)
113
+ response.raise_for_status()
114
+
115
+ # Parse HTML
116
+ soup = BeautifulSoup(response.content, 'html.parser')
117
+
118
+ # Find all links
119
+ links = soup.find_all('a', href=True)
120
+
121
+ # Process links
122
+ sitemap_data = []
123
+ seen_urls = set()
124
+
125
+ for link in links:
126
+ href = link.get('href')
127
+ text = link.get_text().strip()
128
+
129
+ if not href:
130
+ continue
131
+
132
+ # Convert relative URLs to absolute
133
+ full_url = urljoin(url, href)
134
+
135
+ # Filter out unwanted links
136
+ if (full_url in seen_urls or
137
+ href.startswith(('#', 'javascript:', 'mailto:', 'tel:')) or
138
+ full_url == url):
139
+ continue
140
+
141
+ seen_urls.add(full_url)
142
+
143
+ # Create link entry
144
+ if not text:
145
+ text = href
146
+
147
+ sitemap_data.append({
148
+ 'text': text[:100] + '...' if len(text) > 100 else text,
149
+ 'url': full_url
150
+ })
151
+
152
+ # Generate sitemap markdown
153
+ if not sitemap_data:
154
+ return "No links found on this page.", None
155
+
156
+ sitemap_md = "# Sitemap\n\n"
157
+ sitemap_md += f"Found {len(sitemap_data)} links:\n\n"
158
+
159
+ # Group by domain for better organization
160
+ domain_groups = {}
161
+ parsed_base = urlparse(url)
162
+
163
+ for item in sitemap_data:
164
+ parsed_url = urlparse(item['url'])
165
+
166
+ if parsed_url.netloc == parsed_base.netloc:
167
+ domain_key = "Internal Links"
168
+ else:
169
+ domain_key = f"External Links ({parsed_url.netloc})"
170
+
171
+ if domain_key not in domain_groups:
172
+ domain_groups[domain_key] = []
173
+
174
+ domain_groups[domain_key].append(item)
175
+ # Format sitemap
176
+ for domain, links in domain_groups.items():
177
+ sitemap_md += f"## {domain}\n\n"
178
+
179
+ # Use the limit parameter or show all links if None
180
+ if max_links_per_domain is None:
181
+ links_to_show = links
182
+ remaining_links = 0
183
+ else:
184
+ links_to_show = links[:max_links_per_domain]
185
+ remaining_links = max(0, len(links) - max_links_per_domain)
186
+
187
+ for link in links_to_show:
188
+ sitemap_md += f"- [{link['text']}]({link['url']})\n"
189
+
190
+ if remaining_links > 0:
191
+ sitemap_md += f"- ... and {remaining_links} more links\n"
192
+
193
+ sitemap_md += "\n"
194
+
195
+ # Write to temp file for download
196
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as f:
197
+ f.write(sitemap_md)
198
+ temp_path = f.name
199
+ return sitemap_md, temp_path
200
+ except requests.exceptions.RequestException as e:
201
+ return f"Error fetching URL: {str(e)}", None
202
+ except Exception as e:
203
+ return f"Error processing content: {str(e)}", None
204
+
205
+
206
+ def extract_all_content_as_zip(url: str, max_links: int = None) -> Tuple[str, str]:
207
+ """
208
+ Extract text content from all links found on a webpage and create a downloadable zip file.
209
+
210
+ Args:
211
+ url (str): The URL to analyze for links (can include or omit http/https protocol)
212
+ max_links (int, optional): Maximum number of links to process. If None, processes all links. Defaults to None.
213
+
214
+ Returns:
215
+ Tuple[str, str]: Status message and zip file path for download
216
+ """
217
+ try:
218
+ # Validate URL
219
+ if not url.startswith(('http://', 'https://')):
220
+ url = 'https://' + url
221
+
222
+ # Create session with proper headers
223
+ session = requests.Session()
224
+ session.headers.update({
225
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
226
+ })
227
+
228
+ # First get the sitemap to find all links
229
+ response = session.get(url, timeout=10)
230
+ response.raise_for_status()
231
+
232
+ soup = BeautifulSoup(response.content, 'html.parser')
233
+ links = soup.find_all('a', href=True)
234
+
235
+ # Process links to get unique URLs
236
+ unique_urls = set()
237
+ parsed_base = urlparse(url)
238
+
239
+ for link in links:
240
+ href = link.get('href')
241
+ if not href:
242
+ continue
243
+
244
+ full_url = urljoin(url, href)
245
+
246
+ # Filter out unwanted links and focus on same domain for safety
247
+ if (href.startswith(('#', 'javascript:', 'mailto:', 'tel:')) or
248
+ full_url == url):
249
+ continue
250
+ # Only include internal links to avoid scraping too many external sites
251
+ parsed_url = urlparse(full_url)
252
+ if parsed_url.netloc == parsed_base.netloc:
253
+ unique_urls.add(full_url)
254
+
255
+ if not unique_urls:
256
+ return "No internal links found to extract content from.", None
257
+
258
+ # Use all URLs or limit if specified
259
+ urls_to_process = list(unique_urls)
260
+ total_links_found = len(urls_to_process)
261
+
262
+ # Apply limit if specified
263
+ if max_links is not None:
264
+ urls_to_process = urls_to_process[:max_links]
265
+ limited_message = f" (limited to {max_links} out of {total_links_found})"
266
+ else:
267
+ limited_message = ""
268
+
269
+ # Create temporary zip file
270
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_zip:
271
+ zip_path = temp_zip.name
272
+
273
+ successful_extractions = 0
274
+ failed_extractions = 0
275
+
276
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
277
+ for i, link_url in enumerate(urls_to_process, 1):
278
+ try:
279
+ # Get content from each link
280
+ link_response = session.get(link_url, timeout=10)
281
+ link_response.raise_for_status()
282
+
283
+ # Parse and extract content
284
+ link_soup = BeautifulSoup(link_response.content, 'html.parser')
285
+
286
+ # Remove unwanted elements
287
+ for element in link_soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
288
+ element.decompose()
289
+
290
+ # Find main content
291
+ main_content = (
292
+ link_soup.find('main') or
293
+ link_soup.find('article') or
294
+ link_soup.find('div', class_=re.compile(r'content|main|post|article')) or
295
+ link_soup.find('body')
296
+ )
297
+
298
+ if main_content:
299
+ # Convert to markdown
300
+ markdown_text = md(str(main_content), heading_style="ATX")
301
+
302
+ # Clean up the markdown
303
+ markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
304
+ markdown_text = re.sub(r'\[\s*\]\([^)]*\)', '', markdown_text)
305
+ markdown_text = re.sub(r'[ \t]+', ' ', markdown_text)
306
+
307
+ # Add title if available
308
+ title = link_soup.find('title')
309
+ if title:
310
+ markdown_text = f"# {title.get_text().strip()}\n\n{markdown_text}"
311
+
312
+ markdown_text = markdown_text.strip()
313
+
314
+ # Create safe filename
315
+ parsed_link = urlparse(link_url)
316
+ safe_filename = re.sub(r'[^\w\-_.]', '_', parsed_link.path or 'index')
317
+ if not safe_filename.endswith('.md'):
318
+ safe_filename += '.md'
319
+
320
+ # Ensure unique filename
321
+ if safe_filename == '.md' or safe_filename == 'index.md':
322
+ safe_filename = f"page_{i}.md"
323
+
324
+ # Add source URL as header
325
+ final_content = f"<!-- Source: {link_url} -->\n\n{markdown_text}"
326
+
327
+ # Add to zip
328
+ zip_file.writestr(safe_filename, final_content)
329
+ successful_extractions += 1
330
+ else: failed_extractions += 1
331
+
332
+ except Exception as e:
333
+ failed_extractions += 1
334
+ continue
335
+
336
+ status_message = f"Successfully extracted content from {successful_extractions} pages{limited_message}"
337
+ if failed_extractions > 0:
338
+ status_message += f", failed to extract from {failed_extractions} pages"
339
+ status_message += f". Created zip file with {successful_extractions} markdown files."
340
+
341
+ return status_message, zip_path
342
+
343
+ except requests.exceptions.RequestException as e:
344
+ return f"Error fetching URL: {str(e)}", None
345
+ except Exception as e:
346
+ return f"Error processing content: {str(e)}", None
347
+
348
+
349
+ def generate_sitemap_for_ui(url: str) -> Tuple[str, str]:
350
+ """
351
+ Wrapper function for the Gradio UI that shows all links without limitation.
352
+
353
+ Args:
354
+ url (str): The URL to analyze for links
355
+
356
+ Returns:
357
+ Tuple[str, str]: A markdown-formatted sitemap of all links found on the page, and a file path for download
358
+ """
359
+ return generate_sitemap(url, max_links_per_domain=None)
360
+
361
+
362
+ def generate_sitemap_with_limit(url: str, max_links_per_domain: int) -> Tuple[str, str]:
363
+ """
364
+ Wrapper function for Gradio UI that allows configurable link limits per domain.
365
+
366
+ Args:
367
+ url (str): The URL to analyze for links
368
+ max_links_per_domain (int): Maximum number of links to display per domain (0 = show all)
369
+
370
+ Returns:
371
+ Tuple[str, str]: A markdown-formatted sitemap of all links found on the page, and a file path for download
372
+ """
373
+ limit = None if max_links_per_domain == 0 else max_links_per_domain
374
+ return generate_sitemap(url, max_links_per_domain=limit)
375
+
376
+
377
+ def extract_all_content_for_ui(url: str) -> Tuple[str, str]:
378
+ """
379
+ Wrapper function for the Gradio UI that extracts content from all internal links without limitation.
380
+
381
+ Args:
382
+ url (str): The URL to analyze for links
383
+
384
+ Returns:
385
+ Tuple[str, str]: Status message and zip file path for download
386
+ """
387
+ return extract_all_content_as_zip(url, max_links=None)
388
+
389
+
390
+ def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
391
+ """
392
+ Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
393
+
394
+ Args:
395
+ url (str): The URL to analyze for links
396
+ max_links (int): Maximum number of links to process (0 = process all)
397
+
398
+ Returns:
399
+ Tuple[str, str]: Status message and zip file path for download
400
+ """
401
+ limit = None if max_links == 0 else max_links
402
+ return extract_all_content_as_zip(url, max_links=limit)
403
+
404
+
405
+ # Create Gradio interfaces for each function
406
+ def create_mcp_interface():
407
+ """Create Gradio interface that exposes web scraping tools as MCP functions."""
408
+ # Create individual interfaces for each tool
409
+ scrape_interface = gr.Interface(
410
+ fn=scrape_website_content,
411
+ inputs=gr.Textbox(
412
+ label="Website URL",
413
+ placeholder="https://example.com or example.com"
414
+ ),
415
+ outputs=[
416
+ gr.Textbox(
417
+ label="Scraped Content",
418
+ lines=20,
419
+ max_lines=50,
420
+ show_copy_button=True,
421
+ container=True
422
+ ),
423
+ gr.File(label="Download Markdown")
424
+ ],
425
+ title="Website Content Scraper",
426
+ description="Extract and format website content as markdown",
427
+ api_name="scrape_content" )
428
+
429
+ sitemap_interface = gr.Interface(
430
+ fn=generate_sitemap_for_ui,
431
+ inputs=gr.Textbox(
432
+ label="Website URL",
433
+ placeholder="https://example.com or example.com"
434
+ ),
435
+ outputs=[
436
+ gr.Textbox(
437
+ label="Sitemap",
438
+ lines=20,
439
+ max_lines=50,
440
+ show_copy_button=True,
441
+ container=True
442
+ ),
443
+ gr.File(label="Download Sitemap")
444
+ ],
445
+ title="Website Sitemap Generator",
446
+ description="Generate a sitemap of all links found on a webpage",
447
+ api_name="generate_sitemap"
448
+ )
449
+ bulk_extract_interface = gr.Interface(
450
+ fn=extract_all_content_for_ui,
451
+ inputs=gr.Textbox(
452
+ label="Website URL",
453
+ placeholder="https://example.com or example.com"
454
+ ),
455
+ outputs=[
456
+ gr.Textbox(
457
+ label="Extraction Status",
458
+ lines=10,
459
+ max_lines=20,
460
+ show_copy_button=True,
461
+ container=True
462
+ ),
463
+ gr.File(label="Download ZIP Archive")
464
+ ],
465
+ title="Bulk Content Extractor",
466
+ description="Extract text content from all internal links and download as ZIP",
467
+ api_name="extract_all_content" )
468
+
469
+ # Enhanced sitemap interface with configurable limits
470
+ sitemap_limited_interface = gr.Interface(
471
+ fn=generate_sitemap_with_limit,
472
+ inputs=[
473
+ gr.Textbox(
474
+ label="Website URL",
475
+ placeholder="https://example.com or example.com"
476
+ ),
477
+ gr.Number(
478
+ label="Max Links Per Domain",
479
+ value=0,
480
+ info="Enter 0 to show all links, or a positive number to limit display per domain",
481
+ minimum=0,
482
+ maximum=1000
483
+ )
484
+ ],
485
+ outputs=[
486
+ gr.Textbox(
487
+ label="Sitemap",
488
+ lines=20,
489
+ max_lines=50,
490
+ show_copy_button=True,
491
+ container=True
492
+ ),
493
+ gr.File(label="Download Sitemap")
494
+ ],
495
+ title="Configurable Sitemap Generator",
496
+ description="Generate a sitemap with optional display limits (0 = show all links)",
497
+ api_name="generate_sitemap_limited"
498
+ )
499
+
500
+ # Enhanced bulk extract interface with configurable limits
501
+ bulk_limited_interface = gr.Interface(
502
+ fn=extract_limited_content_as_zip,
503
+ inputs=[
504
+ gr.Textbox(
505
+ label="Website URL",
506
+ placeholder="https://example.com or example.com"
507
+ ),
508
+ gr.Number(
509
+ label="Max Pages to Extract",
510
+ value=0,
511
+ info="Enter 0 to process all pages, or a positive number to limit extraction",
512
+ minimum=0,
513
+ maximum=1000
514
+ )
515
+ ],
516
+ outputs=[
517
+ gr.Textbox(
518
+ label="Extraction Status",
519
+ lines=10,
520
+ max_lines=20,
521
+ show_copy_button=True,
522
+ container=True
523
+ ),
524
+ gr.File(label="Download ZIP Archive")
525
+ ],
526
+ title="Limited Bulk Content Extractor",
527
+ description="Extract text content from internal links with optional processing limits (0 = extract all)",
528
+ api_name="extract_limited_content"
529
+ )
530
+
531
+ # Combine into tabbed interface
532
+ demo = gr.TabbedInterface(
533
+ [scrape_interface, sitemap_interface, sitemap_limited_interface, bulk_extract_interface, bulk_limited_interface],
534
+ ["Content Scraper", "All Links Sitemap", "Limited Sitemap", "Bulk Extractor", "Limited Bulk Extractor"],
535
+ title="🕷️ Web Scraper MCP Server"
536
+ )
537
+
538
+ return demo
539
+
540
+
541
+ if __name__ == "__main__":
542
+ # Create and launch the MCP server
543
+ app = create_mcp_interface()
544
+ app.launch(
545
+ # server_name="0.0.0.0",
546
+ server_port=7862,
547
+ share=False,
548
+ show_error=True,
549
+ mcp_server=True
550
+ )