Spaces:
Running
Running
Added recursion
Browse files
app.py
CHANGED
@@ -23,7 +23,8 @@ def scrape_and_convert(url, depth):
|
|
23 |
return f"Error fetching {url}: {str(e)}\n"
|
24 |
|
25 |
# Convert to Markdown
|
26 |
-
markdown_content =
|
|
|
27 |
html=html_content,
|
28 |
base_url=url,
|
29 |
parser_features='html.parser',
|
@@ -32,10 +33,12 @@ def scrape_and_convert(url, depth):
|
|
32 |
|
33 |
# If depth > 0, extract links and process them
|
34 |
if current_depth > 0:
|
35 |
-
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL
|
|
|
36 |
for link in links:
|
37 |
-
|
38 |
-
|
|
|
39 |
|
40 |
return markdown_content
|
41 |
|
|
|
23 |
return f"Error fetching {url}: {str(e)}\n"
|
24 |
|
25 |
# Convert to Markdown
|
26 |
+
markdown_content = f"## Extracted from: {url}\n\n"
|
27 |
+
markdown_content += Converter.html_to_markdown(
|
28 |
html=html_content,
|
29 |
base_url=url,
|
30 |
parser_features='html.parser',
|
|
|
33 |
|
34 |
# If depth > 0, extract links and process them
|
35 |
if current_depth > 0:
|
36 |
+
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
|
37 |
+
|
38 |
for link in links:
|
39 |
+
if link not in visited_urls:
|
40 |
+
markdown_content += f"\n\n### Extracted from: {link}\n"
|
41 |
+
markdown_content += recursive_scrape(link, current_depth - 1)
|
42 |
|
43 |
return markdown_content
|
44 |
|