CultriX commited on
Commit
ad147d8
·
1 Parent(s): c071d8b

Added recursion

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -23,7 +23,8 @@ def scrape_and_convert(url, depth):
23
  return f"Error fetching {url}: {str(e)}\n"
24
 
25
  # Convert to Markdown
26
- markdown_content = Converter.html_to_markdown(
 
27
  html=html_content,
28
  base_url=url,
29
  parser_features='html.parser',
@@ -32,10 +33,12 @@ def scrape_and_convert(url, depth):
32
 
33
  # If depth > 0, extract links and process them
34
  if current_depth > 0:
35
- links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL, depth=current_depth, visited_urls=visited_urls)
 
36
  for link in links:
37
- markdown_content += f"\n\n## Extracted from: {link}\n"
38
- markdown_content += recursive_scrape(link, current_depth - 1)
 
39
 
40
  return markdown_content
41
 
 
23
  return f"Error fetching {url}: {str(e)}\n"
24
 
25
  # Convert to Markdown
26
+ markdown_content = f"## Extracted from: {url}\n\n"
27
+ markdown_content += Converter.html_to_markdown(
28
  html=html_content,
29
  base_url=url,
30
  parser_features='html.parser',
 
33
 
34
  # If depth > 0, extract links and process them
35
  if current_depth > 0:
36
+ links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
37
+
38
  for link in links:
39
+ if link not in visited_urls:
40
+ markdown_content += f"\n\n### Extracted from: {link}\n"
41
+ markdown_content += recursive_scrape(link, current_depth - 1)
42
 
43
  return markdown_content
44