File size: 1,808 Bytes
1151f26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse

from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor
from rag_scraper.scraper import Scraper
from rag_scraper.utils import URLUtils


def main():
    parser = argparse.ArgumentParser(
        description="RAGScraper: A tool to scrape, extract links, and convert webpages to markdown."
    )

    parser.add_argument("url", help="The URL of the webpage to scrape.")
    parser.add_argument(
        "--element_id",
        help="The ID of the element to search for links.",
        default=None,
    )
    parser.add_argument(
        "--element_type",
        help='The type of the element to search for links. Default is "nav".',
        default="nav",
    )
    parser.add_argument(
        "--convert",
        help="Convert the webpage to markdown.",
        action="store_true",
    )
    parser.add_argument(
        "--extract",
        help="Extract links from the specified element.",
        action="store_true",
    )

    args = parser.parse_args()

    base_url = URLUtils.get_base_url(args.url)

    if args.extract:
        # Extract links if the flag is set
        links = LinkExtractor.scrape_url(
            args.url,
            element_id=args.element_id,
            element_type=args.element_type,
        )
        print(f"Unique links for {args.url}:")
        for link in links:
            print(link)
    elif args.convert:
        # Convert to markdown if the flag is set
        html_content = Scraper.fetch_html(args.url)
        markdown_content = Converter.html_to_markdown(html_content, base_url)
        print(markdown_content)
    else:
        print(
            "Please specify an action: --convert for markdown conversion or --extract for link extraction."
        )


if __name__ == "__main__":
    main()