hadadrjt commited on
Commit
bc90a07
·
1 Parent(s): 76f7f20

ai: Switch to Docker container.

Browse files

* And use async for Deep Search.

Files changed (4) hide show
  1. Dockerfile +27 -0
  2. README.md +6 -5
  3. requirements.txt +6 -0
  4. src/tools/deep_search.py +95 -58
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the latest personal Ubuntu image as the starting point
2
+ FROM hadadrjt/ubuntu:latest
3
+
4
+ # Set the user to root to have full permissions during build and runtime
5
+ USER root
6
+
7
+ # Set the working directory inside the container to /usr/src/app
8
+ # All subsequent commands will be run in this directory
9
+ WORKDIR /usr/src/app
10
+
11
+ # Copy all files from the current directory on the host machine to the working directory in the container
12
+ COPY . .
13
+
14
+ # Install Python dependencies listed in requirements.txt without using cache to reduce image size
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Expose port 7860 so that it can be accessed from outside the container
18
+ EXPOSE 7860
19
+
20
+ # Set an environment variable to configure the Gradio server to listen on all network interfaces
21
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
22
+
23
+ # Clear any default entrypoint to allow CMD to run directly
24
+ ENTRYPOINT []
25
+
26
+ # Specify the default command to run the Python application when the container starts
27
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -2,12 +2,13 @@
2
  title: J.A.R.V.I.S.
3
  license: apache-2.0
4
  license_link: https://huggingface.co/hadadrjt/JARVIS/blob/main/LICENSE
5
- colorFrom: yellow
6
  colorTo: purple
7
- emoji: 🌍
8
- sdk: gradio
9
- sdk_version: 5.34.2
10
- app_file: app.py
 
11
  pinned: true
12
  short_description: Just a Rather Very Intelligent System
13
  models:
 
2
  title: J.A.R.V.I.S.
3
  license: apache-2.0
4
  license_link: https://huggingface.co/hadadrjt/JARVIS/blob/main/LICENSE
5
+ colorFrom: green
6
  colorTo: purple
7
+ emoji:
8
+ thumbnail: >-
9
+ https://cdn-uploads.huggingface.co/production/uploads/67b426629ec6943726101b92/ptiPI3_NVFdo2yaDtpvH3.jpeg
10
+ sdk: docker
11
+ app_port: 7860
12
  pinned: true
13
  short_description: Just a Rather Very Intelligent System
14
  models:
requirements.txt CHANGED
@@ -1,4 +1,10 @@
 
1
  anyio
 
 
 
2
  pydantic
 
3
  starlette
 
4
  uvicorn
 
1
+ aiohttp[speedups]
2
  anyio
3
+ gradio
4
+ httpx
5
+ httpx[http2]
6
  pydantic
7
+ Pygments
8
  starlette
9
+ urllib3
10
  uvicorn
src/tools/deep_search.py CHANGED
@@ -3,22 +3,24 @@
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
- import requests # Import the requests library to perform HTTP requests synchronously
 
7
  from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers
8
 
9
- # Define a class named SearchTools to encapsulate functionalities related to deep search
10
  class SearchTools:
11
  # This class provides methods to connect to the web
12
 
13
  """
14
- A class providing tools to perform web searches and read content from URLs using various search engines
15
- and a reader API service.
 
16
 
17
  Attributes:
18
- searxng_url (str): Base URL for the SearXNG search proxy service.
19
- baidu_url (str): Base URL for Baidu search engine.
20
- timeout (int): Timeout duration in seconds for HTTP requests.
21
- reader_api (str): Base URL for the reader API service used to extract content from URLs.
22
 
23
  Methods:
24
  read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
@@ -26,73 +28,108 @@ class SearchTools:
26
  returning the raw HTML response text.
27
  """
28
 
 
29
  def __init__(self):
30
  """
31
  Initialize the SearchTools instance with predefined URLs and timeout settings.
 
32
  """
33
- self.searxng_url = "https://paulgo.io/search" # URL for the SearXNG search proxy service
34
- self.baidu_url = "https://www.baidu.com/s" # URL for Baidu search engine
35
- self.timeout = 30 # Timeout in seconds for HTTP requests to avoid long hanging connections
36
- self.reader_api = "https://r.jina.ai/" # Reader API endpoint to extract readable content from URLs
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  async def read_url(self, url: str) -> str:
39
  """
40
- Asynchronously read and retrieve the textual content of a given URL using the reader API.
 
41
 
42
  Args:
43
- url (str): The URL of the webpage to read content from.
44
 
45
  Returns:
46
- str: The textual content extracted from the URL if successful.
47
- None: If the request fails or an exception occurs.
48
  """
49
- try:
50
- data = {"url": url} # Prepare POST data with the target URL
51
- # Send a synchronous POST request to the reader API with the URL data and timeout
52
- response = requests.post(self.reader_api, data=data, timeout=self.timeout)
53
- response.raise_for_status() # Raise an exception if the response status is an HTTP error
54
- return response.text # Return the textual content of the response
55
- except Exception:
56
- # Return None if any error occurs during the request or response processing
57
- return None
58
 
 
59
  async def search(self, query: str, engine: str = "google") -> str:
60
  """
61
- Asynchronously perform a web search for the given query using the specified search engine.
 
62
 
63
  Args:
64
- query (str): The search query string.
65
- engine (str, optional): The search engine to use. Supported values are "google" and "baidu".
66
- Defaults to "google".
67
 
68
  Returns:
69
- str: The raw HTML content of the search results page if successful.
70
- None: If the request fails or an exception occurs.
71
  """
72
- try:
73
- if engine == "baidu":
74
- # Construct the URL for Baidu search by appending the query parameter 'wd' with the search term
75
- url = f"{self.reader_api}{self.baidu_url}?wd={query}"
76
- # Set the HTTP header to target the main content container of Baidu search results
77
- headers = {
78
- "X-Target-Selector": "#content_left",
79
- "X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
80
- }
81
- else:
82
- # For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing)
83
- prefix = "!go" if engine == "google" else "!bi"
84
- # Construct the URL for SearXNG search proxy with the prefixed query
85
- url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
86
- # Set the HTTP header to target the URLs container in the search results
87
- headers = {
88
- "X-Target-Selector": "#urls",
89
- "X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
90
- }
91
-
92
- # Send a synchronous GET request to the constructed URL with headers and timeout
93
- response = requests.get(url, headers=headers, timeout=self.timeout)
94
- response.raise_for_status() # Raise an exception if the response status is an HTTP error
95
- return response.text # Return the raw HTML content of the search results
96
- except Exception:
97
- # Return None if any error occurs during the request or response processing
98
- return None
 
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
+ import aiohttp # Import the aiohttp library to perform asynchronous HTTP requests
7
+ import asyncio # Import asyncio library to handle asynchronous operations and implement delay mechanisms
8
  from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers
9
 
10
+ # Define the main SearchTools class that provides web searching and URL reading capabilities
11
  class SearchTools:
12
  # This class provides methods to connect to the web
13
 
14
  """
15
+ A comprehensive class providing tools to perform web searches and read content from URLs using various search engines
16
+ and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure
17
+ connections remain active even when encountering errors.
18
 
19
  Attributes:
20
+ searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines.
21
+ baidu_url (str): Base URL for Baidu search engine for Chinese language searches.
22
+ timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging.
23
+ reader_api (str): Base URL for the reader API service used to extract clean content from URLs.
24
 
25
  Methods:
26
  read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
 
28
  returning the raw HTML response text.
29
  """
30
 
31
+ # Constructor method to initialize the SearchTools instance with all necessary configuration values
32
  def __init__(self):
33
  """
34
  Initialize the SearchTools instance with predefined URLs and timeout settings.
35
+ This method sets up all the base URLs and configuration parameters needed for web searching and content reading.
36
  """
37
+ # Set the base URL for SearXNG search proxy service which provides access to multiple search engines
38
+ self.searxng_url = "https://paulgo.io/search"
39
+ # Set the base URL for Baidu search engine for handling Chinese language queries
40
+ self.baidu_url = "https://www.baidu.com/s"
41
+ # Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits
42
+ self.timeout = 30
43
+ # Set the reader API endpoint that converts web pages into clean, readable text format
44
+ self.reader_api = "https://r.jina.ai/"
45
 
46
+ # Private helper method that implements the core retry logic for all HTTP requests
47
+ async def _fetch_with_retry(self, session, method, url, **kwargs):
48
+ """
49
+ Helper method to perform HTTP requests with infinite retry until a valid response is obtained.
50
+ This method ensures that connections never fail permanently and will keep trying until success.
51
+
52
+ Args:
53
+ session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests.
54
+ method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete').
55
+ url (str): The complete URL to send the request to.
56
+ **kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.).
57
+
58
+ Returns:
59
+ str: The response text content when a successful request is finally achieved.
60
+ """
61
+ # Create an infinite loop that will only break when a successful response is received
62
+ while True:
63
+ # Use a try-except block to catch any type of exception that might occur during the request
64
+ try:
65
+ # Make the actual HTTP request using the provided session, method, URL and additional arguments
66
+ async with session.request(method, url, **kwargs) as response:
67
+ # Check if the response status indicates success, raise exception if it's an error status
68
+ response.raise_for_status()
69
+ # Return the text content of the successful response
70
+ return await response.text()
71
+ # Catch any exception that occurs during the request process
72
+ except Exception:
73
+ # Retry on any exception without stopping the loop or raising the error
74
+ # Wait for 5 second before attempting the next retry to avoid overwhelming the server
75
+ await asyncio.sleep(5)
76
+
77
+ # Public method to read and extract content from any given URL
78
  async def read_url(self, url: str) -> str:
79
  """
80
+ Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry.
81
+ This method will keep trying until it successfully retrieves the content from the specified URL.
82
 
83
  Args:
84
+ url (str): The complete URL of the webpage to read content from.
85
 
86
  Returns:
87
+ str: The clean textual content extracted from the URL by the reader API service.
 
88
  """
89
+ # Prepare the POST data payload containing the target URL for the reader API
90
+ data = {"url": url}
91
+ # Create an aiohttp client session with the configured timeout settings
92
+ async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
93
+ # Use the retry helper method to POST the URL to the reader API and get the content
94
+ return await self._fetch_with_retry(session, 'post', self.reader_api, data=data)
 
 
 
95
 
96
+ # Public method to perform web searches using different search engines
97
  async def search(self, query: str, engine: str = "google") -> str:
98
  """
99
+ Asynchronously perform a web search for the given query using the specified search engine with infinite retry.
100
+ This method will keep trying until it successfully retrieves search results from the chosen search engine.
101
 
102
  Args:
103
+ query (str): The search query string containing the terms to search for.
104
+ engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu".
105
+ Defaults to "google" if not specified.
106
 
107
  Returns:
108
+ str: The raw HTML content of the search results page from the specified search engine.
 
109
  """
110
+ # Check if the user wants to use Baidu search engine for the query
111
+ if engine == "baidu":
112
+ # Construct the full URL by combining reader API, Baidu URL and the search query parameter
113
+ url = f"{self.reader_api}{self.baidu_url}?wd={query}"
114
+ # Set HTTP headers specific to Baidu search results extraction
115
+ headers = {
116
+ # Target the main content container where Baidu displays search results
117
+ "X-Target-Selector": "#content_left",
118
+ "X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
119
+ }
120
+ # Handle all other search engines (Google, Bing, etc.) through SearXNG proxy
121
+ else:
122
+ # Determine the search prefix based on the requested engine (Google or Bing)
123
+ prefix = "!go" if engine == "google" else "!bi"
124
+ # Construct the full URL by combining reader API, SearXNG URL, prefix and query
125
+ url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
126
+ # Set HTTP headers specific to SearXNG search results extraction
127
+ headers = {
128
+ # Target the URLs container where SearXNG displays search result links
129
+ "X-Target-Selector": "#urls",
130
+ "X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
131
+ }
132
+ # Create an aiohttp client session with the configured timeout settings
133
+ async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
134
+ # Use the retry helper method to GET the search results and return the HTML content
135
+ return await self._fetch_with_retry(session, 'get', url, headers=headers)