timeki commited on
Commit
8f369fe
·
1 Parent(s): f79bf0a

add debuging for hf datasets queries

Browse files
climateqa/engine/talk_to_data/query.py CHANGED
@@ -3,6 +3,8 @@ from concurrent.futures import ThreadPoolExecutor
3
  import duckdb
4
  import pandas as pd
5
  import os
 
 
6
 
7
  def find_indicator_column(table: str, indicator_columns_per_table: dict[str,str]) -> str:
8
  """Retrieves the name of the indicator column within a table.
@@ -41,14 +43,88 @@ async def execute_sql_query(sql_query: str) -> pd.DataFrame:
41
  def _execute_query():
42
  # Execute the query
43
  con = duckdb.connect()
 
 
44
  HF_TTD_TOKEN = os.getenv("HF_TTD_TOKEN")
45
- con.execute(f"""CREATE SECRET hf_token (
46
- TYPE huggingface,
47
- TOKEN '{HF_TTD_TOKEN}'
48
- );""")
49
- results = con.execute(sql_query).fetchdf()
50
- # return fetched data
51
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Run the query in a thread pool to avoid blocking
54
  loop = asyncio.get_event_loop()
 
3
  import duckdb
4
  import pandas as pd
5
  import os
6
+ import requests
7
+ import tempfile
8
 
9
  def find_indicator_column(table: str, indicator_columns_per_table: dict[str,str]) -> str:
10
  """Retrieves the name of the indicator column within a table.
 
43
  def _execute_query():
44
  # Execute the query
45
  con = duckdb.connect()
46
+
47
+ # Try to use Hugging Face authentication if token is available
48
  HF_TTD_TOKEN = os.getenv("HF_TTD_TOKEN")
49
+
50
+ try:
51
+ if HF_TTD_TOKEN:
52
+ # Set up Hugging Face authentication - updated syntax
53
+ con.execute(f"""
54
+ CREATE SECRET IF NOT EXISTS hf_token (
55
+ TYPE HUGGINGFACE,
56
+ TOKEN '{HF_TTD_TOKEN}'
57
+ );
58
+ """)
59
+ print("Hugging Face authentication configured")
60
+
61
+ # Execute the query
62
+ results = con.execute(sql_query).fetchdf()
63
+ return results
64
+
65
+ except duckdb.HTTPException as e:
66
+ print(f"HTTP error accessing Hugging Face dataset: {e}")
67
+
68
+ # If we have a token but still get HTTP error, try without authentication
69
+ if HF_TTD_TOKEN:
70
+ print("Retrying without authentication...")
71
+ try:
72
+ # Create a new connection without the secret
73
+ con_no_auth = duckdb.connect()
74
+ results = con_no_auth.execute(sql_query).fetchdf()
75
+ return results
76
+ except Exception as e2:
77
+ print(f"Also failed without authentication: {e2}")
78
+
79
+ # Try to download the file locally and retry
80
+ print("Trying to download file locally and retry...")
81
+
82
+ # Extract the URL from the error message or construct it from the query
83
+ error_str = str(e)
84
+ url = None
85
+
86
+ if "HTTP GET error on '" in error_str:
87
+ url = error_str.split("HTTP GET error on '")[1].split("'")[0]
88
+ else:
89
+ # Try to extract URL from the SQL query
90
+ import re
91
+ url_match = re.search(r"'(https://huggingface\.co/[^']+)'", sql_query)
92
+ if url_match:
93
+ url = url_match.group(1)
94
+
95
+ if url:
96
+ table_name = url.split('/')[-1]
97
+ local_path = os.path.join(tempfile.gettempdir(), table_name)
98
+ print(f"Downloading {url} to {local_path}")
99
+
100
+ # Add authentication headers if token is available
101
+ headers = {}
102
+ if HF_TTD_TOKEN:
103
+ headers['Authorization'] = f'Bearer {HF_TTD_TOKEN}'
104
+
105
+ response = requests.get(url, headers=headers, stream=True)
106
+ if response.status_code == 200:
107
+ with open(local_path, 'wb') as f:
108
+ for chunk in response.iter_content(chunk_size=8192):
109
+ f.write(chunk)
110
+
111
+ # Modify the SQL query to use the local file
112
+ modified_sql = sql_query.replace(f"'{url}'", f"'{local_path}'")
113
+ results = con.execute(modified_sql).fetchdf()
114
+ return results
115
+ elif response.status_code == 401:
116
+ print("Authentication failed - check your HF_TTD_TOKEN")
117
+ raise Exception("Authentication failed. Please check your HF_TTD_TOKEN environment variable.")
118
+ else:
119
+ print(f"Failed to download file: {response.status_code}")
120
+ raise e
121
+ else:
122
+ print("Could not extract URL from error message")
123
+ raise e
124
+
125
+ except Exception as e:
126
+ print(f"Unexpected error: {e}")
127
+ raise e
128
 
129
  # Run the query in a thread pool to avoid blocking
130
  loop = asyncio.get_event_loop()
requirements.txt CHANGED
@@ -27,3 +27,4 @@ openai==1.61.1
27
  pydantic==2.9.2
28
  pydantic-settings==2.2.1
29
  geojson==3.2.0
 
 
27
  pydantic==2.9.2
28
  pydantic-settings==2.2.1
29
  geojson==3.2.0
30
+ requests==2.32.3