oberbics commited on
Commit
44e10c6
Β·
verified Β·
1 Parent(s): fd03c15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -433
app.py CHANGED
@@ -4,7 +4,7 @@ import requests
4
  import os
5
  import pandas as pd
6
  import folium
7
- from folium.plugins import MeasureControl, Fullscreen, MarkerCluster, Search
8
  from geopy.geocoders import Nominatim
9
  from geopy.exc import GeocoderTimedOut, GeocoderServiceError
10
  import time
@@ -18,30 +18,9 @@ warnings.filterwarnings("ignore")
18
 
19
  # Map Tile Providers with reliable sources
20
  MAP_TILES = {
21
- "Satellite": {
22
- "url": "https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
23
- "attr": "Esri",
24
- "fallback": "https://server.arcgisonline.com/ArcGIS/rest/services/World_Topo_Map/MapServer/tile/{z}/{y}/{x}"
25
- },
26
- "Topographic": {
27
- "url": "https://server.arcgisonline.com/ArcGIS/rest/services/World_Topo_Map/MapServer/tile/{z}/{y}/{x}",
28
- "attr": "Esri",
29
- "fallback": "https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png"
30
- },
31
- "OpenStreetMap": {
32
- "url": "https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png",
33
- "attr": "OpenStreetMap",
34
- "fallback": None
35
- },
36
- "Terrain": {
37
- "url": "https://server.arcgisonline.com/ArcGIS/rest/services/World_Terrain_Base/MapServer/tile/{z}/{y}/{x}",
38
- "attr": "Esri",
39
- "fallback": None
40
- },
41
  "Toner": {
42
  "url": "https://tiles.stadiamaps.com/tiles/stamen_toner/{z}/{x}/{y}.png",
43
- "attr": "Stadia Maps",
44
- "fallback": None
45
  }
46
  }
47
 
@@ -49,126 +28,132 @@ MAP_TILES = {
49
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
50
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
51
 
52
- # Geocoding Service
53
- class GeocodingService:
54
- def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
55
- if user_agent is None:
56
- user_agent = f"python_geocoding_script_{random.randint(1000, 9999)}"
57
-
58
- self.geolocator = Nominatim(
59
- user_agent=user_agent,
60
- timeout=timeout
61
- )
62
- self.rate_limit = rate_limit
63
  self.last_request = 0
64
- self.cache = {} # Simple in-memory cache
65
-
66
- def _rate_limit_wait(self):
67
  current_time = time.time()
68
- time_since_last = current_time - self.last_request
69
- if time_since_last < self.rate_limit:
70
- time.sleep(self.rate_limit - time_since_last)
71
  self.last_request = time.time()
72
-
73
- def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
74
- # Check cache first
 
 
 
 
75
  if location in self.cache:
76
  return self.cache[location]
77
-
78
- for attempt in range(max_retries):
79
- try:
80
- self._rate_limit_wait()
81
- location_data = self.geolocator.geocode(location)
82
- if location_data:
83
- # Store in cache and return
84
- self.cache[location] = (location_data.latitude, location_data.longitude)
85
- return self.cache[location]
86
- # Cache None results too
87
- self.cache[location] = None
88
- return None
89
- except (GeocoderTimedOut, GeocoderServiceError) as e:
90
- if attempt == max_retries - 1:
91
- print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
92
- self.cache[location] = None
93
- return None
94
- time.sleep(2 ** attempt) # Exponential backoff
95
- except Exception as e:
96
- print(f"Error geocoding '{location}': {e}")
97
- self.cache[location] = None
98
- return None
99
- return None
100
-
101
- def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
102
- if pd.isna(locations) or not locations:
103
- return []
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  try:
106
- # First try to intelligently parse
107
- import re
108
- pattern = r"([^,]+(?:,\s*[A-Za-z]+)?)"
109
- matches = re.findall(pattern, locations)
110
- location_list = [match.strip() for match in matches if match.strip()]
111
 
112
- # If regex finds nothing, fall back to simple comma splitting
113
- if not location_list:
114
- location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
115
-
116
- # For debugging
117
- print(f"Parsed '{locations}' into: {location_list}")
118
-
119
- return [self.geocode_location(loc) for loc in location_list]
120
- except Exception as e:
121
- print(f"Error parsing locations '{locations}': {e}")
122
- # Fall back to simple method
123
- location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
124
- return [self.geocode_location(loc) for loc in location_list]
125
 
126
- def create_reliable_map(df, location_col):
127
- """Create a map with multiple layer options and better error handling"""
128
-
129
- # Set default tile
130
- default_tile_name = "Toner"
131
-
132
- # Initialize map
133
  m = folium.Map(location=[20, 0], zoom_start=2, control_scale=True)
134
 
135
- # Add all tile layers with the appropriate one active, but no layer control
136
- for name, config in MAP_TILES.items():
137
- folium.TileLayer(
138
- tiles=config["url"],
139
- attr=f"{config['attr']} ({name})",
140
- name=name,
141
- overlay=False,
142
- control=False, # Disable tile layer in controls
143
- show=(name == default_tile_name) # Only show the default layer initially
144
- ).add_to(m)
145
 
146
- # Add plugins for better user experience
147
  Fullscreen().add_to(m)
148
  MeasureControl(position='topright', primary_length_unit='kilometers').add_to(m)
149
 
150
- # Add markers
151
  geocoder = SafeGeocoder()
152
  coords = []
153
-
154
- # Create marker cluster for better performance with many points
155
  marker_cluster = MarkerCluster(name="Locations").add_to(m)
156
-
157
- # Process each location
158
  processed_count = 0
 
159
  for idx, row in df.iterrows():
160
  if pd.isna(row[location_col]):
161
  continue
162
 
163
  location = str(row[location_col]).strip()
164
 
165
- # Get additional info if available
166
  additional_info = ""
167
  for col in df.columns:
168
  if col != location_col and not pd.isna(row[col]):
169
  additional_info += f"<br><b>{col}:</b> {row[col]}"
170
 
171
- # Parse multiple locations if comma-separated
172
  try:
173
  locations = [loc.strip() for loc in location.split(',') if loc.strip()]
174
  if not locations:
@@ -178,10 +163,8 @@ def create_reliable_map(df, location_col):
178
 
179
  # Process each location
180
  for loc in locations:
181
- # Geocode location
182
  point = geocoder.get_coords(loc)
183
  if point:
184
- # Create popup content
185
  popup_content = f"""
186
  <div style="min-width: 200px; max-width: 300px">
187
  <h4 style="font-family: 'Source Sans Pro', sans-serif; margin-bottom: 5px;">{loc}</h4>
@@ -191,7 +174,6 @@ def create_reliable_map(df, location_col):
191
  </div>
192
  """
193
 
194
- # Add marker
195
  folium.Marker(
196
  location=point,
197
  popup=folium.Popup(popup_content, max_width=300),
@@ -202,61 +184,11 @@ def create_reliable_map(df, location_col):
202
  coords.append(point)
203
  processed_count += 1
204
 
205
- # Layer control - removed as requested
206
- # folium.LayerControl(collapsed=False).add_to(m)
207
-
208
- # Set bounds if we have coordinates
209
  if coords:
210
  m.fit_bounds(coords)
211
 
212
- # Add better tile error handling with JavaScript
213
- m.get_root().html.add_child(folium.Element("""
214
- <script>
215
- // Wait for the map to be fully loaded
216
- document.addEventListener('DOMContentLoaded', function() {
217
- setTimeout(function() {
218
- // Get the map instance
219
- var maps = document.querySelectorAll('.leaflet-container');
220
- if (maps.length > 0) {
221
- var map = maps[0];
222
-
223
- // Add error handler for tiles
224
- var layers = map.querySelectorAll('.leaflet-tile-pane .leaflet-layer');
225
- for (var i = 0; i < layers.length; i++) {
226
- var layer = layers[i];
227
- var tiles = layer.querySelectorAll('.leaflet-tile');
228
-
229
- // Check if layer has no loaded tiles
230
- var loadedTiles = layer.querySelectorAll('.leaflet-tile-loaded');
231
- if (tiles.length > 0 && loadedTiles.length === 0) {
232
- // Force switch to OpenStreetMap if current layer failed
233
- var osmButton = document.querySelector('.leaflet-control-layers-list input[type="radio"]:nth-child(3)');
234
- if (osmButton) {
235
- osmButton.click();
236
- }
237
- console.log("Switched to fallback tile layer due to loading issues");
238
- }
239
- }
240
- }
241
- }, 3000); // Wait 3 seconds for tiles to load
242
- });
243
- </script>
244
-
245
- <style>
246
- .leaflet-popup-content {
247
- font-family: 'Source Sans Pro', sans-serif;
248
- }
249
- .leaflet-popup-content h4 {
250
- font-weight: 600;
251
- margin-bottom: 8px;
252
- }
253
- .leaflet-control-layers {
254
- font-family: 'Source Sans Pro', sans-serif;
255
- }
256
- </style>
257
- """))
258
-
259
- # Add custom CSS for better fonts
260
  custom_css = """
261
  <style>
262
  @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');
@@ -269,79 +201,33 @@ def create_reliable_map(df, location_col):
269
 
270
  return m._repr_html_(), processed_count
271
 
272
- # SafeGeocoder with better error handling
273
- class SafeGeocoder:
274
- def __init__(self):
275
- user_agent = f"location_mapper_v1_{random.randint(1000, 9999)}"
276
- self.geolocator = Nominatim(user_agent=user_agent, timeout=10)
277
- self.cache = {} # Simple cache to avoid repeated requests
278
- self.last_request = 0
279
-
280
- def _respect_rate_limit(self):
281
- # Ensure at least 1 second between requests
282
- current_time = time.time()
283
- elapsed = current_time - self.last_request
284
- if elapsed < 1.0:
285
- time.sleep(1.0 - elapsed)
286
- self.last_request = time.time()
287
-
288
- def get_coords(self, location: str):
289
- if not location or pd.isna(location):
290
- return None
291
-
292
- # Convert to string if needed
293
- location = str(location).strip()
294
-
295
- # Check cache first
296
- if location in self.cache:
297
- return self.cache[location]
298
-
299
- try:
300
- self._respect_rate_limit()
301
- result = self.geolocator.geocode(location)
302
- if result:
303
- coords = (result.latitude, result.longitude)
304
- self.cache[location] = coords
305
- return coords
306
- self.cache[location] = None
307
- return None
308
- except Exception as e:
309
- print(f"Geocoding error for '{location}': {e}")
310
- self.cache[location] = None
311
- return None
312
-
313
  def process_excel(file, places_column):
314
- # Check if file is None
315
  if file is None:
316
  return None, "No file uploaded", None
317
 
318
  try:
319
- # Handle various file object types that Gradio might provide
320
  if hasattr(file, 'name'):
321
- # Gradio file object
322
  df = pd.read_excel(file.name)
323
  elif isinstance(file, bytes):
324
- # Raw bytes
325
  df = pd.read_excel(io.BytesIO(file))
326
  else:
327
- # Assume it's a filepath string
328
  df = pd.read_excel(file)
329
 
330
- # Print column names for debugging
331
  print(f"Columns in Excel file: {list(df.columns)}")
332
 
333
  if places_column not in df.columns:
334
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
335
 
336
  # Create map
337
- map_html, processed_count = create_reliable_map(df, places_column)
338
 
339
  # Save processed data
340
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
341
  processed_path = tmp.name
342
  df.to_excel(processed_path, index=False)
343
 
344
- # Generate stats
345
  total_locations = df[places_column].count()
346
  success_rate = (processed_count / total_locations * 100) if total_locations > 0 else 0
347
 
@@ -354,184 +240,165 @@ def process_excel(file, places_column):
354
  print(f"Error processing file: {e}\n{trace}")
355
  return None, f"Error processing file: {str(e)}", None
356
 
357
- def process_and_map(file, column):
358
- if file is None:
359
- return None, "Please upload an Excel file", None
360
-
361
- try:
362
- map_html, stats, processed_path = process_excel(file, column)
363
-
364
- if map_html and processed_path:
365
- # Create responsive container for the map
366
- responsive_html = f"""
367
- <div style="width:100%; height:70vh; margin:0; padding:0; border:1px solid #e0e0e0; border-radius:8px; overflow:hidden;">
368
- {map_html}
369
- </div>
370
- """
371
- return responsive_html, stats, processed_path
372
- else:
373
- return None, stats, None
374
- except Exception as e:
375
- import traceback
376
- trace = traceback.format_exc()
377
- print(f"Error in process_and_map: {e}\n{trace}")
378
- return None, f"Error: {str(e)}", None
379
 
380
- # NuExtract Functions
381
- def extract_info(template, text):
382
- try:
383
- # Format prompt according to NuExtract-1.5 requirements
384
- prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
385
-
386
- # Call API
387
- payload = {
388
- "inputs": prompt,
389
- "parameters": {
390
- "max_new_tokens": 1000,
391
- "do_sample": False
392
- }
393
- }
394
-
395
- response = requests.post(API_URL, headers=headers, json=payload)
396
-
397
- # If the model is loading, inform the user
398
- if response.status_code == 503:
399
- response_json = response.json()
400
- if "error" in response_json and "loading" in response_json["error"]:
401
- estimated_time = response_json.get("estimated_time", "unknown")
402
- return f"⏳ Model is loading (ETA: {int(float(estimated_time)) if isinstance(estimated_time, (int, float, str)) else 'unknown'} seconds)", "Please try again in a few minutes"
403
-
404
- if response.status_code != 200:
405
- return f"❌ API Error: {response.status_code}", response.text
406
-
407
- # Process result
408
- result = response.json()
409
-
410
- # Handle different response formats
411
- try:
412
- if isinstance(result, list):
413
- if len(result) > 0:
414
- result_text = result[0].get("generated_text", "")
415
- else:
416
- return "❌ Empty result list", "{}"
417
- else:
418
- result_text = str(result)
419
-
420
- # Split at output marker if present
421
- if "<|output|>" in result_text:
422
- parts = result_text.split("<|output|>")
423
- if len(parts) > 1:
424
- json_text = parts[1].strip()
425
- else:
426
- json_text = result_text
427
- else:
428
- json_text = result_text
429
-
430
- # Try to parse as JSON
431
- try:
432
- extracted = json.loads(json_text)
433
- formatted = json.dumps(extracted, indent=2)
434
- except json.JSONDecodeError:
435
- return "❌ JSON parsing error", json_text
436
-
437
- return "βœ… Success", formatted
438
- except Exception as inner_e:
439
- return f"❌ Error processing result: {str(inner_e)}", "{}"
440
- except Exception as e:
441
- return f"❌ Error: {str(e)}", "{}"
442
-
443
- # Custom CSS for improved styling
444
  custom_css = """
445
  <style>
446
  @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@300;400;600;700&display=swap');
447
 
448
- :root {
449
- --primary-color: #2c6bb3;
450
- --secondary-color: #4e8fd1;
451
- --background-color: #f7f9fc;
452
- --text-color: #333333;
453
- --border-color: #e0e0e0;
454
- }
455
-
456
  body, .gradio-container {
457
  font-family: 'Source Sans Pro', sans-serif !important;
458
- background-color: var(--background-color);
459
- color: var(--text-color);
460
  }
461
 
462
  h1 {
463
  font-weight: 700 !important;
464
- color: var(--primary-color) !important;
465
  font-size: 2.5rem !important;
466
  margin-bottom: 1rem !important;
467
  }
468
 
469
  h2 {
470
  font-weight: 600 !important;
471
- color: var(--secondary-color) !important;
472
  font-size: 1.5rem !important;
473
  margin-top: 1rem !important;
474
  margin-bottom: 0.75rem !important;
475
  }
476
 
477
  .gradio-button.primary {
478
- background-color: var(--primary-color) !important;
479
- }
480
-
481
- .gradio-button.primary:hover {
482
- background-color: var(--secondary-color) !important;
483
- }
484
-
485
- .gradio-tab-nav button {
486
- font-family: 'Source Sans Pro', sans-serif !important;
487
- font-weight: 600 !important;
488
- }
489
-
490
- .gradio-tab-nav button.selected {
491
- color: var(--primary-color) !important;
492
- border-color: var(--primary-color) !important;
493
  }
494
 
495
  .info-box {
496
  background-color: #e8f4fd;
497
- border-left: 4px solid var(--primary-color);
498
  padding: 15px;
499
  margin: 15px 0;
500
  border-radius: 4px;
501
  }
502
 
503
- .stats-box {
504
- background-color: white;
505
- border: 1px solid var(--border-color);
506
- border-radius: 8px;
507
- padding: 15px;
508
- font-size: 1rem;
509
- line-height: 1.5;
510
- }
511
-
512
- .subtle-text {
513
- font-size: 0.9rem;
514
- color: #666;
515
- font-style: italic;
516
- }
517
-
518
  .file-upload-box {
519
- border: 2px dashed var(--border-color);
520
  border-radius: 8px;
521
  padding: 20px;
522
  text-align: center;
523
  transition: all 0.3s ease;
524
  }
525
-
526
- .file-upload-box:hover {
527
- border-color: var(--primary-color);
528
- }
529
-
530
  </style>
531
  """
532
 
533
- # Create the Gradio interface
534
- with gr.Blocks(css=custom_css) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  gr.HTML("""
536
  <div style="text-align: center; margin-bottom: 1rem">
537
  <h1>Historical Data Analysis Tools</h1>
@@ -539,87 +406,14 @@ with gr.Blocks(css=custom_css) as demo:
539
  </div>
540
  """)
541
 
542
- with gr.Tabs():
543
  with gr.TabItem("πŸ” Text Extraction"):
544
- gr.HTML("""
545
- <div class="info-box">
546
- <h3 style="margin-top: 0;">Extract Structured Data from Text</h3>
547
- <p>Use NuExtract-1.5 to automatically extract structured information from historical texts. Define the JSON template for the data you want to extract.</p>
548
- </div>
549
- """)
550
-
551
- with gr.Row():
552
- with gr.Column():
553
- template = gr.Textbox(
554
- label="JSON Template",
555
- value='{"earthquake location": "", "dateline location": ""}',
556
- lines=5,
557
- placeholder="Define the fields you want to extract as a JSON template"
558
- )
559
- text = gr.Textbox(
560
- label="Text to Extract From",
561
- value="Neues Erdbeben in Japan. Aus Tokio wird berichtet, daß in Yokohama bei einem Erdbeben sechs Personen getâtet und 22 verwundet, in Tokio vier getâtet und 22 verwundet wurden. In Yokohama seien 6VV HÀuser zerstârt worden. Die telephonische und telegraphische Verbindung zwischen Tokio und Osaka ist unterbrochen worden. Der Trambahnverkehr in Tokio liegt still. Auch der Eisenbahnverkehr zwischen Tokio und Yokohama ist unterbrochen. In Sngamo, einer Vorstadt von Tokio sind BrÀnde ausgebrochen. Ein Eisenbahnzug stürzte in den Vajugawafluß zwischen Gotemba und Tokio. Sechs Züge wurden umgeworfen. Mit dem letzten japanischen Erdbeben sind seit eineinhalb Jahrtausenden bis heute in Japan 229 grâßere Erdbeben zu verzeichnen gewesen.",
562
- lines=8,
563
- placeholder="Enter the text you want to extract information from"
564
- )
565
- extract_btn = gr.Button("Extract Information", variant="primary", size="lg")
566
-
567
- with gr.Column():
568
- status = gr.Textbox(
569
- label="Status",
570
- elem_classes="stats-box"
571
- )
572
- output = gr.Textbox(
573
- label="Extracted Data",
574
- elem_classes="stats-box",
575
- lines=10
576
- )
577
 
578
- extract_btn.click(
579
- fn=extract_info,
580
- inputs=[template, text],
581
- outputs=[status, output]
582
- )
583
-
584
  with gr.TabItem("πŸ“ Location Mapping"):
585
- gr.HTML("""
586
- <div class="info-box">
587
- <h3 style="margin-top: 0;">Map Your Historical Locations</h3>
588
- <p>Upload an Excel file containing location data to create an interactive map visualization. The tool will geocode your locations and display them on a customizable map.</p>
589
- </div>
590
- """)
591
-
592
- with gr.Row():
593
- with gr.Column():
594
- template = gr.Textbox(
595
- label="JSON Template",
596
- value='{"earthquake location": "", "dateline location": ""}',
597
- lines=5,
598
- placeholder="Define the fields you want to extract as a JSON template"
599
- )
600
- text = gr.Textbox(
601
- label="Text to Extract From",
602
- value="Neues Erdbeben in Japan. Aus Tokio wird berichtet, daß in Yokohama bei einem Erdbeben sechs Personen getâtet und 22 verwundet, in Tokio vier getâtet und 22 verwundet wurden. In Yokohama seien 6VV HÀuser zerstârt worden. Die telephonische und telegraphische Verbindung zwischen Tokio und Osaka ist unterbrochen worden. Der Trambahnverkehr in Tokio liegt still. Auch der Eisenbahnverkehr zwischen Tokio und Yokohama ist unterbrochen. In Sngamo, einer Vorstadt von Tokio sind BrÀnde ausgebrochen. Ein Eisenbahnzug stürzte in den Vajugawafluß zwischen Gotemba und Tokio. Sechs Züge wurden umgeworfen. Mit dem letzten japanischen Erdbeben sind seit eineinhalb Jahrtausenden bis heute in Japan 229 grâßere Erdbeben zu verzeichnen gewesen.",
603
- lines=8,
604
- placeholder="Enter the text you want to extract information from"
605
- )
606
- extract_btn = gr.Button("Extract Information", variant="primary", size="lg")
607
-
608
- with gr.Column():
609
- status = gr.Textbox(
610
- label="Status",
611
- elem_classes="stats-box"
612
- )
613
- output = gr.JSON(
614
- label="Extracted Data",
615
- elem_classes="stats-box"
616
- )
617
-
618
- extract_btn.click(
619
- fn=extract_info,
620
- inputs=[template, text],
621
- outputs=[status, output]
622
- )
623
 
624
  gr.HTML("""
625
  <div style="text-align: center; margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #eee; font-size: 0.9rem; color: #666;">
@@ -628,4 +422,4 @@ with gr.Blocks(css=custom_css) as demo:
628
  """)
629
 
630
  if __name__ == "__main__":
631
- demo.launch()
 
4
  import os
5
  import pandas as pd
6
  import folium
7
+ from folium.plugins import MeasureControl, Fullscreen, MarkerCluster
8
  from geopy.geocoders import Nominatim
9
  from geopy.exc import GeocoderTimedOut, GeocoderServiceError
10
  import time
 
18
 
19
  # Map Tile Providers with reliable sources
20
  MAP_TILES = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "Toner": {
22
  "url": "https://tiles.stadiamaps.com/tiles/stamen_toner/{z}/{x}/{y}.png",
23
+ "attr": "Stadia Maps"
 
24
  }
25
  }
26
 
 
28
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
29
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
30
 
31
+ class SafeGeocoder:
32
+ def __init__(self):
33
+ user_agent = f"location_mapper_v1_{random.randint(1000, 9999)}"
34
+ self.geolocator = Nominatim(user_agent=user_agent, timeout=10)
35
+ self.cache = {}
 
 
 
 
 
 
36
  self.last_request = 0
37
+
38
+ def _respect_rate_limit(self):
 
39
  current_time = time.time()
40
+ elapsed = current_time - self.last_request
41
+ if elapsed < 1.0:
42
+ time.sleep(1.0 - elapsed)
43
  self.last_request = time.time()
44
+
45
+ def get_coords(self, location: str):
46
+ if not location or pd.isna(location):
47
+ return None
48
+
49
+ location = str(location).strip()
50
+
51
  if location in self.cache:
52
  return self.cache[location]
53
+
54
+ try:
55
+ self._respect_rate_limit()
56
+ result = self.geolocator.geocode(location)
57
+ if result:
58
+ coords = (result.latitude, result.longitude)
59
+ self.cache[location] = coords
60
+ return coords
61
+ self.cache[location] = None
62
+ return None
63
+ except Exception as e:
64
+ print(f"Geocoding error for '{location}': {e}")
65
+ self.cache[location] = None
66
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ # NuExtract Functions
69
+ def extract_info(template, text):
70
+ try:
71
+ # Format prompt according to NuExtract-1.5 requirements
72
+ prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
73
+
74
+ # Call API
75
+ payload = {
76
+ "inputs": prompt,
77
+ "parameters": {
78
+ "max_new_tokens": 1000,
79
+ "do_sample": False
80
+ }
81
+ }
82
+
83
+ response = requests.post(API_URL, headers=headers, json=payload)
84
+
85
+ # If the model is loading, inform the user
86
+ if response.status_code == 503:
87
+ response_json = response.json()
88
+ if "error" in response_json and "loading" in response_json["error"]:
89
+ estimated_time = response_json.get("estimated_time", "unknown")
90
+ return f"⏳ Model is loading (ETA: {int(float(estimated_time)) if isinstance(estimated_time, (int, float, str)) else 'unknown'} seconds)", "Please try again in a few minutes"
91
+
92
+ if response.status_code != 200:
93
+ return f"❌ API Error: {response.status_code}", response.text
94
+
95
+ # Process result
96
+ result = response.json()
97
+
98
+ # Handle different response formats
99
+ if isinstance(result, list) and len(result) > 0:
100
+ result_text = result[0].get("generated_text", "")
101
+ else:
102
+ result_text = str(result)
103
+
104
+ # Split at output marker if present
105
+ if "<|output|>" in result_text:
106
+ json_text = result_text.split("<|output|>")[1].strip()
107
+ else:
108
+ json_text = result_text
109
+
110
+ # Try to parse as JSON
111
  try:
112
+ extracted = json.loads(json_text)
113
+ formatted = json.dumps(extracted, indent=2)
114
+ except json.JSONDecodeError:
115
+ return "❌ JSON parsing error", json_text
 
116
 
117
+ return "βœ… Success", formatted
118
+ except Exception as e:
119
+ return f"❌ Error: {str(e)}", "{}"
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def create_map(df, location_col):
122
+ # Initialize map with Toner style
 
 
 
 
 
123
  m = folium.Map(location=[20, 0], zoom_start=2, control_scale=True)
124
 
125
+ # Add the single tile layer without controls
126
+ folium.TileLayer(
127
+ tiles=MAP_TILES["Toner"]["url"],
128
+ attr=MAP_TILES["Toner"]["attr"],
129
+ name="Toner",
130
+ overlay=False,
131
+ control=False
132
+ ).add_to(m)
 
 
133
 
134
+ # Add plugins
135
  Fullscreen().add_to(m)
136
  MeasureControl(position='topright', primary_length_unit='kilometers').add_to(m)
137
 
138
+ # Process markers
139
  geocoder = SafeGeocoder()
140
  coords = []
 
 
141
  marker_cluster = MarkerCluster(name="Locations").add_to(m)
 
 
142
  processed_count = 0
143
+
144
  for idx, row in df.iterrows():
145
  if pd.isna(row[location_col]):
146
  continue
147
 
148
  location = str(row[location_col]).strip()
149
 
150
+ # Get additional info
151
  additional_info = ""
152
  for col in df.columns:
153
  if col != location_col and not pd.isna(row[col]):
154
  additional_info += f"<br><b>{col}:</b> {row[col]}"
155
 
156
+ # Parse locations
157
  try:
158
  locations = [loc.strip() for loc in location.split(',') if loc.strip()]
159
  if not locations:
 
163
 
164
  # Process each location
165
  for loc in locations:
 
166
  point = geocoder.get_coords(loc)
167
  if point:
 
168
  popup_content = f"""
169
  <div style="min-width: 200px; max-width: 300px">
170
  <h4 style="font-family: 'Source Sans Pro', sans-serif; margin-bottom: 5px;">{loc}</h4>
 
174
  </div>
175
  """
176
 
 
177
  folium.Marker(
178
  location=point,
179
  popup=folium.Popup(popup_content, max_width=300),
 
184
  coords.append(point)
185
  processed_count += 1
186
 
187
+ # Set bounds
 
 
 
188
  if coords:
189
  m.fit_bounds(coords)
190
 
191
+ # Add custom font CSS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  custom_css = """
193
  <style>
194
  @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');
 
201
 
202
  return m._repr_html_(), processed_count
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  def process_excel(file, places_column):
 
205
  if file is None:
206
  return None, "No file uploaded", None
207
 
208
  try:
209
+ # Handle file
210
  if hasattr(file, 'name'):
 
211
  df = pd.read_excel(file.name)
212
  elif isinstance(file, bytes):
 
213
  df = pd.read_excel(io.BytesIO(file))
214
  else:
 
215
  df = pd.read_excel(file)
216
 
 
217
  print(f"Columns in Excel file: {list(df.columns)}")
218
 
219
  if places_column not in df.columns:
220
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
221
 
222
  # Create map
223
+ map_html, processed_count = create_map(df, places_column)
224
 
225
  # Save processed data
226
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
227
  processed_path = tmp.name
228
  df.to_excel(processed_path, index=False)
229
 
230
+ # Stats
231
  total_locations = df[places_column].count()
232
  success_rate = (processed_count / total_locations * 100) if total_locations > 0 else 0
233
 
 
240
  print(f"Error processing file: {e}\n{trace}")
241
  return None, f"Error processing file: {str(e)}", None
242
 
243
+ # Create separate interfaces for each tab to avoid conflicts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ # CSS for improved styling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  custom_css = """
247
  <style>
248
  @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@300;400;600;700&display=swap');
249
 
 
 
 
 
 
 
 
 
250
  body, .gradio-container {
251
  font-family: 'Source Sans Pro', sans-serif !important;
252
+ color: #333333;
 
253
  }
254
 
255
  h1 {
256
  font-weight: 700 !important;
257
+ color: #2c6bb3 !important;
258
  font-size: 2.5rem !important;
259
  margin-bottom: 1rem !important;
260
  }
261
 
262
  h2 {
263
  font-weight: 600 !important;
264
+ color: #4e8fd1 !important;
265
  font-size: 1.5rem !important;
266
  margin-top: 1rem !important;
267
  margin-bottom: 0.75rem !important;
268
  }
269
 
270
  .gradio-button.primary {
271
+ background-color: #ff7518 !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  }
273
 
274
  .info-box {
275
  background-color: #e8f4fd;
276
+ border-left: 4px solid #2c6bb3;
277
  padding: 15px;
278
  margin: 15px 0;
279
  border-radius: 4px;
280
  }
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  .file-upload-box {
283
+ border: 2px dashed #e0e0e0;
284
  border-radius: 8px;
285
  padding: 20px;
286
  text-align: center;
287
  transition: all 0.3s ease;
288
  }
 
 
 
 
 
289
  </style>
290
  """
291
 
292
+ # Text Extraction tab as a separate Blocks interface
293
+ with gr.Blocks(css=custom_css) as extraction_interface:
294
+ gr.HTML("""
295
+ <div class="info-box">
296
+ <h3 style="margin-top: 0;">Extract Structured Data from Text</h3>
297
+ <p>Use NuExtract-1.5 to automatically extract structured information from historical texts. Define the JSON template for the data you want to extract.</p>
298
+ </div>
299
+ """)
300
+
301
+ with gr.Row():
302
+ with gr.Column():
303
+ template = gr.Textbox(
304
+ label="JSON Template",
305
+ value='{"earthquake location": "", "dateline location": ""}',
306
+ lines=5
307
+ )
308
+ text = gr.Textbox(
309
+ label="Text to Extract From",
310
+ value="Neues Erdbeben in Japan. Aus Tokio wird berichtet, daß in Yokohama bei einem Erdbeben sechs Personen getâtet und 22 verwundet, in Tokio vier getâtet und 22 verwundet wurden. In Yokohama seien 6VV HÀuser zerstârt worden. Die telephonische und telegraphische Verbindung zwischen Tokio und Osaka ist unterbrochen worden. Der Trambahnverkehr in Tokio liegt still. Auch der Eisenbahnverkehr zwischen Tokio und Yokohama ist unterbrochen. In Sngamo, einer Vorstadt von Tokio sind BrÀnde ausgebrochen. Ein Eisenbahnzug stürzte in den Vajugawafluß zwischen Gotemba und Tokio. Sechs Züge wurden umgeworfen. Mit dem letzten japanischen Erdbeben sind seit eineinhalb Jahrtausenden bis heute in Japan 229 grâßere Erdbeben zu verzeichnen gewesen.",
311
+ lines=8
312
+ )
313
+ extract_btn = gr.Button("Extract Information", variant="primary")
314
+
315
+ with gr.Column():
316
+ status = gr.Textbox(label="Status")
317
+ output = gr.Textbox(label="Output", lines=10)
318
+
319
+ extract_btn.click(
320
+ fn=extract_info,
321
+ inputs=[template, text],
322
+ outputs=[status, output]
323
+ )
324
+
325
+ # Mapping tab as a separate Blocks interface
326
+ with gr.Blocks(css=custom_css) as mapping_interface:
327
+ gr.HTML("""
328
+ <div class="info-box">
329
+ <h3 style="margin-top: 0;">Map Your Historical Locations</h3>
330
+ <p>Upload an Excel file containing location data to create an interactive map visualization. The tool will geocode your locations and display them on a map.</p>
331
+ </div>
332
+ """)
333
+
334
+ with gr.Row():
335
+ with gr.Column():
336
+ excel_file = gr.File(
337
+ label="Upload Excel File",
338
+ file_types=[".xlsx", ".xls"],
339
+ elem_classes="file-upload-box"
340
+ )
341
+ places_column = gr.Textbox(
342
+ label="Location Column Name",
343
+ value="dateline_locations",
344
+ placeholder="e.g., 'dateline_locations', 'earthquake_locations', or 'place_of_distribution'"
345
+ )
346
+ process_btn = gr.Button("Generate Map", variant="primary")
347
+
348
+ with gr.Column():
349
+ map_output = gr.HTML(
350
+ label="Interactive Map",
351
+ value="""
352
+ <div style="text-align:center; height:70vh; display:flex; align-items:center; justify-content:center;
353
+ background-color:#f5f5f5; border:1px solid #e0e0e0; border-radius:8px;">
354
+ <div>
355
+ <img src="https://cdn-icons-png.flaticon.com/512/854/854878.png" width="100">
356
+ <p style="margin-top:20px; color:#666;">Your map will appear here after processing</p>
357
+ </div>
358
+ </div>
359
+ """
360
+ )
361
+ stats_output = gr.Textbox(
362
+ label="Location Statistics",
363
+ lines=2
364
+ )
365
+ processed_file = gr.File(
366
+ label="Download Processed Data",
367
+ visible=True,
368
+ interactive=False
369
+ )
370
+
371
+ def process_and_map(file, column):
372
+ if file is None:
373
+ return None, "Please upload an Excel file", None
374
+
375
+ try:
376
+ map_html, stats, processed_path = process_excel(file, column)
377
+
378
+ if map_html and processed_path:
379
+ # Create responsive container for the map
380
+ responsive_html = f"""
381
+ <div style="width:100%; height:70vh; margin:0; padding:0; border:1px solid #e0e0e0; border-radius:8px; overflow:hidden;">
382
+ {map_html}
383
+ </div>
384
+ """
385
+ return responsive_html, stats, processed_path
386
+ else:
387
+ return None, stats, None
388
+ except Exception as e:
389
+ import traceback
390
+ trace = traceback.format_exc()
391
+ print(f"Error in process_and_map: {e}\n{trace}")
392
+ return None, f"Error: {str(e)}", None
393
+
394
+ process_btn.click(
395
+ fn=process_and_map,
396
+ inputs=[excel_file, places_column],
397
+ outputs=[map_output, stats_output, processed_file]
398
+ )
399
+
400
+ # Main app with proper tab separation
401
+ with gr.Blocks(css=custom_css, title="Historical Data Analysis") as demo:
402
  gr.HTML("""
403
  <div style="text-align: center; margin-bottom: 1rem">
404
  <h1>Historical Data Analysis Tools</h1>
 
406
  </div>
407
  """)
408
 
409
+ with gr.Tabs() as tabs:
410
  with gr.TabItem("πŸ” Text Extraction"):
411
+ # Instead of duplicating content, use the interface
412
+ extraction_interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
 
 
 
 
 
 
414
  with gr.TabItem("πŸ“ Location Mapping"):
415
+ # Instead of duplicating content, use the interface
416
+ mapping_interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  gr.HTML("""
419
  <div style="text-align: center; margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #eee; font-size: 0.9rem; color: #666;">
 
422
  """)
423
 
424
  if __name__ == "__main__":
425
+ demo.launch()