RDF Validation Deployment commited on
Commit
e344fcd
·
1 Parent(s): 2cc7244

Initial deployment of RDF validation app to new mcp4rdf space

Browse files
MonographDCTAP/Monograph_AdminMetadata.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ shapeID shapeLabel target propertyID propertyLabel valueShape mandatory severity valueNodeType repeatable note
2
+ big:AdminMetadata Admin Metadata bf:AdminMetadata bf:creationDate Date Cataloged or Updated/Changed true Violation literal false
3
+ big:AdminMetadata Admin Metadata bf:AdminMetadata bf:assigner Cataloging institution true Violation IRI; bnode false
MonographDCTAP/Monograph_Instance_Print.tsv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ shapeID shapeLabel target propertyID propertyLabel valueShape mandatory severity valueNodeType repeatable note
2
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:instanceOf Instance of big:Monograph:Work true Violation IRI; bnode true
3
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:title Instance Title big:Title true Violation IRI; bnode true
4
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:editionStatement Edition Statement true Warning literal true
5
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:provisionActivity Provision Activity--Publication Information big:ProvisionActivity true Violation IRI; bnode true
6
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:seriesStatement Series Statement true Warning literal true
7
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:identifiedBy Identifiers true Warning IRI; bnode true e.g., ISBN
8
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:issuance Mode of Issuance true Violation IRI; bnode false
9
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:media Media type true Violation IRI; bnode true
10
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:carrier Carrier type true Violation IRI; bnode true
11
+ big:Monograph:Instance:Print Instance (Monograph) Print bf:Print bf:adminMetadata Administrative metadata true Violation IRI; bnode true *adminMetadata should be at the Work and Instance levels but the requirements are the same for both
12
+ big:Title Instance Title bf:Title bf:mainTitle Main Title true Violation literal false Move to Title Sheet and add other title types
13
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity bf:agent Agent big:Agent true Warning IRI; bnode true prefer use of bf:agent in Provision Activity but if no bf:agent exists, then use bf:simpleAgent (below)
14
+ big:Agent bf:Agent ; bf:Person ; bf:Family ; bf:Organization ; bf:Jurisdiction ; bf:Meeting rdfs:label Agent Label true Warning literal true
15
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity bflc:simpleAgent Agent Simple Label true Warning literal true prefer use of bf:agent (above) in Provision Activity but if no bf:agent exists, then use bf:simpleAgent
16
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity; bf:Distribution; bf:Manufacture; bf:Production; bf:Publication; bf:Modification bf:date Date true Warning literal true Pull both types of dates if they exist as the formatting may differ
17
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity; bf:Distribution; bf:Manufacture; bf:Production; bf:Publication; bf:Modification bflc:simpleDate Simple Date true Warning literal true Pull both types of dates if they exist as the formatting may differ
18
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity; bf:Distribution; bf:Manufacture; bf:Production; bf:Publication; bf:Modification bf:place Place big:Place true Warning IRI; bnode true Pull both bf:place and bf:simplePlace LC uses bf:place to indicate the country of publication
19
+ big:Place bf:Place rdfs:label Place Label true Warning literal true
20
+ ProvisionActivityShape Provision Activity bf:ProvisionActivity; bf:Distribution; bf:Manufacture; bf:Production; bf:Publication; bf:Modification bflc:simplePlace Place Simple Label true Warning literal true
21
+
22
+
23
+
MonographDCTAP/Monograph_Prefixes.tsv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Vocabulary Prefix Namespace
2
+ BIBFRAME bf: http://id.loc.gov/ontologies/bibframe/
3
+ BIBFRAME LC Extension Ontology bflc: http://id.loc.gov/ontologies/bflc/
4
+ Resource Description Framework Schema rdfs: http://www.w3.org/2000/01/rdf-schema#
5
+ BIBFRAME Interoperbility Group Shapes big: https://example.org/
MonographDCTAP/Monograph_Work_Text.tsv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ shapeID shapeLabel target propertyID propertyLabel valueShape mandatory severity valueNodeType repeatable note
2
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:title Work Title big:Title true Violation IRI ; bnode true Change to 'SeeTitle Sheet' per AdminMetadata
3
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:contribution Contribution big:Contribution true Warning IRI ; bnode true required if applicable. Should be included if it is there
4
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:genreForm Form/Genre of Work true Warning IRI ; bnode true From discussion - consider Work subclasses as sufficient
5
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:originDate Date of Work true Warning literal true
6
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:originPlace Place of Origin of the Work true Warning IRI ; bnode false
7
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:language Language true Violation IRI true
8
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:subject Subject of the Work true Warning IRI ; bnode true
9
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:classification Classification numbers true Warning IRI ; bnode true
10
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:content Content Type true Violation IRI ; bnode true *Thought to make this false, leveraging the subclass information for the same - will this fulfill this need? If not, how to address missing data here? uncontrolled labels . Essential for differentiation/identification.
11
+ big:Monograph:Work Work (Monograph) Text bf:Text ; bf:Monograph bf:adminMetadata Administrative metadata true Violation IRI ; bnode true See AdminMetadata Sheet
12
+ big:Title Monograph Title bf:Title bf:mainTitle Main Title true Violation literal false Move to Title Sheet and add other title types
13
+ big:Contribution Contribution bf:Contribution; bf:PrimaryContribution bf:agent Agent big:Agent true Warning IRI ; bnode true
14
+ big:Contribution Contribution bf:Contribution; bf:PrimaryContribution bf:role Role big:Role true Warning IRI ; bnode true
15
+ big:Agent Agent bf:Agent ; bf:Person ; bf:Family ; bf:Organization ; bf:Jurisdiction ; bf:Meeting rdfs:label Agent Label true Warning literal true
16
+ big:Role Role bf:Role rdfs:label Role Label true Warning literal true
app.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hugging Face Gradio App for RDF Validation with MCP Server and Anthropic AI
4
+
5
+ This app serves both as a web interface and can expose MCP server functionality.
6
+ Deploy this on Hugging Face Spaces with your Anthropic API key.
7
+ """
8
+
9
+ import gradio as gr
10
+ import os
11
+ import json
12
+ import sys
13
+ import asyncio
14
+ import logging
15
+ import requests
16
+ from typing import Any, Dict, List, Optional
17
+ import threading
18
+ import time
19
+
20
+ # CRITICAL: FORCE OVERRIDE ALL ENVIRONMENT VARIABLES THAT COULD INTERFERE
21
+ print("🔧 FORCING ENVIRONMENT VARIABLE OVERRIDES...")
22
+
23
+ # Remove any HF environment variables that could cause URL concatenation
24
+ problematic_env_vars = [
25
+ 'HF_API_URL',
26
+ 'HF_INFERENCE_URL',
27
+ 'HF_ENDPOINT_URL',
28
+ 'HF_MODEL',
29
+ 'HUGGINGFACE_API_URL',
30
+ 'HUGGINGFACE_INFERENCE_URL'
31
+ ]
32
+
33
+ for var in problematic_env_vars:
34
+ if var in os.environ:
35
+ old_value = os.environ[var]
36
+ del os.environ[var]
37
+ print(f"🗑️ Removed environment variable: {var} = {old_value}")
38
+
39
+ print("✅ Environment variables cleaned")
40
+
41
+ # Add current directory to path
42
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
43
+
44
+ # Import our validation logic
45
+ try:
46
+ from validator import validate_rdf
47
+ VALIDATOR_AVAILABLE = True
48
+ except ImportError:
49
+ VALIDATOR_AVAILABLE = False
50
+ print("⚠️ Warning: validator.py not found. Some features may be limited.")
51
+
52
+ # Optional: Check if OpenAI and requests are available
53
+ try:
54
+ from openai import OpenAI
55
+ OPENAI_AVAILABLE = True
56
+ except ImportError:
57
+ OPENAI_AVAILABLE = False
58
+ print("💡 Install 'openai' package for AI-powered corrections: pip install openai")
59
+
60
+ try:
61
+ import requests
62
+ HF_INFERENCE_AVAILABLE = True
63
+ except ImportError:
64
+ HF_INFERENCE_AVAILABLE = False
65
+ print("💡 Install 'requests' package for AI-powered corrections: pip install requests")
66
+
67
+ # Set up logging
68
+ logging.basicConfig(level=logging.INFO)
69
+ logger = logging.getLogger(__name__)
70
+
71
+ # Configuration - ABSOLUTELY HARDCODED VALUES (NO ENV VARS ALLOWED)
72
+ HF_API_KEY = os.getenv('HF_API_KEY', '') # Only this one env var is allowed
73
+ # FORCE HARDCODED VALUES - IGNORE ALL OTHER ENVIRONMENT VARIABLES
74
+ HF_ENDPOINT_URL = "https://evxgv66ksxjlfrts.us-east-1.aws.endpoints.huggingface.cloud/v1/"
75
+ HF_MODEL = "lmstudio-community/Llama-3.3-70B-Instruct-GGUF" # Correct model name for your endpoint
76
+
77
+ print(f"🔐 FORCED hardcoded endpoint: {HF_ENDPOINT_URL}")
78
+ print(f"🔐 FORCED hardcoded model: {HF_MODEL}")
79
+ print(f"🔑 HF_API_KEY configured: {'Yes' if HF_API_KEY else 'No'}")
80
+
81
+ # EXTRA PROTECTION: Override any modules that might have cached env vars
82
+ import sys
83
+ if 'requests' in sys.modules:
84
+ print("🔄 Requests module detected - ensuring no cached env vars")
85
+ if 'httpx' in sys.modules:
86
+ print("🔄 HTTPX module detected - ensuring no cached env vars")
87
+
88
+ # OpenAI client configuration for the endpoint
89
+ def get_openai_client():
90
+ """Get configured OpenAI client for HF Inference Endpoint"""
91
+ if not HF_API_KEY:
92
+ print("❌ No HF_API_KEY available for OpenAI client")
93
+ return None
94
+
95
+ print(f"🔗 Creating OpenAI client with:")
96
+ print(f" base_url: {HF_ENDPOINT_URL}")
97
+ print(f" api_key: {'***' + HF_API_KEY[-4:] if len(HF_API_KEY) > 4 else 'HIDDEN'}")
98
+
99
+ return OpenAI(
100
+ base_url=HF_ENDPOINT_URL,
101
+ api_key=HF_API_KEY,
102
+ timeout=120.0 # Increase timeout for cold starts
103
+ )
104
+
105
+ # Sample RDF data for examples
106
+ SAMPLE_VALID_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
107
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
108
+ xmlns:bf="http://id.loc.gov/ontologies/bibframe/"
109
+ xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
110
+
111
+ <bf:Work rdf:about="http://example.org/work/1">
112
+ <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Text"/>
113
+ <bf:title>
114
+ <bf:Title>
115
+ <bf:mainTitle>Sample Monograph Title</bf:mainTitle>
116
+ </bf:Title>
117
+ </bf:title>
118
+ <bf:creator>
119
+ <bf:Agent>
120
+ <rdfs:label>Sample Author</rdfs:label>
121
+ </bf:Agent>
122
+ </bf:creator>
123
+ </bf:Work>
124
+
125
+ </rdf:RDF>'''
126
+
127
+ SAMPLE_INVALID_RDF = '''<?xml version="1.0" encoding="UTF-8"?>
128
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
129
+ <!-- Missing namespace declarations -->
130
+ <!-- Missing required properties -->
131
+ <bf:Work rdf:about="http://example.org/work/1">
132
+ <bf:title>Incomplete Title</bf:title>
133
+ <!-- Missing rdf:type -->
134
+ <!-- Missing proper title structure -->
135
+ </bf:Work>
136
+ </rdf:RDF>'''
137
+
138
+ # MCP Server Tools (can be used independently)
139
+ def validate_rdf_tool(rdf_content: str, template: str = "monograph") -> dict:
140
+ """
141
+ Validate RDF/XML content against SHACL templates.
142
+
143
+ This tool validates RDF/XML data against predefined SHACL shapes to ensure
144
+ compliance with metadata standards like BIBFRAME. Returns detailed validation
145
+ results with conformance status and specific violation information.
146
+
147
+ Args:
148
+ rdf_content (str): The RDF/XML content to validate
149
+ template (str): Validation template to use ('monograph' or 'custom')
150
+
151
+ Returns:
152
+ dict: Validation results with conformance status and detailed feedback
153
+ """
154
+ if not rdf_content:
155
+ return {"error": "No RDF/XML content provided", "conforms": False}
156
+
157
+ if not VALIDATOR_AVAILABLE:
158
+ return {
159
+ "error": "Validator not available - ensure validator.py is present",
160
+ "conforms": False
161
+ }
162
+
163
+ try:
164
+ conforms, results_text = validate_rdf(rdf_content.encode('utf-8'), template)
165
+
166
+ return {
167
+ "conforms": conforms,
168
+ "results": results_text,
169
+ "template": template,
170
+ "status": "✅ Valid RDF" if conforms else "❌ Invalid RDF"
171
+ }
172
+
173
+ except Exception as e:
174
+ logger.error(f"Validation error: {str(e)}")
175
+ return {
176
+ "error": f"Validation failed: {str(e)}",
177
+ "conforms": False
178
+ }
179
+
180
+ def get_ai_suggestions(validation_results: str, rdf_content: str) -> str:
181
+ """
182
+ Generate AI-powered fix suggestions for invalid RDF/XML.
183
+
184
+ This tool analyzes validation results and provides actionable suggestions
185
+ for fixing RDF/XML validation errors using AI or rule-based analysis.
186
+
187
+ Args:
188
+ validation_results (str): The validation error messages
189
+ rdf_content (str): The original RDF/XML content that failed validation
190
+
191
+ Returns:
192
+ str: Detailed suggestions for fixing the RDF validation issues
193
+ """
194
+
195
+ if not OPENAI_AVAILABLE:
196
+ return generate_manual_suggestions(validation_results)
197
+
198
+ # Get API key dynamically at runtime
199
+ current_api_key = os.getenv('HF_API_KEY', '')
200
+ if not current_api_key:
201
+ return f"""
202
+ 🔑 **AI suggestions disabled**: Please set your Hugging Face API key as a Secret in your Space settings.
203
+
204
+ {generate_manual_suggestions(validation_results)}
205
+ """
206
+
207
+ try:
208
+ # Use OpenAI client with your Hugging Face Inference Endpoint
209
+ print("🔍 Attempting to get OpenAI client for suggestions...")
210
+ client = get_openai_client()
211
+ if not client:
212
+ print("❌ OpenAI client is None for suggestions.")
213
+ return f"""
214
+ 🔑 **AI suggestions disabled**: HF_API_KEY not configured or client creation failed.
215
+
216
+ {generate_manual_suggestions(validation_results)}
217
+ """
218
+ print(f"✅ OpenAI client obtained for suggestions. Client timeout: {client.timeout}")
219
+
220
+ prompt = f"""You are an expert in RDF/XML and SHACL validation. Analyze the following validation results and provide clear, actionable suggestions for fixing the RDF issues.
221
+
222
+ Validation Results:
223
+ {validation_results}
224
+
225
+ Original RDF (first 1000 chars):
226
+ {rdf_content[:1000]}...
227
+
228
+ Please provide:
229
+ 1. A clear summary of what's wrong
230
+ 2. Specific step-by-step instructions to fix each issue
231
+ 3. Example corrections where applicable
232
+ 4. Best practices to prevent similar issues
233
+
234
+ Format your response in a helpful, structured way using markdown."""
235
+
236
+ # Make API call using OpenAI client
237
+ print(f"🔄 Making SUGGESTION API call to: {HF_ENDPOINT_URL} with model: {HF_MODEL}")
238
+ print(f"🔄 Client base_url: {client.base_url}")
239
+ print("⏳ Attempting client.chat.completions.create() for suggestions...")
240
+
241
+ chat_completion = client.chat.completions.create(
242
+ model=HF_MODEL,
243
+ messages=[
244
+ {
245
+ "role": "user",
246
+ "content": prompt
247
+ }
248
+ ],
249
+ max_tokens=1500,
250
+ temperature=0.7,
251
+ top_p=0.9
252
+ )
253
+
254
+ print(f"✅ client.chat.completions.create() returned for suggestions. Type: {type(chat_completion)}")
255
+ generated_text = chat_completion.choices[0].message.content
256
+ print("✅ Suggestion API call successful, content extracted.")
257
+ return f"🤖 **AI-Powered Suggestions:**\n\n{generated_text}"
258
+
259
+ except Exception as e:
260
+ logger.error(f"OpenAI/HF Inference Endpoint error (suggestions): {str(e)}", exc_info=True) # Added exc_info for full traceback
261
+ return f"""
262
+ ❌ **AI suggestions error**: {str(e)}
263
+
264
+ {generate_manual_suggestions(validation_results)}
265
+ """
266
+
267
+ def get_ai_correction(validation_results: str, rdf_content: str) -> str:
268
+ """
269
+ Generate AI-powered corrected RDF/XML based on validation errors.
270
+
271
+ This tool takes invalid RDF/XML and validation results, then generates
272
+ a corrected version that addresses all identified validation issues.
273
+
274
+ Args:
275
+ validation_results (str): The validation error messages
276
+ rdf_content (str): The original invalid RDF/XML content
277
+
278
+ Returns:
279
+ str: Corrected RDF/XML that should pass validation
280
+ """
281
+
282
+ if not OPENAI_AVAILABLE:
283
+ return generate_manual_correction_hints(validation_results, rdf_content)
284
+
285
+ # Get API key dynamically at runtime
286
+ current_api_key = os.getenv('HF_API_KEY', '')
287
+ if not current_api_key:
288
+ return f"""<!-- AI correction disabled: Set HF_API_KEY as a Secret in your Space settings -->
289
+
290
+ {generate_manual_correction_hints(validation_results, rdf_content)}"""
291
+
292
+ try:
293
+ # Use OpenAI client with your Hugging Face Inference Endpoint
294
+ print("🔍 Attempting to get OpenAI client for correction...")
295
+ client = get_openai_client()
296
+ if not client:
297
+ print("❌ OpenAI client is None for correction.")
298
+ return f"""<!-- AI correction disabled: HF_API_KEY not configured or client creation failed. -->
299
+
300
+ {generate_manual_correction_hints(validation_results, rdf_content)}"""
301
+ print(f"✅ OpenAI client obtained for correction. Client timeout: {client.timeout}")
302
+
303
+ prompt = f"""You are an expert in RDF/XML. Fix the following RDF/XML based on the validation errors provided.
304
+
305
+ Validation Errors:
306
+ {validation_results}
307
+
308
+ Original RDF/XML:
309
+ {rdf_content}
310
+
311
+ Please provide the corrected RDF/XML that addresses all validation issues.
312
+ - Return only the corrected XML without additional explanation
313
+ - Maintain the original structure as much as possible while fixing errors
314
+ - Ensure all namespace declarations are present
315
+ - Add any missing required properties
316
+ - Fix any syntax or structural issues"""
317
+
318
+ # Make API call using OpenAI client
319
+ print(f"🔄 Making CORRECTION API call to: {HF_ENDPOINT_URL} with model: {HF_MODEL}")
320
+ print(f"🔄 Client base_url: {client.base_url}")
321
+ print("⏳ Attempting client.chat.completions.create() for correction...")
322
+
323
+ chat_completion = client.chat.completions.create(
324
+ model=HF_MODEL,
325
+ messages=[
326
+ {
327
+ "role": "user",
328
+ "content": prompt
329
+ }
330
+ ],
331
+ max_tokens=2000,
332
+ temperature=0.3,
333
+ top_p=0.9
334
+ )
335
+
336
+ print(f"✅ client.chat.completions.create() returned for correction. Type: {type(chat_completion)}")
337
+ corrected_text = chat_completion.choices[0].message.content
338
+ print("✅ Correction API call successful, content extracted.")
339
+ return corrected_text
340
+
341
+ except Exception as e:
342
+ logger.error(f"OpenAI/HF Inference Endpoint error (correction): {str(e)}", exc_info=True) # Added exc_info for full traceback
343
+ return f"""<!-- AI correction error: {str(e)} -->
344
+
345
+ {generate_manual_correction_hints(validation_results, rdf_content)}"""
346
+
347
+ def generate_manual_suggestions(validation_results: str) -> str:
348
+ """Generate rule-based suggestions when AI is not available"""
349
+ suggestions = []
350
+
351
+ if "Constraint Violation" in validation_results:
352
+ suggestions.append("• Fix SHACL constraint violations by ensuring required properties are present")
353
+
354
+ if "Missing property" in validation_results or "missing" in validation_results.lower():
355
+ suggestions.append("• Add missing required properties (check template requirements)")
356
+
357
+ if "datatype" in validation_results.lower():
358
+ suggestions.append("• Correct data type mismatches (ensure proper literal types)")
359
+
360
+ if "namespace" in validation_results.lower() or "prefix" in validation_results.lower():
361
+ suggestions.append("• Add missing namespace declarations at the top of your RDF")
362
+
363
+ if "XML" in validation_results or "syntax" in validation_results.lower():
364
+ suggestions.append("• Fix XML syntax errors (check for unclosed tags, invalid characters)")
365
+
366
+ if not suggestions:
367
+ suggestions.append("• Review detailed validation results for specific issues")
368
+ suggestions.append("• Ensure your RDF follows the selected template requirements")
369
+
370
+ suggestions_text = "\n".join(suggestions)
371
+
372
+ return f"""
373
+ 📋 **Manual Analysis:**
374
+
375
+ {suggestions_text}
376
+
377
+ 💡 **General Tips:**
378
+ • Check namespace declarations at the top of your RDF
379
+ • Ensure all required properties are present
380
+ • Verify data types match expected formats
381
+ • Make sure XML structure is well-formed
382
+
383
+ 🔧 **Common Fixes:**
384
+ • Add missing namespace prefixes
385
+ • Include required properties like rdf:type
386
+ • Fix malformed URIs or literals
387
+ • Ensure proper XML syntax
388
+ """
389
+
390
+ def generate_manual_correction_hints(validation_results: str, rdf_content: str) -> str:
391
+ """Generate manual correction hints when AI is not available"""
392
+ return f"""<!-- Manual correction hints based on validation results -->
393
+ <!-- Set HF_API_KEY as a Secret in your Space settings for AI-powered corrections -->
394
+
395
+ {rdf_content}
396
+
397
+ <!--
398
+ VALIDATION ISSUES FOUND:
399
+ {validation_results[:500]}...
400
+
401
+ MANUAL CORRECTION STEPS:
402
+ 1. Add missing namespace declarations
403
+ 2. Include required properties (rdf:type, etc.)
404
+ 3. Fix XML syntax errors
405
+ 4. Ensure proper URI formats
406
+ 5. Validate data types
407
+ -->"""
408
+
409
+ def validate_rdf_interface(rdf_content: str, template: str, use_ai: bool = True):
410
+ """Main validation function for Gradio interface"""
411
+ if not rdf_content.strip():
412
+ return "❌ Error", "No RDF/XML data provided", "", ""
413
+
414
+ # Validate RDF
415
+ result = validate_rdf_tool(rdf_content, template)
416
+
417
+ if "error" in result:
418
+ return f"❌ Error: {result['error']}", "", "", ""
419
+
420
+ status = result["status"]
421
+ results_text = result["results"]
422
+
423
+ if result["conforms"]:
424
+ suggestions = "✅ No issues found! Your RDF/XML is valid according to the selected template."
425
+ corrected_rdf = "<!-- Already valid - no corrections needed -->\n" + rdf_content
426
+ else:
427
+ if use_ai:
428
+ suggestions = get_ai_suggestions(results_text, rdf_content)
429
+ corrected_rdf = get_ai_correction(results_text, rdf_content)
430
+ else:
431
+ suggestions = generate_manual_suggestions(results_text)
432
+ corrected_rdf = generate_manual_correction_hints(results_text, rdf_content)
433
+
434
+ return status, results_text, suggestions, corrected_rdf
435
+
436
+ def get_rdf_examples(example_type: str = "valid") -> str:
437
+ """
438
+ Retrieve example RDF/XML snippets for testing and learning.
439
+
440
+ This tool provides sample RDF/XML content that can be used to test
441
+ the validation system or learn proper RDF structure.
442
+
443
+ Args:
444
+ example_type (str): Type of example ('valid', 'invalid', or 'bibframe')
445
+
446
+ Returns:
447
+ str: RDF/XML example content
448
+ """
449
+ examples = {
450
+ "valid": SAMPLE_VALID_RDF,
451
+ "invalid": SAMPLE_INVALID_RDF,
452
+ "bibframe": '''<?xml version="1.0" encoding="UTF-8"?>
453
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
454
+ xmlns:bf="http://id.loc.gov/ontologies/bibframe/"
455
+ xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
456
+
457
+ <bf:Instance rdf:about="http://example.org/instance/1">
458
+ <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Print"/>
459
+ <bf:instanceOf rdf:resource="http://example.org/work/1"/>
460
+ <bf:title>
461
+ <bf:Title>
462
+ <bf:mainTitle>Example Book Title</bf:mainTitle>
463
+ </bf:Title>
464
+ </bf:title>
465
+ <bf:provisionActivity>
466
+ <bf:Publication>
467
+ <bf:date>2024</bf:date>
468
+ <bf:place>
469
+ <bf:Place>
470
+ <rdfs:label>New York</rdfs:label>
471
+ </bf:Place>
472
+ </bf:place>
473
+ </bf:Publication>
474
+ </bf:provisionActivity>
475
+ </bf:Instance>
476
+
477
+ </rdf:RDF>'''
478
+ }
479
+
480
+ return examples.get(example_type, examples["valid"])
481
+
482
+ # Create Gradio Interface
483
+ def create_interface():
484
+ """Create the main Gradio interface"""
485
+
486
+ # Check API key status dynamically
487
+ current_api_key = os.getenv('HF_API_KEY', '')
488
+ api_status = "🔑 AI features enabled" if (OPENAI_AVAILABLE and current_api_key) else "⚠️ AI features disabled (set HF_API_KEY)"
489
+
490
+ with gr.Blocks(
491
+ title="RDF Validation Server with AI",
492
+ theme=gr.themes.Soft(),
493
+ css="""
494
+ .status-box {
495
+ font-weight: bold;
496
+ padding: 10px;
497
+ border-radius: 5px;
498
+ }
499
+ .header-text {
500
+ text-align: center;
501
+ padding: 20px;
502
+ }
503
+ """
504
+ ) as demo:
505
+
506
+ # Header
507
+ debug_info = f"""
508
+ Debug Info:
509
+ - OPENAI_AVAILABLE: {OPENAI_AVAILABLE}
510
+ - HF_INFERENCE_AVAILABLE: {HF_INFERENCE_AVAILABLE}
511
+ - HF_API_KEY set: {'Yes' if current_api_key else 'No'}
512
+ - HF_API_KEY length: {len(current_api_key) if current_api_key else 0}
513
+ - HF_ENDPOINT_URL: {HF_ENDPOINT_URL}
514
+ - HF_MODEL: {HF_MODEL}
515
+ """
516
+
517
+ gr.HTML(f"""
518
+ <div class="header-text">
519
+ <h1>🔍 RDF Validation Server with AI</h1>
520
+ <p>Validate RDF/XML against SHACL schemas with AI-powered suggestions and corrections</p>
521
+ <p><strong>Status:</strong> {api_status}</p>
522
+ <details><summary>Debug Info</summary><pre>{debug_info}</pre></details>
523
+ </div>
524
+ """)
525
+
526
+ # Main interface
527
+ with gr.Row():
528
+ with gr.Column(scale=1):
529
+ gr.Markdown("### 📝 Input")
530
+
531
+ rdf_input = gr.Textbox(
532
+ label="RDF/XML Content",
533
+ placeholder="Paste your RDF/XML content here...",
534
+ lines=15,
535
+ show_copy_button=True
536
+ )
537
+
538
+ with gr.Row():
539
+ template_dropdown = gr.Dropdown(
540
+ label="Validation Template",
541
+ choices=["monograph", "custom"],
542
+ value="monograph",
543
+ info="Select the SHACL template to validate against"
544
+ )
545
+
546
+ use_ai_checkbox = gr.Checkbox(
547
+ label="Use AI Features",
548
+ value=True,
549
+ info="Enable AI-powered suggestions and corrections"
550
+ )
551
+
552
+ validate_btn = gr.Button("🔍 Validate RDF", variant="primary", size="lg")
553
+
554
+ # Results section
555
+ with gr.Row():
556
+ with gr.Column():
557
+ gr.Markdown("### 📊 Results")
558
+
559
+ status_output = gr.Textbox(
560
+ label="Validation Status",
561
+ interactive=False,
562
+ lines=1,
563
+ elem_classes=["status-box"]
564
+ )
565
+
566
+ results_output = gr.Textbox(
567
+ label="Detailed Validation Results",
568
+ interactive=False,
569
+ lines=8,
570
+ show_copy_button=True
571
+ )
572
+
573
+ suggestions_output = gr.Textbox(
574
+ label="💡 Fix Suggestions",
575
+ interactive=False,
576
+ lines=8,
577
+ show_copy_button=True
578
+ )
579
+
580
+ # Corrected RDF section
581
+ with gr.Row():
582
+ with gr.Column():
583
+ gr.Markdown("### 🛠️ AI-Generated Corrections")
584
+
585
+ corrected_output = gr.Textbox(
586
+ label="Corrected RDF/XML",
587
+ interactive=False,
588
+ lines=15,
589
+ show_copy_button=True,
590
+ placeholder="Corrected RDF will appear here after validation..."
591
+ )
592
+
593
+ # Examples and controls
594
+ with gr.Row():
595
+ gr.Markdown("### 📚 Examples & Tools")
596
+
597
+ with gr.Row():
598
+ example1_btn = gr.Button("✅ Valid RDF Example", variant="secondary")
599
+ example2_btn = gr.Button("❌ Invalid RDF Example", variant="secondary")
600
+ example3_btn = gr.Button("📖 BibFrame Example", variant="secondary")
601
+ clear_btn = gr.Button("🗑️ Clear All", variant="stop")
602
+
603
+ # Event handlers
604
+ validate_btn.click(
605
+ fn=validate_rdf_interface,
606
+ inputs=[rdf_input, template_dropdown, use_ai_checkbox],
607
+ outputs=[status_output, results_output, suggestions_output, corrected_output]
608
+ )
609
+
610
+ # Auto-validate on input change (debounced)
611
+ rdf_input.change(
612
+ fn=validate_rdf_interface,
613
+ inputs=[rdf_input, template_dropdown, use_ai_checkbox],
614
+ outputs=[status_output, results_output, suggestions_output, corrected_output]
615
+ )
616
+
617
+ # Example buttons
618
+ example1_btn.click(
619
+ lambda: get_rdf_examples("valid"),
620
+ outputs=[rdf_input]
621
+ )
622
+
623
+ example2_btn.click(
624
+ lambda: get_rdf_examples("invalid"),
625
+ outputs=[rdf_input]
626
+ )
627
+
628
+ example3_btn.click(
629
+ lambda: get_rdf_examples("bibframe"),
630
+ outputs=[rdf_input]
631
+ )
632
+
633
+ clear_btn.click(
634
+ lambda: ("", "", "", "", ""),
635
+ outputs=[rdf_input, status_output, results_output, suggestions_output, corrected_output]
636
+ )
637
+
638
+ # Footer with instructions
639
+ gr.Markdown("""
640
+ ---
641
+ ### 🚀 **Deployment Instructions for Hugging Face Spaces:**
642
+
643
+ 1. **Create a new Space** on [Hugging Face](https://huggingface.co/spaces)
644
+ 2. **Set up your Hugging Face Inference Endpoint** and get the endpoint URL
645
+ 3. **Set your tokens** in Space settings (use Secrets for security):
646
+ - Go to Settings → Repository secrets
647
+ - Add: `HF_API_KEY` = `your_huggingface_api_key_here`
648
+ - Endpoint is now hardcoded to your specific Inference Endpoint
649
+ 4. **Upload these files** to your Space repository
650
+ 5. **Install requirements**: The Space will auto-install from `requirements.txt`
651
+
652
+ ### 🔧 **MCP Server Mode:**
653
+ This app functions as both a web interface AND an MCP server for Claude Desktop and other MCP clients.
654
+
655
+ **Available MCP Tools (via SSE):**
656
+ - `validate_rdf_tool`: Validate RDF/XML against SHACL shapes
657
+ - `get_ai_suggestions`: Get AI-powered fix suggestions
658
+ - `get_ai_correction`: Generate corrected RDF/XML
659
+ - `get_rdf_examples`: Retrieve example RDF snippets
660
+
661
+ **MCP Connection:**
662
+ 1. When deployed on Hugging Face Spaces, the MCP server is available at:
663
+ `https://your-space-id.hf.space/gradio_api/mcp/sse`
664
+ 2. Use this URL in Claude Desktop's MCP configuration
665
+ 3. The app automatically exposes functions with proper docstrings as MCP tools
666
+
667
+ ### 💡 **Features:**
668
+ - ✅ Real-time RDF/XML validation against SHACL schemas
669
+ - 🤖 AI-powered error suggestions and corrections (with HF Inference Endpoint)
670
+ - 📚 Built-in examples and templates
671
+ - 🔄 Auto-validation as you type
672
+ - 📋 Copy results with one click
673
+
674
+ **Note:** AI features require a valid Hugging Face API key (HF_API_KEY) set as a Secret. Manual suggestions are provided as fallback.
675
+ """)
676
+
677
+ return demo
678
+
679
+ # Launch configuration
680
+ if __name__ == "__main__":
681
+ # Force verify environment is clean
682
+ print("🔍 FINAL CHECK: Verifying problematic environment variables are removed...")
683
+ for var in problematic_env_vars:
684
+ if var in os.environ:
685
+ print(f"⚠️ WARNING: {var} still exists! Value: {os.environ[var]}")
686
+ del os.environ[var]
687
+ print(f"🗑️ FORCE REMOVED: {var}")
688
+ else:
689
+ print(f"✅ {var} confirmed not in environment")
690
+
691
+ demo = create_interface()
692
+
693
+ # Configuration for different environments
694
+ port = int(os.getenv('PORT', 7860)) # Hugging Face uses PORT env variable
695
+
696
+ demo.launch(
697
+ server_name="0.0.0.0", # Important for external hosting
698
+ server_port=port, # Use environment PORT or default to 7860
699
+ share=False, # Don't create gradio.live links in production
700
+ show_error=True, # Show errors in the interface
701
+ show_api=True, # Enable API endpoints
702
+ allowed_paths=["."] # Allow serving files from current directory
703
+ )
electronic_MonographDCTAP/Monograph_Instance_Electronic.tsv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ shapeID shapeLabel target propertyID propertyLabel valueShape mandatory severity valueNodeType repeatable note
2
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:instanceOf Instance of big:Monograph:Work true Violation IRI; bnode true
3
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:title Instance Title big:Title true Violation IRI; bnode true
4
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:editionStatement Edition Statement true Warning literal true
5
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:provisionActivity Provision Activity--Publication Information big:ProvisionActivity true Violation IRI; bnode true
6
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:seriesStatement Series Statement true Warning literal true
7
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:identifiedBy Identifiers true Warning IRI; bnode true e.g., ISBN
8
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:issuance Mode of Issuance true Violation IRI; bnode false
9
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:media Media type true Violation IRI; bnode true
10
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:carrier Carrier type true Violation IRI; bnode true
11
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:electronicLocator Uniform Resource Locator for resource true Warning IRI; bnode true
12
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:digitalCharacteristic Digital Characteristic big:DigitalCharacteristic true Warning IRI; bnode false
13
+ big:Monograph:Instance:Electronic Instance (Monograph) Electronic bf:Electronic bf:adminMetadata Administrative metadata true Violation IRI; bnode true *adminMetadata should be at the Work and Instance levels but the requirements are the same for both
14
+ big:Title Instance Title bf:Title bf:mainTitle Main Title true Violation literal false Move to Title Sheet and add other title types
15
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bf:agent Agent big:AgentShape true Warning IRI; bnode true prefer use of bf:agent in Provision Activity but if no bf:agent exists, then use bf:simpleAgent (below)
16
+ big:Agent bf:Agent ; bf:Person ; bf:Family ; bf:Organization ; bf:Jurisdiction ; bf:Meeting rdfs:label Agent Label true Warning literal true
17
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bflc:simpleAgent Agent Simple Label big:Agent true Warning literal true prefer use of bf:agent (above) in Provision Activity but if no bf:agent exists, then use bf:simpleAgent
18
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bf:date Date true Warning literal true Pull both types of dates if they exist as the formatting may differ
19
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bflc:simpleDate Simple Date true Warning literal true Pull both types of dates if they exist as the formatting may differ
20
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bf:place Place big:Place true Warning IRI; bnode true Pull both bf:place and bf:simplePlace LC uses bf:place to indicate the country of publication
21
+ big:Place bf:Place rdfs:label Place Label true Warning literal true
22
+ big:ProvisionActivity Provision Activity bf:ProvisionActivity ; bf:Distribution ; bf:Manufacture ; bf:Production ; bf:Publication ; bf:Modification bflc:simplePlace Place Simple Label true Warning literal true
23
+ big:DigitalCharacteristic Digital Characteristic bf:FileType bf:digitalCharacteristic File Type true Warning literal true
24
+ big:DigitalCharacteristic Digital Characteristic bf:EncodingFormat bf:digitalCharacteristic Encoding Format true Warning literal true
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Hugging Face Gradio App with MCP Server
2
+ # Core dependencies
3
+ gradio>=4.0.0
4
+ rdflib>=7.0.0
5
+ pySHACL>=0.25.0
6
+ pandas>=2.0.0
7
+
8
+ # AI integrations
9
+ huggingface_hub>=0.20.0
10
+ openai>=1.0.0
11
+
12
+ # MCP support (optional)
13
+ mcp>=0.9.0
14
+
15
+ # Web and utilities
16
+ flask>=2.3.0
17
+ flask-cors>=4.0.0
18
+ requests>=2.31.0
19
+ waitress>=2.1.0
20
+
21
+ # Additional utilities
22
+ python-dotenv>=1.0.0
23
+ aiofiles>=23.0.0
24
+ asyncio-mqtt>=0.13.0
validator.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import pandas as pd
4
+ import rdflib
5
+ from rdflib import Namespace, Literal, BNode, RDF, RDFS
6
+ from pyshacl import validate
7
+
8
+ # Set up basic logging (use DEBUG level to see detailed output)
9
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
10
+
11
+ BASE_DIR = os.path.join(os.path.dirname(__file__), "MonographDCTAP")
12
+
13
+ TSV_FILES = [
14
+ "MonographDCTAP/Monograph_Work_Text.tsv",
15
+ "MonographDCTAP/Monograph_AdminMetadata.tsv",
16
+ "MonographDCTAP/Monograph_Instance_Print.tsv",
17
+ "electronic_MonographDCTAP/Monograph_Instance_Electronic.tsv",
18
+ ]
19
+ PREFIX_FILE = "./MonographDCTAP/Monograph_Prefixes.tsv"
20
+
21
+ # Add a global constant for fixed prefixes.
22
+ FIXED_PREFIXES = {
23
+ "bf": "http://id.loc.gov/ontologies/bibframe/",
24
+ "bflc": "http://id.loc.gov/ontologies/bflc/",
25
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
26
+ "big": "https://example.org/"
27
+ }
28
+
29
+ # Replace load_prefixes() with a simplified function:
30
+ def load_prefixes(prefixes_file):
31
+ logging.info("Using hardcoded prefixes:")
32
+ for p, ns in FIXED_PREFIXES.items():
33
+ logging.info(f"{p} -> {ns}")
34
+ return FIXED_PREFIXES
35
+
36
+ # Optionally simplify register_prefixes and _bind_namespaces:
37
+ def register_prefixes(graph, prefixes):
38
+ for prefix, uri in prefixes.items():
39
+ graph.bind(prefix, Namespace(uri), override=True)
40
+
41
+ def _bind_namespaces(graph: rdflib.Graph):
42
+ # Hard-code the fixed namespaces as well.
43
+ graph.namespace_manager.bind("bf", Namespace(FIXED_PREFIXES["bf"]))
44
+ graph.namespace_manager.bind("bflc", Namespace(FIXED_PREFIXES["bflc"]))
45
+ graph.namespace_manager.bind("rdfs", Namespace(FIXED_PREFIXES["rdfs"]))
46
+ graph.namespace_manager.bind("big", Namespace(FIXED_PREFIXES["big"]))
47
+
48
+ def _prop_id_to_uri(property_id, prefixes):
49
+ if ":" in property_id:
50
+ prefix, suffix = property_id.split(":", 1)
51
+ ns = prefixes.get(prefix.strip())
52
+ if ns:
53
+ return rdflib.URIRef(ns + suffix.strip())
54
+ if property_id.startswith("http"):
55
+ return rdflib.URIRef(property_id)
56
+ return Literal(property_id)
57
+
58
+ def add_shape_from_row(graph, row, prefixes):
59
+ shape_uri = rdflib.URIRef(row['shapeID'])
60
+ logging.info(f"Processing shape: {shape_uri}")
61
+ if (shape_uri, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")) not in graph:
62
+ graph.add((shape_uri, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")))
63
+ graph.add((shape_uri, RDFS.label, Literal(row['shapeLabel'])))
64
+ logging.info(f"Added NodeShape: {shape_uri} with label {row['shapeLabel']}")
65
+ targets = [t.strip() for t in str(row['target']).split(";")]
66
+ for target in targets:
67
+ target_uri = _prop_id_to_uri(target, prefixes)
68
+ graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#targetClass"), target_uri))
69
+ logging.info(f"Added target '{target_uri}' to shape {shape_uri}")
70
+
71
+ # If the property is mandatory, add a SPARQLTarget to force evaluation of nodes missing the property.
72
+ if str(row['mandatory']).strip().lower() == "true":
73
+ property_uri = _prop_id_to_uri(row['propertyID'], prefixes)
74
+ target_uris = [ _prop_id_to_uri(t, prefixes) for t in targets ]
75
+ union_clause = " UNION ".join([f"{{ ?this a <{uri}> }}" for uri in target_uris])
76
+ query = f"SELECT ?this WHERE {{ {union_clause} FILTER NOT EXISTS {{ ?this <{property_uri}> ?o }} }}"
77
+ bnode = BNode()
78
+ sh = rdflib.URIRef("http://www.w3.org/ns/shacl#")
79
+ # Use RDF.type triple to mark the bnode as a SPARQLTarget
80
+ graph.add((bnode, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#SPARQLTarget")))
81
+ graph.add((bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#select"), Literal(query)))
82
+ graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#target"), bnode))
83
+ logging.info(f"Added SPARQLTarget with query: {query} to shape {shape_uri}")
84
+
85
+ property_bnode = BNode()
86
+ graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#property"), property_bnode))
87
+ graph.add((property_bnode, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#PropertyShape")))
88
+ graph.add((property_bnode, RDFS.label, Literal(row['propertyLabel'])))
89
+ path_uri = _prop_id_to_uri(row['propertyID'], prefixes)
90
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#path"), path_uri))
91
+ logging.info(f"Added property shape for property {row['propertyID']} with label {row['propertyLabel']}")
92
+ if str(row['mandatory']).strip().lower() == "true":
93
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#minCount"), Literal(1)))
94
+ logging.info(f"Set minCount 1 for property {row['propertyID']}")
95
+ if str(row['repeatable']).strip().lower() == "false":
96
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#maxCount"), Literal(1)))
97
+ logging.info(f"Set maxCount 1 for property {row['propertyID']}")
98
+ severity = str(row.get("severity", "")).strip()
99
+ if severity:
100
+ sev_ns = rdflib.URIRef("http://www.w3.org/ns/shacl#")
101
+ if severity == "Violation":
102
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"),
103
+ rdflib.URIRef(sev_ns + "Violation")))
104
+ elif severity == "Warning":
105
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"),
106
+ rdflib.URIRef(sev_ns + "Warning")))
107
+ else:
108
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"),
109
+ rdflib.URIRef(sev_ns + "Info")))
110
+ logging.info(f"Set severity {severity} for property {row['propertyID']}")
111
+ if pd.notna(row.get("valueShape")) and row["valueShape"].strip():
112
+ value_shape_uri = _prop_id_to_uri(row["valueShape"], prefixes)
113
+ graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#node"), value_shape_uri))
114
+ logging.info(f"Linked valueShape {value_shape_uri} for property {row['propertyID']}")
115
+ return graph
116
+
117
+ def build_shacl_graphs():
118
+ logging.info("Building individual SHACL graphs from TSV files")
119
+ module_graphs = {} # Initialize the dictionary for module graphs
120
+ prefixes = load_prefixes(PREFIX_FILE)
121
+ for tsv in TSV_FILES:
122
+ tsv_path = tsv # already an absolute path
123
+ if not os.path.exists(tsv_path):
124
+ logging.error(f"TSV file not found: {tsv_path}")
125
+ logging.info(f"Processing TSV file: {tsv_path}")
126
+ graph = rdflib.Graph()
127
+ register_prefixes(graph, prefixes)
128
+ _bind_namespaces(graph) # Bind fixed namespaces for the SHACL graph
129
+ df = pd.read_csv(tsv_path, sep='\t', comment='/')
130
+ for _, row in df.iterrows():
131
+ if pd.isna(row.get("shapeID")):
132
+ continue
133
+ add_shape_from_row(graph, row, prefixes)
134
+ module_graphs[tsv] = graph
135
+ logging.info("Completed building individual SHACL graphs")
136
+ return module_graphs
137
+
138
+ def parse_results_text(results_text: str) -> str:
139
+ """
140
+ Parse and reformat raw results_text for nicer display.
141
+ Adjust the logic to suit your output format.
142
+ """
143
+ lines = results_text.strip().splitlines()
144
+ formatted_lines = []
145
+ for line in lines:
146
+ line = line.strip()
147
+ if line.startswith("==="):
148
+ # Start of a module section
149
+ formatted_lines.append("\n" + line)
150
+ elif line.startswith("Validation Result"):
151
+ # Start a new violation
152
+ formatted_lines.append("\n" + line)
153
+ else:
154
+ formatted_lines.append("\t" + line)
155
+ return "\n".join(formatted_lines)
156
+
157
+ def validate_rdf(rdf_data, template):
158
+ logging.info("Starting validation")
159
+ data_graph = rdflib.Graph()
160
+ logging.info("Parsing RDF data")
161
+ try:
162
+ data_graph.parse(data=rdf_data, format='xml')
163
+ except Exception as e:
164
+ logging.error(f"Error parsing RDF data: {e}")
165
+ raise e
166
+ logging.info(f"Data graph has {len(data_graph)} triples.")
167
+
168
+ # Bind known namespaces explicitly from the input RDF/XML
169
+ namespaces = {
170
+ "bf": "http://id.loc.gov/ontologies/bibframe/",
171
+ "bflc": "http://id.loc.gov/ontologies/bflc/",
172
+ "bfsimple": "http://id.loc.gov/ontologies/bfsimple/",
173
+ "cc": "http://creativecommons.org/ns#",
174
+ "datatypes": "http://id.loc.gov/datatypes/",
175
+ "dcterms": "http://purl.org/dc/terms/",
176
+ "foaf": "http://xmlns.com/foaf/0.1/",
177
+ "lcc": "http://id.loc.gov/ontologies/lcc#",
178
+ "lclocal": "http://id.loc.gov/ontologies/lclocal/",
179
+ "madsrdf": "http://www.loc.gov/mads/rdf/v1#",
180
+ "mnotetype": "http://id.loc.gov/vocabulary/mnotetype/",
181
+ "mstatus": "https://id.loc.gov/vocabulary/mstatus/",
182
+ "owl": "http://www.w3.org/2002/07/owl#",
183
+ "pmo": "http://performedmusicontology.org/ontology/",
184
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
185
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
186
+ "skos": "http://www.w3.org/2004/02/skos/core#",
187
+ "vartitletype": "http://id.loc.gov/vocabulary/vartitletype/",
188
+ "void": "http://rdfs.org/ns/void#",
189
+ "xsd": "http://www.w3.org/2001/XMLSchema#"
190
+ }
191
+ for prefix, uri in namespaces.items():
192
+ data_graph.bind(prefix, uri)
193
+
194
+ logging.info(f"Data graph has {len(data_graph)} triples.")
195
+ for s, p, o in list(data_graph)[:10]:
196
+ logging.debug(f"Parsed triple: {s} {p} {o}")
197
+ # New: Log the full RDF graph in turtle format
198
+ serialized_graph = data_graph.serialize(format='turtle')
199
+ logging.info("Full RDF graph:\n" + (serialized_graph.decode('utf-8') if isinstance(serialized_graph, bytes) else serialized_graph))
200
+
201
+ # Extra debugging: log all rdf:type values from the data graph
202
+ classes = set()
203
+ for s, o in data_graph.subject_objects(RDF.type):
204
+ classes.add(o)
205
+ logging.debug(f"Data graph contains these types: {list(classes)}")
206
+
207
+ # === Added debugging to check expected target class URIs ===
208
+ prefixes = load_prefixes(PREFIX_FILE)
209
+ # List your expected target class identifiers as they are used in your TSV
210
+ expected_targets = ["https:Agent", "big:Contribution"]
211
+ expanded_targets = [ _prop_id_to_uri(t, prefixes) for t in expected_targets ]
212
+ logging.debug(f"Expected target classes per TSV: {expanded_targets}")
213
+
214
+ if template.lower() == 'monograph':
215
+ logging.info("Using Monograph template; processing individual TSV modules")
216
+ module_graphs = build_shacl_graphs()
217
+
218
+ # Debug: inspect declared target classes in each module and query focus nodes.
219
+ for tsv, module in module_graphs.items():
220
+ logging.debug(f"Module {tsv} declared targets:")
221
+ for shape in module.subjects(RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")):
222
+ for target in module.objects(shape, rdflib.URIRef("http://www.w3.org/ns/shacl#targetClass")):
223
+ logging.debug(f"Shape {shape} declares target: {target}")
224
+ q = f"SELECT ?x WHERE {{ ?x a <{target}> . }}"
225
+ matches = list(data_graph.query(q))
226
+ logging.debug(f"Found {len(matches)} focus node(s) for target {target}")
227
+ for match in matches:
228
+ logging.debug(f"Focus node: {match.x}")
229
+
230
+ all_results = []
231
+ overall_conforms = True
232
+ for tsv, graph in module_graphs.items():
233
+ shacl_text = graph.serialize(format='turtle')
234
+ logging.info(f"Module {tsv} SHACL shapes:")
235
+ logging.info(shacl_text.decode('utf-8') if isinstance(shacl_text, bytes) else shacl_text)
236
+ conforms, results_graph, results_text = validate(data_graph, shacl_graph=graph, inference='rdfs', debug=True)
237
+ # Override conform status if any violation has severity sh:Violation.
238
+ violation_query = """
239
+ PREFIX sh: <http://www.w3.org/ns/shacl#>
240
+ SELECT ?severity WHERE {
241
+ ?vr a sh:ValidationResult ;
242
+ sh:resultSeverity ?severity .
243
+ }
244
+ """
245
+ severities = [str(row.severity) for row in results_graph.query(violation_query)]
246
+ module_conforms = False if any("http://www.w3.org/ns/shacl#Violation" in s for s in severities) else True
247
+ logging.info(f"Module {tsv} - Overridden Conforms: {module_conforms}")
248
+
249
+ # Build a nicely formatted summary of the results.
250
+ query_formatted = """
251
+ PREFIX sh: <http://www.w3.org/ns/shacl#>
252
+ SELECT ?component ?severity ?sourceShape ?focus ?resultPath ?message
253
+ WHERE {
254
+ ?vr a sh:ValidationResult ;
255
+ sh:sourceConstraintComponent ?component ;
256
+ sh:resultSeverity ?severity ;
257
+ sh:sourceShape ?sourceShape ;
258
+ sh:focusNode ?focus ;
259
+ sh:resultPath ?resultPath ;
260
+ sh:resultMessage ?message .
261
+ }
262
+ ORDER BY ?component
263
+ """
264
+ formatted_results = ""
265
+ count = 0
266
+ for row in results_graph.query(query_formatted):
267
+ count += 1
268
+ formatted_results += f"Validation Result in {row.component}:\n"
269
+ formatted_results += f"\tSeverity: {row.severity}\n"
270
+ formatted_results += f"\tSource Shape: {row.sourceShape}\n"
271
+ formatted_results += f"\tFocus Node: {row.focus}\n"
272
+ formatted_results += f"\tResult Path: {row.resultPath}\n"
273
+ formatted_results += f"\tMessage: {row.message}\n"
274
+ formatted_results = f"Results ({count}):\n" + formatted_results
275
+
276
+ # Assemble module output.
277
+ module_output = (
278
+ f"\n=== Module: {tsv} ===\n"
279
+ f"Overridden Conforms: {module_conforms}\n"
280
+ f"{formatted_results}\n"
281
+ "------------------------\n"
282
+ )
283
+ all_results.append(module_output)
284
+ if not module_conforms:
285
+ overall_conforms = False
286
+ combined_results = "\n".join(all_results)
287
+ # Optionally, parse the combined results for easier display.
288
+ combined_results = parse_results_text(combined_results)
289
+ return overall_conforms, combined_results
290
+ else:
291
+ logging.info("Using default SHACL template")
292
+ shacl_text = """
293
+ @prefix sh: <http://www.w3.org/ns/shacl#> .
294
+ @prefix ex: <http://example.org/> .
295
+ ex:DefaultShape a sh:NodeShape ;
296
+ sh:targetNode ex:SomeNode ;
297
+ sh:property [
298
+ sh:path ex:someProperty ;
299
+ sh:datatype xsd:string ;
300
+ ] .
301
+ """
302
+ shacl_graph = rdflib.Graph()
303
+ shacl_graph.parse(data=shacl_text, format='turtle')
304
+ conforms, results_graph, results_text = validate(data_graph, shacl_graph=shacl_graph, inference='rdfs', debug=True)
305
+ logging.info(f"Validation completed; Conforms: {conforms}")
306
+ logging.info("Results text:")
307
+ logging.info(results_text)
308
+ serialized_results = results_graph.serialize(format='turtle')
309
+ logging.info("Detailed results graph:")
310
+ logging.info(serialized_results.decode('utf-8') if isinstance(serialized_results, bytes) else serialized_results)
311
+ combined_results = (f"{results_text.strip()}\nDetailed Results:\n"
312
+ f"{serialized_results.decode('utf-8') if isinstance(serialized_results, bytes) else serialized_results}")
313
+ return conforms, combined_results