Spaces:

jimfhahn
/

mcp4rdf

Running

File size: 15,630 Bytes

e344fcd

import os
import logging
import pandas as pd
import rdflib
from rdflib import Namespace, Literal, BNode, RDF, RDFS
from pyshacl import validate

# Set up basic logging (use DEBUG level to see detailed output)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')

BASE_DIR = os.path.join(os.path.dirname(__file__), "MonographDCTAP")

TSV_FILES = [
    "MonographDCTAP/Monograph_Work_Text.tsv",
    "MonographDCTAP/Monograph_AdminMetadata.tsv",
    "MonographDCTAP/Monograph_Instance_Print.tsv",
    "electronic_MonographDCTAP/Monograph_Instance_Electronic.tsv",
]
PREFIX_FILE = "./MonographDCTAP/Monograph_Prefixes.tsv"

# Add a global constant for fixed prefixes.
FIXED_PREFIXES = {
    "bf": "http://id.loc.gov/ontologies/bibframe/",
    "bflc": "http://id.loc.gov/ontologies/bflc/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "big": "https://example.org/"
}

# Replace load_prefixes() with a simplified function:
def load_prefixes(prefixes_file):
    logging.info("Using hardcoded prefixes:")
    for p, ns in FIXED_PREFIXES.items():
        logging.info(f"{p} -> {ns}")
    return FIXED_PREFIXES

# Optionally simplify register_prefixes and _bind_namespaces:
def register_prefixes(graph, prefixes):
    for prefix, uri in prefixes.items():
        graph.bind(prefix, Namespace(uri), override=True)
        
def _bind_namespaces(graph: rdflib.Graph):
    # Hard-code the fixed namespaces as well.
    graph.namespace_manager.bind("bf", Namespace(FIXED_PREFIXES["bf"]))
    graph.namespace_manager.bind("bflc", Namespace(FIXED_PREFIXES["bflc"]))
    graph.namespace_manager.bind("rdfs", Namespace(FIXED_PREFIXES["rdfs"]))
    graph.namespace_manager.bind("big", Namespace(FIXED_PREFIXES["big"]))

def _prop_id_to_uri(property_id, prefixes):
    if ":" in property_id:
        prefix, suffix = property_id.split(":", 1)
        ns = prefixes.get(prefix.strip())
        if ns:
            return rdflib.URIRef(ns + suffix.strip())
    if property_id.startswith("http"):
        return rdflib.URIRef(property_id)
    return Literal(property_id)

def add_shape_from_row(graph, row, prefixes):
    shape_uri = rdflib.URIRef(row['shapeID'])
    logging.info(f"Processing shape: {shape_uri}")
    if (shape_uri, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")) not in graph:
        graph.add((shape_uri, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")))
        graph.add((shape_uri, RDFS.label, Literal(row['shapeLabel'])))
        logging.info(f"Added NodeShape: {shape_uri} with label {row['shapeLabel']}")
    targets = [t.strip() for t in str(row['target']).split(";")]
    for target in targets:
        target_uri = _prop_id_to_uri(target, prefixes)
        graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#targetClass"), target_uri))
        logging.info(f"Added target '{target_uri}' to shape {shape_uri}")
    
    # If the property is mandatory, add a SPARQLTarget to force evaluation of nodes missing the property.
    if str(row['mandatory']).strip().lower() == "true":
        property_uri = _prop_id_to_uri(row['propertyID'], prefixes)
        target_uris = [ _prop_id_to_uri(t, prefixes) for t in targets ]
        union_clause = " UNION ".join([f"{{ ?this a <{uri}> }}" for uri in target_uris])
        query = f"SELECT ?this WHERE {{ {union_clause} FILTER NOT EXISTS {{ ?this <{property_uri}> ?o }} }}"
        bnode = BNode()
        sh = rdflib.URIRef("http://www.w3.org/ns/shacl#")
        # Use RDF.type triple to mark the bnode as a SPARQLTarget
        graph.add((bnode, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#SPARQLTarget")))
        graph.add((bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#select"), Literal(query)))
        graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#target"), bnode))
        logging.info(f"Added SPARQLTarget with query: {query} to shape {shape_uri}")

    property_bnode = BNode()
    graph.add((shape_uri, rdflib.URIRef("http://www.w3.org/ns/shacl#property"), property_bnode))
    graph.add((property_bnode, RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#PropertyShape")))
    graph.add((property_bnode, RDFS.label, Literal(row['propertyLabel'])))
    path_uri = _prop_id_to_uri(row['propertyID'], prefixes)
    graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#path"), path_uri))
    logging.info(f"Added property shape for property {row['propertyID']} with label {row['propertyLabel']}")
    if str(row['mandatory']).strip().lower() == "true":
        graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#minCount"), Literal(1)))
        logging.info(f"Set minCount 1 for property {row['propertyID']}")
    if str(row['repeatable']).strip().lower() == "false":
        graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#maxCount"), Literal(1)))
        logging.info(f"Set maxCount 1 for property {row['propertyID']}")
    severity = str(row.get("severity", "")).strip()
    if severity:
        sev_ns = rdflib.URIRef("http://www.w3.org/ns/shacl#")
        if severity == "Violation":
            graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"), 
                       rdflib.URIRef(sev_ns + "Violation")))
        elif severity == "Warning":
            graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"), 
                       rdflib.URIRef(sev_ns + "Warning")))
        else:
            graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#severity"), 
                       rdflib.URIRef(sev_ns + "Info")))
        logging.info(f"Set severity {severity} for property {row['propertyID']}")
    if pd.notna(row.get("valueShape")) and row["valueShape"].strip():
        value_shape_uri = _prop_id_to_uri(row["valueShape"], prefixes)
        graph.add((property_bnode, rdflib.URIRef("http://www.w3.org/ns/shacl#node"), value_shape_uri))
        logging.info(f"Linked valueShape {value_shape_uri} for property {row['propertyID']}")
    return graph

def build_shacl_graphs():
    logging.info("Building individual SHACL graphs from TSV files")
    module_graphs = {}  # Initialize the dictionary for module graphs
    prefixes = load_prefixes(PREFIX_FILE)
    for tsv in TSV_FILES:
        tsv_path = tsv  # already an absolute path
        if not os.path.exists(tsv_path):
            logging.error(f"TSV file not found: {tsv_path}")
        logging.info(f"Processing TSV file: {tsv_path}")
        graph = rdflib.Graph()
        register_prefixes(graph, prefixes)
        _bind_namespaces(graph)  # Bind fixed namespaces for the SHACL graph
        df = pd.read_csv(tsv_path, sep='\t', comment='/')
        for _, row in df.iterrows():
            if pd.isna(row.get("shapeID")):
                continue
            add_shape_from_row(graph, row, prefixes)
        module_graphs[tsv] = graph
    logging.info("Completed building individual SHACL graphs")
    return module_graphs

def parse_results_text(results_text: str) -> str:
    """
    Parse and reformat raw results_text for nicer display.
    Adjust the logic to suit your output format.
    """
    lines = results_text.strip().splitlines()
    formatted_lines = []
    for line in lines:
        line = line.strip()
        if line.startswith("==="):
            # Start of a module section
            formatted_lines.append("\n" + line)
        elif line.startswith("Validation Result"):
            # Start a new violation
            formatted_lines.append("\n" + line)
        else:
            formatted_lines.append("\t" + line)
    return "\n".join(formatted_lines)

def validate_rdf(rdf_data, template):
    logging.info("Starting validation")
    data_graph = rdflib.Graph()
    logging.info("Parsing RDF data")
    try:
        data_graph.parse(data=rdf_data, format='xml')
    except Exception as e:
        logging.error(f"Error parsing RDF data: {e}")
        raise e
    logging.info(f"Data graph has {len(data_graph)} triples.")

    # Bind known namespaces explicitly from the input RDF/XML
    namespaces = {
        "bf": "http://id.loc.gov/ontologies/bibframe/",
        "bflc": "http://id.loc.gov/ontologies/bflc/",
        "bfsimple": "http://id.loc.gov/ontologies/bfsimple/",
        "cc": "http://creativecommons.org/ns#",
        "datatypes": "http://id.loc.gov/datatypes/",
        "dcterms": "http://purl.org/dc/terms/",
        "foaf": "http://xmlns.com/foaf/0.1/",
        "lcc": "http://id.loc.gov/ontologies/lcc#",
        "lclocal": "http://id.loc.gov/ontologies/lclocal/",
        "madsrdf": "http://www.loc.gov/mads/rdf/v1#",
        "mnotetype": "http://id.loc.gov/vocabulary/mnotetype/",
        "mstatus": "https://id.loc.gov/vocabulary/mstatus/",
        "owl": "http://www.w3.org/2002/07/owl#",
        "pmo": "http://performedmusicontology.org/ontology/",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "skos": "http://www.w3.org/2004/02/skos/core#",
        "vartitletype": "http://id.loc.gov/vocabulary/vartitletype/",
        "void": "http://rdfs.org/ns/void#",
        "xsd": "http://www.w3.org/2001/XMLSchema#"
    }
    for prefix, uri in namespaces.items():
        data_graph.bind(prefix, uri)
    
    logging.info(f"Data graph has {len(data_graph)} triples.")
    for s, p, o in list(data_graph)[:10]:
        logging.debug(f"Parsed triple: {s} {p} {o}")
    # New: Log the full RDF graph in turtle format
    serialized_graph = data_graph.serialize(format='turtle')
    logging.info("Full RDF graph:\n" + (serialized_graph.decode('utf-8') if isinstance(serialized_graph, bytes) else serialized_graph))

    # Extra debugging: log all rdf:type values from the data graph
    classes = set()
    for s, o in data_graph.subject_objects(RDF.type):
        classes.add(o)
    logging.debug(f"Data graph contains these types: {list(classes)}")
    
    # === Added debugging to check expected target class URIs ===
    prefixes = load_prefixes(PREFIX_FILE)
    # List your expected target class identifiers as they are used in your TSV
    expected_targets = ["https:Agent", "big:Contribution"]
    expanded_targets = [ _prop_id_to_uri(t, prefixes) for t in expected_targets ]
    logging.debug(f"Expected target classes per TSV: {expanded_targets}")

    if template.lower() == 'monograph':
        logging.info("Using Monograph template; processing individual TSV modules")
        module_graphs = build_shacl_graphs()
        
        # Debug: inspect declared target classes in each module and query focus nodes.
        for tsv, module in module_graphs.items():
            logging.debug(f"Module {tsv} declared targets:")
            for shape in module.subjects(RDF.type, rdflib.URIRef("http://www.w3.org/ns/shacl#NodeShape")):
                for target in module.objects(shape, rdflib.URIRef("http://www.w3.org/ns/shacl#targetClass")):
                    logging.debug(f"Shape {shape} declares target: {target}")
                    q = f"SELECT ?x WHERE {{ ?x a <{target}> . }}"
                    matches = list(data_graph.query(q))
                    logging.debug(f"Found {len(matches)} focus node(s) for target {target}")
                    for match in matches:
                        logging.debug(f"Focus node: {match.x}")

        all_results = []
        overall_conforms = True
        for tsv, graph in module_graphs.items():
            shacl_text = graph.serialize(format='turtle')
            logging.info(f"Module {tsv} SHACL shapes:")
            logging.info(shacl_text.decode('utf-8') if isinstance(shacl_text, bytes) else shacl_text)
            conforms, results_graph, results_text = validate(data_graph, shacl_graph=graph, inference='rdfs', debug=True)
            # Override conform status if any violation has severity sh:Violation.
            violation_query = """
            PREFIX sh: <http://www.w3.org/ns/shacl#>
            SELECT ?severity WHERE {
                ?vr a sh:ValidationResult ;
                    sh:resultSeverity ?severity .
            }
            """
            severities = [str(row.severity) for row in results_graph.query(violation_query)]
            module_conforms = False if any("http://www.w3.org/ns/shacl#Violation" in s for s in severities) else True
            logging.info(f"Module {tsv} - Overridden Conforms: {module_conforms}")
            
            # Build a nicely formatted summary of the results.
            query_formatted = """
            PREFIX sh: <http://www.w3.org/ns/shacl#>
            SELECT ?component ?severity ?sourceShape ?focus ?resultPath ?message
            WHERE {
              ?vr a sh:ValidationResult ;
                  sh:sourceConstraintComponent ?component ;
                  sh:resultSeverity ?severity ;
                  sh:sourceShape ?sourceShape ;
                  sh:focusNode ?focus ;
                  sh:resultPath ?resultPath ;
                  sh:resultMessage ?message .
            }
            ORDER BY ?component
            """
            formatted_results = ""
            count = 0
            for row in results_graph.query(query_formatted):
                count += 1
                formatted_results += f"Validation Result in {row.component}:\n"
                formatted_results += f"\tSeverity: {row.severity}\n"
                formatted_results += f"\tSource Shape: {row.sourceShape}\n"
                formatted_results += f"\tFocus Node: {row.focus}\n"
                formatted_results += f"\tResult Path: {row.resultPath}\n"
                formatted_results += f"\tMessage: {row.message}\n"
            formatted_results = f"Results ({count}):\n" + formatted_results
            
            # Assemble module output.
            module_output = (
                f"\n=== Module: {tsv} ===\n"
                f"Overridden Conforms: {module_conforms}\n"
                f"{formatted_results}\n"
                "------------------------\n"
            )
            all_results.append(module_output)
            if not module_conforms:
                overall_conforms = False
        combined_results = "\n".join(all_results)
        # Optionally, parse the combined results for easier display.
        combined_results = parse_results_text(combined_results)
        return overall_conforms, combined_results
    else:
        logging.info("Using default SHACL template")
        shacl_text = """
        @prefix sh: <http://www.w3.org/ns/shacl#> .
        @prefix ex: <http://example.org/> .
        ex:DefaultShape a sh:NodeShape ;
            sh:targetNode ex:SomeNode ;
            sh:property [
                sh:path ex:someProperty ;
                sh:datatype xsd:string ;
            ] .
        """
        shacl_graph = rdflib.Graph()
        shacl_graph.parse(data=shacl_text, format='turtle')
        conforms, results_graph, results_text = validate(data_graph, shacl_graph=shacl_graph, inference='rdfs', debug=True)
        logging.info(f"Validation completed; Conforms: {conforms}")
        logging.info("Results text:")
        logging.info(results_text)
        serialized_results = results_graph.serialize(format='turtle')
        logging.info("Detailed results graph:")
        logging.info(serialized_results.decode('utf-8') if isinstance(serialized_results, bytes) else serialized_results)
        combined_results = (f"{results_text.strip()}\nDetailed Results:\n"
                            f"{serialized_results.decode('utf-8') if isinstance(serialized_results, bytes) else serialized_results}")
        return conforms, combined_results