From dbb4092badd2264ea3d0c273f395fb213ef812a0 Mon Sep 17 00:00:00 2001 From: Lawon Lewis Date: Fri, 25 Oct 2024 09:59:50 +1000 Subject: [PATCH] update prefix generation function keep vowels but remove all punctuation characters as per #287 --- prez/services/curie_functions.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/prez/services/curie_functions.py b/prez/services/curie_functions.py index 485f9447..b390a90f 100755 --- a/prez/services/curie_functions.py +++ b/prez/services/curie_functions.py @@ -1,5 +1,6 @@ import logging import re +import string from urllib.parse import urlparse from aiocache import caches @@ -35,10 +36,10 @@ def namespace_registered(namespace): def valid_prefix(prefix: str): """For turtle serialization, as per https://www.w3.org/TR/turtle/#grammar-production-PN_PREFIX""" valid = True - PN_CHARS_BASE = r"([A-Z]|[a-z]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF])" - PN_CHARS_U = rf"({PN_CHARS_BASE}|_)" - PN_CHARS = rf"({PN_CHARS_U}|-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])" - PN_PREFIX = rf"({PN_CHARS_BASE}(({PN_CHARS}|.)*{PN_CHARS})?)" + PN_CHARS_BASE = "([A-Z]|[a-z]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF])" + PN_CHARS_U = f"({PN_CHARS_BASE}|_)" + PN_CHARS = f"({PN_CHARS_U}|-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])" + PN_PREFIX = f"({PN_CHARS_BASE}(({PN_CHARS}|.)*{PN_CHARS})?)" matches = re.match(PN_PREFIX, prefix) if not matches: valid = False @@ -57,20 +58,17 @@ def generate_new_prefix(uri): split_prefix_path = ns[:-1].rsplit("/", 1) if len(split_prefix_path) > 1: - to_generate_prefix_from = split_prefix_path[-1].lower() - # attempt to just use the last part of the path prior to the fragment or "identifier" - if len(to_generate_prefix_from) <= 6: - proposed_prefix = to_generate_prefix_from - if not prefix_registered(proposed_prefix): - prefix_graph.bind(proposed_prefix, ns) - return - # otherwise, remove vowels to reduce length - proposed_prefix = "".join( - [c for c in to_generate_prefix_from if c not in "aeiou!@#$%^&*()_+-=,."] + path_part = split_prefix_path[-1] + # generate a prefix using the last part of the path prior to the fragment or 'identifier' + # converts to lowercase and removes punctuation characters + proposed_prefix = ( + path_part + .lower() + .translate(str.maketrans("", "", string.punctuation)) ) if not valid_prefix(proposed_prefix): - # if we still can't get a nice prefix. use an ugly but valid one using a hash of the IRI - proposed_prefix = f"ns{hash(to_generate_prefix_from)}" + # if the generated prefix is not valid use an ugly but valid one by hashing the last part of the uri + proposed_prefix = f"ns{hash(path_part)}" if not prefix_registered(proposed_prefix): prefix_graph.bind(proposed_prefix, ns) return