Skip to content

Commit

Permalink
update prefix generation function
Browse files Browse the repository at this point in the history
keep vowels but remove all punctuation characters as per #287
  • Loading branch information
lalewis1 committed Oct 24, 2024
1 parent 6cfbf0c commit dbb4092
Showing 1 changed file with 14 additions and 16 deletions.
30 changes: 14 additions & 16 deletions prez/services/curie_functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import re
import string
from urllib.parse import urlparse

from aiocache import caches
Expand Down Expand Up @@ -35,10 +36,10 @@ def namespace_registered(namespace):
def valid_prefix(prefix: str):
"""For turtle serialization, as per https://www.w3.org/TR/turtle/#grammar-production-PN_PREFIX"""
valid = True
PN_CHARS_BASE = r"([A-Z]|[a-z]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF])"
PN_CHARS_U = rf"({PN_CHARS_BASE}|_)"
PN_CHARS = rf"({PN_CHARS_U}|-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])"
PN_PREFIX = rf"({PN_CHARS_BASE}(({PN_CHARS}|.)*{PN_CHARS})?)"
PN_CHARS_BASE = "([A-Z]|[a-z]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF])"
PN_CHARS_U = f"({PN_CHARS_BASE}|_)"
PN_CHARS = f"({PN_CHARS_U}|-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])"
PN_PREFIX = f"({PN_CHARS_BASE}(({PN_CHARS}|.)*{PN_CHARS})?)"
matches = re.match(PN_PREFIX, prefix)
if not matches:
valid = False
Expand All @@ -57,20 +58,17 @@ def generate_new_prefix(uri):

split_prefix_path = ns[:-1].rsplit("/", 1)
if len(split_prefix_path) > 1:
to_generate_prefix_from = split_prefix_path[-1].lower()
# attempt to just use the last part of the path prior to the fragment or "identifier"
if len(to_generate_prefix_from) <= 6:
proposed_prefix = to_generate_prefix_from
if not prefix_registered(proposed_prefix):
prefix_graph.bind(proposed_prefix, ns)
return
# otherwise, remove vowels to reduce length
proposed_prefix = "".join(
[c for c in to_generate_prefix_from if c not in "aeiou!@#$%^&*()_+-=,."]
path_part = split_prefix_path[-1]
# generate a prefix using the last part of the path prior to the fragment or 'identifier'
# converts to lowercase and removes punctuation characters
proposed_prefix = (
path_part
.lower()
.translate(str.maketrans("", "", string.punctuation))
)
if not valid_prefix(proposed_prefix):
# if we still can't get a nice prefix. use an ugly but valid one using a hash of the IRI
proposed_prefix = f"ns{hash(to_generate_prefix_from)}"
# if the generated prefix is not valid use an ugly but valid one by hashing the last part of the uri
proposed_prefix = f"ns{hash(path_part)}"
if not prefix_registered(proposed_prefix):
prefix_graph.bind(proposed_prefix, ns)
return
Expand Down

0 comments on commit dbb4092

Please sign in to comment.