From a5e647233ac41011511e8353c0acc688d25fdb67 Mon Sep 17 00:00:00 2001 From: Lawon Lewis Date: Wed, 20 Nov 2024 16:43:55 +1000 Subject: [PATCH] fix: filter out bnodes from fts search results --- .../query_generation/search_fuseki_fts.py | 117 +++++++++++++++--- 1 file changed, 97 insertions(+), 20 deletions(-) diff --git a/prez/services/query_generation/search_fuseki_fts.py b/prez/services/query_generation/search_fuseki_fts.py index ed89df7f..25c7da63 100644 --- a/prez/services/query_generation/search_fuseki_fts.py +++ b/prez/services/query_generation/search_fuseki_fts.py @@ -5,12 +5,18 @@ from rdflib.namespace import RDF, RDFS from sparql_grammar_pydantic import ( IRI, + AdditiveExpression, + BrackettedExpression, BuiltInCall, CollectionPath, + ConditionalAndExpression, + ConditionalOrExpression, + Constraint, ConstructQuery, ConstructTemplate, ConstructTriples, Expression, + Filter, GraphNodePath, GraphPatternNotTriples, GraphTerm, @@ -19,6 +25,8 @@ GroupOrUnionGraphPattern, LimitClause, LimitOffsetClauses, + MultiplicativeExpression, + NumericExpression, ObjectListPath, ObjectPath, OffsetClause, @@ -33,6 +41,7 @@ PropertyListPath, PropertyListPathNotEmpty, RDFLiteral, + RelationalExpression, SelectClause, SG_Path, SolutionModifier, @@ -41,6 +50,8 @@ TriplesNodePath, TriplesSameSubject, TriplesSameSubjectPath, + UnaryExpression, + ValueLogical, Var, VarOrTerm, VerbPath, @@ -59,27 +70,41 @@ class SearchQueryFusekiFTS(ConstructQuery): :param limit: sparql limit clause :param offset: sparql offset clause :param non_shacl_predicates: list of predicates to search over (must be indexed) + :param shacl_tssp_preds: list of triples same subject paths and search predicate + (typically generated from a ) + :param tss_list: list of triples same subject paths + (typically generated from a ) generates a query of the form .. code:: sparql - construct { + CONSTRUCT { + ?prof_101_node_1 ?fts_search_node . + ?focus_node ?prof_101_node_1 . ?hashID . ?hashID ?focus_node . ?hashID ?match . ?hashID ?pred . ?hashID ?weight } - where { - select ?focus_node ?pred ?match ?weight (URI(CONCAT("urn:hash:", SHA256(CONCAT(STR(?focus_node), STR(?pred), STR(?match), STR(?weight))))) as ?hashID) - where { - (?focus_node ?weight ?match ?g ?pred) text:query ( "") + WHERE { + SELECT ?focus_node ?pred ?match ?weight (URI(CONCAT("urn:hash:", SHA256(CONCAT(STR(?focus_node), STR(?pred), STR(?match), STR(?weight))))) AS ?hashID) + WHERE { + { + (?focus_node ?weight ?match ?g ?pred) ( "search+term") + } + UNION + { + (?fts_search_node ?weight ?match ?g ?pred) ( "search+term") . + ?prof_101_node_1 ?fts_search_node . + ?focus_node ?prof_101_node_1 + } } + ORDER BY DESC( ?weight ) + LIMIT + OFFSET } - order by desc(?weight) - limit - offset NOTE: By default the search phrase given by `term` will be split by whitespace and concatenated together with '+' as this @@ -98,6 +123,10 @@ def __init__( ) = None, tss_list: list[TriplesSameSubjectPath] | None = None, ): + if not any([bool(non_shacl_predicates), bool(shacl_tssp_preds)]): + raise ValueError( + "At least one of `non_shacl_predicates` and `shacl_tssp_preds` must be given" + ) limit += 1 # increase the limit by one, so we know if there are further pages of results. # join search terms with '+' for better results term = "+".join(term.split(" ")) @@ -235,21 +264,70 @@ def _generate_fts_triples_block( ) ggp_list = [] + bnode_filter = Filter( + constraint=Constraint( + content=BrackettedExpression( + expression=Expression( + conditional_or_expression=ConditionalOrExpression( + conditional_and_expressions=[ + ConditionalAndExpression( + value_logicals=[ + ValueLogical( + relational_expression=RelationalExpression( + left=NumericExpression( + additive_expression=AdditiveExpression( + base_expression=MultiplicativeExpression( + base_expression=UnaryExpression( + operator="!", + primary_expression=PrimaryExpression( + content=BuiltInCall( + function_name="isBLANK", + arguments=[ + sr_uri + ], + ) + ), + ) + ) + ) + ) + ) + ) + ] + ) + ] + ) + ) + ) + ) + ) if non_shacl_predicates: direct_preds_tb = _generate_fts_triples_block(non_shacl_predicates) direct_preds_ggp = GroupGraphPattern( - content=GroupGraphPatternSub(triples_block=direct_preds_tb) + content=GroupGraphPatternSub( + graph_patterns_or_triples_blocks=[ + direct_preds_tb, + GraphPatternNotTriples(content=bnode_filter), + ] + ) ) ggp_list.append(direct_preds_ggp) - for tssp_list, preds in shacl_tssp_preds: - path_preds_tb = _generate_fts_triples_block( - preds, Var(value="fts_search_node") - ) - path_preds_tb.triples_block = TriplesBlock.from_tssp_list(tssp_list) - path_preds_ggp = GroupGraphPattern( - content=GroupGraphPatternSub(triples_block=path_preds_tb) - ) - ggp_list.append(path_preds_ggp) + if shacl_tssp_preds: + for tssp_list, preds in shacl_tssp_preds: + path_preds_tb = _generate_fts_triples_block( + preds, Var(value="fts_search_node") + ) + path_preds_tb.triples_block = TriplesBlock.from_tssp_list(tssp_list) + path_preds_ggp = GroupGraphPattern( + content=GroupGraphPatternSub( + graph_patterns_or_triples_blocks=[ + path_preds_tb, + GraphPatternNotTriples(content=bnode_filter), + ] + ) + ) + ggp_list.append(path_preds_ggp) + gpnt = GraphPatternNotTriples( content=GroupOrUnionGraphPattern(group_graph_patterns=ggp_list) ) @@ -337,8 +415,6 @@ def _generate_fts_triples_block( ) ) ) - - # logger.debug(f"constructed Fuseki FTS query:\n{self}") super().__init__( construct_template=construct_template, where_clause=where_clause, @@ -394,3 +470,4 @@ def inner_select_gpnt(self): offset=0, non_shacl_predicates=[RDFS.label, RDFS.comment], ) + logger.debug(fts_query)