diff --git a/arango_rdf/main.py b/arango_rdf/main.py index 7879da3..dcae8b6 100644 --- a/arango_rdf/main.py +++ b/arango_rdf/main.py @@ -1456,33 +1456,41 @@ def migrate_unknown_resources( def migrate_edges_to_attributes( self, graph_name: str, - edge_collection_name: str, + edge_path: list[str], attribute_name: Optional[str] = None, edge_direction: str = "OUTBOUND", + max_depth: int = 1, sort_clause: Optional[str] = None, return_clause: Optional[str] = None, filter_clause: Optional[str] = None, + traversal_options: Optional[dict[str, Any]] = None, ) -> int: """RDF --> ArangoDB (PGT): Migrate all edges in the specified edge collection to attributes. This method is useful when combined with the **resource_collection_name** parameter of the :func:`rdf_to_arangodb_by_pgt` method. - NOTE: It is recommended to run this method with **edge_collection_name** set - to **"type"** after :func:`rdf_to_arangodb_by_pgt` if the user has set the + NOTE: It is recommended to run this method with **edge_path** set + to **["type"]** after :func:`rdf_to_arangodb_by_pgt` if the user has set the **resource_collection_name** parameter. :param graph_name: The name of the graph to migrate the edges from. :type graph_name: str - :param edge_collection_name: The name of the edge collection to migrate. - :type edge_collection_name: str + :param edge_path: The path of the edges to migrate. The first element is the + starting edge collection, the last element is the ending edge collection. + Can also include edge direction traversal + (e.g ["OUTBOUND type", "OUTBOUND subClassOf"]). + :type edge_path: list[str] + :param edge_direction: The default traversal direction of the edges to migrate. + Defaults to **OUTBOUND**. + :type edge_direction: str + :param max_depth: The maximum depth of the edge path to migrate. + Defaults to 1. + :type max_depth: int :param attribute_name: The name of the attribute to migrate the edges to. - Defaults to **edge_collection_name**, prefixed with the + Defaults to **edge_path[0]**, prefixed with the **rdf_attribute_prefix** parameter set in the constructor. :type attribute_name: Optional[str] - :param edge_direction: The direction of the edges to migrate. - Defaults to **OUTBOUND**. - :type edge_direction: str :param sort_clause: A SORT statement to order the traversed vertices. Defaults to f"v.{self.__rdf_attribute_prefix}label". If set to None, the vertex values will be ordered based on their traversal order. @@ -1495,6 +1503,9 @@ def migrate_edges_to_attributes( :param filter_clause: A FILTER statement to filter the traversed edges & target vertices. Defaults to None. :type filter_clause: Optional[str] + :param traversal_options: A dictionary of traversal options to pass to the + AQL query. Defaults to None. + :type traversal_options: Optional[dict[str, Any]] :return: The number of documents updated. :rtype: int """ @@ -1507,35 +1518,51 @@ def migrate_edges_to_attributes( graph = self.db.graph(graph_name) - target_e_d = {} + # Remove potential INBOUND/OUTBOUND/ANY prefix + # (e.g ["OUTBOUND type", "OUTBOUND subClassOf"]) + edge_path_cleaned = [e_col.split(" ")[-1] for e_col in edge_path] + start_edge_collection = edge_path_cleaned[0] + + start_node_collections = [] + all_e_ds = [] for e_d in graph.edge_definitions(): - if e_d["edge_collection"] == edge_collection_name: - target_e_d = e_d - break + if e_d["edge_collection"] == start_edge_collection: + start_node_collections = e_d["from_vertex_collections"] - if not target_e_d: - m = f"No edge definition found for '{edge_collection_name}' in graph '{graph_name}'. Cannot migrate edges to attributes." # noqa: E501 + if e_d["edge_collection"] in edge_path_cleaned: + all_e_ds.append(e_d) + + if not all_e_ds: + m = f"No edge definitions found for '{edge_path}' in graph '{graph_name}'. Cannot migrate edges to attributes." # noqa: E501 raise ValueError(m) - if not attribute_name: - attribute_name = f"{self.__rdf_attribute_prefix}{edge_collection_name}" + if attribute_name is None: + attribute_name = f"{self.__rdf_attribute_prefix}{start_edge_collection}" - if not sort_clause: + if sort_clause is None: sort_clause = f"v.{self.__rdf_label_attr}" - if not return_clause: + if return_clause is None: return_clause = f"v.{self.__rdf_label_attr}" - with_cols = set(target_e_d["to_vertex_collections"]) + if traversal_options is None: + traversal_options = { + "uniqueVertices": "path", + "uniqueEdges": "path", + } + + with_cols = {col for e_d in all_e_ds for col in e_d["to_vertex_collections"]} with_cols_str = "WITH " + ", ".join(with_cols) + e_cols = ", ".join(edge_path_cleaned) count = 0 - for v_col in target_e_d["from_vertex_collections"]: + for v_col in start_node_collections: query = f""" {with_cols_str} FOR doc IN @@v_col LET labels = ( - FOR v, e IN 1 {edge_direction} doc @@e_col + FOR v, e IN 1..{max_depth} {edge_direction} doc {e_cols} + OPTIONS {json.dumps(traversal_options)} {f"FILTER {filter_clause}" if filter_clause else ""} {f"SORT {sort_clause}" if sort_clause else ""} RETURN {return_clause} @@ -1544,9 +1571,7 @@ def migrate_edges_to_attributes( UPDATE doc WITH {{{attribute_name}: labels}} IN @@v_col """ - self.db.aql.execute( - query, bind_vars={"@v_col": v_col, "@e_col": edge_collection_name} - ) + self.db.aql.execute(query, bind_vars={"@v_col": v_col}) count += self.db.collection(v_col).count() diff --git a/docs/rdf_to_arangodb_lpg.rst b/docs/rdf_to_arangodb_lpg.rst index b7d0e4d..329be3f 100644 --- a/docs/rdf_to_arangodb_lpg.rst +++ b/docs/rdf_to_arangodb_lpg.rst @@ -21,6 +21,7 @@ Consider the following RDF graph: .. code-block:: turtle @prefix ex: . + @prefix rdfs: . ex:Alice a ex:Person ; ex:name "Alice" ; @@ -32,6 +33,8 @@ Consider the following RDF graph: ex:Alice ex:friend ex:Bob . + ex:Person rdfs:subClassOf ex:Human . + Running the LPG transformation produces a graph with: * **2 vertices** in the ``Node`` collection (``ex:Alice`` & ``ex:Bob``) @@ -80,6 +83,28 @@ After the migration each vertex has an ``_type`` array property – ``["Person"]`` in this example – and the original ``rdf:type`` edges remain untouched. Delete them if you do not need them any more. +In addition to the **edge_collection_name** parameter, it is possible to traverse the vertices of the 2nd Order edge collection to apply +the same attribute (but at the 2nd Order) to the original target verticies. In PGT, a common use case is to +set **edge_collection_name** to **"type"** and **second_order_edge_collection_name** +to **"subClassOf"** for inferring the **_type** attribute. + +In LPG, this can be done with ``second_order_filter_clause``: + +.. code-block:: python + + adbrdf.migrate_edges_to_attributes( + graph_name="DemoGraph", + edge_collection_name="Edge", + attribute_name="_type", + filter_clause="e._label == 'type'", + second_order_edge_collection_name="Edge", + second_order_filter_clause="e._label == 'subClassOf'" + second_order_depth=10, + ) + +After this migration, the ``_type`` attribute of ``ex:Alice`` and ``ex:Bob`` will be adjusted to ``["Person", "Human"]``. + + LPG Collection Mapping Process ============================== diff --git a/tests/test_main.py b/tests/test_main.py index a075cfb..d9fb8fa 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -5442,7 +5442,7 @@ def test_pgt_resource_collection_name_and_set_types_attribute() -> None: for node in db.collection("Node"): assert "_type" not in node - count = adbrdf.migrate_edges_to_attributes("Test", "type") + count = adbrdf.migrate_edges_to_attributes("Test", ["type"]) node_col = db.collection("Node") assert set(node_col.get(adbrdf.hash("http://example.com/Alice"))["_type"]) == { @@ -5475,7 +5475,7 @@ def test_pgt_resource_collection_name_and_set_types_attribute() -> None: for v in db.collection("Company"): assert "_type" not in v - count = adbrdf.migrate_edges_to_attributes("Test", "type", "foo") + count = adbrdf.migrate_edges_to_attributes("Test", ["type"], "foo") assert count == 3 for v in db.collection("Human"): @@ -5484,9 +5484,7 @@ def test_pgt_resource_collection_name_and_set_types_attribute() -> None: for v in db.collection("Company"): assert set(v["foo"]) == {"Organization", "Company"} - count = adbrdf.migrate_edges_to_attributes( - graph_name="Test", edge_collection_name="friend" - ) + count = adbrdf.migrate_edges_to_attributes(graph_name="Test", edge_path=["friend"]) alice = db.collection("Human").get(adbrdf.hash("http://example.com/Alice")) assert alice["_friend"] == ["Bob"] @@ -5497,7 +5495,7 @@ def test_pgt_resource_collection_name_and_set_types_attribute() -> None: assert count == 2 count = adbrdf.migrate_edges_to_attributes( - graph_name="Test", edge_collection_name="friend", edge_direction="ANY" + graph_name="Test", edge_path=["friend"], edge_direction="ANY" ) assert count == 2 @@ -5510,23 +5508,19 @@ def test_pgt_resource_collection_name_and_set_types_attribute() -> None: with pytest.raises(ValueError) as e: adbrdf.migrate_edges_to_attributes( - graph_name="Test", edge_collection_name="friend", edge_direction="INVALID" + graph_name="Test", edge_path=["friend"], edge_direction="INVALID" ) assert "Invalid edge direction: INVALID" in str(e.value) with pytest.raises(ValueError) as e: - adbrdf.migrate_edges_to_attributes( - graph_name="Test", edge_collection_name="INVALID" - ) + adbrdf.migrate_edges_to_attributes(graph_name="Test", edge_path=["INVALID"]) - m = "No edge definition found for 'INVALID' in graph 'Test'. Cannot migrate edges to attributes." # noqa: E501 + m = "No edge definitions found for '['INVALID']' in graph 'Test'. Cannot migrate edges to attributes." # noqa: E501 assert m in str(e.value) with pytest.raises(ValueError) as e: - adbrdf.migrate_edges_to_attributes( - graph_name="INVALID", edge_collection_name="friend" - ) + adbrdf.migrate_edges_to_attributes(graph_name="INVALID", edge_path=["friend"]) assert "Graph 'INVALID' does not exist" in str(e.value) @@ -5627,7 +5621,7 @@ def test_lpg() -> None: assert "_type" not in node adbrdf.migrate_edges_to_attributes( - "Test", "Edge", "_type", filter_clause="e._label == 'type'" + "Test", ["Edge"], "_type", filter_clause="e._label == 'type'" ) for node in db.collection("Node"): @@ -5702,3 +5696,121 @@ def import_rdf(graph_name: str, rdf_graph: RDFGraph) -> str: assert db.collection("Node").count() == 3 assert db.collection("Property").count() == 2 assert db.collection("knows").count() == 2 + + +def test_migrate_edges_to_attributes_max_depth() -> None: + db.delete_graph("Test", drop_collections=True, ignore_missing=True) + + g = RDFGraph() + g.parse( + data=""" + @prefix ex: . + @prefix rdfs: . + + ex:Alice a ex:Human . + + ex:Bob a ex:Person . + + ex:Charlie a ex:Animal . + + ex:Dana a ex:Entity . + + ex:Eve a ex:Human . + ex:Eve a ex:Person . + + ex:Fred a ex:Human . + ex:Fred a ex:Individual . + + ex:Human rdfs:subClassOf ex:Animal . + ex:Person rdfs:subClassOf ex:Individual . + ex:Animal rdfs:subClassOf ex:Entity . + ex:Individual rdfs:subClassOf ex:Entity . + """, + format="turtle", + ) + + adbrdf.rdf_to_arangodb_by_pgt("Test", g, resource_collection_name="Node") + + assert db.collection("subClassOf").count() == 4 + + adbrdf.migrate_edges_to_attributes( + graph_name="Test", + edge_path=["type", "subClassOf"], + max_depth=1, + ) + + alice = db.collection("Node").get(adbrdf.hash("http://example.com/Alice")) + assert set(alice["_type"]) == {"Human"} + + bob = db.collection("Node").get(adbrdf.hash("http://example.com/Bob")) + assert set(bob["_type"]) == {"Person"} + + charlie = db.collection("Node").get(adbrdf.hash("http://example.com/Charlie")) + assert set(charlie["_type"]) == {"Animal"} + + dana = db.collection("Node").get(adbrdf.hash("http://example.com/Dana")) + assert set(dana["_type"]) == {"Entity"} + + eve = db.collection("Node").get(adbrdf.hash("http://example.com/Eve")) + assert set(eve["_type"]) == {"Human", "Person"} + + fred = db.collection("Node").get(adbrdf.hash("http://example.com/Fred")) + assert set(fred["_type"]) == {"Human", "Individual"} + + db.delete_graph("Test", drop_collections=True) + + adbrdf.rdf_to_arangodb_by_pgt("Test", g, resource_collection_name="Node") + + adbrdf.migrate_edges_to_attributes( + graph_name="Test", + edge_path=["type", "subClassOf"], + max_depth=2, + ) + + alice = db.collection("Node").get(adbrdf.hash("http://example.com/Alice")) + assert set(alice["_type"]) == {"Human", "Animal"} + + bob = db.collection("Node").get(adbrdf.hash("http://example.com/Bob")) + assert set(bob["_type"]) == {"Person", "Individual"} + + charlie = db.collection("Node").get(adbrdf.hash("http://example.com/Charlie")) + assert set(charlie["_type"]) == {"Animal", "Entity"} + + dana = db.collection("Node").get(adbrdf.hash("http://example.com/Dana")) + assert set(dana["_type"]) == {"Entity"} + + eve = db.collection("Node").get(adbrdf.hash("http://example.com/Eve")) + assert set(eve["_type"]) == {"Human", "Person", "Animal", "Individual"} + + fred = db.collection("Node").get(adbrdf.hash("http://example.com/Fred")) + assert set(fred["_type"]) == {"Human", "Individual", "Animal", "Entity"} + + db.delete_graph("Test", drop_collections=True) + + adbrdf.rdf_to_arangodb_by_pgt("Test", g, resource_collection_name="Node") + + adbrdf.migrate_edges_to_attributes( + graph_name="Test", + edge_path=["type", "subClassOf"], + max_depth=3, + ) + + alice = db.collection("Node").get(adbrdf.hash("http://example.com/Alice")) + assert set(alice["_type"]) == {"Human", "Animal", "Entity"} + + bob = db.collection("Node").get(adbrdf.hash("http://example.com/Bob")) + assert set(bob["_type"]) == {"Person", "Individual", "Entity"} + + charlie = db.collection("Node").get(adbrdf.hash("http://example.com/Charlie")) + assert set(charlie["_type"]) == {"Animal", "Entity"} + + dana = db.collection("Node").get(adbrdf.hash("http://example.com/Dana")) + assert set(dana["_type"]) == {"Entity"} + + eve = db.collection("Node").get(adbrdf.hash("http://example.com/Eve")) + assert set(eve["_type"]) == {"Human", "Person", "Animal", "Individual", "Entity"} + + fred = db.collection("Node").get(adbrdf.hash("http://example.com/Fred")) + assert set(fred["_type"]) == {"Human", "Individual", "Entity", "Animal"} + + db.delete_graph("Test", drop_collections=True)