From d62f6a60fc20fb8c821b8bf80e5be51361fa84a7 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Fri, 20 Sep 2024 15:17:03 +0200 Subject: [PATCH] Issue #150/#155 initial process_id deny list support in DeepGraphSplitter --- src/openeo_aggregator/backend.py | 3 + .../partitionedjobs/crossbackend.py | 36 ++++++++-- tests/partitionedjobs/test_crossbackend.py | 72 ++++++++++++++++++- 3 files changed, 103 insertions(+), 8 deletions(-) diff --git a/src/openeo_aggregator/backend.py b/src/openeo_aggregator/backend.py index f09b720..398aaeb 100644 --- a/src/openeo_aggregator/backend.py +++ b/src/openeo_aggregator/backend.py @@ -983,6 +983,9 @@ def supporting_backends(node_id: str, node: dict) -> Union[List[str], None]: graph_splitter = DeepGraphSplitter( supporting_backends=supporting_backends, primary_backend=split_strategy.get("crossbackend", {}).get("primary_backend"), + # TODO: instead of this hardcoded deny-list, build it based on backend metadata inspection? + # TODO: make a config for this? + split_deny_list={"aggregate_spatial", "load_geojson", "load_url"}, ) else: raise ValueError(f"Invalid graph split strategy {graph_split_method!r}") diff --git a/src/openeo_aggregator/partitionedjobs/crossbackend.py b/src/openeo_aggregator/partitionedjobs/crossbackend.py index 2be30c6..076b021 100644 --- a/src/openeo_aggregator/partitionedjobs/crossbackend.py +++ b/src/openeo_aggregator/partitionedjobs/crossbackend.py @@ -30,6 +30,7 @@ import openeo from openeo import BatchJob +from openeo.util import deep_get from openeo_driver.jobregistry import JOB_STATUS from openeo_aggregator.constants import JOB_OPTION_FORCE_BACKEND @@ -51,6 +52,7 @@ SubGraphId = str NodeId = str BackendId = str +ProcessId = str # Annotation for a function that maps node information (id and its node dict) @@ -750,10 +752,17 @@ def next_nodes(node_id: NodeId) -> Iterable[NodeId]: return up, down - def produce_split_locations(self, limit: int = 20) -> Iterator[List[NodeId]]: + def produce_split_locations( + self, + limit: int = 20, + allow_split: Callable[[NodeId], bool] = lambda n: True, + ) -> Iterator[List[NodeId]]: """ Produce disjoint subgraphs that can be processed independently. + :param limit: maximum number of split locations to produce + :param allow_split: predicate to determine if a node can be split on (e.g. to deny splitting on certain nodes) + :return: iterator of node listings. Each node listing encodes a graph split (nodes ids where to split). A node listing is ordered with the following in mind: @@ -765,6 +774,8 @@ def produce_split_locations(self, limit: int = 20) -> Iterator[List[NodeId]]: the previous split. - etc """ + # TODO: allow_split: possible to make this backend-dependent in some way? + # Find nodes that have empty set of backend_candidates forsaken_nodes = self.find_forsaken_nodes() @@ -779,13 +790,11 @@ def produce_split_locations(self, limit: int = 20) -> Iterator[List[NodeId]]: articulation_points: Set[NodeId] = set(self.find_articulation_points()) _log.debug(f"_GraphViewer.produce_split_locations: {articulation_points=}") - # TODO: allow/deny lists of what openEO processes can be split on? E.g. only split raster cube paths - # Walk upstream from forsaken nodes to find articulation points, where we can cut split_options = [ n for n in self.walk_upstream_nodes(seeds=forsaken_nodes, include_seeds=False) - if n in articulation_points + if n in articulation_points and allow_split(n) ] _log.debug(f"_GraphViewer.produce_split_locations: {split_options=}") if not split_options: @@ -799,7 +808,7 @@ def produce_split_locations(self, limit: int = 20) -> Iterator[List[NodeId]]: assert not up.find_forsaken_nodes() # Recursively split downstream part if necessary if down.find_forsaken_nodes(): - down_splits = list(down.produce_split_locations(limit=max(limit - 1, 1))) + down_splits = list(down.produce_split_locations(limit=max(limit - 1, 1), allow_split=allow_split)) else: down_splits = [[]] @@ -834,11 +843,20 @@ def split_at_multiple(self, split_nodes: List[NodeId]) -> Dict[Union[NodeId, Non class DeepGraphSplitter(ProcessGraphSplitterInterface): """ More advanced graph splitting (compared to just splitting off `load_collection` nodes) + + :param split_deny_list: list of process ids that should not be split on """ - def __init__(self, supporting_backends: SupportingBackendsMapper, primary_backend: Optional[BackendId] = None): + def __init__( + self, + supporting_backends: SupportingBackendsMapper, + primary_backend: Optional[BackendId] = None, + split_deny_list: Iterable[ProcessId] = (), + ): self._supporting_backends_mapper = supporting_backends self._primary_backend = primary_backend + # TODO also support other deny mechanisms, e.g. callable instead of a deny list? + self._split_deny_list = set(split_deny_list) def _pick_backend(self, backend_candidates: Union[frozenset[BackendId], None]) -> BackendId: if backend_candidates is None: @@ -855,7 +873,11 @@ def split(self, process_graph: FlatPG) -> _PGSplitResult: flat_graph=process_graph, supporting_backends=self._supporting_backends_mapper ) - for split_nodes in graph.produce_split_locations(): + def allow_split(node_id: NodeId) -> bool: + process_id = deep_get(process_graph, node_id, "process_id", default=None) + return process_id not in self._split_deny_list + + for split_nodes in graph.produce_split_locations(allow_split=allow_split): _log.debug(f"DeepGraphSplitter.split: evaluating split nodes: {split_nodes=}") split_views = graph.split_at_multiple(split_nodes=split_nodes) diff --git a/tests/partitionedjobs/test_crossbackend.py b/tests/partitionedjobs/test_crossbackend.py index 0be8664..adf0e65 100644 --- a/tests/partitionedjobs/test_crossbackend.py +++ b/tests/partitionedjobs/test_crossbackend.py @@ -962,9 +962,36 @@ def test_produce_split_locations_merge_longer_triangle(self): ) assert list(graph.produce_split_locations(limit=4)) == [["bands2"], ["mask1"], ["lc2"], ["lc1"]] + def test_produce_split_locations_allow_split(self): + """Usage of custom allow_list predicate""" + flat = { + # lc1 lc2 + # | | + # bands1 bands2 + # \ / + # merge1 + "lc1": {"process_id": "load_collection", "arguments": {"id": "S1"}}, + "bands1": {"process_id": "filter_bands", "arguments": {"data": {"from_node": "lc1"}, "bands": ["B01"]}}, + "lc2": {"process_id": "load_collection", "arguments": {"id": "S2"}}, + "bands2": {"process_id": "filter_bands", "arguments": {"data": {"from_node": "lc2"}, "bands": ["B02"]}}, + "merge1": { + "process_id": "merge_cubes", + "arguments": {"cube1": {"from_node": "bands1"}, "cube2": {"from_node": "bands2"}}, + }, + } + graph = _GraphViewer.from_flat_graph( + flat, + supporting_backends=supporting_backends_from_node_id_dict({"lc1": ["b1"], "lc2": ["b2"]}), + ) + assert list(graph.produce_split_locations()) == [["bands1"], ["bands2"], ["lc1"], ["lc2"]] + assert list(graph.produce_split_locations(allow_split=lambda n: n not in {"bands1", "lc2"})) == [ + ["bands2"], + ["lc1"], + ] + class TestDeepGraphSplitter: - def test_simple_no_split(self): + def test_no_split(self): splitter = DeepGraphSplitter(supporting_backends=supporting_backends_from_node_id_dict({"lc1": ["b1"]})) flat = { "lc1": {"process_id": "load_collection", "arguments": {"id": "S2"}}, @@ -1150,3 +1177,46 @@ def test_split_with_primary_backend(self, primary_backend, secondary_graph): primary_backend_id=primary_backend, secondary_graphs=[secondary_graph], ) + + @pytest.mark.parametrize( + ["split_deny_list", "split_node", "primary_node_ids", "secondary_node_ids"], + [ + ({}, "temporal2", {"lc1", "bands1", "temporal2", "merge"}, {"lc2", "temporal2"}), + ({"filter_bands", "filter_temporal"}, "lc2", {"lc1", "lc2", "bands1", "temporal2", "merge"}, {"lc2"}), + ], + ) + def test_split_deny_list(self, split_deny_list, split_node, primary_node_ids, secondary_node_ids): + """ + Simple deep split use case: + two load_collections from different backends, with some additional filtering, merged. + """ + splitter = DeepGraphSplitter( + supporting_backends=supporting_backends_from_node_id_dict({"lc1": ["b1"], "lc2": ["b2"]}), + primary_backend="b1", + split_deny_list=split_deny_list, + ) + flat = { + # lc1 lc2 + # | | + # bands1 temporal2 + # \ / + # merge + "lc1": {"process_id": "load_collection", "arguments": {"id": "S1"}}, + "lc2": {"process_id": "load_collection", "arguments": {"id": "S2"}}, + "bands1": {"process_id": "filter_bands", "arguments": {"data": {"from_node": "lc1"}, "bands": ["B01"]}}, + "temporal2": { + "process_id": "filter_temporal", + "arguments": {"data": {"from_node": "lc2"}, "extent": ["2022", "2023"]}, + }, + "merge": { + "process_id": "merge_cubes", + "arguments": {"cube1": {"from_node": "bands1"}, "cube2": {"from_node": "temporal2"}}, + "result": True, + }, + } + result = splitter.split(flat) + assert result == _PGSplitResult( + primary_node_ids=primary_node_ids, + primary_backend_id="b1", + secondary_graphs=[_PGSplitSubGraph(split_node=split_node, node_ids=secondary_node_ids, backend_id="b2")], + )