diff --git a/nemo_curator/backends/base.py b/nemo_curator/backends/base.py index e302a37eb6..fdf76f3dc7 100644 --- a/nemo_curator/backends/base.py +++ b/nemo_curator/backends/base.py @@ -14,6 +14,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from pathlib import Path from typing import TYPE_CHECKING, Any from nemo_curator.core.utils import ignore_ray_head_node @@ -52,8 +53,22 @@ def __init__(self, config: dict[str, Any] | None = None, ignore_head_node: bool self.ignore_head_node = ignore_head_node or ignore_ray_head_node() @abstractmethod - def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | None = None) -> None: - """Execute the pipeline.""" + def execute( + self, + stages: list["ProcessingStage"], + initial_tasks: list[Task] | None = None, + checkpoint_path: str | Path | None = None, + ) -> None: + """Execute the pipeline. + + Args: + stages: Execution stages to run. + initial_tasks: Initial tasks. Empty list / ``EmptyTask`` is used when ``None``. + checkpoint_path: If provided, lineage records (parents, children, type, + completed flag) for every task that flows through the pipeline are + persisted to an LMDB file at this path. The file is owned by a + single Ray actor and is safe to place on NFS/Lustre. + """ class BaseStageAdapter: diff --git a/nemo_curator/backends/ray_actor_pool/executor.py b/nemo_curator/backends/ray_actor_pool/executor.py index b69d9e52e3..0c2c42f968 100644 --- a/nemo_curator/backends/ray_actor_pool/executor.py +++ b/nemo_curator/backends/ray_actor_pool/executor.py @@ -14,6 +14,7 @@ import uuid from copy import deepcopy +from pathlib import Path from typing import TYPE_CHECKING import numpy as np @@ -25,6 +26,7 @@ from nemo_curator.backends.base import BaseExecutor from nemo_curator.backends.utils import RayStageSpecKeys, execute_setup_on_node, register_loguru_serializer from nemo_curator.tasks import EmptyTask, Task +from nemo_curator.utils.lineage_store import LINEAGE_ACTOR_NAME, LineageWriterActor from .adapter import RayActorPoolStageAdapter from .raft_adapter import RayActorPoolRAFTAdapter @@ -78,12 +80,19 @@ def __init__( self.show_progress = show_progress self.progress_interval = progress_interval - def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | None = None) -> list[Task]: # noqa: PLR0912 + def execute( # noqa: PLR0912, PLR0915, C901 + self, + stages: list["ProcessingStage"], + initial_tasks: list[Task] | None = None, + checkpoint_path: str | Path | None = None, + ) -> list[Task]: """Execute the pipeline stages using ActorPool. Args: stages: List of processing stages to execute initial_tasks: Initial tasks to process (can be None for empty start) + checkpoint_path: If provided, spawn a :class:`LineageWriterActor` that + records the task DAG to LMDB at this path for the duration of the run. Returns: List of final processed tasks @@ -93,10 +102,19 @@ def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | N session_id = uuid.uuid4().bytes + lineage_actor = None try: # Initialize Ray and register loguru serializer register_loguru_serializer() ray.init(ignore_reinit_error=True, runtime_env=_parse_runtime_env(self.config.get("runtime_env", {}))) + if checkpoint_path is not None: + absolute_checkpoint_path = str(Path(checkpoint_path).absolute()) + lineage_actor = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + lifetime="detached", + get_if_exists=True, + ).remote(path=absolute_checkpoint_path) + logger.info(f"Spawned LineageWriterActor; checkpoint at {absolute_checkpoint_path}") # Execute setup on node for all stages BEFORE processing begins execute_setup_on_node(stages, ignore_head_node=self.ignore_head_node) @@ -157,9 +175,14 @@ def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | N # Return final results directly - no need for ray.get() final_results = current_tasks or [] logger.info(f"\nPipeline completed. Final results: {len(final_results)} tasks") - return final_results finally: + if lineage_actor is not None: + try: + ray.get(lineage_actor.close.remote()) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to close LineageWriterActor: {e}") + ray.kill(lineage_actor) # Clean up all Ray resources including named actors logger.info("Shutting down Ray to clean up all resources...") ray.shutdown() diff --git a/nemo_curator/backends/ray_data/executor.py b/nemo_curator/backends/ray_data/executor.py index 19e78d8c2f..6441346042 100644 --- a/nemo_curator/backends/ray_data/executor.py +++ b/nemo_curator/backends/ray_data/executor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import TYPE_CHECKING, Any import ray @@ -21,6 +22,7 @@ from nemo_curator.backends.base import BaseExecutor from nemo_curator.backends.utils import execute_setup_on_node, register_loguru_serializer from nemo_curator.tasks import EmptyTask, Task +from nemo_curator.utils.lineage_store import LINEAGE_ACTOR_NAME, LineageWriterActor from .adapter import RayDataStageAdapter @@ -41,12 +43,19 @@ class RayDataExecutor(BaseExecutor): def __init__(self, config: dict[str, Any] | None = None, ignore_head_node: bool = False): super().__init__(config, ignore_head_node) - def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | None = None) -> list[Task]: + def execute( + self, + stages: list["ProcessingStage"], + initial_tasks: list[Task] | None = None, + checkpoint_path: str | Path | None = None, + ) -> list[Task]: """Execute the pipeline stages using Ray Data. Args: stages (list[ProcessingStage]): List of processing stages to execute initial_tasks (list[Task], optional): Initial tasks to process (can be None for empty start) + checkpoint_path (str | Path, optional): If provided, spawn a :class:`LineageWriterActor` + that records the task DAG to LMDB at this path for the duration of the run. Returns: list[Task]: List of final processed tasks @@ -60,6 +69,7 @@ def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | N # Initialize with initial tasks if provided, otherwise start with EmptyTask tasks: list[Task] = initial_tasks or [EmptyTask] output_tasks: list[Task] = [] + lineage_actor = None # When runtime_env with pip is used, Ray's pip plugin sets up per-stage virtualenvs # lazily on first task dispatch by cloning the current virtualenv. The NeMo Curator # container's /opt/venv is created with `uv venv --seed` so pip is available in clones. @@ -69,6 +79,14 @@ def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | N ray.init( ignore_reinit_error=True, runtime_env={"env_vars": {"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": ""}} ) + if checkpoint_path is not None: + absolute_checkpoint_path = str(Path(checkpoint_path).absolute()) + lineage_actor = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + lifetime="detached", + get_if_exists=True, + ).remote(path=absolute_checkpoint_path) + logger.info(f"Spawned LineageWriterActor; checkpoint at {absolute_checkpoint_path}") # Convert tasks to dataset current_dataset = self._tasks_to_dataset(tasks) @@ -97,6 +115,12 @@ def execute(self, stages: list["ProcessingStage"], initial_tasks: list[Task] | N output_tasks = self._dataset_to_tasks(current_dataset) logger.info(f"Pipeline completed. Final results: {len(output_tasks)} tasks") finally: + if lineage_actor is not None: + try: + ray.get(lineage_actor.close.remote()) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to close LineageWriterActor: {e}") + ray.kill(lineage_actor) # This ensures we unset all the env vars set above during initialize and kill the pending actors. ray.shutdown() return output_tasks diff --git a/nemo_curator/backends/xenna/executor.py b/nemo_curator/backends/xenna/executor.py index aaf51c8383..8b6e703c73 100644 --- a/nemo_curator/backends/xenna/executor.py +++ b/nemo_curator/backends/xenna/executor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any import ray @@ -24,6 +25,7 @@ from nemo_curator.backends.xenna.adapter import create_named_xenna_stage_adapter from nemo_curator.stages.base import ProcessingStage from nemo_curator.tasks import EmptyTask, Task +from nemo_curator.utils.lineage_store import LINEAGE_ACTOR_NAME, LineageWriterActor class XennaExecutor(BaseExecutor): @@ -59,12 +61,20 @@ def __init__(self, config: dict[str, Any] | None = None, ignore_head_node: bool "autoscale_interval_s": 180, } - def execute(self, stages: list[ProcessingStage], initial_tasks: list[Task] | None = None) -> list[Task]: + def execute( + self, + stages: list[ProcessingStage], + initial_tasks: list[Task] | None = None, + checkpoint_path: str | Path | None = None, + ) -> list[Task]: """Execute the pipeline using Cosmos-Xenna. Args: stages (list[ProcessingStage]): The stages to run initial_tasks (list[Task], optional): The initial tasks to run. Empty list of Task is used if not provided. + checkpoint_path (str | Path, optional): If provided, spawn a :class:`LineageWriterActor` + that records the task DAG (parents, children, type, completed flag) to LMDB at + this path for the duration of the run. Returns: list[Task]: List of output tasks from the pipeline @@ -134,6 +144,7 @@ def execute(self, stages: list[ProcessingStage], initial_tasks: list[Task] | Non # Log pipeline configuration logger.info(f"Execution mode: {exec_mode.name}") + lineage_actor = None try: register_loguru_serializer() # Prevent Ray from overriding accelerator env vars when num_gpus=0, letting Xenna manage them instead. @@ -146,6 +157,14 @@ def execute(self, stages: list[ProcessingStage], initial_tasks: list[Task] | Non } }, ) + if checkpoint_path is not None: + absolute_checkpoint_path = str(Path(checkpoint_path).absolute()) + lineage_actor = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + lifetime="detached", + get_if_exists=True, + ).remote(path=absolute_checkpoint_path) + logger.info(f"Spawned LineageWriterActor; checkpoint at {absolute_checkpoint_path}") # Run the pipeline (this will re-initialize ray but that'll be a no-op and the ray.init above will take precedence) results = pipelines_v1.run_pipeline(pipeline_spec) logger.info(f"Pipeline completed successfully with {len(results) if results else 0} output tasks") @@ -153,6 +172,12 @@ def execute(self, stages: list[ProcessingStage], initial_tasks: list[Task] | Non logger.error(f"Pipeline execution failed: {e}") raise finally: + if lineage_actor is not None: + try: + ray.get(lineage_actor.close.remote()) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to close LineageWriterActor: {e}") + ray.kill(lineage_actor) # This ensures we unset all the env vars set above during initialize and kill the pending actors. ray.shutdown() return results if results else [] diff --git a/nemo_curator/pipeline/pipeline.py b/nemo_curator/pipeline/pipeline.py index 246ffcffc1..0bdbe9f2fa 100644 --- a/nemo_curator/pipeline/pipeline.py +++ b/nemo_curator/pipeline/pipeline.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any from loguru import logger from nemo_curator.backends.base import BaseExecutor -from nemo_curator.stages.base import CompositeStage, ProcessingStage +from nemo_curator.stages.base import CompositeStage, ProcessingStage, assign_root_lineage from nemo_curator.tasks import Task @@ -80,6 +81,15 @@ def build(self) -> None: self.stages = execution_stages self.decomposition_info = decomposition_info + # 3. Flag the terminal execution stage so its default process_batch can + # incrementally mark emitted leaves completed via the lineage store. + # Reset all stages first so re-builds and instances reused across + # pipelines do not leak a stale True. + for stage in self.stages: + stage._is_terminal_stage = False + if self.stages: + self.stages[-1]._is_terminal_stage = True + def _decompose_stages( self, stages: list[ProcessingStage | CompositeStage] ) -> tuple[list[ProcessingStage], dict[str, list[str]]]: @@ -174,18 +184,31 @@ def describe(self) -> str: return "\n".join(lines) - def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] | None = None) -> list[Task] | None: + def run( + self, + executor: BaseExecutor | None = None, + initial_tasks: list[Task] | None = None, + checkpoint_path: str | Path | None = None, + ) -> list[Task] | None: """Run the pipeline. Args: executor (BaseExecutor): Executor to use initial_tasks (list[Task], optional): Initial tasks to start the pipeline with. Defaults to None. + checkpoint_path (str | Path, optional): If provided, a single LMDB file at this path + records lineage (parents, children, task type, completed flag) for every task that + flows through the pipeline, keyed by ``_udid``. Owned by one Ray actor, so the file + may live on NFS/Lustre. When omitted, no lineage is persisted. Returns: list[Task] | None: List of tasks """ self.build() + if checkpoint_path is not None: + checkpoint_path = Path(checkpoint_path).absolute() + checkpoint_path.parent.mkdir(parents=True, exist_ok=True) + if executor is None: from nemo_curator.backends.xenna import XennaExecutor @@ -212,4 +235,6 @@ def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] | "The executor will schedule GPU stages on GPUs not held by Serve." ) - return executor.execute(self.stages, initial_tasks) + if initial_tasks: + assign_root_lineage(initial_tasks) + return executor.execute(self.stages, initial_tasks, checkpoint_path=checkpoint_path) diff --git a/nemo_curator/stages/base.py b/nemo_curator/stages/base.py index 5761dfeb18..9e8968951a 100644 --- a/nemo_curator/stages/base.py +++ b/nemo_curator/stages/base.py @@ -25,6 +25,7 @@ from nemo_curator.stages.resources import Resources from nemo_curator.tasks import Task +from nemo_curator.utils.lineage_store import are_completed, mark_leaves_completed, record_lineage if TYPE_CHECKING: from nemo_curator.backends.base import NodeInfo, WorkerMetadata @@ -35,6 +36,58 @@ _STAGE_REGISTRY: dict[str, type[ProcessingStage]] = {} +def assign_child_lineage( + parent_paths: list[str], + result: Task | list[Task] | None, +) -> list[Task]: + """Normalize a stage's ``process()`` result and assign deterministic lineage. + + Each surviving ``children[i]`` gets ``_lineage_path`` and ``_udid`` derived + from ``(parent_paths, i)`` so that the same pipeline run twice on the same + inputs produces byte-identical task IDs. Call this from any custom + ``process_batch`` override to keep outputs consistent with the rest of the + pipeline. + + Children whose ``_udid`` is already set are passed through unchanged. This + happens when a stage mutates and returns the same task instance it received + (e.g. an embedder that writes results onto the input task): the framework + must not treat such a task as a new child of itself. + + Args: + parent_paths: One element per logical parent (typically + ``[task._lineage_path]`` for 1:N stages, or multiple paths for + join/aggregate stages). + result: Whatever ``process()`` (or your custom batch logic) returned for + this parent set — a single task, a list, or ``None``. + + Returns: + The normalized list of children with lineage assigned. May be empty. + """ + if result is None: + return [] + children = result if isinstance(result, list) else [result] + children = [c for c in children if c is not None] + for i, child in enumerate(children): + child._set_lineage(parent_paths, i) + return children + + +def assign_root_lineage(tasks: list[Task]) -> list[Task]: + """Assign deterministic root-level lineage to initial pipeline tasks. + + Each ``tasks[i]`` gets ``_lineage_path = str(i)`` and the corresponding + ``_udid``. Tasks whose ``_udid`` is already set are left untouched + (``_set_lineage`` early-returns), so calling this twice is a no-op. + + Without this step every root carries ``_lineage_path = ""``, and the + empty-string filter in ``_set_lineage`` collapses first-stage children of + different roots onto the same lineage path, producing identical ``_udid``. + """ + for i, task in enumerate(tasks): + task._set_lineage([], i) + return tasks + + class StageMeta(ABCMeta): """Metaclass that automatically registers concrete Stage subclasses. A class is considered *concrete* if it directly inherits from @@ -86,6 +139,9 @@ class ProcessingStage(ABC, Generic[X, Y], metaclass=StageMeta): resources = Resources(cpus=1.0) batch_size = 1 runtime_env: ClassVar[dict[str, Any] | None] = None + # Set by Pipeline.build() on the final execution stage so the default + # process_batch can incrementally mark leaves completed. Do not set manually. + _is_terminal_stage: bool = False @property @final @@ -168,6 +224,23 @@ def process(self, task: X) -> Y | list[Y]: - None: If the task should be filtered out """ + def _filter_completed_tasks(self, tasks: list[X]) -> list[X]: + """Drop tasks whose ``_udid`` is already marked completed in the lineage + store. No-op when no :class:`LineageWriterActor` is registered. + + Tasks with empty ``_udid`` (sources / unassigned) are never filtered. + Order is preserved for survivors. + + Stages that override :meth:`process_batch` should call this themselves + at the top of their override — same contract as + :func:`nemo_curator.utils.lineage_store.record_lineage` and + :func:`nemo_curator.utils.lineage_store.mark_leaves_completed`. + """ + if len(tasks) == 0: + return tasks + flags = are_completed([t._udid for t in tasks]) + return [t for t, done in zip(tasks, flags, strict=True) if not done] + def process_batch(self, tasks: list[X]) -> list[Y]: """Process a batch of tasks and return results. Override this method to enable batch processing for your stage. @@ -179,12 +252,30 @@ def process_batch(self, tasks: list[X]) -> list[Y]: - Single task: For 1-to-1 transformations - List of tasks: For 1-to-many transformations - None: If the task should be filtered out - Note: The returned list should have the same length as the input list, - with each element corresponding to the result of processing the task - at the same index. + + Lineage contract: every emitted child must have its ``_lineage_path`` + and ``_udid`` set so the pipeline produces deterministic IDs. The + default implementation below delegates to + :func:`assign_child_lineage` per input task. If you override this + method, you are responsible for calling ``assign_child_lineage`` on + each chunk of outputs that share parentage, e.g.:: + + outputs = [] + for task in tasks: + raw = self.my_batched_process(task) + outputs.extend(assign_child_lineage([task._lineage_path], raw)) + return outputs + + In-place returns are supported: if ``process()`` mutates and returns the + same task it received, ``assign_child_lineage`` will preserve that + task's existing ``_lineage_path`` / ``_udid`` rather than treating it as + a new child of itself. + + Outputs that skip this step will carry empty ``_udid``/``_lineage_path``. """ # Default implementation: process tasks one by one # This is only used as a fallback if a stage doesn't override this method + tasks = self._filter_completed_tasks(tasks) results = [] for task in tasks: if not self.validate_input(task): @@ -192,10 +283,21 @@ def process_batch(self, tasks: list[X]) -> list[Y]: raise ValueError(msg) result = self.process(task) - if isinstance(result, list): - results.extend(result) - else: - results.append(result) + # Do not forget to call the assign_child_lineage if you have overwritten + # the process_batch funtion. This function generates unique and + # deterministic keys. + children = assign_child_lineage([task._lineage_path], result) + # If you pass a checkpoint_path to the executor, call the record_lineage + # function to build the DAG for resumability. If your stage is the + # terminal stage in a pipeline AND you override process_batch, also + # call mark_leaves_completed([c._udid for c in children]) after + # record_lineage for incremental completion marking. Overrides should + # also call self._filter_completed_tasks(tasks) at the top to honor + # resumability — same contract as the other helpers. + record_lineage([task._udid], [c._udid for c in children]) + if self._is_terminal_stage and children: + mark_leaves_completed([c._udid for c in children]) + results.extend(children) return results def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None: diff --git a/nemo_curator/tasks/tasks.py b/nemo_curator/tasks/tasks.py index b2836415c1..3bfd4d5d15 100644 --- a/nemo_curator/tasks/tasks.py +++ b/nemo_curator/tasks/tasks.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import uuid from abc import ABC, abstractmethod from dataclasses import dataclass, field @@ -40,11 +41,31 @@ class Task(ABC, Generic[T]): _stage_perf: list[StagePerfStats] = field(default_factory=list) _metadata: dict[str, Any] = field(default_factory=dict) _uuid: str = field(init=False, default_factory=lambda: str(uuid.uuid4())) + # `_lineage_path` is the index-based path of this task through the pipeline + # DAG (e.g. "3_0_7" = 4th root task, then 1st child, then 8th grandchild). + # It is propagated to children and hashed into `_udid`, the deterministic + # task id. + _lineage_path: str = field(init=False, default="") + _udid: str = field(init=False, default="") def __post_init__(self) -> None: """Post-initialization hook.""" self.validate() + def _set_lineage(self, parent_lineage_paths: list[str], child_index: int) -> bool: + """Assign deterministic lineage to this task. + + Returns ``True`` if lineage was newly assigned, ``False`` if ``_udid`` + was already set — which signals the task was returned in place by an + earlier stage and its existing lineage must be preserved. + """ + if self._udid: + return False + parts = [*[p for p in parent_lineage_paths if p], str(child_index)] + self._lineage_path = "_".join(parts) + self._udid = hashlib.sha256(self._lineage_path.encode()).hexdigest()[:32] + return True + @property @abstractmethod def num_items(self) -> int: diff --git a/nemo_curator/utils/lineage_store.py b/nemo_curator/utils/lineage_store.py new file mode 100644 index 0000000000..cd8d82f785 --- /dev/null +++ b/nemo_curator/utils/lineage_store.py @@ -0,0 +1,440 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LMDB-backed lineage store for task DAG checkpointing. + +Stores, per task ``_udid``: + +- parent ``_udid``s +- child ``_udid``s +- ``task_type`` ("source" | "middle" | "leaf" | "source_leaf") +- ``completed`` flag — set incrementally by :func:`mark_leaves_completed` as + terminal-stage leaves finish, then rolled up to ancestors by + :meth:`LineageStore.mark_completed_and_propagate` (a parent is marked only + when all its children are completed). + +Architecture: + +- :class:`LineageStore` — direct LMDB owner. Used inside the writer actor for + the active pipeline, and also opened standalone (e.g., after a run finishes, + in a fresh process) to read records. +- :class:`LineageWriterActor` — named Ray actor that wraps a single + :class:`LineageStore`. The only writer during a pipeline run, which is what + lets the file live safely on NFS / Lustre. +- :func:`record_lineage` — write helper called from stages. No-op unless a + :class:`LineageWriterActor` is registered in the cluster. +""" + +from __future__ import annotations + +import hashlib +from collections import deque +from dataclasses import dataclass, field +from pathlib import Path + +import lmdb +import ray +from loguru import logger + +LINEAGE_ACTOR_NAME = "nemo_curator_lineage_writer" + +_IN_EDGES_DB = b"in_edges" +_OUT_EDGES_DB = b"out_edges" +_TASK_TYPE_DB = b"task_type" +_COMPLETED_DB = b"completed" + +_DEFAULT_MAP_SIZE = 1 << 34 # 16 GiB; sparse on Linux so effectively free + +_TYPE_SOURCE = b"source" +_TYPE_MIDDLE = b"middle" +_TYPE_LEAF = b"leaf" +_TYPE_SOURCE_LEAF = b"source_leaf" + + +def _classify(has_parent: bool, has_child: bool) -> bytes: + if has_parent and has_child: + return _TYPE_MIDDLE + if has_parent: + return _TYPE_LEAF + if has_child: + return _TYPE_SOURCE + return _TYPE_SOURCE_LEAF + + +def _udid_to_key(udid: str) -> bytes: + return udid.encode("ascii") + + +def _key_to_udid(key: bytes) -> str: + return key.decode("ascii") + + +def _path_to_udid(lineage_path: str) -> str: + """Mirror of the udid derivation in ``Task._set_lineage`` ([tasks.py:59]).""" + return hashlib.sha256(lineage_path.encode()).hexdigest()[:32] + + +@dataclass +class LineageRecord: + parents: list[str] = field(default_factory=list) + children: list[str] = field(default_factory=list) + task_type: str = "source_leaf" + completed: bool = False + + +class LineageStore: + """Direct LMDB owner for the lineage checkpoint. + + Not safe to use from multiple processes concurrently. The writer actor uses + one of these as its backing store during a pipeline run; tests and + post-pipeline inspection tools instantiate one directly to read records. + """ + + def __init__(self, path: str | Path, map_size: int = _DEFAULT_MAP_SIZE): + self._path = str(Path(path).absolute()) + Path(self._path).parent.mkdir(parents=True, exist_ok=True) + self._env = lmdb.open( + self._path, + subdir=False, + lock=False, + max_dbs=4, + map_size=map_size, + metasync=False, + sync=True, + readahead=False, + ) + self._in_db = self._env.open_db(_IN_EDGES_DB, dupsort=True) + self._out_db = self._env.open_db(_OUT_EDGES_DB, dupsort=True) + self._type_db = self._env.open_db(_TASK_TYPE_DB) + self._completed_db = self._env.open_db(_COMPLETED_DB) + + @staticmethod + def _has_dup(txn: lmdb.Transaction, db: lmdb._Database, key: bytes) -> bool: + with txn.cursor(db=db) as cur: + return cur.set_key(key) + + def _record_emission_once(self, parent_udids: list[str], child_udids: list[str]) -> None: + parent_keys = [_udid_to_key(u) for u in parent_udids] + child_keys = [_udid_to_key(u) for u in child_udids] + with self._env.begin(write=True) as txn: + for child_key in child_keys: + for parent_key in parent_keys: + if parent_key == child_key: + # In-place return: don't add a node as its own parent/child. + continue + # In dupsort dbs, the default flags allow multiple distinct values + # per key and silently drop exact (key, value) duplicates. We + # deliberately do NOT pass overwrite=False — that maps to + # MDB_NOOVERWRITE which refuses any new value once the key has + # any existing value, blocking incremental parent attribution. + txn.put(child_key, parent_key, db=self._in_db) + txn.put(parent_key, child_key, db=self._out_db) + + affected = {*child_keys, *parent_keys} + for udid_key in affected: + has_parent = self._has_dup(txn, self._in_db, udid_key) + has_child = self._has_dup(txn, self._out_db, udid_key) + txn.put(udid_key, _classify(has_parent, has_child), db=self._type_db, overwrite=True) + + def record_emission(self, parent_udids: list[str], child_udids: list[str]) -> None: + """Append edges for ``(parent, child)`` pairs and refresh ``task_type`` + for every affected udid. Idempotent under retries and incremental + parent attribution; no-op when ``child_udids`` is empty.""" + if not child_udids: + return + try: + self._record_emission_once(parent_udids, child_udids) + except lmdb.MapFullError: + new_size = self._env.info()["map_size"] * 2 + logger.warning(f"LMDB map full at {self._path}; growing to {new_size} bytes") + self._env.set_mapsize(new_size) + self._record_emission_once(parent_udids, child_udids) + + def mark_completed(self, udid: str) -> None: + with self._env.begin(write=True) as txn: + txn.put(_udid_to_key(udid), b"1", db=self._completed_db, overwrite=True) + + def is_completed(self, udid: str) -> bool: + with self._env.begin() as txn: + return txn.get(_udid_to_key(udid), db=self._completed_db) is not None + + def are_completed(self, udids: list[str]) -> list[bool]: + """Bulk variant of :meth:`is_completed`. Single read txn, snapshot-consistent. + Returns one bool per input udid, in the same order. Empty strings and + unknown udids return ``False``.""" + if not udids: + return [] + with self._env.begin() as txn: + return [ + bool(u) and txn.get(_udid_to_key(u), db=self._completed_db) is not None + for u in udids + ] + + @staticmethod + def _all_children_completed(txn: lmdb.Transaction, db: lmdb._Database, completed_db: lmdb._Database, key: bytes) -> bool: + with txn.cursor(db=db) as cur: + if not cur.set_key(key): + return True + for child_key in cur.iternext_dup(): + if txn.get(child_key, db=completed_db) is None: + return False + return True + + def _mark_completed_and_propagate_once(self, udids: list[str]) -> list[str]: + keys: deque[bytes] = deque(_udid_to_key(u) for u in udids) + visited: set[bytes] = set() + newly_marked: list[str] = [] + with self._env.begin(write=True) as txn: + while keys: + key = keys.popleft() + if key in visited: + continue + visited.add(key) + + if txn.get(key, db=self._type_db) is None: + continue + if txn.get(key, db=self._completed_db) is not None: + continue + if not self._all_children_completed(txn, self._out_db, self._completed_db, key): + continue + + txn.put(key, b"1", db=self._completed_db, overwrite=True) + newly_marked.append(_key_to_udid(key)) + + with txn.cursor(db=self._in_db) as cur: + if cur.set_key(key): + keys.extend(pk for pk in cur.iternext_dup() if pk not in visited) + return newly_marked + + def mark_completed_and_propagate(self, udids: list[str]) -> list[str]: + """Mark each udid completed iff all its children are completed, then walk + to parents and apply the same rule. Returns the udids whose ``completed`` + flag transitioned 0→1 in this call. + + Seeded from terminal-stage leaves via :func:`mark_leaves_completed`, + called from inside :meth:`ProcessingStage.process_batch` right after + :func:`record_lineage`. Stages that override ``process_batch`` are + responsible for calling :func:`mark_leaves_completed` themselves when + they are the terminal stage — same contract as :func:`record_lineage`. + + The BFS stops along any branch whose current node is not yet eligible, + so partial fan-in (some siblings still pending) blocks the rollup + correctly. + + Raises ``ValueError`` if any input udid is the empty string — a missing + ``_udid`` on a task means a stage forgot to call + :func:`nemo_curator.stages.base.assign_child_lineage` and the caller + deserves a loud failure rather than a silent skip. Unknown-but-non-empty + udids are skipped silently.""" + if any(not u for u in udids): + msg = "mark_completed_and_propagate received an empty udid; tasks must have lineage assigned via assign_child_lineage" + raise ValueError(msg) + if not udids: + return [] + try: + return self._mark_completed_and_propagate_once(udids) + except lmdb.MapFullError: + new_size = self._env.info()["map_size"] * 2 + logger.warning(f"LMDB map full at {self._path}; growing to {new_size} bytes") + self._env.set_mapsize(new_size) + return self._mark_completed_and_propagate_once(udids) + + def get(self, udid: str) -> LineageRecord | None: + key = _udid_to_key(udid) + with self._env.begin() as txn: + task_type = txn.get(key, db=self._type_db) + if task_type is None: + return None + parents: list[str] = [] + with txn.cursor(db=self._in_db) as cur: + if cur.set_key(key): + parents = [_key_to_udid(v) for v in cur.iternext_dup()] + children: list[str] = [] + with txn.cursor(db=self._out_db) as cur: + if cur.set_key(key): + children = [_key_to_udid(v) for v in cur.iternext_dup()] + completed = txn.get(key, db=self._completed_db) is not None + return LineageRecord( + parents=parents, + children=children, + task_type=task_type.decode("ascii"), + completed=completed, + ) + + def iter_records(self) -> list[tuple[str, LineageRecord]]: + results: list[tuple[str, LineageRecord]] = [] + with self._env.begin() as txn, txn.cursor(db=self._type_db) as cur: + for key, _ in cur: + udid = _key_to_udid(key) + rec = self.get(udid) + if rec is not None: + results.append((udid, rec)) + return results + + def _traverse(self, udid: str, attr: str) -> dict[str, LineageRecord]: + start = self.get(udid) + if start is None: + return {} + result: dict[str, LineageRecord] = {} + queue: deque[str] = deque(getattr(start, attr)) + while queue: + neighbor = queue.popleft() + if neighbor == udid or neighbor in result: + continue + rec = self.get(neighbor) + if rec is None: + continue + result[neighbor] = rec + queue.extend(getattr(rec, attr)) + return result + + def get_all_parents(self, udid: str) -> dict[str, LineageRecord]: + """Return every ancestor of ``udid`` (transitive parents) keyed by udid. + + Excludes ``udid`` itself. Returns ``{}`` when ``udid`` is unknown or + has no parents. + """ + return self._traverse(udid, "parents") + + def get_all_children(self, udid: str) -> dict[str, LineageRecord]: + """Return every descendant of ``udid`` (transitive children) keyed by udid. + + Excludes ``udid`` itself. Returns ``{}`` when ``udid`` is unknown or + has no children. + """ + return self._traverse(udid, "children") + + def close(self) -> None: + if self._env is not None: + self._env.close() + self._env = None # type: ignore[assignment] + + +@ray.remote(num_cpus=0) +class LineageWriterActor: + """Singleton owner of the LMDB env, spawned by the executor when + ``Pipeline.run(checkpoint_path=...)`` is provided. Workers send lineage + events here via :func:`record_lineage`. Because it is the only process + that writes to the LMDB file, no cross-process file lock is required and + the file may safely live on NFS or Lustre.""" + + def __init__(self, path: str, map_size: int = _DEFAULT_MAP_SIZE): + self._store = LineageStore(path, map_size=map_size) + + def record_emission(self, parent_udids: list[str], child_udids: list[str]) -> None: + self._store.record_emission(parent_udids, child_udids) + + def mark_completed(self, udid: str) -> None: + self._store.mark_completed(udid) + + def is_completed(self, udid: str) -> bool: + return self._store.is_completed(udid) + + def are_completed(self, udids: list[str]) -> list[bool]: + return self._store.are_completed(udids) + + def mark_completed_and_propagate(self, udids: list[str]) -> list[str]: + return self._store.mark_completed_and_propagate(udids) + + def get(self, udid: str) -> LineageRecord | None: + return self._store.get(udid) + + def iter_records(self) -> list[tuple[str, LineageRecord]]: + return self._store.iter_records() + + def get_all_parents(self, udid: str) -> dict[str, LineageRecord]: + return self._store.get_all_parents(udid) + + def get_all_children(self, udid: str) -> dict[str, LineageRecord]: + return self._store.get_all_children(udid) + + def close(self) -> None: + self._store.close() + + +def record_lineage(parent_udids: list[str], child_udids: list[str]) -> None: + """Persist parent/child edges via the named :class:`LineageWriterActor`. + + No-op when Ray is not initialized or no such actor is registered. The + actor is spawned by the executor only when ``Pipeline.run`` is called + with ``checkpoint_path``, so the absence of the actor is what gates + recording. + + Intended to be called from inside ``process_batch`` right after + :func:`nemo_curator.stages.base.assign_child_lineage`. Pass the parent + tasks' ``_udid`` values (typically ``[task._udid]`` for 1:N stages, or one + udid per parent for joins) and the emitted children's ``_udid`` values. + Empty udids (``EmptyTask`` roots and tasks that haven't been lineage-assigned + yet) are filtered out, so source tasks naturally end up with empty + ``in_edges``. + """ + if not ray.is_initialized(): + return + try: + actor = ray.get_actor(LINEAGE_ACTOR_NAME) + except ValueError: + return + + parent_udids = [u for u in parent_udids if u] + child_udids = [u for u in child_udids if u] + if not child_udids: + return + + ray.get(actor.record_emission.remote(parent_udids, child_udids)) + + +def mark_leaves_completed(udids: list[str]) -> None: + """Seed :meth:`LineageStore.mark_completed_and_propagate` with leaves that just + exited a terminal stage. + + No-op when Ray is not initialized or no :class:`LineageWriterActor` is + registered — same gating as :func:`record_lineage`, so pipelines run without + a ``checkpoint_path`` pay nothing. Empty udids are filtered out (parity with + :func:`record_lineage`) rather than raising; the underlying actor method + would otherwise fail loudly on ``""``. + + Intended to be called from inside :meth:`ProcessingStage.process_batch` of + the terminal stage, immediately after :func:`record_lineage`, with the + emitted children's ``_udid``s. + """ + if not ray.is_initialized(): + return + try: + actor = ray.get_actor(LINEAGE_ACTOR_NAME) + except ValueError: + return + + udids = [u for u in udids if u] + if not udids: + return + + ray.get(actor.mark_completed_and_propagate.remote(udids)) + + +def are_completed(udids: list[str]) -> list[bool]: + """Bulk completion check via the named :class:`LineageWriterActor`. + + Returns ``[False] * len(udids)`` when Ray is not initialized or no actor + is registered — same gating as :func:`record_lineage`, so pipelines run + without ``checkpoint_path`` skip nothing. Empty udids map to ``False``. + Order preserved. + """ + if not udids: + return [] + if not ray.is_initialized(): + return [False] * len(udids) + try: + actor = ray.get_actor(LINEAGE_ACTOR_NAME) + except ValueError: + return [False] * len(udids) + return ray.get(actor.are_completed.remote(udids)) diff --git a/pyproject.toml b/pyproject.toml index a0ab3fac3e..d84c87913a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "fsspec", "hydra-core", "jieba==0.42.1", + "lmdb>=1.4", "loguru", "mecab-python3", "omegaconf", diff --git a/tests/pipelines/_resumability_runner.py b/tests/pipelines/_resumability_runner.py new file mode 100644 index 0000000000..54c06b95ad --- /dev/null +++ b/tests/pipelines/_resumability_runner.py @@ -0,0 +1,197 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Subprocess entry point for the SIGINT/resume integration test. + +Builds a 4-stage pipeline (fanout -> passthrough -> chunked-fanin -> slow_writer) +and walks it stage-by-stage with a real :class:`LineageWriterActor` writing to +``--checkpoint-path``. Prints one ``completed`` line per terminal-stage emission +to stdout so the parent test can pace SIGINT injection deterministically. + +Not a test itself; loaded as a subprocess from +:func:`test_resumable_after_sigint` in ``test_lineage_integration.py``. +""" + +from __future__ import annotations + +import argparse +import contextlib +import sys +import time +from dataclasses import dataclass + +import ray + +from nemo_curator.backends.base import BaseStageAdapter +from nemo_curator.pipeline.pipeline import Pipeline +from nemo_curator.stages.base import ProcessingStage, assign_child_lineage +from nemo_curator.tasks import Task +from nemo_curator.utils.lineage_store import ( + LINEAGE_ACTOR_NAME, + LineageWriterActor, + record_lineage, +) + + +@dataclass +class _SimpleTask(Task[list[int]]): + @property + def num_items(self) -> int: + return len(self.data) if self.data is not None else 0 + + def validate(self) -> bool: + return True + + +@dataclass +class _FanOut(ProcessingStage[_SimpleTask, _SimpleTask]): + times: int = 2000 + name: str = "fanout" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> list[_SimpleTask]: + return [ + _SimpleTask(task_id=f"{task.task_id}_{i}", dataset_name=task.dataset_name, data=task.data) + for i in range(self.times) + ] + + +@dataclass +class _Passthrough(ProcessingStage[_SimpleTask, _SimpleTask]): + name: str = "passthrough" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + return _SimpleTask(task_id=f"{task.task_id}_pt", dataset_name=task.dataset_name, data=task.data) + + +@dataclass +class _ChunkedFanIn(ProcessingStage[_SimpleTask, _SimpleTask]): + """Fan-in that chunks its input into groups of ``fanin_size`` and merges each + group into one output. Overrides ``process_batch`` (multi-parent emission), + so it must call :meth:`_filter_completed_tasks`, :func:`assign_child_lineage`, + and :func:`record_lineage` itself.""" + + fanin_size: int = 20 + name: str = "fanin" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + _ = task + msg = "ChunkedFanIn only supports batched execution" + raise NotImplementedError(msg) + + def process_batch(self, tasks: list[_SimpleTask]) -> list[_SimpleTask]: + tasks = self._filter_completed_tasks(tasks) + results: list[_SimpleTask] = [] + for start in range(0, len(tasks), self.fanin_size): + chunk = tasks[start : start + self.fanin_size] + combined: list[int] = [] + for t in chunk: + combined.extend(t.data) + merged = _SimpleTask(task_id=f"merged_{start}", dataset_name=chunk[0].dataset_name, data=combined) + children = assign_child_lineage([t._lineage_path for t in chunk], merged) + record_lineage([t._udid for t in chunk], [c._udid for c in children]) + results.extend(children) + return results + + +@dataclass +class _SlowWriter(ProcessingStage[_SimpleTask, _SimpleTask]): + """Terminal stage with a per-task sleep so SIGINT can land mid-batch. Emits + one stdout line per processed task so the parent test can pace the signal.""" + + sleep_s: float = 0.05 + name: str = "slow_writer" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + time.sleep(self.sleep_s) + out = _SimpleTask(task_id=f"{task.task_id}_w", dataset_name=task.dataset_name, data=task.data) + sys.stdout.write("completed\n") + sys.stdout.flush() + return out + + +def _drive(pipeline: Pipeline, initial_tasks: list[Task]) -> list[Task]: + current = initial_tasks + for stage in pipeline.stages: + current = BaseStageAdapter(stage).process_batch(current) + return current + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint-path", required=True) + parser.add_argument("--n-tasks", type=int, default=2000) + parser.add_argument("--fanin-size", type=int, default=20) + parser.add_argument("--writer-sleep-s", type=float, default=0.05) + args = parser.parse_args() + + ray.init(ignore_reinit_error=True, log_to_driver=False) + + with contextlib.suppress(ValueError): + ray.kill(ray.get_actor(LINEAGE_ACTOR_NAME)) + actor = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + get_if_exists=True, + ).remote(path=args.checkpoint_path) + + pipeline = Pipeline( + name="resumable", + stages=[ + _FanOut(times=args.n_tasks), + _Passthrough(), + _ChunkedFanIn(fanin_size=args.fanin_size), + _SlowWriter(sleep_s=args.writer_sleep_s), + ], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + + try: + _drive(pipeline, [root]) + except KeyboardInterrupt: + sys.stdout.write("interrupted\n") + sys.stdout.flush() + finally: + with contextlib.suppress(Exception): + ray.get(actor.close.remote()) + with contextlib.suppress(Exception): + ray.shutdown() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/pipelines/test_lineage_integration.py b/tests/pipelines/test_lineage_integration.py new file mode 100644 index 0000000000..bdb761d612 --- /dev/null +++ b/tests/pipelines/test_lineage_integration.py @@ -0,0 +1,579 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end lineage-checkpoint tests. + +Drive a pipeline through the default :meth:`ProcessingStage.process_batch` +(which calls :func:`assign_child_lineage` + :func:`record_lineage` separately) +while a real :class:`LineageWriterActor` is registered, and verify the +resulting on-disk DAG matches the topology. Without an actor, recording is a +true no-op. +""" + +import contextlib +import os +import signal +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path + +import pytest +import ray + +from nemo_curator.backends.base import BaseStageAdapter +from nemo_curator.pipeline.pipeline import Pipeline +from nemo_curator.stages.base import ProcessingStage, assign_child_lineage +from nemo_curator.tasks import Task +from nemo_curator.utils.lineage_store import ( + LINEAGE_ACTOR_NAME, + LineageStore, + LineageWriterActor, + _path_to_udid, + record_lineage, +) + + +@dataclass +class _SimpleTask(Task[list[int]]): + @property + def num_items(self) -> int: + return len(self.data) if self.data is not None else 0 + + def validate(self) -> bool: + return True + + +@dataclass +class _FanOut(ProcessingStage[_SimpleTask, _SimpleTask]): + times: int = 3 + name: str = "fanout" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> list[_SimpleTask]: + return [ + _SimpleTask(task_id=f"{task.task_id}_{i}", dataset_name=task.dataset_name, data=task.data) + for i in range(self.times) + ] + + +@dataclass +class _Passthrough(ProcessingStage[_SimpleTask, _SimpleTask]): + name: str = "passthrough" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + return _SimpleTask(task_id=f"{task.task_id}_pt", dataset_name=task.dataset_name, data=task.data) + + +@dataclass +class _Writer(ProcessingStage[_SimpleTask, _SimpleTask]): + """Stand-in for a real sink stage: emits one child per input. Lineage-wise + indistinguishable from a passthrough but named separately so the 4-stage + end-to-end test reads like a real ``passthrough → fanout → fanin → writer`` + pipeline.""" + + name: str = "writer" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + return _SimpleTask(task_id=f"{task.task_id}_w", dataset_name=task.dataset_name, data=task.data) + + +@dataclass +class _FailAfterN(ProcessingStage[_SimpleTask, _SimpleTask]): + """Test-only stage that emits children for the first ``fail_after`` inputs and + raises on the next one. Lets us drive a pipeline to a known partial-DAG state + so we can assert "no spurious completions" after a mid-run abort.""" + + fail_after: int = 1 + name: str = "fail_after_n" + _seen: int = 0 + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + if self._seen >= self.fail_after: + msg = f"_FailAfterN exploding after {self.fail_after} inputs" + raise RuntimeError(msg) + self._seen += 1 + return _SimpleTask(task_id=f"{task.task_id}_x", dataset_name=task.dataset_name, data=task.data) + + +@dataclass +class _FanIn(ProcessingStage[_SimpleTask, _SimpleTask]): + """Override ``process_batch`` to combine the whole batch into one output. + Demonstrates the multi-parent path of the lineage contract — separate + :func:`assign_child_lineage` and :func:`record_lineage` calls.""" + + name: str = "fanin" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + _ = task + msg = "FanIn only supports batched execution" + raise NotImplementedError(msg) + + def process_batch(self, tasks: list[_SimpleTask]) -> list[_SimpleTask]: + combined: list[int] = [] + for t in tasks: + combined.extend(t.data) + merged = _SimpleTask(task_id="merged", dataset_name=tasks[0].dataset_name, data=combined) + children = assign_child_lineage([t._lineage_path for t in tasks], merged) + record_lineage([t._udid for t in tasks], [c._udid for c in children]) + return children + + +def _drive(pipeline: Pipeline, initial_tasks: list[Task]) -> list[Task]: + current = initial_tasks + for stage in pipeline.stages: + current = BaseStageAdapter(stage).process_batch(current) + return current + + +def _kill_actor_if_present() -> None: + with contextlib.suppress(ValueError): + handle = ray.get_actor(LINEAGE_ACTOR_NAME) + ray.kill(handle) + + +@pytest.fixture +def actor(tmp_path: Path, shared_ray_client: None) -> tuple[object, Path]: # noqa: ARG001 + """Spawn a real :class:`LineageWriterActor` so ``record_lineage`` has somewhere + to write.""" + _kill_actor_if_present() + path = tmp_path / "lineage.mdb" + handle = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + get_if_exists=True, + ).remote(path=str(path)) + try: + yield handle, path + finally: + with contextlib.suppress(Exception): + ray.get(handle.close.remote()) + ray.kill(handle) + + +def test_fanout_passthrough_fanin_records_full_dag(actor: tuple[object, Path]) -> None: + """Drive a 4-stage pipeline and verify the on-disk DAG matches the topology. + + Input ─▶ FanOut(3) ─▶ Passthrough ─▶ FanIn ─▶ Passthrough ─▶ Output + """ + actor_handle, _ = actor + pipeline = Pipeline( + name="fanout_fanin", + stages=[_FanOut(times=3), _Passthrough(name="pt1"), _FanIn(), _Passthrough(name="pt2")], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + final = _drive(pipeline, [root]) + + records = dict(ray.get(actor_handle.iter_records.remote())) + + # 3 FanOut outputs, 3 Passthrough-1 outputs, 1 FanIn output, 1 Passthrough-2 output = 8 records. + assert len(records) == 8 + + fanout_paths = ["0", "1", "2"] + pt1_paths = ["0_0", "1_0", "2_0"] + fanin_path = "0_0_1_0_2_0_0" + pt2_path = "0_0_1_0_2_0_0_0" + + fanout_udids = {_path_to_udid(p) for p in fanout_paths} + pt1_udids = {_path_to_udid(p) for p in pt1_paths} + fanin_udid = _path_to_udid(fanin_path) + pt2_udid = _path_to_udid(pt2_path) + + # FanOut roots: source (no parents, have children at the next stage). + for u in fanout_udids: + rec = records[u] + assert rec.parents == [] + assert len(rec.children) == 1 + assert rec.task_type == "source" + + # PT1: each has 1 parent (a FanOut output) and 1 child (the FanIn). + for u in pt1_udids: + rec = records[u] + assert len(rec.parents) == 1 + assert rec.parents[0] in fanout_udids + assert rec.children == [fanin_udid] + assert rec.task_type == "middle" + + # FanIn: 3 parents (the PT1 tasks), 1 child (the PT2 output). + fanin_rec = records[fanin_udid] + assert set(fanin_rec.parents) == pt1_udids + assert fanin_rec.children == [pt2_udid] + assert fanin_rec.task_type == "middle" + + # PT2: 1 parent (the FanIn), 0 children → leaf. + pt2_rec = records[pt2_udid] + assert pt2_rec.parents == [fanin_udid] + assert pt2_rec.children == [] + assert pt2_rec.task_type == "leaf" + + # The final returned task should match the leaf in the store. + assert len(final) == 1 + assert final[0]._udid == pt2_udid + + +def test_actor_exposes_transitive_traversal(actor: tuple[object, Path]) -> None: + """The actor surfaces ``get_all_parents`` / ``get_all_children`` for transitive + DAG inspection. Drives the same fanout/passthrough/fanin/passthrough pipeline + as :func:`test_fanout_passthrough_fanin_records_full_dag` and walks both + directions from the leaf and from one source.""" + actor_handle, _ = actor + pipeline = Pipeline( + name="fanout_fanin_traverse", + stages=[_FanOut(times=3), _Passthrough(name="pt1"), _FanIn(), _Passthrough(name="pt2")], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + _drive(pipeline, [root]) + + fanout_udids = {_path_to_udid(p) for p in ["0", "1", "2"]} + pt1_udids = {_path_to_udid(p) for p in ["0_0", "1_0", "2_0"]} + fanin_udid = _path_to_udid("0_0_1_0_2_0_0") + pt2_udid = _path_to_udid("0_0_1_0_2_0_0_0") + + # From the final leaf, every upstream node should be reachable. + ancestors = ray.get(actor_handle.get_all_parents.remote(pt2_udid)) + assert set(ancestors.keys()) == fanout_udids | pt1_udids | {fanin_udid} + + # From one fanout root, descendants are its own pt1 + the shared fanin + pt2. + one_fanout = next(iter(fanout_udids)) + descendants = ray.get(actor_handle.get_all_children.remote(one_fanout)) + descendant_pt1 = pt1_udids & set(descendants.keys()) + assert len(descendant_pt1) == 1 + assert set(descendants.keys()) == descendant_pt1 | {fanin_udid, pt2_udid} + + +def test_no_lineage_recording_when_actor_absent(tmp_path: Path, shared_ray_client: None) -> None: # noqa: ARG001 + """Driving a pipeline with no LineageWriterActor registered must not create any LMDB file.""" + _kill_actor_if_present() + + pipeline = Pipeline(name="no_lineage", stages=[_FanOut(times=2), _Passthrough(name="pt")]) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + out = _drive(pipeline, [root]) + assert len(out) == 2 + # No files created anywhere by the lineage subsystem. + assert list(tmp_path.iterdir()) == [] + + +def test_full_run_marks_entire_dag_completed(actor: tuple[object, Path]) -> None: + """End-to-end ``passthrough → fanout → fanin → writer``: after a successful run, + every DAG node must be marked completed. + + ``_Writer`` (terminal) calls :func:`mark_leaves_completed` from inside its + default ``process_batch``, which rolls completion up to the full DAG before + ``_drive`` returns.""" + actor_handle, _ = actor + pipeline = Pipeline( + name="four_stage", + stages=[_Passthrough(name="pt0"), _FanOut(times=3), _FanIn(), _Writer()], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + _drive(pipeline, [root]) + + records = dict(ray.get(actor_handle.iter_records.remote())) + assert all(rec.completed for rec in records.values()) + + +def test_kill_midrun_leaves_partial_dag_with_no_completions( + tmp_path: Path, + shared_ray_client: None, # noqa: ARG001 +) -> None: + """Drive ``passthrough → fanout → fail_after_n → writer`` so the third stage + explodes mid-batch, before the terminal ``_Writer`` stage runs at all. + Verify that: + + 1. The actor persisted edges for the stages that did run (passthrough + the + partial fanout outputs the failing stage consumed before raising). + 2. No node is marked ``completed`` — the terminal stage never ran, so + incremental marking never fired, and no leaves exist to seed the rollup. + Companion test :func:`test_terminal_stage_partial_failure_marks_processed_leaves` + covers the case where the terminal stage itself fails mid-batch.""" + _kill_actor_if_present() + path = tmp_path / "lineage_midkill.mdb" + actor_handle = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + get_if_exists=True, + ).remote(path=str(path)) + + try: + pipeline = Pipeline( + name="four_stage_flaky", + stages=[ + _Passthrough(name="pt0"), + _FanOut(times=5), + _FailAfterN(fail_after=2), + _Writer(), + ], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + + with pytest.raises(RuntimeError, match="exploding"): + _drive(pipeline, [root]) + + # Close the actor cleanly so we can re-open the LMDB file from this process. + ray.get(actor_handle.close.remote()) + finally: + ray.kill(actor_handle) + + # Re-open the LMDB store directly to inspect the partial DAG. + store = LineageStore(str(path)) + try: + records = dict(store.iter_records()) + + # We made progress: pt0 + at least 2 fanout outputs + 2 fail_after outputs were recorded. + assert len(records) > 0 + # NOT every node should be present (writer never ran for some fanout outputs). + # And critically, nothing must be marked completed — the terminal stage + # (_Writer) never ran, so mark_leaves_completed was never invoked. + assert all(not rec.completed for rec in records.values()), ( + f"unexpected completions in partial DAG: {[u for u, r in records.items() if r.completed]}" + ) + finally: + store.close() + + +def test_incremental_marking_inside_terminal_process_batch( + actor: tuple[object, Path], +) -> None: + """Drive ``passthrough → fanout → writer`` one stage at a time and peek at the + LMDB between stages. Until the terminal ``_Writer`` stage runs, nothing is + completed; once it does, every recorded node rolls up via BFS. + + This proves the marking happens inside the terminal stage's + ``process_batch`` (via :func:`mark_leaves_completed`), not at end-of-pipeline. + """ + actor_handle, _ = actor + pipeline = Pipeline( + name="incremental_marking", + stages=[_Passthrough(name="pt0"), _FanOut(times=2), _Writer()], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + + # Drive stage-by-stage, peeking after each. + current: list[Task] = [root] + completion_history: list[bool] = [] + for stage in pipeline.stages: + current = BaseStageAdapter(stage).process_batch(current) + records = dict(ray.get(actor_handle.iter_records.remote())) + completion_history.append(any(rec.completed for rec in records.values())) + + # Stages 0 (passthrough) and 1 (fanout) record edges but never complete anything; + # only stage 2 (terminal writer) triggers incremental marking. + assert completion_history == [False, False, True] + + final_records = dict(ray.get(actor_handle.iter_records.remote())) + assert all(rec.completed for rec in final_records.values()) + + +def test_terminal_stage_partial_failure_marks_processed_leaves( + actor: tuple[object, Path], +) -> None: + """Place ``_FailAfterN`` as the terminal stage so it processes ``fail_after`` + inputs (marking each emitted leaf and rolling up its fully-completed ancestor + chain) before exploding on the next input. Verify the partial completions + landed in LMDB even though the pipeline aborted — the resumability story.""" + actor_handle, path = actor + pipeline = Pipeline( + name="terminal_fails_midbatch", + stages=[_Passthrough(name="pt0"), _FanOut(times=5), _FailAfterN(fail_after=2)], + ) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + + with pytest.raises(RuntimeError, match="exploding"): + _drive(pipeline, [root]) + + # Lineage paths for this topology (root has empty lineage_path): + # pt0 output: "0" + # fanout(5) outputs: "0_0" ... "0_4" + # fail_after outputs: "0_0_0", "0_1_0" (only first two processed successfully) + pt0_udid = _path_to_udid("0") + fanout_udids = [_path_to_udid(f"0_{i}") for i in range(5)] + leaf_udids = [_path_to_udid("0_0_0"), _path_to_udid("0_1_0")] + + # Close the actor cleanly so we can re-open the LMDB file from this process. + ray.get(actor_handle.close.remote()) + store = LineageStore(str(path)) + try: + records = dict(store.iter_records()) + + # Leaves emitted before the crash are completed. + for udid in leaf_udids: + assert records[udid].completed, f"leaf {udid} should be completed" + + # The two fanout outputs whose only child reached the terminal stage roll + # up; the remaining three fanout outputs (no children produced) do not. + completed_fanouts = [u for u in fanout_udids if records[u].completed] + assert len(completed_fanouts) == 2 + + # pt0 has 5 fanout children but only 2 completed — the BFS gate blocks + # rollup at pt0, matching the partial-fan-in contract. + assert not records[pt0_udid].completed + finally: + store.close() + + +def _launch_runner(checkpoint: Path, n_tasks: int, fanin_size: int, writer_sleep_s: float) -> subprocess.Popen[str]: + runner = Path(__file__).parent / "_resumability_runner.py" + repo_root = Path(__file__).resolve().parents[2] + env = {**os.environ, "PYTHONPATH": f"{repo_root}{os.pathsep}{os.environ.get('PYTHONPATH', '')}"} + return subprocess.Popen( # noqa: S603 + [ + sys.executable, + str(runner), + "--checkpoint-path", + str(checkpoint), + "--n-tasks", + str(n_tasks), + "--fanin-size", + str(fanin_size), + "--writer-sleep-s", + str(writer_sleep_s), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + env=env, + ) + + +def _wait_for_completions_then_sigint(proc: subprocess.Popen[str], threshold: int, max_wait_s: float) -> None: + """Read stdout until ``threshold`` ``completed`` lines have been emitted, + then send SIGINT and wait for the runner to exit. Raises ``pytest.fail`` + on premature exit, no progress, or hang.""" + completed_seen = 0 + deadline = time.monotonic() + max_wait_s + while completed_seen < threshold: + line = proc.stdout.readline() + if line == "": + _, stderr_tail = proc.communicate(timeout=10) + pytest.fail( + f"runner exited before reaching threshold (saw {completed_seen} completions). " + f"stderr:\n{stderr_tail}" + ) + if line.strip() == "completed": + completed_seen += 1 + if time.monotonic() > deadline: + proc.kill() + proc.wait() + pytest.fail(f"runner only reached {completed_seen}/{threshold} completions in {max_wait_s}s") + + proc.send_signal(signal.SIGINT) + try: + proc.wait(timeout=60) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + pytest.fail("runner did not exit within 60s of SIGINT") + + +def _drain_proc(proc: subprocess.Popen[str]) -> None: + if proc.poll() is None: + proc.kill() + proc.wait() + with contextlib.suppress(Exception): + proc.stdout.close() + with contextlib.suppress(Exception): + proc.stderr.close() + + +def _read_records(checkpoint: Path) -> dict[str, object]: + _kill_actor_if_present() + store = LineageStore(str(checkpoint)) + try: + return dict(store.iter_records()) + finally: + store.close() + + +def test_resumable_after_sigint(tmp_path: Path, shared_ray_cluster: str) -> None: # noqa: ARG001 + """Drive a 4-stage 2000-task pipeline (fanout -> passthrough -> chunked-fanin + -> slow_writer) in a subprocess, SIGINT it mid-run, and verify partial + completion. Relaunch with the same checkpoint path and verify full + completion. The shared Ray cluster (autouse session fixture) is what the + runner subprocess connects to via ``RAY_ADDRESS``.""" + checkpoint = tmp_path / "lineage_resume.mdb" + n_tasks = 2000 + fanin_size = 20 + writer_sleep_s = 0.05 + expected_total = 2 * n_tasks + 2 * (n_tasks // fanin_size) # 4200 + threshold = 5 + + # --- Run 1: launch, interrupt after some leaves complete --- + _kill_actor_if_present() + proc = _launch_runner(checkpoint, n_tasks, fanin_size, writer_sleep_s) + try: + _wait_for_completions_then_sigint(proc, threshold=threshold, max_wait_s=120) + finally: + _drain_proc(proc) + + records = _read_records(checkpoint) + completed = [u for u, r in records.items() if r.completed] + assert len(completed) >= threshold, ( + f"expected at least {threshold} completions after SIGINT, found {len(completed)}" + ) + assert len(completed) < expected_total, ( + f"all {expected_total} nodes should not be completed after mid-run SIGINT (saw {len(completed)})" + ) + + # --- Run 2: relaunch with same checkpoint, run to natural completion --- + _kill_actor_if_present() + proc2 = _launch_runner(checkpoint, n_tasks, fanin_size, writer_sleep_s) + try: + _, stderr_tail = proc2.communicate(timeout=300) + except subprocess.TimeoutExpired: + proc2.kill() + proc2.wait() + pytest.fail("runner did not finish within 300s on resume") + assert proc2.returncode == 0, f"runner exited with {proc2.returncode}; stderr:\n{stderr_tail}" + + records = _read_records(checkpoint) + assert len(records) == expected_total, ( + f"expected {expected_total} recorded nodes after full run, found {len(records)}" + ) + unfinished = [u for u, r in records.items() if not r.completed] + assert not unfinished, f"unfinished after resume: {unfinished[:5]} ({len(unfinished)} total)" diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 87cc23e324..335dc8f347 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -12,13 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib +from dataclasses import dataclass from unittest.mock import Mock, patch import pytest +from nemo_curator.backends.base import BaseStageAdapter from nemo_curator.pipeline.pipeline import Pipeline -from nemo_curator.stages.base import ProcessingStage +from nemo_curator.stages.base import ProcessingStage, assign_child_lineage, assign_root_lineage from nemo_curator.stages.resources import Resources +from nemo_curator.tasks import Task def test_pipeline_uses_xenna_executor_by_default(): @@ -69,3 +73,304 @@ def test_raises_when_ray_serve_active_with_xenna_and_gpu_stages() -> None: with pytest.raises(RuntimeError, match="Cannot run XennaExecutor"): pipeline.run(executor=mock_executor) + + +# --------------------------------------------------------------------------- +# Deterministic _udid / _lineage_path end-to-end +# --------------------------------------------------------------------------- + + +@dataclass +class _SimpleTask(Task[list[int]]): + @property + def num_items(self) -> int: + return len(self.data) if self.data is not None else 0 + + def validate(self) -> bool: + return True + + +@dataclass +class _Repeat(ProcessingStage[_SimpleTask, _SimpleTask]): + times: int = 3 + name: str = "repeat" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> list[_SimpleTask]: + return [ + _SimpleTask( + task_id=f"{task.task_id}_{i}", + dataset_name=task.dataset_name, + data=task.data, + ) + for i in range(self.times) + ] + + +def _drive(pipeline: Pipeline, initial_tasks: list[Task]) -> list[Task]: + """Walk a built pipeline by hand, threading tasks through BaseStageAdapter + for each stage. This is what every real executor does internally; using it + here lets us exercise the determinism contract without needing Ray.""" + assign_root_lineage(initial_tasks) + current = initial_tasks + for stage in pipeline.stages: + current = BaseStageAdapter(stage).process_batch(current) + return current + + +def test_pipeline_udid_deterministic_across_runs(): + def run_once() -> tuple[list[str], list[str]]: + pipeline = Pipeline(name="det", stages=[_Repeat(times=2), _Repeat(times=3)]) + pipeline.build() + root = _SimpleTask(task_id="r", dataset_name="d", data=[1, 2]) + out = _drive(pipeline, [root]) + return [t._lineage_path for t in out], [t._udid for t in out] + + paths_a, udids_a = run_once() + paths_b, udids_b = run_once() + assert paths_a == paths_b + assert udids_a == udids_b + # Root index "0" is prepended by `assign_root_lineage`; subsequent fan-outs + # extend the path one segment at a time per the documented + # "{root_idx}_{child_idx}_{grandchild_idx}" shape. + assert paths_a == [f"0_{i}_{j}" for i in range(2) for j in range(3)] + assert udids_a == [hashlib.sha256(p.encode()).hexdigest()[:32] for p in paths_a] + + +# --------------------------------------------------------------------------- +# Fan-out / passthrough / fan-in topology with explicit expected _udid values +# --------------------------------------------------------------------------- + + +@dataclass +class _Passthrough(ProcessingStage[_SimpleTask, _SimpleTask]): + name: str = "passthrough" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + return _SimpleTask( + task_id=f"{task.task_id}_pt", + dataset_name=task.dataset_name, + data=task.data, + ) + + +@dataclass +class _FanOut(ProcessingStage[_SimpleTask, _SimpleTask]): + times: int = 3 + name: str = "fanout" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> list[_SimpleTask]: + return [ + _SimpleTask( + task_id=f"{task.task_id}_{i}", + dataset_name=task.dataset_name, + data=task.data, + ) + for i in range(self.times) + ] + + +@dataclass +class _FanIn(ProcessingStage[_SimpleTask, _SimpleTask]): + """Overrides `process_batch` to combine the whole batch into a single + output. Demonstrates the multi-parent path of `assign_child_lineage`.""" + + name: str = "fanin" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + _ = task + msg = "FanIn only supports batched execution" + raise NotImplementedError(msg) + + def process_batch(self, tasks: list[_SimpleTask]) -> list[_SimpleTask]: + combined: list[int] = [] + for t in tasks: + combined.extend(t.data) + merged = _SimpleTask( + task_id="merged", + dataset_name=tasks[0].dataset_name, + data=combined, + ) + return assign_child_lineage([t._lineage_path for t in tasks], merged) + + +def test_pipeline_udid_fanout_passthrough_fanin_passthrough(): + """End-to-end: a 4-stage pipeline that exercises 1:N, 1:1, N:1, 1:1 and + verifies the exact `_lineage_path` / `_udid` values at every step. + + Pipeline topology: + + Input ─▶ FanOut(3) ─▶ Passthrough ─▶ FanIn ─▶ Passthrough ─▶ Output + + Starting from one root task assigned ``_lineage_path = "0"`` by + ``assign_root_lineage``, the framework should produce the following paths: + + After FanOut: ["0_0", "0_1", "0_2"] + After Passthrough: ["0_0_0", "0_1_0", "0_2_0"] + After FanIn: ["0_0_0_0_1_0_0_2_0_0"] (all 3 parents + idx 0) + After Passthrough: ["0_0_0_0_1_0_0_2_0_0_0"] + """ + pipeline = Pipeline( + name="fanout_fanin", + stages=[ + _FanOut(times=3), + _Passthrough(name="pt1"), + _FanIn(), + _Passthrough(name="pt2"), + ], + ) + pipeline.build() + + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + assign_root_lineage([root]) + + # Drive stage-by-stage so we can inspect each intermediate set of tasks. + after_fanout = BaseStageAdapter(pipeline.stages[0]).process_batch([root]) + after_passthrough_1 = BaseStageAdapter(pipeline.stages[1]).process_batch(after_fanout) + after_fanin = BaseStageAdapter(pipeline.stages[2]).process_batch(after_passthrough_1) + after_passthrough_2 = BaseStageAdapter(pipeline.stages[3]).process_batch(after_fanin) + + # Expected paths at every level. + assert [t._lineage_path for t in after_fanout] == ["0_0", "0_1", "0_2"] + assert [t._lineage_path for t in after_passthrough_1] == ["0_0_0", "0_1_0", "0_2_0"] + assert [t._lineage_path for t in after_fanin] == ["0_0_0_0_1_0_0_2_0_0"] + assert [t._lineage_path for t in after_passthrough_2] == ["0_0_0_0_1_0_0_2_0_0_0"] + + # Expected _udid values are exactly sha256(lineage_path)[:32]. + def udid(path: str) -> str: + return hashlib.sha256(path.encode()).hexdigest()[:32] + + assert [t._udid for t in after_fanout] == [udid("0_0"), udid("0_1"), udid("0_2")] + assert [t._udid for t in after_passthrough_1] == [udid("0_0_0"), udid("0_1_0"), udid("0_2_0")] + assert [t._udid for t in after_fanin] == [udid("0_0_0_0_1_0_0_2_0_0")] + assert [t._udid for t in after_passthrough_2] == [udid("0_0_0_0_1_0_0_2_0_0_0")] + + # Uniqueness: every task emitted anywhere in the pipeline has a distinct + # _udid (and a distinct _lineage_path). + all_tasks = [*after_fanout, *after_passthrough_1, *after_fanin, *after_passthrough_2] + all_udids = [t._udid for t in all_tasks] + all_paths = [t._lineage_path for t in all_tasks] + assert len(set(all_udids)) == len(all_udids) + assert len(set(all_paths)) == len(all_paths) + + # Determinism: running the same pipeline shape over the same input again + # yields byte-identical _udid and _lineage_path everywhere. + pipeline2 = Pipeline( + name="fanout_fanin", + stages=[ + _FanOut(times=3), + _Passthrough(name="pt1"), + _FanIn(), + _Passthrough(name="pt2"), + ], + ) + pipeline2.build() + second_run = _drive(pipeline2, [_SimpleTask(task_id="r", dataset_name="d", data=[1])]) + assert [t._lineage_path for t in second_run] == [t._lineage_path for t in after_passthrough_2] + assert [t._udid for t in second_run] == [t._udid for t in after_passthrough_2] + + +# --------------------------------------------------------------------------- +# In-place stages (process() returns the same task) preserve lineage +# --------------------------------------------------------------------------- + + +@dataclass +class _InPlace(ProcessingStage[_SimpleTask, _SimpleTask]): + """Mutates the input task and returns the same instance — the pattern used + by ImageEmbeddingStage and ~28 other stages across audio/image/video.""" + + name: str = "inplace" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _SimpleTask) -> _SimpleTask: + task.data = [*(task.data or []), 0] + return task + + +def test_inplace_stage_preserves_lineage(): + pipeline = Pipeline( + name="inplace", + stages=[_Repeat(times=2), _InPlace(name="ip1"), _InPlace(name="ip2")], + ) + pipeline.build() + + root = _SimpleTask(task_id="r", dataset_name="d", data=[1]) + assign_root_lineage([root]) + after_fanout = BaseStageAdapter(pipeline.stages[0]).process_batch([root]) + after_ip1 = BaseStageAdapter(pipeline.stages[1]).process_batch(after_fanout) + after_ip2 = BaseStageAdapter(pipeline.stages[2]).process_batch(after_ip1) + + # Fan-out gave the children paths "0_0" and "0_1". The two in-place stages + # must NOT extend the lineage path — same instances come back unchanged. + assert [t._lineage_path for t in after_fanout] == ["0_0", "0_1"] + assert [t._lineage_path for t in after_ip1] == ["0_0", "0_1"] + assert [t._lineage_path for t in after_ip2] == ["0_0", "0_1"] + + def udid(path: str) -> str: + return hashlib.sha256(path.encode()).hexdigest()[:32] + + expected_udids = [udid("0_0"), udid("0_1")] + assert [t._udid for t in after_fanout] == expected_udids + assert [t._udid for t in after_ip1] == expected_udids + assert [t._udid for t in after_ip2] == expected_udids + + # Identity check: the in-place stages return the same task instances. + assert all(a is b for a, b in zip(after_fanout, after_ip1, strict=True)) + assert all(a is b for a, b in zip(after_ip1, after_ip2, strict=True)) + + +# --------------------------------------------------------------------------- +# Multiple root tasks must produce distinct _udid through a 1:1 first stage +# --------------------------------------------------------------------------- + + +def test_pipeline_udid_no_collision_across_multiple_roots(): + """Multiple root tasks through a 1:1 first stage must produce distinct _udid. + + Without ``assign_root_lineage`` every root carries ``_lineage_path = ""``; + the empty-string filter in ``_set_lineage`` then collapses all first-stage + children onto the same path ("0"), so their ``_udid`` collides. + """ + pipeline = Pipeline(name="multi_root", stages=[_Passthrough(name="pt")]) + pipeline.build() + + roots = [ + _SimpleTask(task_id="r0", dataset_name="d", data=[1]), + _SimpleTask(task_id="r1", dataset_name="d", data=[2]), + _SimpleTask(task_id="r2", dataset_name="d", data=[3]), + ] + out = _drive(pipeline, roots) + + paths = [t._lineage_path for t in out] + udids = [t._udid for t in out] + assert paths == ["0_0", "1_0", "2_0"] + assert len(set(udids)) == len(udids) diff --git a/tests/pipelines/test_resumability_integration.py b/tests/pipelines/test_resumability_integration.py new file mode 100644 index 0000000000..bc68584f3c --- /dev/null +++ b/tests/pipelines/test_resumability_integration.py @@ -0,0 +1,336 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end resumability test with three failure modes. + +Drives an 8-stage pipeline twice through ``pipeline.run(executor, +checkpoint_path=...)`` on the same LMDB checkpoint, parametrized over both +``XennaExecutor`` and ``RayDataExecutor``, and asserts: + +* **transient** failures injected on run 1 are rescued on run 2, +* **always-fail** branches leave some records ``completed=False`` even after + resume, and +* **filter** decisions produce parquet files with the halved row count. + +Failure modes are keyed on each task's framework-assigned ``_udid`` so the +decisions are stable across runs. Transient drops are gated by a per-stage +``is_resume_run`` flag — flipping it between runs lets us assert resume +behavior without relying on RNG. +""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd +import pytest + +from nemo_curator.backends.ray_data import RayDataExecutor +from nemo_curator.backends.xenna import XennaExecutor + +if TYPE_CHECKING: + from nemo_curator.backends.base import BaseExecutor +from nemo_curator.pipeline.pipeline import Pipeline +from nemo_curator.stages.base import ProcessingStage, assign_child_lineage +from nemo_curator.tasks import Task, _EmptyTask +from nemo_curator.utils.lineage_store import ( + LineageRecord, + LineageStore, + mark_leaves_completed, + record_lineage, +) + +NUM_TASKS = 4 +ROWS_PER_TASK = 12 +FANOUT_FACTOR = 4 # fanout splits each parent into chunks of FANOUT_FACTOR rows + + +@dataclass +class _RowTask(Task[pd.DataFrame]): + data: pd.DataFrame = field(default_factory=pd.DataFrame) + + @property + def num_items(self) -> int: + return len(self.data) + + def validate(self) -> bool: + return True + + +def _bucket(udid: str) -> int: + return int(hashlib.sha256(udid.encode()).hexdigest(), 16) % 6 + + +def _decision(udid: str) -> str: + """Deterministic per-task outcome — stable across runs.""" + return {0: "always_fail", 1: "filter"}.get(_bucket(udid), "all") + + +def _is_transient(udid: str) -> bool: + """Drops ~1/6 of tasks on the run that has transients enabled.""" + return _bucket(udid) == 2 + + +@dataclass +class _LsStage(ProcessingStage[_EmptyTask, _RowTask]): + name: str = "1_ls" + num_tasks: int = NUM_TASKS + rows_per_task: int = ROWS_PER_TASK + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, _: _EmptyTask) -> list[_RowTask]: + return [ + _RowTask( + task_id=f"part_{i}", + dataset_name="demo", + data=pd.DataFrame({"row_idx": list(range(self.rows_per_task)), "src": [i] * self.rows_per_task}), + ) + for i in range(self.num_tasks) + ] + + +@dataclass +class _ReadMockStage(ProcessingStage[_RowTask, _RowTask]): + name: str = "2_read_mock" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _RowTask) -> _RowTask: + task.data = task.data.assign(value=list(range(len(task.data)))) + return task + + +def _apply_failure_modes(task: _RowTask, is_resume_run: bool) -> _RowTask | None: + """Shared 3-mode logic for the four passthrough stages.""" + d = _decision(task._udid) + if d == "always_fail": + return None + if not is_resume_run and _is_transient(task._udid): + return None + if d == "filter": + n = len(task.data) + task.data = task.data.iloc[: max(1, n // 2)].reset_index(drop=True) + return task + + +@dataclass +class _PassThroughStage(ProcessingStage[_RowTask, _RowTask]): + name: str = "passthrough" + is_resume_run: bool = False + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _RowTask) -> _RowTask | None: + return _apply_failure_modes(task, self.is_resume_run) + + +@dataclass +class _PassThroughBatchedStage(ProcessingStage[_RowTask, _RowTask]): + """Overrides ``process_batch`` — must call the four lineage helpers itself. + + See [_resumability_runner.py:109-121] for the same contract on a real + multi-parent stage. + """ + + name: str = "passthrough_batched" + is_resume_run: bool = False + batch_size: int = 1 + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _RowTask) -> _RowTask: + msg = "_PassThroughBatchedStage only supports batched execution" + raise NotImplementedError(msg) + + def process_batch(self, tasks: list[_RowTask]) -> list[_RowTask]: + tasks = self._filter_completed_tasks(tasks) + results: list[_RowTask] = [] + for task in tasks: + result = _apply_failure_modes(task, self.is_resume_run) + children = assign_child_lineage([task._lineage_path], result) + record_lineage([task._udid], [c._udid for c in children]) + if self._is_terminal_stage and children: + mark_leaves_completed([c._udid for c in children]) + results.extend(children) + return results + + +@dataclass +class _FanOutStage(ProcessingStage[_RowTask, _RowTask]): + name: str = "5_fanout" + factor: int = FANOUT_FACTOR + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _RowTask) -> list[_RowTask]: + rows = len(task.data) + n_out = max(1, rows // self.factor) + chunk = max(1, -(-rows // n_out)) # ceil(rows / n_out) + out: list[_RowTask] = [] + for i in range(n_out): + sub = task.data.iloc[i * chunk : (i + 1) * chunk].reset_index(drop=True) + if len(sub) == 0: + continue + out.append( + _RowTask( + task_id=f"{task.task_id}_fan{i}", + dataset_name=task.dataset_name, + data=sub, + ) + ) + return out + + +@dataclass +class _WriteParquetStage(ProcessingStage[_RowTask, _RowTask]): + name: str = "8_write" + out_dir: str = "" + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def process(self, task: _RowTask) -> _RowTask: + # Filename keyed on _udid so file existence ↔ LMDB completion 1:1. + path = Path(self.out_dir) / f"{task._udid}.parquet" + task.data.to_parquet(path, index=False) + return task + + +def _build_pipeline(out_dir: Path, is_resume_run: bool, batched_size: int) -> Pipeline: + return Pipeline( + name="resumability_three_modes", + stages=[ + _LsStage(), + _ReadMockStage(), + _PassThroughStage(name="3_passthrough", is_resume_run=is_resume_run), + _PassThroughBatchedStage( + name="4_passthrough_batched", + is_resume_run=is_resume_run, + batch_size=batched_size, + ), + _FanOutStage(), + _PassThroughStage(name="6_passthrough", is_resume_run=is_resume_run), + _PassThroughBatchedStage( + name="7_passthrough_batched", + is_resume_run=is_resume_run, + batch_size=batched_size, + ), + _WriteParquetStage(out_dir=str(out_dir)), + ], + ) + + +def _read_lmdb(path: Path) -> dict[str, LineageRecord]: + """Open the checkpoint after the executor killed the writer actor.""" + store = LineageStore(str(path)) + try: + return dict(store.iter_records()) + finally: + store.close() + + +@pytest.mark.parametrize( + "executor_cls", + [ + pytest.param(XennaExecutor, id="xenna"), + pytest.param(RayDataExecutor, id="ray_data"), + ], +) +@pytest.mark.parametrize("batched_size", [1, 4], ids=["batch1", "batch4"]) +@pytest.mark.usefixtures("shared_ray_cluster") +def test_resumability_three_modes(executor_cls: type[BaseExecutor], batched_size: int, tmp_path: Path) -> None: + """Two runs against the same LMDB: transient on run 1, disabled on run 2. + + Verifies the three resumability properties documented at the top of the + module across: + + * Both ``XennaExecutor`` and ``RayDataExecutor``. + * ``batch_size`` 1 and 4 on the two ``process_batch``-override stages — + the ``batch4`` axis exercises the multi-task-per-call path that the + demo's finding #1 flagged as broken on RayData. + """ + checkpoint = tmp_path / "lineage.mdb" + out_dir = tmp_path / "out" + out_dir.mkdir() + + # --- Run 1: transient drops active --- + _build_pipeline(out_dir, is_resume_run=False, batched_size=batched_size).run( + executor_cls(), checkpoint_path=str(checkpoint) + ) + run1 = _read_lmdb(checkpoint) + run1_files = {p.stem for p in out_dir.glob("*.parquet")} + + # --- Run 2: same checkpoint, transient disabled --- + _build_pipeline(out_dir, is_resume_run=True, batched_size=batched_size).run( + executor_cls(), checkpoint_path=str(checkpoint) + ) + run2 = _read_lmdb(checkpoint) + run2_files = {p.stem for p in out_dir.glob("*.parquet")} + + # --- Transient rescued on resume --- + run1_completed_leaves = {u for u, r in run1.items() if r.completed and r.task_type in ("leaf", "source_leaf")} + run2_completed_leaves = {u for u, r in run2.items() if r.completed and r.task_type in ("leaf", "source_leaf")} + assert run1_completed_leaves < run2_completed_leaves, ( + f"run 2 must complete strictly more leaves than run 1 " + f"(run1={len(run1_completed_leaves)} run2={len(run2_completed_leaves)})" + ) + + # --- Always-fail branches never converge --- + assert any(not r.completed for r in run2.values()), ( + "always-fail branches should leave some records incomplete after resume" + ) + + # --- Determinism + filename/udid coupling --- + assert run1_files <= run2_files, "resume must not lose parquet outputs" + assert run2_files == run2_completed_leaves, ( + f"parquet filenames (= _udid) must match the completed-leaf set; " + f"files - completed = {run2_files - run2_completed_leaves}, " + f"completed - files = {run2_completed_leaves - run2_files}" + ) + + # --- Filter recorded with halved rows --- + # Post-fanout chunk size = FANOUT_FACTOR rows. A filter at stage 6 or 7 + # halves that. Any leaf with fewer than FANOUT_FACTOR rows must be a + # filter-affected branch. + row_counts = [len(pd.read_parquet(p)) for p in out_dir.glob("*.parquet")] + assert any(c < FANOUT_FACTOR for c in row_counts), ( + f"expected at least one filter-affected parquet with fewer than " + f"{FANOUT_FACTOR} rows; got row counts {sorted(set(row_counts))}" + ) diff --git a/tests/stages/common/test_base.py b/tests/stages/common/test_base.py index b553eba6e1..a0703db19d 100644 --- a/tests/stages/common/test_base.py +++ b/tests/stages/common/test_base.py @@ -12,11 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib +from pathlib import Path + import pytest +import ray from nemo_curator.stages.base import CompositeStage, ProcessingStage from nemo_curator.stages.resources import Resources from nemo_curator.tasks import Task +from nemo_curator.utils.lineage_store import ( + LINEAGE_ACTOR_NAME, + LineageWriterActor, +) class MockTask(Task[dict]): @@ -655,3 +663,121 @@ def test_composite_stage_inputs_and_outputs(self): # outputs() should return the last stage's outputs assert composite.outputs() == composite.decompose()[-1].outputs() + + +# --------------------------------------------------------------------------- # +# Completed-task filtering tests. +# --------------------------------------------------------------------------- # + + +def _kill_lineage_actor_if_present() -> None: + with contextlib.suppress(ValueError): + handle = ray.get_actor(LINEAGE_ACTOR_NAME) + ray.kill(handle) + + +@pytest.fixture +def lineage_actor(tmp_path: Path, shared_ray_client: None) -> tuple[object, Path]: # noqa: ARG001 + _kill_lineage_actor_if_present() + path = tmp_path / "lineage_filter.mdb" + handle = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + get_if_exists=True, + ).remote(path=str(path)) + try: + yield handle, path + finally: + with contextlib.suppress(Exception): + ray.get(handle.close.remote()) + ray.kill(handle) + + +class CountingStage(ProcessingStage[MockTask, MockTask]): + """ProcessingStage that records every task it processed via ``process``. + Returns a *new* task each call so the default ``process_batch``'s + in-place ``assign_child_lineage`` doesn't clobber the input task's udid.""" + + name = "CountingStage" + resources = Resources(cpus=1.0) + batch_size = 1 + + def __init__(self) -> None: + self.seen_udids: list[str] = [] + + def process(self, task: MockTask) -> MockTask: + self.seen_udids.append(task._udid) + return MockTask(data=dict(task.data)) + + def inputs(self) -> tuple[list[str], list[str]]: + return [], [] + + def outputs(self) -> tuple[list[str], list[str]]: + return [], [] + + +def _make_lineage_task(parent_path: str, index: int) -> MockTask: + t = MockTask(data={}) + t._set_lineage([parent_path], index) + return t + + +class TestFilterCompletedTasks: + """Behavior of :meth:`ProcessingStage._filter_completed_tasks` and the + default :meth:`process_batch` filter integration.""" + + def test_filter_with_no_actor_returns_input_unchanged(self) -> None: + """No actor registered → filter is a no-op even if Ray itself isn't up.""" + stage = CountingStage() + tasks = [_make_lineage_task("", i) for i in range(3)] + assert stage._filter_completed_tasks(tasks) == tasks + + def test_filter_drops_completed_tasks(self, lineage_actor: tuple[object, Path]) -> None: + actor_handle, _ = lineage_actor + stage = CountingStage() + tasks = [_make_lineage_task("", i) for i in range(3)] + for t in tasks: + ray.get(actor_handle.record_emission.remote([], [t._udid])) + ray.get(actor_handle.mark_completed.remote(tasks[1]._udid)) + + survivors = stage._filter_completed_tasks(tasks) + assert [t._udid for t in survivors] == [tasks[0]._udid, tasks[2]._udid] + + def test_filter_preserves_empty_udid_tasks(self, lineage_actor: tuple[object, Path]) -> None: + """Tasks with empty ``_udid`` (source / unassigned) are never filtered.""" + _ = lineage_actor + stage = CountingStage() + tasks = [MockTask(data={}), MockTask(data={})] # _udid == "" by default + assert stage._filter_completed_tasks(tasks) == tasks + + def test_filter_empty_input(self, lineage_actor: tuple[object, Path]) -> None: + _ = lineage_actor + stage = CountingStage() + assert stage._filter_completed_tasks([]) == [] + + def test_process_batch_skips_completed_tasks(self, lineage_actor: tuple[object, Path]) -> None: + """Completed tasks must not reach ``process``; survivors are returned.""" + actor_handle, _ = lineage_actor + stage = CountingStage() + tasks = [_make_lineage_task("", i) for i in range(3)] + for t in tasks: + ray.get(actor_handle.record_emission.remote([], [t._udid])) + ray.get(actor_handle.mark_completed.remote(tasks[0]._udid)) + ray.get(actor_handle.mark_completed.remote(tasks[2]._udid)) + + results = stage.process_batch(tasks) + + assert stage.seen_udids == [tasks[1]._udid] + assert len(results) == 1 + + def test_process_batch_all_completed_returns_empty(self, lineage_actor: tuple[object, Path]) -> None: + """Marking every task completed makes ``process_batch`` a no-op.""" + actor_handle, _ = lineage_actor + stage = CountingStage() + tasks = [_make_lineage_task("", i) for i in range(3)] + for t in tasks: + ray.get(actor_handle.record_emission.remote([], [t._udid])) + ray.get(actor_handle.mark_completed.remote(t._udid)) + + results = stage.process_batch(tasks) + assert results == [] + assert stage.seen_udids == [] diff --git a/tests/tasks/test_tasks.py b/tests/tasks/test_tasks.py index e76959c8e6..53699282e3 100644 --- a/tests/tasks/test_tasks.py +++ b/tests/tasks/test_tasks.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import hashlib from dataclasses import dataclass +from nemo_curator.backends.base import BaseStageAdapter from nemo_curator.stages.base import ProcessingStage from nemo_curator.tasks import Task @@ -69,3 +71,75 @@ def test_fanout_tasks_have_unique_uuid(): assert len(output) == 3 uuids = [t._uuid for t in output] assert len(set(uuids)) == 3, f"Expected unique _uuid per task, got {uuids}" + + +def _sha256_32(s: str) -> str: + return hashlib.sha256(s.encode()).hexdigest()[:32] + + +def test_lineage_path_and_udid_format(): + # Empty parent → just the child index + task = SimpleTask(task_id="root", dataset_name="t", data=[]) + task._set_lineage([], 4) + assert task._lineage_path == "4" + assert task._udid == _sha256_32("4") + + # Single non-empty parent + child = SimpleTask(task_id="c", dataset_name="t", data=[]) + child._set_lineage(["3"], 0) + assert child._lineage_path == "3_0" + assert child._udid == _sha256_32("3_0") + + # Multi-parent join + grandchild = SimpleTask(task_id="g", dataset_name="t", data=[]) + grandchild._set_lineage(["3_0", "4_1"], 2) + assert grandchild._lineage_path == "3_0_4_1_2" + assert grandchild._udid == _sha256_32("3_0_4_1_2") + + +def test_fanout_udid_from_empty_root(): + # Driving through the adapter triggers the default process_batch which + # calls assign_child_lineage. Parent _lineage_path is "" (no lineage + # assigned yet), so children get indices as their root paths. + task = _sample_task() + output = BaseStageAdapter(Repeat(times=3)).process_batch([task]) + + assert [t._lineage_path for t in output] == ["0", "1", "2"] + assert [t._udid for t in output] == [_sha256_32("0"), _sha256_32("1"), _sha256_32("2")] + # Original _uuid stays random and unique per task. + assert len({t._uuid for t in output}) == 3 + + +def test_set_lineage_is_idempotent(): + # First assignment fills in path/udid and returns True. + task = SimpleTask(task_id="t", dataset_name="d", data=[]) + assert task._set_lineage(["3"], 0) is True + assert task._lineage_path == "3_0" + assert task._udid == _sha256_32("3_0") + + # Re-assigning with different parent/index is a no-op and returns False. + # This is how the framework detects that a stage returned a task in place. + assert task._set_lineage(["7"], 4) is False + assert task._lineage_path == "3_0" + assert task._udid == _sha256_32("3_0") + + +def test_udid_deterministic_across_runs(): + # Same pipeline run twice over the same input must yield byte-identical + # _udid / _lineage_path sequences. (`_uuid` will differ because it's a + # fresh uuid4 each run; that's expected and not what _udid is for.) + def run_once() -> tuple[list[str], list[str]]: + task = _sample_task() + after_first = BaseStageAdapter(Repeat(times=2)).process_batch([task]) + after_second = BaseStageAdapter(Repeat(times=3)).process_batch(after_first) + return ( + [t._lineage_path for t in after_second], + [t._udid for t in after_second], + ) + + paths_a, udids_a = run_once() + paths_b, udids_b = run_once() + assert paths_a == paths_b + assert udids_a == udids_b + # Sanity: lineage paths follow the documented "{parent_idx}_{child_idx}" shape. + assert paths_a == [f"{i}_{j}" for i in range(2) for j in range(3)] diff --git a/tests/utils/test_lineage_store.py b/tests/utils/test_lineage_store.py new file mode 100644 index 0000000000..145e80d16c --- /dev/null +++ b/tests/utils/test_lineage_store.py @@ -0,0 +1,614 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for :mod:`nemo_curator.utils.lineage_store`. + +The storage-layer tests use :class:`LineageStore` directly. The end-to-end +helper tests spawn a real :class:`LineageWriterActor` and verify that calling +:func:`assign_and_record_lineage` writes through the actor. +""" + +import contextlib +from dataclasses import dataclass +from pathlib import Path + +import pytest +import ray + +from nemo_curator.stages.base import assign_child_lineage +from nemo_curator.tasks import Task +from nemo_curator.utils.lineage_store import ( + LINEAGE_ACTOR_NAME, + LineageStore, + LineageWriterActor, + _classify, + _path_to_udid, + are_completed, + mark_leaves_completed, + record_lineage, +) + + +@dataclass +class _T(Task[None]): + @property + def num_items(self) -> int: + return 0 + + def validate(self) -> bool: + return True + + +def _make_child(parent_path: str, i: int) -> _T: + """Build a lineage-assigned task as if it were emitted by a stage.""" + t = _T(task_id=f"t{i}", dataset_name="ds", data=None) + t._set_lineage([parent_path], i) + return t + + +# --------------------------------------------------------------------------- # +# Direct LineageStore tests — no Ray required. +# --------------------------------------------------------------------------- # + + +@pytest.fixture +def store(tmp_path: Path) -> LineageStore: + path = tmp_path / "lineage.mdb" + s = LineageStore(str(path)) + try: + yield s + finally: + s.close() + + +def test_classify_truth_table() -> None: + assert _classify(False, False) == b"source_leaf" + assert _classify(False, True) == b"source" + assert _classify(True, False) == b"leaf" + assert _classify(True, True) == b"middle" + + +def test_get_returns_none_for_unknown(store: LineageStore) -> None: + assert store.get("doesnotexist" + "0" * 20) is None + + +def test_records_single_edge_with_types(store: LineageStore) -> None: + parent = "p" * 32 + child = "c" * 32 + store.record_emission([parent], [child]) + + p_rec = store.get(parent) + c_rec = store.get(child) + assert p_rec is not None + assert c_rec is not None + assert p_rec.children == [child] + assert p_rec.parents == [] + assert p_rec.task_type == "source" # provisional: no parents seen yet for `parent` + assert c_rec.parents == [parent] + assert c_rec.children == [] + assert c_rec.task_type == "leaf" + + +def test_emission_is_idempotent_under_retry(store: LineageStore) -> None: + parent = "p" * 32 + child = "c" * 32 + store.record_emission([parent], [child]) + store.record_emission([parent], [child]) # retry + + assert store.get(parent).children == [child] # no duplicate edge + assert store.get(child).parents == [parent] + + +def test_incremental_parent_attribution(store: LineageStore) -> None: + """Multiple calls for the same child accumulate parents.""" + p1 = "1" * 32 + p2 = "2" * 32 + child = "c" * 32 + store.record_emission([p1], [child]) + store.record_emission([p2], [child]) + + rec = store.get(child) + assert set(rec.parents) == {p1, p2} + assert rec.task_type == "leaf" + assert set(store.get(p1).children) == {child} + assert set(store.get(p2).children) == {child} + + +def test_type_promotes_monotonically_under_reordering(store: LineageStore) -> None: + """If a node first appears as a parent, it's provisionally `source`. When + its own parent-edge later arrives, it must promote to `middle`.""" + grandparent = "g" * 32 + parent = "p" * 32 + child = "c" * 32 + + # Out-of-order: child created from parent first; parent's own creation arrives later. + store.record_emission([parent], [child]) + assert store.get(parent).task_type == "source" + + store.record_emission([grandparent], [parent]) + assert store.get(parent).task_type == "middle" + assert store.get(grandparent).task_type == "source" + assert store.get(child).task_type == "leaf" + + +def test_source_leaf_classification(store: LineageStore) -> None: + """A task with no parents and no children is `source_leaf`.""" + orphan = "o" * 32 + # Emit a child with no real parents, AND no children of its own. + store.record_emission([], [orphan]) + assert store.get(orphan).task_type == "source_leaf" + + +def test_completed_defaults_false_and_can_be_set(store: LineageStore) -> None: + udid = "x" * 32 + store.record_emission([], [udid]) + assert store.get(udid).completed is False + store.mark_completed(udid) + assert store.is_completed(udid) is True + assert store.get(udid).completed is True + # Idempotent. + store.mark_completed(udid) + assert store.is_completed(udid) is True + + +def test_iter_records_returns_all(store: LineageStore) -> None: + udids = ["a" * 32, "b" * 32, "c" * 32] + # a → b, b → c + store.record_emission([udids[0]], [udids[1]]) + store.record_emission([udids[1]], [udids[2]]) + + all_records = dict(store.iter_records()) + assert set(all_records.keys()) == set(udids) + assert all_records[udids[0]].task_type == "source" + assert all_records[udids[1]].task_type == "middle" + assert all_records[udids[2]].task_type == "leaf" + + +def test_get_all_parents_chain(store: LineageStore) -> None: + """a → b → c: ``get_all_parents(c)`` returns both ``a`` and ``b``.""" + a, b, c = "a" * 32, "b" * 32, "c" * 32 + store.record_emission([a], [b]) + store.record_emission([b], [c]) + + parents = store.get_all_parents(c) + assert set(parents.keys()) == {a, b} + assert parents[b].parents == [a] + assert parents[a].parents == [] + + +def test_get_all_children_chain(store: LineageStore) -> None: + """a → b → c: ``get_all_children(a)`` returns both ``b`` and ``c``.""" + a, b, c = "a" * 32, "b" * 32, "c" * 32 + store.record_emission([a], [b]) + store.record_emission([b], [c]) + + children = store.get_all_children(a) + assert set(children.keys()) == {b, c} + assert children[b].children == [c] + assert children[c].children == [] + + +def test_transitive_diamond_dedup(store: LineageStore) -> None: + """Diamond: a → {b, c} → d. ``a`` appears once in ``get_all_parents(d)``.""" + a, b, c, d = "a" * 32, "b" * 32, "c" * 32, "d" * 32 + store.record_emission([a], [b]) + store.record_emission([a], [c]) + store.record_emission([b], [d]) + store.record_emission([c], [d]) + + assert set(store.get_all_parents(d).keys()) == {a, b, c} + assert set(store.get_all_children(a).keys()) == {b, c, d} + + +def test_transitive_unknown_returns_empty(store: LineageStore) -> None: + unknown = "u" * 32 + assert store.get_all_parents(unknown) == {} + assert store.get_all_children(unknown) == {} + + +def test_transitive_source_and_leaf_empty(store: LineageStore) -> None: + a, b = "a" * 32, "b" * 32 + store.record_emission([a], [b]) + # Pure source: no ancestors. + assert store.get_all_parents(a) == {} + # Pure leaf: no descendants. + assert store.get_all_children(b) == {} + + +def test_transitive_excludes_self(store: LineageStore) -> None: + a, b, c = "a" * 32, "b" * 32, "c" * 32 + store.record_emission([a], [b]) + store.record_emission([b], [c]) + for udid in (a, b, c): + assert udid not in store.get_all_parents(udid) + assert udid not in store.get_all_children(udid) + + +def test_record_emission_skips_self_loop_in_place_return(store: LineageStore) -> None: + """In-place return: parent and child udids match. No self-edge is recorded + and the node stays ``source_leaf`` rather than getting promoted to ``middle``.""" + a = "a" * 32 + store.record_emission([a], [a]) + + rec = store.get(a) + assert rec is not None + assert rec.parents == [] + assert rec.children == [] + assert rec.task_type == "source_leaf" + + +def test_record_emission_keeps_cross_edges_when_one_child_is_self(store: LineageStore) -> None: + """``parents=[a]``, ``children=[a, b]``: only ``a→a`` is dropped; ``a→b`` is kept.""" + a, b = "a" * 32, "b" * 32 + store.record_emission([a], [a, b]) + + a_rec = store.get(a) + b_rec = store.get(b) + assert a_rec is not None + assert b_rec is not None + assert a_rec.parents == [] + assert a_rec.children == [b] + assert a_rec.task_type == "source" + assert b_rec.parents == [a] + assert b_rec.children == [] + assert b_rec.task_type == "leaf" + + +def test_record_emission_keeps_cross_edges_in_multi_parent_self(store: LineageStore) -> None: + """``parents=[a, b]``, ``children=[a, c]``: edges are ``b→a``, ``a→c``, ``b→c``; + only ``a→a`` is dropped.""" + a, b, c = "a" * 32, "b" * 32, "c" * 32 + store.record_emission([a, b], [a, c]) + + a_rec = store.get(a) + b_rec = store.get(b) + c_rec = store.get(c) + assert a_rec is not None + assert b_rec is not None + assert c_rec is not None + assert a_rec.parents == [b] + assert set(a_rec.children) == {c} + assert b_rec.parents == [] + assert set(b_rec.children) == {a, c} + assert set(c_rec.parents) == {a, b} + assert c_rec.children == [] + + +def test_path_to_udid_matches_task_set_lineage() -> None: + """Mirror invariant: hashing a lineage path with ``_path_to_udid`` yields the same + ``_udid`` that ``Task._set_lineage`` would assign.""" + t = _T(task_id="t", dataset_name="ds", data=None) + t._set_lineage(["3_0"], 7) + assert t._lineage_path == "3_0_7" + assert _path_to_udid("3_0_7") == t._udid + + +# --------------------------------------------------------------------------- # +# BFS completion-propagation tests. +# --------------------------------------------------------------------------- # + + +def test_propagate_linear_chain(store: LineageStore) -> None: + """A→B→C→D: propagating from D rolls up to A.""" + a, b, c, d = "a" * 32, "b" * 32, "c" * 32, "d" * 32 + store.record_emission([a], [b]) + store.record_emission([b], [c]) + store.record_emission([c], [d]) + + newly = store.mark_completed_and_propagate([d]) + assert set(newly) == {a, b, c, d} + for udid in (a, b, c, d): + assert store.is_completed(udid) + + +def test_propagate_diamond_partial(store: LineageStore) -> None: + """A→{B,C}; propagating only from B does not mark A because C is still pending. + A second pass that completes C then rolls up to A.""" + a, b, c = "a" * 32, "b" * 32, "c" * 32 + store.record_emission([a], [b]) + store.record_emission([a], [c]) + + newly_first = store.mark_completed_and_propagate([b]) + assert newly_first == [b] + assert store.is_completed(b) + assert not store.is_completed(a) + assert not store.is_completed(c) + + newly_second = store.mark_completed_and_propagate([c]) + assert set(newly_second) == {a, c} + assert store.is_completed(a) + + +def test_propagate_diamond_full_batch(store: LineageStore) -> None: + """A→{B,C}→D: batch-propagate from D marks all four; A appears once (visited dedup).""" + a, b, c, d = "a" * 32, "b" * 32, "c" * 32, "d" * 32 + store.record_emission([a], [b]) + store.record_emission([a], [c]) + store.record_emission([b], [d]) + store.record_emission([c], [d]) + + newly = store.mark_completed_and_propagate([d]) + assert set(newly) == {a, b, c, d} + assert newly.count(a) == 1 + for udid in (a, b, c, d): + assert store.is_completed(udid) + + +def test_propagate_idempotent(store: LineageStore) -> None: + """Calling propagate twice returns empty the second time; state unchanged.""" + a, b = "a" * 32, "b" * 32 + store.record_emission([a], [b]) + + newly_first = store.mark_completed_and_propagate([b]) + assert set(newly_first) == {a, b} + + newly_second = store.mark_completed_and_propagate([b]) + assert newly_second == [] + assert store.is_completed(a) + assert store.is_completed(b) + + +def test_propagate_stops_at_incomplete_sibling(store: LineageStore) -> None: + """Fan-out A→{C1,C2,C3}: completing only C1 must not mark A because C2 and + C3 are still pending children. A second pass completing C2 still leaves A + blocked. Only after C3 is also completed does A get rolled up.""" + a, c1, c2, c3 = "a" * 32, "1" * 32, "2" * 32, "3" * 32 + store.record_emission([a], [c1]) + store.record_emission([a], [c2]) + store.record_emission([a], [c3]) + + first = store.mark_completed_and_propagate([c1]) + assert first == [c1] + assert store.is_completed(c1) + assert not store.is_completed(a) + assert not store.is_completed(c2) + assert not store.is_completed(c3) + + second = store.mark_completed_and_propagate([c2]) + assert set(second) == {c2} + assert not store.is_completed(a) + + third = store.mark_completed_and_propagate([c3]) + assert set(third) == {a, c3} + assert store.is_completed(a) + + +def test_propagate_unknown_udid_is_noop(store: LineageStore) -> None: + """An unknown but non-empty udid is silently skipped.""" + newly = store.mark_completed_and_propagate(["u" * 32]) + assert newly == [] + + +def test_propagate_empty_udid_raises(store: LineageStore) -> None: + """An empty udid means the caller forgot ``assign_child_lineage`` — raise loudly. + The known leaf in the same batch must NOT be marked, since the call aborts.""" + leaf = "x" * 32 + store.record_emission([], [leaf]) + with pytest.raises(ValueError, match="empty udid"): + store.mark_completed_and_propagate(["", leaf]) + assert not store.is_completed(leaf) + + +# --------------------------------------------------------------------------- # +# Bulk are_completed tests — single read txn over many udids. +# --------------------------------------------------------------------------- # + + +def test_are_completed_empty_input(store: LineageStore) -> None: + assert store.are_completed([]) == [] + + +def test_are_completed_all_completed(store: LineageStore) -> None: + udids = ["a" * 32, "b" * 32, "c" * 32] + for u in udids: + store.record_emission([], [u]) + store.mark_completed(u) + assert store.are_completed(udids) == [True, True, True] + + +def test_are_completed_none_completed(store: LineageStore) -> None: + udids = ["a" * 32, "b" * 32, "c" * 32] + for u in udids: + store.record_emission([], [u]) + assert store.are_completed(udids) == [False, False, False] + + +def test_are_completed_mixed_preserves_order(store: LineageStore) -> None: + udids = ["a" * 32, "b" * 32, "c" * 32, "d" * 32] + for u in udids: + store.record_emission([], [u]) + store.mark_completed(udids[0]) + store.mark_completed(udids[2]) + assert store.are_completed(udids) == [True, False, True, False] + + +def test_are_completed_unknown_udids_return_false(store: LineageStore) -> None: + """Never-recorded udids return False (no key in completed_db).""" + assert store.are_completed(["u" * 32, "v" * 32]) == [False, False] + + +def test_are_completed_empty_string_returns_false(store: LineageStore) -> None: + """Empty udid short-circuits to False without an LMDB lookup.""" + udid = "a" * 32 + store.record_emission([], [udid]) + store.mark_completed(udid) + assert store.are_completed(["", udid]) == [False, True] + + +# --------------------------------------------------------------------------- # +# Actor-routed tests — verify record_lineage → LineageWriterActor → LMDB. +# --------------------------------------------------------------------------- # + + +def _kill_actor_if_present() -> None: + """Make sure no leftover writer actor lingers between tests.""" + with contextlib.suppress(ValueError): + handle = ray.get_actor(LINEAGE_ACTOR_NAME) + ray.kill(handle) + + +@pytest.fixture +def actor(tmp_path: Path, shared_ray_client: None) -> tuple[object, Path]: # noqa: ARG001 + """Spawn a real :class:`LineageWriterActor` for the duration of the test.""" + _kill_actor_if_present() + path = tmp_path / "lineage_actor.mdb" + handle = LineageWriterActor.options( + name=LINEAGE_ACTOR_NAME, + get_if_exists=True, + ).remote(path=str(path)) + try: + yield handle, path + finally: + with contextlib.suppress(Exception): + ray.get(handle.close.remote()) + ray.kill(handle) + + +def test_record_lineage_filters_empty_parents(actor: tuple[object, Path]) -> None: + """``record_lineage`` should not record EmptyTask-style empty parent udids.""" + actor_handle, _ = actor + children = assign_child_lineage([""], [_T(task_id="c", dataset_name="ds", data=None) for _ in range(3)]) + record_lineage([""], [c._udid for c in children]) + assert [c._lineage_path for c in children] == ["0", "1", "2"] + + for i, c in enumerate(children): + rec = ray.get(actor_handle.get.remote(c._udid)) + assert rec is not None, f"child {i} not in store" + assert rec.parents == [] + assert rec.task_type == "source_leaf" + + +def test_record_lineage_propagates_lineage(actor: tuple[object, Path]) -> None: + """End-to-end through the actor: drive two stages with separate + :func:`assign_child_lineage` + :func:`record_lineage` calls and check the + DAG ends up in the store with correct types.""" + actor_handle, _ = actor + + # Stage 1: produce three children from an empty root. + parents = assign_child_lineage([""], [_T(task_id=f"p{i}", dataset_name="ds", data=None) for i in range(3)]) + record_lineage([""], [p._udid for p in parents]) + + # Stage 2: each parent produces two children. + grandchildren = [] + for p in parents: + emitted = assign_child_lineage( + [p._lineage_path], + [_T(task_id="g", dataset_name="ds", data=None) for _ in range(2)], + ) + record_lineage([p._udid], [c._udid for c in emitted]) + grandchildren.extend(emitted) + + # Sources: 3 parents, classified `source` because they now have children. + for p in parents: + rec = ray.get(actor_handle.get.remote(p._udid)) + assert rec is not None + assert rec.task_type == "source" + assert len(rec.children) == 2 + assert rec.parents == [] + + # Leaves: 6 grandchildren, each with one parent. + for g in grandchildren: + rec = ray.get(actor_handle.get.remote(g._udid)) + assert rec is not None + assert rec.task_type == "leaf" + assert len(rec.parents) == 1 + assert rec.children == [] + + +def test_record_lineage_is_noop_without_actor(shared_ray_client: None) -> None: # noqa: ARG001 + """When no LineageWriterActor is registered, ``record_lineage`` must not raise + and must not write anything (no actor exists to write to).""" + _kill_actor_if_present() + # If this returned an error path it would raise. We don't have a record store + # to assert "no write" against directly, but the absence of an actor means + # there's literally nowhere for it to write — successful return is the assertion. + child = _make_child("", 0) + record_lineage([""], [child._udid]) + + +def test_mark_completed_and_propagate_actor_passthrough(actor: tuple[object, Path]) -> None: + """Drive a small DAG via the actor and confirm propagation through ``ray.get``.""" + actor_handle, _ = actor + a, b, c = "a" * 32, "b" * 32, "c" * 32 + ray.get(actor_handle.record_emission.remote([a], [b])) + ray.get(actor_handle.record_emission.remote([b], [c])) + + newly = ray.get(actor_handle.mark_completed_and_propagate.remote([c])) + assert set(newly) == {a, b, c} + for udid in (a, b, c): + assert ray.get(actor_handle.is_completed.remote(udid)) + + +def test_mark_leaves_completed_routes_through_actor(actor: tuple[object, Path]) -> None: + """The :func:`mark_leaves_completed` helper looks up the named actor and forwards + to its ``mark_completed_and_propagate`` method, rolling completion up the DAG.""" + actor_handle, _ = actor + a, b, c = "a" * 32, "b" * 32, "c" * 32 + ray.get(actor_handle.record_emission.remote([a], [b])) + ray.get(actor_handle.record_emission.remote([b], [c])) + + mark_leaves_completed([c]) + + for udid in (a, b, c): + assert ray.get(actor_handle.is_completed.remote(udid)) + + +def test_mark_leaves_completed_noop_without_actor(shared_ray_client: None) -> None: # noqa: ARG001 + """When no LineageWriterActor is registered, :func:`mark_leaves_completed` must + return silently — mirrors the :func:`record_lineage` no-op contract.""" + _kill_actor_if_present() + # No actor, no destination — successful return is the assertion. + mark_leaves_completed(["x" * 32]) + + +def test_mark_leaves_completed_filters_empty_udids(actor: tuple[object, Path]) -> None: + """Empty udids in the input list are filtered (parity with :func:`record_lineage`) + rather than triggering the underlying ``ValueError``; non-empty udids still get + marked.""" + actor_handle, _ = actor + leaf = "z" * 32 + ray.get(actor_handle.record_emission.remote([], [leaf])) + + # Mixing an empty udid in must not raise; the real leaf still gets marked. + mark_leaves_completed(["", leaf]) + assert ray.get(actor_handle.is_completed.remote(leaf)) + + +def test_module_are_completed_without_ray() -> None: + """No Ray initialized → bulk helper returns all False (filter no-op).""" + if ray.is_initialized(): + pytest.skip("ray already initialized by another test in this session") + assert are_completed(["a" * 32, "b" * 32]) == [False, False] + + +def test_module_are_completed_no_actor(shared_ray_client: None) -> None: # noqa: ARG001 + """Ray up but no LineageWriterActor registered → all False.""" + _kill_actor_if_present() + assert are_completed(["a" * 32, "b" * 32]) == [False, False] + + +def test_module_are_completed_with_actor(actor: tuple[object, Path]) -> None: + """Module-level helper routes through the registered actor and preserves order.""" + actor_handle, _ = actor + udids = ["a" * 32, "b" * 32, "c" * 32, "d" * 32] + for u in udids: + ray.get(actor_handle.record_emission.remote([], [u])) + ray.get(actor_handle.mark_completed.remote(udids[1])) + ray.get(actor_handle.mark_completed.remote(udids[3])) + + assert are_completed(udids) == [False, True, False, True] + + +def test_module_are_completed_empty_input(actor: tuple[object, Path]) -> None: # noqa: ARG001 + """Empty input short-circuits before any actor call.""" + assert are_completed([]) == [] diff --git a/uv.lock b/uv.lock index fd786a09ae..4470e57b85 100644 --- a/uv.lock +++ b/uv.lock @@ -1813,17 +1813,49 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/bd/d4/16916f3dc20a3f5455b63c35dcb260b3716f59ce27a93586804e70e431d5/cytoolz-1.1.0.tar.gz", hash = "sha256:13a7bf254c3c0d28b12e2290b82aed0f0977a4c2a2bf84854fcdc7796a29f3b0", size = 642510, upload-time = "2025-10-19T00:44:56.174Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/df/aa/365953926ee8b4f2e07df7200c0d73632155908c8867af14b2d19cc9f1f7/cytoolz-1.1.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:098d628a801dc142e9740126be5624eb7aef1d732bc7a5719f60a2095547b485", size = 2639311, upload-time = "2025-10-19T00:40:22.289Z" }, + { url = "https://files.pythonhosted.org/packages/7c/ee/62beaaee7df208f22590ad07ef8875519af49c52ca39d99460b14a00f15a/cytoolz-1.1.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:779ee4096ed7a82cffab89372ffc339631c285079dbf33dbe7aff1f6174985df", size = 2979532, upload-time = "2025-10-19T00:40:24.006Z" }, + { url = "https://files.pythonhosted.org/packages/c5/04/2211251e450bed111ada1194dc42c461da9aea441de62a01e4085ea6de9f/cytoolz-1.1.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f2ce18dd99533d077e9712f9faa852f389f560351b1efd2f2bdb193a95eddde2", size = 3018632, upload-time = "2025-10-19T00:40:26.175Z" }, { url = "https://files.pythonhosted.org/packages/ed/a2/4a3400e4d07d3916172bf74fede08020d7b4df01595d8a97f1e9507af5ae/cytoolz-1.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac266a34437812cf841cecbfe19f355ab9c3dd1ef231afc60415d40ff12a76e4", size = 2788579, upload-time = "2025-10-19T00:40:27.878Z" }, + { url = "https://files.pythonhosted.org/packages/fe/82/bb88caa53a41f600e7763c517d50e2efbbe6427ea395716a92b83f44882a/cytoolz-1.1.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1920b9b9c13d60d0bb6cd14594b3bce0870022eccb430618c37156da5f2b7a55", size = 2593024, upload-time = "2025-10-19T00:40:29.601Z" }, + { url = "https://files.pythonhosted.org/packages/d4/56/faec7696f235521b926ffdf92c102f5b029f072d28e1020364e55b084820/cytoolz-1.1.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5ab2c97d8aaa522b038cca9187b1153347af22309e7c998b14750c6fdec7b1cb", size = 2654461, upload-time = "2025-10-19T00:40:32.884Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b3/80b8183e7eee44f45bfa3cdd3ebdadf3dd43ffc686f96d442a6c4dded45d/cytoolz-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fc0f1e4e9bb384d26e73c6657bbc26abdae4ff66a95933c00f3d578be89181b", size = 2881589, upload-time = "2025-10-19T00:40:36.315Z" }, + { url = "https://files.pythonhosted.org/packages/8f/05/ac5ba5ddb88a3ba7ecea4bf192194a838af564d22ea7a4812cbb6bd106ce/cytoolz-1.1.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:dd3f894ff972da1994d06ac6157d74e40dda19eb31fe5e9b7863ca4278c3a167", size = 2589924, upload-time = "2025-10-19T00:40:38.317Z" }, + { url = "https://files.pythonhosted.org/packages/8e/cd/100483cae3849d24351c8333a815dc6adaf3f04912486e59386d86d9db9a/cytoolz-1.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0846f49cf8a4496bd42659040e68bd0484ce6af819709cae234938e039203ba0", size = 2868059, upload-time = "2025-10-19T00:40:40.025Z" }, { url = "https://files.pythonhosted.org/packages/34/6e/3a7c56b325772d39397fc3aafb4dc054273982097178b6c3917c6dad48de/cytoolz-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:16a3af394ade1973226d64bb2f9eb3336adbdea03ed5b134c1bbec5a3b20028e", size = 2721692, upload-time = "2025-10-19T00:40:41.621Z" }, { url = "https://files.pythonhosted.org/packages/fd/04/2ab98edeea90311e4029e1643e43d2027b54da61453292d9ea51a103ee87/cytoolz-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:ebf06d1c5344fb22fee71bf664234733e55db72d74988f2ecb7294b05e4db30c", size = 945831, upload-time = "2025-10-19T00:40:44.693Z" }, + { url = "https://files.pythonhosted.org/packages/0c/93/9c787f7c909e75670fff467f2504725d06d8c3f51d6dfe22c55a08c8ccd4/cytoolz-1.1.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7d3e405e435320e08c5a1633afaf285a392e2d9cef35c925d91e2a31dfd7a688", size = 2679635, upload-time = "2025-10-19T00:40:57.799Z" }, + { url = "https://files.pythonhosted.org/packages/50/aa/9ee92c302cccf7a41a7311b325b51ebeff25d36c1f82bdc1bbe3f58dc947/cytoolz-1.1.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:923df8f5591e0d20543060c29909c149ab1963a7267037b39eee03a83dbc50a8", size = 2938352, upload-time = "2025-10-19T00:40:59.49Z" }, + { url = "https://files.pythonhosted.org/packages/6a/a3/3b58c5c1692c3bacd65640d0d5c7267a7ebb76204f7507aec29de7063d2f/cytoolz-1.1.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:25db9e4862f22ea0ae2e56c8bec9fc9fd756b655ae13e8c7b5625d7ed1c582d4", size = 3022121, upload-time = "2025-10-19T00:41:01.209Z" }, { url = "https://files.pythonhosted.org/packages/e1/93/c647bc3334355088c57351a536c2d4a83dd45f7de591fab383975e45bff9/cytoolz-1.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7a98deb11ccd8e5d9f9441ef2ff3352aab52226a2b7d04756caaa53cd612363", size = 2857656, upload-time = "2025-10-19T00:41:03.456Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c2/43fea146bf4141deea959e19dcddf268c5ed759dec5c2ed4a6941d711933/cytoolz-1.1.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:dce4ee9fc99104bc77efdea80f32ca5a650cd653bcc8a1d984a931153d3d9b58", size = 2551284, upload-time = "2025-10-19T00:41:05.347Z" }, + { url = "https://files.pythonhosted.org/packages/45/be/f8524bb9ad8812ad375e61238dcaa3177628234d1b908ad0b74e3657cafd/cytoolz-1.1.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3b5c5a192abda123ad45ef716ec9082b4cf7d95e9ada8291c5c2cc5558be858b", size = 2722884, upload-time = "2025-10-19T00:41:09.698Z" }, + { url = "https://files.pythonhosted.org/packages/d7/dd/88619f9c8d2b682562c0c886bbb7c35720cb83fda2ac9a41bdd14073d9bd/cytoolz-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e7e29a1a03f00b4322196cfe8e2c38da9a6c8d573566052c586df83aacc5663c", size = 2839661, upload-time = "2025-10-19T00:41:13.053Z" }, + { url = "https://files.pythonhosted.org/packages/b8/8d/4478ebf471ee78dd496d254dc0f4ad729cd8e6ba8257de4f0a98a2838ef2/cytoolz-1.1.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5291b117d71652a817ec164e7011f18e6a51f8a352cc9a70ed5b976c51102fda", size = 2547095, upload-time = "2025-10-19T00:41:16.054Z" }, + { url = "https://files.pythonhosted.org/packages/e6/68/f1dea33367b0b3f64e199c230a14a6b6f243c189020effafd31e970ca527/cytoolz-1.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:8caef62f846a9011676c51bda9189ae394cdd6bb17f2946ecaedc23243268320", size = 2870901, upload-time = "2025-10-19T00:41:17.727Z" }, { url = "https://files.pythonhosted.org/packages/4a/9a/33591c09dfe799b8fb692cf2ad383e2c41ab6593cc960b00d1fc8a145655/cytoolz-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:de425c5a8e3be7bb3a195e19191d28d9eb3c2038046064a92edc4505033ec9cb", size = 2765422, upload-time = "2025-10-19T00:41:20.075Z" }, { url = "https://files.pythonhosted.org/packages/ad/33/4c9bdf8390dc01d2617c7f11930697157164a52259b6818ddfa2f94f89f4/cytoolz-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:07156987f224c6dac59aa18fb8bf91e1412f5463961862716a3381bf429c8699", size = 947989, upload-time = "2025-10-19T00:41:23.288Z" }, { url = "https://files.pythonhosted.org/packages/d9/cb/efc1b29e211e0670a6953222afaac84dcbba5cb940b130c0e49858978040/cytoolz-1.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:26801c1a165e84786a99e03c9c9973356caaca002d66727b761fb1042878ef06", size = 992632, upload-time = "2025-10-19T00:41:30.612Z" }, + { url = "https://files.pythonhosted.org/packages/db/f5/0083608286ad1716eda7c41f868e85ac549f6fd6b7646993109fa0bdfd98/cytoolz-1.1.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:823df012ab90d2f2a0f92fea453528539bf71ac1879e518524cd0c86aa6df7b9", size = 2669312, upload-time = "2025-10-19T00:41:41.55Z" }, + { url = "https://files.pythonhosted.org/packages/47/a8/d16080b575520fe5da00cede1ece4e0a4180ec23f88dcdc6a2f5a90a7f7f/cytoolz-1.1.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f1fcf9e7e7b3487883ff3f815abc35b89dcc45c4cf81c72b7ee457aa72d197b", size = 2922147, upload-time = "2025-10-19T00:41:43.252Z" }, + { url = "https://files.pythonhosted.org/packages/7e/bc/716c9c1243701e58cad511eb3937fd550e645293c5ed1907639c5d66f194/cytoolz-1.1.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4cdb3fa1772116827f263f25b0cdd44c663b6701346a56411960534a06c082de", size = 2981602, upload-time = "2025-10-19T00:41:45.354Z" }, { url = "https://files.pythonhosted.org/packages/14/bc/571b232996846b27f4ac0c957dc8bf60261e9b4d0d01c8d955e82329544e/cytoolz-1.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d1b5c95041741b81430454db65183e133976f45ac3c03454cfa8147952568529", size = 2830103, upload-time = "2025-10-19T00:41:47.959Z" }, + { url = "https://files.pythonhosted.org/packages/5b/55/c594afb46ecd78e4b7e1fb92c947ed041807875661ceda73baaf61baba4f/cytoolz-1.1.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b2079fd9f1a65f4c61e6278c8a6d4f85edf30c606df8d5b32f1add88cbbe2286", size = 2533802, upload-time = "2025-10-19T00:41:49.683Z" }, + { url = "https://files.pythonhosted.org/packages/e2/df/035a408df87f25cfe3611557818b250126cd2281b2104cd88395de205583/cytoolz-1.1.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:06d1c79aa51e6a92a90b0e456ebce2288f03dd6a76c7f582bfaa3eda7692e8a5", size = 2707575, upload-time = "2025-10-19T00:41:53.305Z" }, + { url = "https://files.pythonhosted.org/packages/30/7a/2c3d60682b26058d435416c4e90d4a94db854de5be944dfd069ed1be648a/cytoolz-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:964b248edc31efc50a65e9eaa0c845718503823439d2fa5f8d2c7e974c2b5409", size = 2819605, upload-time = "2025-10-19T00:41:58.257Z" }, + { url = "https://files.pythonhosted.org/packages/45/92/19b722a1d83cc443fbc0c16e0dc376f8a451437890d3d9ee370358cf0709/cytoolz-1.1.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c9ff2b3c57c79b65cb5be14a18c6fd4a06d5036fb3f33e973a9f70e9ac13ca28", size = 2533559, upload-time = "2025-10-19T00:42:00.324Z" }, + { url = "https://files.pythonhosted.org/packages/1d/15/fa3b7891da51115204416f14192081d3dea0eaee091f123fdc1347de8dd1/cytoolz-1.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:22290b73086af600042d99f5ce52a43d4ad9872c382610413176e19fc1d4fd2d", size = 2839171, upload-time = "2025-10-19T00:42:01.881Z" }, { url = "https://files.pythonhosted.org/packages/46/40/d3519d5cd86eebebf1e8b7174ec32dfb6ecec67b48b0cfb92bf226659b5a/cytoolz-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a2ade74fccd080ea793382968913ee38d7a35c921df435bbf0a6aeecf0d17574", size = 2743379, upload-time = "2025-10-19T00:42:03.809Z" }, { url = "https://files.pythonhosted.org/packages/d6/a4/fb7eb403c6a4c81e5a30363f34a71adcc8bf5292dc8ea32e2440aa5668f2/cytoolz-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9e2d3fe3b45c3eb7233746f7aca37789be3dceec3e07dcc406d3e045ea0f7bdc", size = 946461, upload-time = "2025-10-19T00:42:07.983Z" }, + { url = "https://files.pythonhosted.org/packages/9a/71/1d1103b819458679277206ad07d78ca6b31c4bb88d6463fd193e19bfb270/cytoolz-1.1.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4d96ff3d381423af1b105295f97de86d1db51732c9566eb37378bab6670c5010", size = 2807149, upload-time = "2025-10-19T00:42:20.964Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d4/3d83a05a21e7d2ed2b9e6daf489999c29934b005de9190272b8a2e3735d0/cytoolz-1.1.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0ec96b3d537cdf47d4e76ded199f7440715f4c71029b45445cff92c1248808c2", size = 3111608, upload-time = "2025-10-19T00:42:22.684Z" }, + { url = "https://files.pythonhosted.org/packages/51/88/96f68354c3d4af68de41f0db4fe41a23b96a50a4a416636cea325490cfeb/cytoolz-1.1.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:208e2f2ef90a32b0acbff3303d90d89b13570a228d491d2e622a7883a3c68148", size = 3179373, upload-time = "2025-10-19T00:42:24.395Z" }, { url = "https://files.pythonhosted.org/packages/ce/50/ed87a5cd8e6f27ffbb64c39e9730e18ec66c37631db2888ae711909f10c9/cytoolz-1.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d416a81bb0bd517558668e49d30a7475b5445f9bbafaab7dcf066f1e9adba36", size = 3003120, upload-time = "2025-10-19T00:42:26.18Z" }, + { url = "https://files.pythonhosted.org/packages/d3/a7/acde155b050d6eaa8e9c7845c98fc5fb28501568e78e83ebbf44f8855274/cytoolz-1.1.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f32e94c91ffe49af04835ee713ebd8e005c85ebe83e7e1fdcc00f27164c2d636", size = 2703225, upload-time = "2025-10-19T00:42:27.93Z" }, + { url = "https://files.pythonhosted.org/packages/89/7a/93e5f860926165538c85e1c5e1670ad3424f158df810f8ccd269da652138/cytoolz-1.1.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:bf069c5381d757debae891401b88b3a346ba3a28ca45ba9251103b282463fad8", size = 2862950, upload-time = "2025-10-19T00:42:31.803Z" }, + { url = "https://files.pythonhosted.org/packages/71/ca/adfa1fb7949478135a37755cb8e88c20cd6b75c22a05f1128f05f3ab2c60/cytoolz-1.1.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:3e3872c21170f8341656f8692f8939e8800dcee6549ad2474d4c817bdefd62cd", size = 2979049, upload-time = "2025-10-19T00:42:35.377Z" }, + { url = "https://files.pythonhosted.org/packages/70/4c/7bf47a03a4497d500bc73d4204e2d907771a017fa4457741b2a1d7c09319/cytoolz-1.1.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:b9ddeff8e8fd65eb1fcefa61018100b2b627e759ea6ad275d2e2a93ffac147bf", size = 2699492, upload-time = "2025-10-19T00:42:37.133Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e7/3d034b0e4817314f07aa465d5864e9b8df9d25cb260a53dd84583e491558/cytoolz-1.1.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:02feeeda93e1fa3b33414eb57c2b0aefd1db8f558dd33fdfcce664a0f86056e4", size = 2995646, upload-time = "2025-10-19T00:42:38.912Z" }, { url = "https://files.pythonhosted.org/packages/c1/62/be357181c71648d9fe1d1ce91cd42c63457dcf3c158e144416fd51dced83/cytoolz-1.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d08154ad45349162b6c37f12d5d1b2e6eef338e657b85e1621e4e6a4a69d64cb", size = 2919481, upload-time = "2025-10-19T00:42:40.85Z" }, { url = "https://files.pythonhosted.org/packages/64/29/39c161e9204a9715321ddea698cbd0abc317e78522c7c642363c20589e71/cytoolz-1.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:1bb77bc6197e5cb19784b6a42bb0f8427e81737a630d9d7dda62ed31733f9e6c", size = 1004445, upload-time = "2025-10-19T00:42:44.855Z" }, { url = "https://files.pythonhosted.org/packages/f6/8a/606e4c7ed14aa6a86aee6ca84a2cb804754dc6c4905b8f94e09e49f1ce60/cytoolz-1.1.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b7de5718e2113d4efccea3f06055758cdbc17388ecc3341ba4d1d812837d7c1a", size = 978877, upload-time = "2025-10-19T00:44:50.819Z" }, @@ -2912,12 +2944,18 @@ version = "3.3.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/c7/e5/40dbda2736893e3e53d25838e0f19a2b417dfc122b9989c91918db30b5d3/greenlet-3.3.0.tar.gz", hash = "sha256:a82bb225a4e9e4d653dd2fb7b8b2d36e4fb25bc0165422a11e48b88e9e6f78fb", size = 190651, upload-time = "2025-12-04T14:49:44.05Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/f2/89c5eb0faddc3ff014f1c04467d67dee0d1d334ab81fadbf3744847f8a8a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4243050a88ba61842186cb9e63c7dfa677ec146160b0efd73b855a3d9c7fcf32", size = 590338, upload-time = "2025-12-04T14:57:41.136Z" }, + { url = "https://files.pythonhosted.org/packages/80/d7/db0a5085035d05134f8c089643da2b44cc9b80647c39e93129c5ef170d8f/greenlet-3.3.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:670d0f94cd302d81796e37299bcd04b95d62403883b24225c6b5271466612f45", size = 601098, upload-time = "2025-12-04T15:07:11.898Z" }, { url = "https://files.pythonhosted.org/packages/dc/a6/e959a127b630a58e23529972dbc868c107f9d583b5a9f878fb858c46bc1a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cb3a8ec3db4a3b0eb8a3c25436c2d49e3505821802074969db017b87bc6a948", size = 590206, upload-time = "2025-12-04T14:26:01.254Z" }, { url = "https://files.pythonhosted.org/packages/0a/5f/783a23754b691bfa86bd72c3033aa107490deac9b2ef190837b860996c9f/greenlet-3.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4449a736606bd30f27f8e1ff4678ee193bc47f6ca810d705981cfffd6ce0d8c5", size = 1615483, upload-time = "2025-12-04T14:27:28.083Z" }, { url = "https://files.pythonhosted.org/packages/1d/d5/c339b3b4bc8198b7caa4f2bd9fd685ac9f29795816d8db112da3d04175bb/greenlet-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:7652ee180d16d447a683c04e4c5f6441bae7ba7b17ffd9f6b3aff4605e9e6f71", size = 301164, upload-time = "2025-12-04T14:42:51.577Z" }, + { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, { url = "https://files.pythonhosted.org/packages/6c/79/3912a94cf27ec503e51ba493692d6db1e3cd8ac7ac52b0b47c8e33d7f4f9/greenlet-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7a34b13d43a6b78abf828a6d0e87d3385680eaf830cd60d20d52f249faabf39", size = 301964, upload-time = "2025-12-04T14:36:58.316Z" }, + { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, { url = "https://files.pythonhosted.org/packages/7e/71/ba21c3fb8c5dce83b8c01f458a42e99ffdb1963aeec08fff5a18588d8fd7/greenlet-3.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:9ee1942ea19550094033c35d25d20726e4f1c40d59545815e1128ac58d416d38", size = 301833, upload-time = "2025-12-04T14:32:23.929Z" }, @@ -4336,6 +4374,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/ef/11292bb0b85cf4c93447cab5a29f64576ed14d3ab4280e35ddd23486594a/lm_format_enforcer-0.11.3-py3-none-any.whl", hash = "sha256:cf586350875def1ae7a8fba84fcbbfc8371424b6c9d05c1fcba70aa233fbf06f", size = 45418, upload-time = "2025-08-24T19:37:46.325Z" }, ] +[[package]] +name = "lmdb" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/44/d94934efaf8f887b6959f131fde740fcaa831edfd13eb5425574637cddd5/lmdb-2.2.0.tar.gz", hash = "sha256:53020e20305c043ea6e68089bc242d744fba6073cdb268332299ba6dda2886d4", size = 933189, upload-time = "2026-03-30T01:26:19.049Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/20/043bd8851979fb86a7fdb08b4337d319dbccf7f468632418527bad684945/lmdb-2.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a72dba2c63f6d497f1dd1a1e46e30f14dfb8c1fddc5a51ed913993f5ac03736c", size = 112274, upload-time = "2026-03-30T01:25:40.919Z" }, + { url = "https://files.pythonhosted.org/packages/ad/d1/d8f61fda6f837dad050514544560385a0f12e8b94e91079f63632195acc6/lmdb-2.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5c807ce9c514354c4f2e76f97e69002048b7f4a3c97a3eaf82415bf7c5daed77", size = 111129, upload-time = "2026-03-30T01:25:42.31Z" }, + { url = "https://files.pythonhosted.org/packages/19/11/f25fc19a68d8218d1337894b323fae79a4cccdef0994ba1c2714e268a2cd/lmdb-2.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a073fada46565c373c8683c67c7c07cc0d3511fef7e122da7052bb5720d2af09", size = 321904, upload-time = "2026-03-30T01:25:43.436Z" }, + { url = "https://files.pythonhosted.org/packages/31/a0/1b95f1d53e207d7f4581950228ae891fd930f5d2aeda1501a95982c7b2a8/lmdb-2.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:856b322399dcc1992675b8cf5f56cd54e89d05ea86a89dc5f6fa6d671c7b48f2", size = 324208, upload-time = "2026-03-30T01:25:44.706Z" }, + { url = "https://files.pythonhosted.org/packages/8a/1a/6c5931ee1412a9d8c0c3859ed33bb64ed00ea8ef418413c56524e0372ef3/lmdb-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:beacb2aed281cc806cb9a91663ed1a772fecd7a125d16b694cfc7af94a9864be", size = 109793, upload-time = "2026-03-30T01:25:46.148Z" }, + { url = "https://files.pythonhosted.org/packages/2e/36/0ba441a4faddd32376270aabedf915d7a21f5fe031313e18c6998b0138d4/lmdb-2.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:e36455ace4c50b5185e4660e19d63533fe5c07840598eeefaad783415a380bab", size = 103680, upload-time = "2026-03-30T01:25:47.222Z" }, + { url = "https://files.pythonhosted.org/packages/b8/a7/9604e594725e2d2d0482669cfd9cba23cc47bd288f076c7e93985e5c046c/lmdb-2.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8cc73de748070321986a3a26f51f3693bdd196c20e797d8d2ad0e860b5d2e26c", size = 113096, upload-time = "2026-03-30T01:25:48.293Z" }, + { url = "https://files.pythonhosted.org/packages/05/cf/7b8e13c1253c77a2c41b7786659d64e97f758a13f1fafdb815cf76630eba/lmdb-2.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b6fecb1e32c55f0a1f3585d637f221e20146bb3ea9997c50fdfa3a58c0c2e41", size = 111656, upload-time = "2026-03-30T01:25:49.36Z" }, + { url = "https://files.pythonhosted.org/packages/94/6a/f059c48e4f3321710825fdb1cdee50d32eea90e0c097441beec1b155788f/lmdb-2.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:547e083457b6a0936fe73821f35c019be817877f9a85488be818ec8383ef47a6", size = 329003, upload-time = "2026-03-30T01:25:50.47Z" }, + { url = "https://files.pythonhosted.org/packages/38/22/513c885f284eccd49fc8d1c0a9a9d5da6badd9efc600d482424118df2a67/lmdb-2.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd505c995a595403f69367cbf16bcd5c88cdd208c706d709ba9b1bc2f9a16f69", size = 333140, upload-time = "2026-03-30T01:25:51.68Z" }, + { url = "https://files.pythonhosted.org/packages/f1/9b/8b3c81009230ebbe340e59cf2996626800f291e034ed76535d754b2cf98c/lmdb-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:dacf737ad869c6e48e1471dfa4d3e7c6ce2d082a218c069e20c4a138804e5fd2", size = 109668, upload-time = "2026-03-30T01:25:53.091Z" }, + { url = "https://files.pythonhosted.org/packages/0b/68/368099745c1d82d079c490c62cdef5e99bc9a3e9132991e3b82967363d55/lmdb-2.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:653f5e183b04b9124c505c519a3ff691038b4fb459c3211b1323c67bfba53f37", size = 103760, upload-time = "2026-03-30T01:25:54.374Z" }, + { url = "https://files.pythonhosted.org/packages/64/43/543af71e8fa4c56623bb89c358121ab806426f26685f11539fe5452deffa/lmdb-2.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36e0cbe6b7d59f6e19b448942c5f9e91674f596a802743258f82e926a9a09632", size = 113550, upload-time = "2026-03-30T01:25:55.727Z" }, + { url = "https://files.pythonhosted.org/packages/22/2c/4702d36c0073737554b20d1d62e879a066df963482f8e514866588ddd82d/lmdb-2.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e5d7a9dfd279a5884806fd478244961e4483cc6d7eb769caed1d7019a8608c20", size = 112135, upload-time = "2026-03-30T01:25:56.809Z" }, + { url = "https://files.pythonhosted.org/packages/2f/43/d015fea326ed0a634107f29740b002170a462b6d2481e509105c685520f5/lmdb-2.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d0dbe7902b2cdb60bf6c893f307ef2b2a5039afd22f029515b86183f05ab1353", size = 332108, upload-time = "2026-03-30T01:25:57.907Z" }, + { url = "https://files.pythonhosted.org/packages/bb/c9/503e7f173994b514936badcbcb7fa9f89a07a3cfe596c6fb95b1b91b8d70/lmdb-2.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c576cdb163ae61a7ef6eecbc20a6025a4abe085491c1dc0c667d726f4926b53", size = 336017, upload-time = "2026-03-30T01:25:59.234Z" }, + { url = "https://files.pythonhosted.org/packages/3e/94/b3b064acfd2f8acf5aaa53fff2c43963dbc1932ba8b8df4e27d75bf6a34a/lmdb-2.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:746eebcd4c0aeaf0eb2f897028929d270c5bc80ef4918500eec16db6f26f3fcc", size = 109574, upload-time = "2026-03-30T01:26:00.324Z" }, + { url = "https://files.pythonhosted.org/packages/b9/10/dc7488d1effc339cd9470f9d22ec0fd7052a3d4fdfae87765ecd41cb2e59/lmdb-2.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:006153aac9fb0415a5f3e8ac88789e5730dba3dd0743cd84c95e3951ff68bc3a", size = 103810, upload-time = "2026-03-30T01:26:01.559Z" }, +] + [[package]] name = "locket" version = "1.0.0" @@ -5085,6 +5149,7 @@ dependencies = [ { name = "fsspec" }, { name = "hydra-core" }, { name = "jieba" }, + { name = "lmdb" }, { name = "loguru" }, { name = "mecab-python3" }, { name = "omegaconf" }, @@ -5519,6 +5584,7 @@ requires-dist = [ { name = "jieba", specifier = "==0.42.1" }, { name = "justext", marker = "extra == 'text-cpu'" }, { name = "librosa", marker = "extra == 'audio-common'" }, + { name = "lmdb", specifier = ">=1.4" }, { name = "loguru" }, { name = "lxml", marker = "extra == 'text-cpu'" }, { name = "matplotlib", marker = "extra == 'interleaved-cpu'" }, @@ -8600,9 +8666,11 @@ version = "8.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/5e/eb/5a0d575de784f9a1f94e2b1288c6886f13f34185e13117ed530f32b6f8a8/pyyaml_ft-8.0.0.tar.gz", hash = "sha256:0c947dce03954c7b5d38869ed4878b2e6ff1d44b08a0d84dc83fdad205ae39ab", size = 141057, upload-time = "2025-06-10T15:32:15.613Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/c2/e8825f4ff725b7e560d62a3609e31d735318068e1079539ebfde397ea03e/pyyaml_ft-8.0.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cec6c92b4207004b62dfad1f0be321c9f04725e0f271c16247d8b39c3bf3ea42", size = 786772, upload-time = "2025-06-10T15:31:54.712Z" }, { url = "https://files.pythonhosted.org/packages/35/be/58a4dcae8854f2fdca9b28d9495298fd5571a50d8430b1c3033ec95d2d0e/pyyaml_ft-8.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06237267dbcab70d4c0e9436d8f719f04a51123f0ca2694c00dd4b68c338e40b", size = 778723, upload-time = "2025-06-10T15:31:56.093Z" }, { url = "https://files.pythonhosted.org/packages/f0/69/ac02afe286275980ecb2dcdc0156617389b7e0c0a3fcdedf155c67be2b80/pyyaml_ft-8.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d10175a746be65f6feb86224df5d6bc5c049ebf52b89a88cf1cd78af5a367a8", size = 799159, upload-time = "2025-06-10T15:31:59.675Z" }, { url = "https://files.pythonhosted.org/packages/4e/ac/c492a9da2e39abdff4c3094ec54acac9747743f36428281fb186a03fab76/pyyaml_ft-8.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:58e1015098cf8d8aec82f360789c16283b88ca670fe4275ef6c48c5e30b22a96", size = 158779, upload-time = "2025-06-10T15:32:01.029Z" }, + { url = "https://files.pythonhosted.org/packages/f9/66/28d82dbff7f87b96f0eeac79b7d972a96b4980c1e445eb6a857ba91eda00/pyyaml_ft-8.0.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dab0abb46eb1780da486f022dce034b952c8ae40753627b27a626d803926483b", size = 831650, upload-time = "2025-06-10T15:32:08.076Z" }, { url = "https://files.pythonhosted.org/packages/e8/df/161c4566facac7d75a9e182295c223060373d4116dead9cc53a265de60b9/pyyaml_ft-8.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd48d639cab5ca50ad957b6dd632c7dd3ac02a1abe0e8196a3c24a52f5db3f7a", size = 815755, upload-time = "2025-06-10T15:32:09.435Z" }, { url = "https://files.pythonhosted.org/packages/d5/d2/e369064aa51009eb9245399fd8ad2c562bd0bcd392a00be44b2a824ded7c/pyyaml_ft-8.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3bb4b927929b0cb162fb1605392a321e3333e48ce616cdcfa04a839271373255", size = 835581, upload-time = "2025-06-10T15:32:12.897Z" }, { url = "https://files.pythonhosted.org/packages/c0/28/26534bed77109632a956977f60d8519049f545abc39215d086e33a61f1f2/pyyaml_ft-8.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:de04cfe9439565e32f178106c51dd6ca61afaa2907d143835d501d84703d3793", size = 171579, upload-time = "2025-06-10T15:32:14.34Z" }, @@ -8701,15 +8769,23 @@ source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/d3/28/9d808fe62375b9aab5ba92fa9b29371297b067c2790b2d7cda648b1e2f8d/rapidfuzz-3.14.3.tar.gz", hash = "sha256:2491937177868bc4b1e469087601d53f925e8d270ccc21e07404b4b5814b7b5f", size = 57863900, upload-time = "2025-11-01T11:54:52.321Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ed/69/309d8f3a0bb3031fd9b667174cc4af56000645298af7c2931be5c3d14bb4/rapidfuzz-3.14.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cfe8df315ab4e6db4e1be72c5170f8e66021acde22cd2f9d04d2058a9fd8162e", size = 3178495, upload-time = "2025-11-01T11:52:53.005Z" }, + { url = "https://files.pythonhosted.org/packages/10/b7/f9c44a99269ea5bf6fd6a40b84e858414b6e241288b9f2b74af470d222b1/rapidfuzz-3.14.3-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:769f31c60cd79420188fcdb3c823227fc4a6deb35cafec9d14045c7f6743acae", size = 1228443, upload-time = "2025-11-01T11:52:54.991Z" }, + { url = "https://files.pythonhosted.org/packages/f3/b6/983805a844d44670eaae63831024cdc97ada4e9c62abc6b20703e81e7f9b/rapidfuzz-3.14.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:834d1e818005ed0d4ae38f6b87b86fad9b0a74085467ece0727d20e15077c094", size = 2530120, upload-time = "2025-11-01T11:52:58.298Z" }, { url = "https://files.pythonhosted.org/packages/b4/cc/2c97beb2b1be2d7595d805682472f1b1b844111027d5ad89b65e16bdbaaa/rapidfuzz-3.14.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:948b00e8476a91f510dd1ec07272efc7d78c275d83b630455559671d4e33b678", size = 4283129, upload-time = "2025-11-01T11:53:00.188Z" }, { url = "https://files.pythonhosted.org/packages/cf/99/5fa23e204435803875daefda73fd61baeabc3c36b8fc0e34c1705aab8c7b/rapidfuzz-3.14.3-cp311-cp311-win_amd64.whl", hash = "sha256:ef6bf930b947bd0735c550683939a032090f1d688dfd8861d6b45307b96fd5c5", size = 1544259, upload-time = "2025-11-01T11:53:03.66Z" }, { url = "https://files.pythonhosted.org/packages/30/83/80d22997acd928eda7deadc19ccd15883904622396d6571e935993e0453a/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c5f545f454871e6af05753a0172849c82feaf0f521c5ca62ba09e1b382d6382", size = 3154947, upload-time = "2025-11-01T11:53:12.093Z" }, + { url = "https://files.pythonhosted.org/packages/5b/cf/9f49831085a16384695f9fb096b99662f589e30b89b4a589a1ebc1a19d34/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:07aa0b5d8863e3151e05026a28e0d924accf0a7a3b605da978f0359bb804df43", size = 1223872, upload-time = "2025-11-01T11:53:13.664Z" }, + { url = "https://files.pythonhosted.org/packages/da/86/280038b6b0c2ccec54fb957c732ad6b41cc1fd03b288d76545b9cf98343f/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6de00eb84c71476af7d3110cf25d8fe7c792d7f5fa86764ef0b4ca97e78ca3ed", size = 2521398, upload-time = "2025-11-01T11:53:17.146Z" }, { url = "https://files.pythonhosted.org/packages/fa/7b/05c26f939607dca0006505e3216248ae2de631e39ef94dd63dbbf0860021/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d7843a1abf0091773a530636fdd2a49a41bcae22f9910b86b4f903e76ddc82dc", size = 4259416, upload-time = "2025-11-01T11:53:19.34Z" }, { url = "https://files.pythonhosted.org/packages/b8/63/d06ecce90e2cf1747e29aeab9f823d21e5877a4c51b79720b2d3be7848f8/rapidfuzz-3.14.3-cp312-cp312-win_amd64.whl", hash = "sha256:b5100fd6bcee4d27f28f4e0a1c6b5127bc8ba7c2a9959cad9eab0bf4a7ab3329", size = 1538989, upload-time = "2025-11-01T11:53:22.428Z" }, { url = "https://files.pythonhosted.org/packages/32/00/ec8597a64f2be301ce1ee3290d067f49f6a7afb226b67d5f15b56d772ba5/rapidfuzz-3.14.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43e38c1305cffae8472572a0584d4ffc2f130865586a81038ca3965301f7c97c", size = 3156759, upload-time = "2025-11-01T11:53:30.777Z" }, + { url = "https://files.pythonhosted.org/packages/61/d5/b41eeb4930501cc899d5a9a7b5c9a33d85a670200d7e81658626dcc0ecc0/rapidfuzz-3.14.3-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:e195a77d06c03c98b3fc06b8a28576ba824392ce40de8c708f96ce04849a052e", size = 1222067, upload-time = "2025-11-01T11:53:32.334Z" }, + { url = "https://files.pythonhosted.org/packages/15/ce/4f3ab4c401c5a55364da1ffff8cc879fc97b4e5f4fa96033827da491a973/rapidfuzz-3.14.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:a2135b138bcdcb4c3742d417f215ac2d8c2b87bde15b0feede231ae95f09ec41", size = 2526123, upload-time = "2025-11-01T11:53:35.779Z" }, { url = "https://files.pythonhosted.org/packages/c1/4b/54f804975376a328f57293bd817c12c9036171d15cf7292032e3f5820b2d/rapidfuzz-3.14.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33a325ed0e8e1aa20c3e75f8ab057a7b248fdea7843c2a19ade0008906c14af0", size = 4262874, upload-time = "2025-11-01T11:53:37.866Z" }, { url = "https://files.pythonhosted.org/packages/07/75/fde1f334b0cec15b5946d9f84d73250fbfcc73c236b4bc1b25129d90876b/rapidfuzz-3.14.3-cp313-cp313-win_amd64.whl", hash = "sha256:e6b5e3036976f0fde888687d91be86d81f9ac5f7b02e218913c38285b756be6c", size = 1537011, upload-time = "2025-11-01T11:53:40.92Z" }, { url = "https://files.pythonhosted.org/packages/88/74/f50ea0e24a5880a9159e8fd256b84d8f4634c2f6b4f98028bdd31891d907/rapidfuzz-3.14.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:89acb8cbb52904f763e5ac238083b9fc193bed8d1f03c80568b20e4cef43a519", size = 3165563, upload-time = "2025-11-01T11:53:49.216Z" }, + { url = "https://files.pythonhosted.org/packages/e8/7a/e744359404d7737049c26099423fc54bcbf303de5d870d07d2fb1410f567/rapidfuzz-3.14.3-cp313-cp313t-manylinux_2_31_armv7l.whl", hash = "sha256:7d9af908c2f371bfb9c985bd134e295038e3031e666e4b2ade1e7cb7f5af2f1a", size = 1214727, upload-time = "2025-11-01T11:53:50.883Z" }, + { url = "https://files.pythonhosted.org/packages/70/17/6c0b2b2bff9c8b12e12624c07aa22e922b0c72a490f180fa9183d1ef2c75/rapidfuzz-3.14.3-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:152555187360978119e98ce3e8263d70dd0c40c7541193fc302e9b7125cf8f58", size = 2507596, upload-time = "2025-11-01T11:53:53.835Z" }, { url = "https://files.pythonhosted.org/packages/c3/d1/87852a7cbe4da7b962174c749a47433881a63a817d04f3e385ea9babcd9e/rapidfuzz-3.14.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:52619d25a09546b8db078981ca88939d72caa6b8701edd8b22e16482a38e799f", size = 4273595, upload-time = "2025-11-01T11:53:55.961Z" }, { url = "https://files.pythonhosted.org/packages/0b/0c/71ef356adc29e2bdf74cd284317b34a16b80258fa0e7e242dd92cc1e6d10/rapidfuzz-3.14.3-cp313-cp313t-win_amd64.whl", hash = "sha256:656e52b054d5b5c2524169240e50cfa080b04b1c613c5f90a2465e84888d6f15", size = 1576797, upload-time = "2025-11-01T11:53:59.455Z" }, { url = "https://files.pythonhosted.org/packages/22/20/9d30b4a1ab26aac22fff17d21dec7e9089ccddfe25151d0a8bb57001dc3d/rapidfuzz-3.14.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e6eefec45625c634926a9fd46c9e4f31118ac8f3156fff9494422cee45207e6", size = 3101472, upload-time = "2025-11-01T11:54:47.255Z" }, @@ -11546,16 +11622,32 @@ version = "3.6.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/ef/3a9b05eb527457d5db13a135a2ae1a26c80fecd624d20f3e8dcc4cb170f3/xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f", size = 212384, upload-time = "2025-10-02T14:34:19.182Z" }, + { url = "https://files.pythonhosted.org/packages/0f/18/ccc194ee698c6c623acbf0f8c2969811a8a4b6185af5e824cd27b9e4fd3e/xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e", size = 445749, upload-time = "2025-10-02T14:34:20.659Z" }, { url = "https://files.pythonhosted.org/packages/a5/86/cf2c0321dc3940a7aa73076f4fd677a0fb3e405cb297ead7d864fd90847e/xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8", size = 193880, upload-time = "2025-10-02T14:34:22.431Z" }, + { url = "https://files.pythonhosted.org/packages/67/74/b044fcd6b3d89e9b1b665924d85d3f400636c23590226feb1eb09e1176ce/xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c", size = 210867, upload-time = "2025-10-02T14:34:27.203Z" }, + { url = "https://files.pythonhosted.org/packages/bc/fd/3ce73bf753b08cb19daee1eb14aa0d7fe331f8da9c02dd95316ddfe5275e/xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b", size = 414012, upload-time = "2025-10-02T14:34:28.409Z" }, { url = "https://files.pythonhosted.org/packages/ba/b3/5a4241309217c5c876f156b10778f3ab3af7ba7e3259e6d5f5c7d0129eb2/xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3", size = 191409, upload-time = "2025-10-02T14:34:29.696Z" }, { url = "https://files.pythonhosted.org/packages/65/79/9d24d7f53819fe301b231044ea362ce64e86c74f6e8c8e51320de248b3e5/xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef", size = 31481, upload-time = "2025-10-02T14:34:32.062Z" }, + { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163, upload-time = "2025-10-02T14:34:39.872Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dc/e84875682b0593e884ad73b2d40767b5790d417bde603cceb6878901d647/xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0", size = 445411, upload-time = "2025-10-02T14:34:41.569Z" }, { url = "https://files.pythonhosted.org/packages/11/4f/426f91b96701ec2f37bb2b8cec664eff4f658a11f3fa9d94f0a887ea6d2b/xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2", size = 193883, upload-time = "2025-10-02T14:34:43.249Z" }, + { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655, upload-time = "2025-10-02T14:34:47.571Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001, upload-time = "2025-10-02T14:34:49.273Z" }, { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431, upload-time = "2025-10-02T14:34:50.798Z" }, { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534, upload-time = "2025-10-02T14:34:53.276Z" }, + { url = "https://files.pythonhosted.org/packages/84/7a/c2b3d071e4bb4a90b7057228a99b10d51744878f4a8a6dd643c8bd897620/xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546", size = 212241, upload-time = "2025-10-02T14:35:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/81/5f/640b6eac0128e215f177df99eadcd0f1b7c42c274ab6a394a05059694c5a/xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89", size = 445471, upload-time = "2025-10-02T14:35:03.61Z" }, { url = "https://files.pythonhosted.org/packages/5e/1e/3c3d3ef071b051cc3abbe3721ffb8365033a172613c04af2da89d5548a87/xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d", size = 193936, upload-time = "2025-10-02T14:35:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/d7/fd/2c0a00c97b9e18f72e1f240ad4e8f8a90fd9d408289ba9c7c495ed7dc05c/xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42", size = 210689, upload-time = "2025-10-02T14:35:09.438Z" }, + { url = "https://files.pythonhosted.org/packages/93/86/5dd8076a926b9a95db3206aba20d89a7fc14dd5aac16e5c4de4b56033140/xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11", size = 414068, upload-time = "2025-10-02T14:35:11.162Z" }, { url = "https://files.pythonhosted.org/packages/af/3c/0bb129170ee8f3650f08e993baee550a09593462a5cddd8e44d0011102b1/xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd", size = 191495, upload-time = "2025-10-02T14:35:12.971Z" }, { url = "https://files.pythonhosted.org/packages/86/15/9bc32671e9a38b413a76d24722a2bf8784a132c043063a8f5152d390b0f9/xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392", size = 31542, upload-time = "2025-10-02T14:35:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/bc/68/c4c80614716345d55071a396cf03d06e34b5f4917a467faf43083c995155/xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5", size = 214833, upload-time = "2025-10-02T14:35:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e9/ae27c8ffec8b953efa84c7c4a6c6802c263d587b9fc0d6e7cea64e08c3af/xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1", size = 448348, upload-time = "2025-10-02T14:35:25.111Z" }, { url = "https://files.pythonhosted.org/packages/d7/6b/33e21afb1b5b3f46b74b6bd1913639066af218d704cc0941404ca717fc57/xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee", size = 196070, upload-time = "2025-10-02T14:35:26.586Z" }, + { url = "https://files.pythonhosted.org/packages/0d/98/e8de5baa5109394baf5118f5e72ab21a86387c4f89b0e77ef3e2f6b0327b/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292", size = 213304, upload-time = "2025-10-02T14:35:31.222Z" }, + { url = "https://files.pythonhosted.org/packages/7b/1d/71056535dec5c3177eeb53e38e3d367dd1d16e024e63b1cee208d572a033/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf", size = 416930, upload-time = "2025-10-02T14:35:32.517Z" }, { url = "https://files.pythonhosted.org/packages/dc/6c/5cbde9de2cd967c322e651c65c543700b19e7ae3e0aae8ece3469bf9683d/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033", size = 193787, upload-time = "2025-10-02T14:35:33.827Z" }, { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" }, { url = "https://files.pythonhosted.org/packages/62/b2/5ac99a041a29e58e95f907876b04f7067a0242cb85b5f39e726153981503/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd", size = 32481, upload-time = "2025-10-02T14:37:05.869Z" },