diff --git a/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py new file mode 100644 index 000000000000..9afa321e2ffc --- /dev/null +++ b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py @@ -0,0 +1,55 @@ +# __anti_pattern_start__ +import gc + +import ray + +ray.init() + + +@ray.remote +def consumer_broken(obj_ref_hex: str, timeout: float): + # Anti-pattern: rebuild an ObjectRef from a hex string. + # The hex was produced out of band (e.g. via .hex()) and Ray's distributed + # reference counting never saw it. The underlying object may have already + # been garbage-collected, so this ray.wait can hang or return only the + # "not ready" list forever. + obj_ref = ray.ObjectRef(bytes.fromhex(obj_ref_hex)) + ready, not_ready = ray.wait([obj_ref], timeout=timeout, fetch_local=False) + return len(ready), len(not_ready) + + +@ray.remote +def consumer_correct(obj_ref_hex: str, keep_alive, timeout: float): + # Recommended pattern: still reconstruct from hex on the consumer side, but + # have the caller pass the live ObjectRef alongside (here inside a list, so + # Ray ref-counts it without auto-dereferencing the top-level argument). + # That keeps the underlying object pinned for the duration of the task. + obj_ref = ray.ObjectRef(bytes.fromhex(obj_ref_hex)) + ready, not_ready = ray.wait([obj_ref], timeout=timeout, fetch_local=False) + return len(ready), len(not_ready) + + +# Anti-pattern in action: the driver puts the value, serializes the ObjectRef +# to a hex string, then drops the only live reference before invoking the +# consumer. Ray's distributed reference counter sees zero references, so the +# object is eligible for collection by the time the consumer rebuilds it. +inner_ref = ray.put(42) +inner_ref_hex = inner_ref.hex() +del inner_ref +gc.collect() +broken_result = ray.get(consumer_broken.remote(inner_ref_hex, 5.0)) +# Likely (0, 1) -- the object was unreachable from the consumer's perspective. +print(f"broken: ready={broken_result[0]}, not_ready={broken_result[1]}") + +# Correct usage: pass the hex string (so the task can reconstruct the ref) and +# also pass the live ObjectRef inside a container so Ray's ref counter keeps +# the underlying object alive for the duration of the task. Wrapping in a list +# prevents Ray from auto-dereferencing the top-level argument. +inner_ref = ray.put(42) +inner_ref_hex = inner_ref.hex() +correct_result = ray.get( + consumer_correct.remote(inner_ref_hex, [inner_ref], 5.0) +) +# Expect (1, 0) -- the object is ready. +print(f"correct: ready={correct_result[0]}, not_ready={correct_result[1]}") +# __anti_pattern_end__ diff --git a/doc/source/ray-core/objects/serialization.rst b/doc/source/ray-core/objects/serialization.rst index 51a416cd9c8a..c215c8b6e548 100644 --- a/doc/source/ray-core/objects/serialization.rst +++ b/doc/source/ray-core/objects/serialization.rst @@ -42,6 +42,10 @@ This code example demonstrates how to serialize an `ObjectRef`, store it in exte .. literalinclude:: /ray-core/doc_code/object_ref_serialization.py +.. seealso:: + + Avoid converting an ``ObjectRef`` to its hex string and reconstructing it elsewhere with ``ray.ObjectRef(bytes.fromhex(...))``. The hex form carries no reference, so the underlying object can be garbage-collected before the consumer fetches it. See :ref:`ray-out-of-band-object-ref-serialization` for the full pattern and the recommended fix. + Numpy Arrays ~~~~~~~~~~~~ diff --git a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst index 6a24ea50b1aa..bab66fc4b5ab 100644 --- a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst +++ b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst @@ -9,14 +9,22 @@ Ray's ``ray.ObjectRef`` is distributed reference counted. Ray pins the underlyin When all references to the pinned object are gone, Ray garbage collects the pinned object and cleans it up from the system. However, if user code serializes ``ray.ObjectRef``, Ray can't keep track of the reference. +Out-of-band serialization can happen through any channel that strips the +``ObjectRef`` of its in-process semantics: ``pickle`` and ``ray.cloudpickle``, +or string forms produced by ``ObjectRef.hex()`` and reconstructed with +``ray.ObjectRef(bytes.fromhex(...))``. Both routes have the same root problem +and the same recommended fix: pass the ``ObjectRef`` itself as a remote-task +argument or return value, so Ray sees the reference and keeps the distributed +reference count correct end to end. + To avoid incorrect behavior, if ``ray.cloudpickle`` serializes ``ray.ObjectRef``, Ray pins the object for the lifetime of a worker. "Pin" means that object can't be evicted from the object store until the corresponding owner worker dies. It's prone to Ray object leaks, which can lead to disk spilling. See :ref:`this page ` for more details. To detect if this pattern exists in your code, you can set an environment variable ``RAY_allow_out_of_band_object_ref_serialization=0``. If Ray detects that ``ray.cloudpickle`` serialized ``ray.ObjectRef``, it raises an exception with helpful messages. -Code example ------------- +Code example: pickle and cloudpickle +------------------------------------ **Anti-pattern:** @@ -24,3 +32,28 @@ Code example :language: python :start-after: __anti_pattern_start__ :end-before: __anti_pattern_end__ + +Code example: hex string round-trip +----------------------------------- + +A common variant is to call ``ObjectRef.hex()``, send the string somewhere +out of band (a remote task argument, a database row, a Redis key, an HTTP +request), and reconstruct the reference on the other side with +``ray.ObjectRef(bytes.fromhex(...))``. + +The hex form is just bytes; it carries no reference count. From Ray's +perspective the reference disappeared the moment it was converted to a +string, so the underlying object becomes eligible for garbage collection. +By the time the consumer rebuilds the ``ObjectRef`` and calls ``ray.wait`` +or ``ray.get``, the object is gone. ``ray.wait`` then keeps the ref in the +*not ready* list until the timeout fires, which surfaces as +"``ray.wait`` is broken". + +**Recommended pattern:** pass the ``ObjectRef`` itself as a task argument +or return value. Wrap a single ref in a list (``f.remote([obj_ref])``) if +the task signature expects a collection. + +.. literalinclude:: ../doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py + :language: python + :start-after: __anti_pattern_start__ + :end-before: __anti_pattern_end__