From 005a72ddfa080b80c37daa3059e62ad822480041 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Wed, 27 May 2026 10:51:55 +0530 Subject: [PATCH 1/4] [core][docs] Document hex round-trip ObjectRef anti-pattern Issue #47923 reports that ray.wait "never completes" when a user passes an ObjectRef hex string out of band and reconstructs it on the other side with ray.ObjectRef(bytes.fromhex(...)). The maintainer ruling on the bug was that this is the expected behavior for an out-of-band serialization path, and that we should document the wrong usage and the right way to do it. Extend the existing out-of-band ObjectRef serialization anti-pattern doc with a new section covering the hex string round-trip variant, plus a self-contained code example under doc_code/ showing both the broken pattern and the recommended pattern (passing the ObjectRef directly as a remote-task argument so Ray's distributed reference counting stays intact). Docs-only change. No source code changes; no new .rst files added (only edits to an existing one), so the rst-lint check is satisfied. Signed-off-by: gaurav0107 --- ...ut_of_band_object_ref_serialization_hex.py | 48 +++++++++++++++++++ .../out-of-band-object-ref-serialization.rst | 26 ++++++++++ 2 files changed, 74 insertions(+) create mode 100644 doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py diff --git a/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py new file mode 100644 index 000000000000..2f9a1cf22fea --- /dev/null +++ b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py @@ -0,0 +1,48 @@ +# __anti_pattern_start__ +import ray + +ray.init() + + +@ray.remote +class Producer: + def make(self): + # Returns an ObjectRef. The producer task's result IS the ObjectRef + # the caller will eventually consume. + return ray.put(42) + + +@ray.remote +def consumer_broken(obj_ref_hex: str, timeout: float): + # Anti-pattern: rebuild an ObjectRef from a hex string. + # The hex was produced out of band (e.g. via .hex()) and Ray's distributed + # reference counting never saw it. The underlying object may have already + # been garbage-collected, so this ray.wait can hang or return only the + # "not ready" list forever. + obj_ref = ray.ObjectRef(bytes.fromhex(obj_ref_hex)) + ready, not_ready = ray.wait([obj_ref], timeout=timeout, fetch_local=False) + return len(ready), len(not_ready) + + +@ray.remote +def consumer_correct(obj_ref, timeout: float): + # Recommended pattern: pass the ObjectRef directly as a task argument. + # Ray sees the reference, keeps the underlying object pinned, and ray.wait + # behaves as documented. + ready, not_ready = ray.wait([obj_ref], timeout=timeout, fetch_local=False) + return len(ready), len(not_ready) + + +# Anti-pattern in action: the producer's ObjectRef is serialized to a hex +# string and passed by value, so Ray loses track of it. +producer = Producer.remote() +inner_ref = ray.get(producer.make.remote()) +broken_result = ray.get(consumer_broken.remote(inner_ref.hex(), 5.0)) +# Likely (0, 1) -- the object was unreachable from the consumer's perspective. +print(f"broken: ready={broken_result[0]}, not_ready={broken_result[1]}") + +# Correct usage: pass the ObjectRef itself. Ray tracks the reference end to end. +correct_result = ray.get(consumer_correct.remote(inner_ref, 5.0)) +# Expect (1, 0) -- the object is ready. +print(f"correct: ready={correct_result[0]}, not_ready={correct_result[1]}") +# __anti_pattern_end__ diff --git a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst index 6a24ea50b1aa..cc40cb36f98a 100644 --- a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst +++ b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst @@ -24,3 +24,29 @@ Code example :language: python :start-after: __anti_pattern_start__ :end-before: __anti_pattern_end__ + +Anti-pattern: round-trip an ``ObjectRef`` through its hex string +---------------------------------------------------------------- + +A common variant of this anti-pattern is to call ``ObjectRef.hex()``, send the +string somewhere out of band (a remote task argument, a database row, a Redis +key, an HTTP request), and reconstruct the reference on the other side with +``ray.ObjectRef(bytes.fromhex(...))``. + +The hex form is just bytes; it carries no reference count. From Ray's +perspective the reference disappeared the moment it was converted to a string, +so the underlying object becomes eligible for garbage collection. By the time +the consumer rebuilds the ``ObjectRef`` and calls ``ray.wait`` or ``ray.get`` +on it, the object is gone. ``ray.wait`` then returns the ref in the *not +ready* list forever (until the timeout fires), which surfaces as "``ray.wait`` +is broken". + +**Recommended pattern:** pass the ``ObjectRef`` itself as a task argument or +return value. Ray serializes ``ObjectRef`` arguments specially and keeps the +distributed reference count correct end to end. Wrap a single ref in a list +(``f.remote([obj_ref])``) if your task signature expects a collection. + +.. literalinclude:: ../doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py + :language: python + :start-after: __anti_pattern_start__ + :end-before: __anti_pattern_end__ From e69dcba1e2ea06b3851a4ae365d0a3684e5998c8 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Wed, 27 May 2026 10:56:06 +0530 Subject: [PATCH 2/4] [core][docs] Tighten section structure for out-of-band ObjectRef anti-patterns Restructure the file so the two anti-patterns (pickle/cloudpickle and hex string round-trip) live as parallel "Code example" sub-sections under the shared introduction, instead of one being inline and the other being its own H2. Add a brief paragraph in the intro that names both serialization channels up front, so readers can recognize either path as the same root cause and the same recommended fix. No content removed; rewording only. Signed-off-by: gaurav0107 --- .../out-of-band-object-ref-serialization.rst | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst index cc40cb36f98a..bab66fc4b5ab 100644 --- a/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst +++ b/doc/source/ray-core/patterns/out-of-band-object-ref-serialization.rst @@ -9,14 +9,22 @@ Ray's ``ray.ObjectRef`` is distributed reference counted. Ray pins the underlyin When all references to the pinned object are gone, Ray garbage collects the pinned object and cleans it up from the system. However, if user code serializes ``ray.ObjectRef``, Ray can't keep track of the reference. +Out-of-band serialization can happen through any channel that strips the +``ObjectRef`` of its in-process semantics: ``pickle`` and ``ray.cloudpickle``, +or string forms produced by ``ObjectRef.hex()`` and reconstructed with +``ray.ObjectRef(bytes.fromhex(...))``. Both routes have the same root problem +and the same recommended fix: pass the ``ObjectRef`` itself as a remote-task +argument or return value, so Ray sees the reference and keeps the distributed +reference count correct end to end. + To avoid incorrect behavior, if ``ray.cloudpickle`` serializes ``ray.ObjectRef``, Ray pins the object for the lifetime of a worker. "Pin" means that object can't be evicted from the object store until the corresponding owner worker dies. It's prone to Ray object leaks, which can lead to disk spilling. See :ref:`this page ` for more details. To detect if this pattern exists in your code, you can set an environment variable ``RAY_allow_out_of_band_object_ref_serialization=0``. If Ray detects that ``ray.cloudpickle`` serialized ``ray.ObjectRef``, it raises an exception with helpful messages. -Code example ------------- +Code example: pickle and cloudpickle +------------------------------------ **Anti-pattern:** @@ -25,26 +33,25 @@ Code example :start-after: __anti_pattern_start__ :end-before: __anti_pattern_end__ -Anti-pattern: round-trip an ``ObjectRef`` through its hex string ----------------------------------------------------------------- +Code example: hex string round-trip +----------------------------------- -A common variant of this anti-pattern is to call ``ObjectRef.hex()``, send the -string somewhere out of band (a remote task argument, a database row, a Redis -key, an HTTP request), and reconstruct the reference on the other side with +A common variant is to call ``ObjectRef.hex()``, send the string somewhere +out of band (a remote task argument, a database row, a Redis key, an HTTP +request), and reconstruct the reference on the other side with ``ray.ObjectRef(bytes.fromhex(...))``. The hex form is just bytes; it carries no reference count. From Ray's -perspective the reference disappeared the moment it was converted to a string, -so the underlying object becomes eligible for garbage collection. By the time -the consumer rebuilds the ``ObjectRef`` and calls ``ray.wait`` or ``ray.get`` -on it, the object is gone. ``ray.wait`` then returns the ref in the *not -ready* list forever (until the timeout fires), which surfaces as "``ray.wait`` -is broken". - -**Recommended pattern:** pass the ``ObjectRef`` itself as a task argument or -return value. Ray serializes ``ObjectRef`` arguments specially and keeps the -distributed reference count correct end to end. Wrap a single ref in a list -(``f.remote([obj_ref])``) if your task signature expects a collection. +perspective the reference disappeared the moment it was converted to a +string, so the underlying object becomes eligible for garbage collection. +By the time the consumer rebuilds the ``ObjectRef`` and calls ``ray.wait`` +or ``ray.get``, the object is gone. ``ray.wait`` then keeps the ref in the +*not ready* list until the timeout fires, which surfaces as +"``ray.wait`` is broken". + +**Recommended pattern:** pass the ``ObjectRef`` itself as a task argument +or return value. Wrap a single ref in a list (``f.remote([obj_ref])``) if +the task signature expects a collection. .. literalinclude:: ../doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py :language: python From d12b1711b33b077bab6820a33560e43fa120e78c Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Wed, 27 May 2026 11:00:23 +0530 Subject: [PATCH 3/4] [core][docs] Cross-link serialization page to hex round-trip anti-pattern Add a seealso directive in the Serializing-ObjectRefs section pointing to the out-of-band ObjectRef serialization anti-pattern doc, so readers who land on the serialization page (which already covers cloudpickle of ObjectRefs) can also discover the closely related hex string round-trip pitfall and its recommended fix. Signed-off-by: gaurav0107 --- doc/source/ray-core/objects/serialization.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/ray-core/objects/serialization.rst b/doc/source/ray-core/objects/serialization.rst index 51a416cd9c8a..c215c8b6e548 100644 --- a/doc/source/ray-core/objects/serialization.rst +++ b/doc/source/ray-core/objects/serialization.rst @@ -42,6 +42,10 @@ This code example demonstrates how to serialize an `ObjectRef`, store it in exte .. literalinclude:: /ray-core/doc_code/object_ref_serialization.py +.. seealso:: + + Avoid converting an ``ObjectRef`` to its hex string and reconstructing it elsewhere with ``ray.ObjectRef(bytes.fromhex(...))``. The hex form carries no reference, so the underlying object can be garbage-collected before the consumer fetches it. See :ref:`ray-out-of-band-object-ref-serialization` for the full pattern and the recommended fix. + Numpy Arrays ~~~~~~~~~~~~ From 336a336f229d4164d7877978dcdd47dacf7b2b09 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Fri, 5 Jun 2026 01:22:43 +0530 Subject: [PATCH 4/4] [core][docs] Address review feedback on out-of-band ObjectRef hex example - Move ray.put into the driver instead of returning it from a Producer actor task (the worker should not own the object the driver consumes). - Drop the driver-side ObjectRef before invoking the broken consumer so the example actually demonstrates the failure mode (with the driver still holding the ref, distributed ref counting kept the object alive and the anti-pattern looked benign). - Pass the hex string (and a keep-alive list containing the ObjectRef) to consumer_correct so Ray does not auto-dereference the top-level ObjectRef argument; the task reconstructs and ray.waits on the rebuilt ref while the list keeps the underlying object pinned. Signed-off-by: gaurav0107 --- ...ut_of_band_object_ref_serialization_hex.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py index 2f9a1cf22fea..9afa321e2ffc 100644 --- a/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py +++ b/doc/source/ray-core/doc_code/anti_pattern_out_of_band_object_ref_serialization_hex.py @@ -1,17 +1,11 @@ # __anti_pattern_start__ +import gc + import ray ray.init() -@ray.remote -class Producer: - def make(self): - # Returns an ObjectRef. The producer task's result IS the ObjectRef - # the caller will eventually consume. - return ray.put(42) - - @ray.remote def consumer_broken(obj_ref_hex: str, timeout: float): # Anti-pattern: rebuild an ObjectRef from a hex string. @@ -25,24 +19,37 @@ def consumer_broken(obj_ref_hex: str, timeout: float): @ray.remote -def consumer_correct(obj_ref, timeout: float): - # Recommended pattern: pass the ObjectRef directly as a task argument. - # Ray sees the reference, keeps the underlying object pinned, and ray.wait - # behaves as documented. +def consumer_correct(obj_ref_hex: str, keep_alive, timeout: float): + # Recommended pattern: still reconstruct from hex on the consumer side, but + # have the caller pass the live ObjectRef alongside (here inside a list, so + # Ray ref-counts it without auto-dereferencing the top-level argument). + # That keeps the underlying object pinned for the duration of the task. + obj_ref = ray.ObjectRef(bytes.fromhex(obj_ref_hex)) ready, not_ready = ray.wait([obj_ref], timeout=timeout, fetch_local=False) return len(ready), len(not_ready) -# Anti-pattern in action: the producer's ObjectRef is serialized to a hex -# string and passed by value, so Ray loses track of it. -producer = Producer.remote() -inner_ref = ray.get(producer.make.remote()) -broken_result = ray.get(consumer_broken.remote(inner_ref.hex(), 5.0)) +# Anti-pattern in action: the driver puts the value, serializes the ObjectRef +# to a hex string, then drops the only live reference before invoking the +# consumer. Ray's distributed reference counter sees zero references, so the +# object is eligible for collection by the time the consumer rebuilds it. +inner_ref = ray.put(42) +inner_ref_hex = inner_ref.hex() +del inner_ref +gc.collect() +broken_result = ray.get(consumer_broken.remote(inner_ref_hex, 5.0)) # Likely (0, 1) -- the object was unreachable from the consumer's perspective. print(f"broken: ready={broken_result[0]}, not_ready={broken_result[1]}") -# Correct usage: pass the ObjectRef itself. Ray tracks the reference end to end. -correct_result = ray.get(consumer_correct.remote(inner_ref, 5.0)) +# Correct usage: pass the hex string (so the task can reconstruct the ref) and +# also pass the live ObjectRef inside a container so Ray's ref counter keeps +# the underlying object alive for the duration of the task. Wrapping in a list +# prevents Ray from auto-dereferencing the top-level argument. +inner_ref = ray.put(42) +inner_ref_hex = inner_ref.hex() +correct_result = ray.get( + consumer_correct.remote(inner_ref_hex, [inner_ref], 5.0) +) # Expect (1, 0) -- the object is ready. print(f"correct: ready={correct_result[0]}, not_ready={correct_result[1]}") # __anti_pattern_end__