pytorch
diff --git a/‎docs/source/reference/llms.rst‎
Lines changed: 160 additions & 9 deletions b/‎docs/source/reference/llms.rst‎
Lines changed: 160 additions & 9 deletions
diff --git a/‎examples/collectors/weight_sync_standalone.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/collectors/weight_sync_standalone.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sota-implementations/expert-iteration/ei_utils.py‎
Lines changed: 24 additions & 26 deletions b/‎sota-implementations/expert-iteration/ei_utils.py‎
Lines changed: 24 additions & 26 deletions
diff --git a/‎sota-implementations/expert-iteration/expert-iteration-async.py‎
Lines changed: 26 additions & 13 deletions b/‎sota-implementations/expert-iteration/expert-iteration-async.py‎
Lines changed: 26 additions & 13 deletions
@@ -633,7 +633,7 @@ Collectors
 .. _Collectors:
 
 TorchRL offers specialized collector classes (:class:`~torchrl.collectors.llm.LLMCollector` and :class:`~torchrl.collectors.llm.RayLLMCollector`) 
-that are tailored for LLM use cases. We also provide dedicated updaters for some inference engines.
+that are tailored for LLM use cases. We also provide weight synchronization schemes for vLLM inference engines.
 
 See :ref:`ref_collectors` for more details on the collector API. In brief, the idea of a collector is to isolate the inference part of the pipeline
 in a dedicated class. 
@@ -649,8 +649,126 @@ Collectors are defined by the following parameters and features:
   In other cases, the collector can be iterated over to collect data.
 - **Steps**: A collector is built with a certain number of steps budget, as well as a number of steps to be
   included in each batch yield during collection.
-- **Weight Updater**: Weight updaters are the classes that update the policy weights. Isolating the weight update
-  in a dedicated class allows to easily implement different weight update strategies depending on the policy specification.
+- **Weight Synchronization Schemes**: Weight sync schemes handle the synchronization of weights between the training model
+  and the inference engine. The new scheme-based approach provides flexible, high-performance weight updates for vLLM and
+  other inference backends.
+
+vLLM Weight Synchronization Schemes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TorchRL provides two weight synchronization schemes for vLLM engines, offering different trade-offs between
+performance and simplicity:
+
+**1. NCCL-Based Synchronization** (:class:`~torchrl.weight_update.llm.VLLMWeightSyncScheme`)
+
+Uses NCCL collectives for high-bandwidth GPU-to-GPU weight transfers. Best for:
+
+- High-frequency weight updates
+- Large models where transfer speed is critical
+- Setups with GPU interconnect (NVLink, InfiniBand)
+
+**2. Double-Buffer Synchronization** (:class:`~torchrl.weight_update.llm.VLLMDoubleBufferSyncScheme`)
+
+Uses memory-mapped file storage for asynchronous weight transfers. Best for:
+
+- Simpler setup without NCCL coordination
+- Distributed setups with shared filesystems (NFS)
+- Cases where update frequency is lower
+
+**Usage Example with NCCL:**
+
+.. code-block:: python
+
+    from torchrl.collectors.llm import RayLLMCollector
+    from torchrl.weight_update.llm import VLLMWeightSyncScheme
+    from torchrl.modules.llm import AsyncVLLM, vLLMWrapper
+    
+    # Create vLLM engine
+    vllm_engine = AsyncVLLM.from_pretrained(
+        "Qwen/Qwen2.5-7B",
+        num_devices=2,
+        num_replicas=2,
+    )
+    policy = vLLMWrapper(vllm_engine, input_mode="history")
+    
+    # Create NCCL weight sync scheme
+    weight_sync_scheme = VLLMWeightSyncScheme(
+        master_address="localhost",
+        master_port=29500,
+        gpus_per_replica=2,  # tp_size × dp_size × pp_size
+        num_replicas=2,
+        strategy="state_dict"
+    )
+    
+    # Create collector with weight sync scheme
+    collector = RayLLMCollector(
+        env=make_env,
+        policy=policy,
+        dialog_turns_per_batch=256,
+        total_dialog_turns=10000,
+        weight_sync_schemes={"policy": weight_sync_scheme},
+        track_policy_version=True,
+    )
+    
+    # During training, get the sender and update weights
+    sender = collector._weight_senders["policy"]
+    sender.register_model(training_model)
+    
+    # Initialize collective group (must be called before first update)
+    metadata = get_model_metadata(training_model)
+    sender.init_all_workers_group(metadata, vllm_engine=vllm_engine)
+    
+    # Update weights during training
+    for i, data in enumerate(collector):
+        # ... training step ...
+        if i % 10 == 0:
+            sender.update_weights()  # Broadcasts via NCCL
+
+**Usage Example with Double-Buffer:**
+
+.. code-block:: python
+
+    from torchrl.collectors.llm import RayLLMCollector
+    from torchrl.weight_update.llm import VLLMDoubleBufferSyncScheme
+    from torchrl.modules.llm import AsyncVLLM, vLLMWrapper
+    
+    # Create vLLM engine
+    vllm_engine = AsyncVLLM.from_pretrained(
+        "Qwen/Qwen2.5-7B",
+        num_devices=2,
+        num_replicas=1,
+    )
+    policy = vLLMWrapper(vllm_engine, input_mode="history")
+    
+    # Create double-buffer weight sync scheme
+    weight_sync_scheme = VLLMDoubleBufferSyncScheme(
+        remote_addr="/tmp/weights",  # Or "/mnt/shared/weights" for NFS
+        num_threads=128,
+        strategy="state_dict"
+    )
+    
+    # Create collector with weight sync scheme
+    collector = RayLLMCollector(
+        env=make_env,
+        policy=policy,
+        dialog_turns_per_batch=256,
+        total_dialog_turns=10000,
+        weight_sync_schemes={"policy": weight_sync_scheme},
+        track_policy_version=True,
+    )
+    
+    # During training, get the sender and receiver
+    sender = collector._weight_senders["policy"]
+    sender.register_model(training_model)
+    
+    # No initialization needed for double-buffer scheme!
+    
+    # Update weights during training
+    for i, data in enumerate(collector):
+        # ... training step ...
+        if i % 10 == 0:
+            sender.update_weights()  # Writes to shared storage
+            # vLLM workers can poll and apply: receiver.poll_and_apply()
 
 Policy Version Tracking
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -662,19 +780,52 @@ transform, or a boolean to the collector constructor.
 
     >>> from torchrl.envs.llm.transforms import PolicyVersion
     >>> from torchrl.collectors.llm import LLMCollector
-    >>> from torchrl.collectors.llm.weight_update import vLLMUpdater
+    >>> from torchrl.weight_update.llm import VLLMWeightSyncScheme, get_model_metadata
     >>> env = make_env() # place your code here
     >>> policy = make_policy() # place your code here
-    >>> collector = LLMCollector(env, policy=policy, weight_updater=vLLMUpdater(), track_policy_version=True)
-    >>> # init the updater
-    >>> collector.weight_updater.init(...)
-    >>> # the version is incremented after each weight update
-    >>> collector.update_policy_weights_(state_dict=...)
+    >>> scheme = VLLMWeightSyncScheme(master_port=29500, gpus_per_replica=1, num_replicas=1)
+    >>> collector = LLMCollector(env, policy=policy, weight_sync_schemes={"policy": scheme}, track_policy_version=True)
+    >>> # Get the sender and register model
+    >>> sender = collector._weight_senders["policy"]
+    >>> sender.register_model(training_model)
+    >>> # Initialize the collective group
+    >>> metadata = get_model_metadata(training_model)
+    >>> sender.init_all_workers_group(metadata, vllm_engine=policy.model)
+    >>> # Update weights
+    >>> sender.update_weights()
     >>> print(collector.policy_version_tracker.version)
     >>> # the policy version is written in the data
     >>> for data in collector:
     ...     print(data["policy_version"])
 
+.. currentmodule:: torchrl.weight_update.llm
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template.rst
+
+    VLLMWeightSyncScheme
+    VLLMWeightSender
+    VLLMWeightReceiver
+    VLLMCollectiveTransport
+    VLLMDoubleBufferSyncScheme
+    VLLMDoubleBufferWeightSender
+    VLLMDoubleBufferWeightReceiver
+    VLLMDoubleBufferTransport
+    get_model_metadata
+
+Legacy Weight Updaters (Deprecated)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. deprecated:: 0.11
+    The `vLLMUpdater` and `vLLMUpdaterV2` classes are deprecated in favor of the new weight synchronization schemes
+    (:class:`~torchrl.weight_update.llm.VLLMWeightSyncScheme` and :class:`~torchrl.weight_update.llm.VLLMDoubleBufferSyncScheme`).
+    These schemes provide better performance, more flexibility, and cleaner integration with collectors.
+    The legacy updaters will be removed in a future release.
+
+    The legacy weight updaters (`vLLMUpdater` and `vLLMUpdaterV2`) are still available but are no longer recommended.
+    Please migrate to the new weight synchronization schemes shown above.
+
 .. currentmodule:: torchrl.collectors.llm
 
 .. autosummary::
 
@@ -129,7 +129,7 @@ def example_multiprocess_sync():
         print(
             f"Main: Worker completed. Worker's weight sum: {model_state['weight_sum']:.4f}"
         )
-        print("✓ Weight synchronization successful!")
+        print("Weight synchronization successful!")
 
 
 def example_shared_memory_sync():
@@ -179,7 +179,7 @@ def example_shared_memory_sync():
         print(
             f"Main: Worker completed. Worker's weight sum: {model_state['weight_sum']:.4f}"
         )
-        print("✓ Shared memory synchronization successful!")
+        print("Shared memory synchronization successful!")
 
 
 def main():
 
@@ -15,10 +15,10 @@
 from torch import device as torch_device, dtype as torch_dtype
 
 from torchrl._utils import logger as torchrl_logger
-from torchrl.collectors.llm.weight_update.vllm import vLLMUpdater
 from torchrl.envs.llm import RetrieveLogProb
 from torchrl.envs.llm.datasets.ifeval import IFEvalEnv
 from torchrl.modules.llm import TransformersWrapper, vLLMWrapper
+from torchrl.weight_update.llm import VLLMWeightSyncScheme
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 from transformers.tokenization_utils import PreTrainedTokenizer
 
@@ -479,42 +479,40 @@ def get_hf_model(
         torch.set_default_dtype(original_dtype)
 
 
-def make_weight_updater(
-    policy_training=None,
+def make_weight_sync_scheme(
     master_address=None,
     master_port=None,
-    model_metadata=None,
-    vllm_tp_size=None,
-) -> vLLMUpdater:
-    """Creates a vLLM weight updater for the policy.
+    vllm_tp_size=1,
+) -> VLLMWeightSyncScheme:
+    """Creates a vLLM weight synchronization scheme using NCCL collectives.
 
-    This function can be used in two ways:
-    1. Synchronous mode (expert-iteration-sync.py): Pass policy_training to get an initialized updater with metadata
-    2. Async mode (expert-iteration-async.py): Pass master_address, master_port, model_metadata, and remote_actor
+    This function creates a weight sync scheme that uses NCCL for high-performance
+    GPU-to-GPU weight transfers from the training model to vLLM inference workers.
 
     Args:
-        policy_training (Optional[TransformersWrapper]): The training policy model. Required for sync mode.
-        master_address (Optional[str]): Ray master address for async mode.
-        master_port (Optional[int]): Ray master port for async mode.
-        model_metadata (Optional[dict]): Model metadata for async mode. If not provided but policy_training is,
-            it will be extracted from the policy.
-        vllm_tp_size (Optional[int]): vLLM tensor parallel size. If not provided, will be set to 1.
+        master_address (Optional[str]): Address of the master node for distributed init.
+            Defaults to "localhost".
+        master_port (Optional[int]): Port of the master node for distributed init.
+            If None, will auto-assign.
+        vllm_tp_size (int): vLLM tensor parallel size (gpus_per_replica). Defaults to 1.
 
     Returns:
-        vLLMUpdater: An instance of the weight updater configured to update
-            the vLLM worker's weights.
+        VLLMWeightSyncScheme: A weight sync scheme configured for the vLLM engine.
     """
-    if model_metadata is None and policy_training is not None:
-        # Extract metadata from training policy
-        model_metadata = {
-            k: (v.dtype, v.shape) for k, v in policy_training.model.state_dict().items()
-        }
+    if master_address is None:
+        master_address = "localhost"
+
+    torchrl_logger.info(
+        f"Creating VLLMWeightSyncScheme with tp_size={vllm_tp_size}, "
+        f"master_address={master_address}, master_port={master_port}"
+    )
 
-    return vLLMUpdater(
+    return VLLMWeightSyncScheme(
         master_address=master_address,
         master_port=master_port,
-        model_metadata=model_metadata,
-        vllm_tp_size=vllm_tp_size,
+        gpus_per_replica=vllm_tp_size,
+        num_replicas=1,  # For expert iteration, typically 1 replica
+        strategy="state_dict",
     )
 
 
 
@@ -13,9 +13,9 @@
 import hydra
 
 from torchrl import torchrl_logger
-from torchrl.collectors.llm.weight_update.vllm import vLLMUpdater
 from torchrl.data.llm.history import History
 from torchrl.record.loggers.wandb import WandbLogger
+from torchrl.weight_update.llm import get_model_metadata
 
 try:
     import ray
@@ -33,7 +33,7 @@
     get_train_model,
     log_training_metrics,
     make_env,
-    make_weight_updater,
+    make_weight_sync_scheme,
     RemoteDataLogger,
 )
 from omegaconf import DictConfig
@@ -115,26 +115,39 @@ def train(
     if cfg.model.compile:
         loss_fn = torch.compile(loss_fn)
 
-    # Get metadata
-    model_metadata = vLLMUpdater.get_model_metadata(policy_training)
+    # Get vLLM engine from the inference policy
+    # Note: In expert iteration, the inference policy is typically created in get_inference_model
+    # We need to get the vLLM engine from the collector's policy or create it
+    # For now, we'll use the approach similar to GRPO with explicit scheme creation
 
-    # Create weight updater with remote LLM
-    weight_updater: vLLMUpdater = make_weight_updater(
+    # Create weight sync scheme
+    weight_sync_scheme = make_weight_sync_scheme(
         master_address="localhost",  # Since we're running locally
         master_port=None,  # Will auto-assign an open port
-        model_metadata=model_metadata,
         vllm_tp_size=cfg.inference_model.num_devices
         if cfg.inference_model.num_devices is not None
         else len(cfg.inference_model.get("devices", [1])),
     )
-    collector.weight_updater = weight_updater
 
-    # Initialize the weight updater
-    weight_updater.init(model_metadata=model_metadata)
+    # Set up weight sender
+    torchrl_logger.info("Setting up weight synchronization scheme...")
+    sender = weight_sync_scheme.create_sender()
+    sender.register_model(policy_training)
 
-    # First update the weights
+    # Get vLLM engine reference from collector's policy
+    # The collector has the policy which wraps the vLLM engine
+    vllm_engine = collector.policy.model if hasattr(collector, "policy") else None
+    if vllm_engine is None:
+        raise RuntimeError("Could not get vLLM engine from collector policy")
+
+    # Initialize collective group
+    torchrl_logger.info("Initializing collective group...")
+    metadata = get_model_metadata(policy_training)
+    sender.init_all_workers_group(metadata, vllm_engine=vllm_engine)
+
+    # First weight update
     with timeit("update_policy_weights"):
-        weight_updater.push_weights(policy_training)
+        sender.update_weights()
     timeit.print(prefix="First update_policy_weights_ time")
     timeit.reset()
 
@@ -329,7 +342,7 @@ def train(
         if step % cfg.train.weight_update_frequency == 0:
             with timeit("update_policy_weights"):
                 torchrl_logger.info("Updating policy weights...")
-                weight_updater.push_weights(policy_training)
+                sender.update_weights()
                 # TODO: do we need this? Does it interfere with other processes?
                 # torch.cuda.empty_cache()
                 gc.collect()
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ def example_multiprocess_sync():`
`129`	`129`	`print(`
`130`	`130`	`f"Main: Worker completed. Worker's weight sum: {model_state['weight_sum']:.4f}"`
`131`	`131`	`)`
`132`		`- print("✓ Weight synchronization successful!")`
	`132`	`+ print("Weight synchronization successful!")`
`133`	`133`
`134`	`134`
`135`	`135`	`def example_shared_memory_sync():`
`@@ -179,7 +179,7 @@ def example_shared_memory_sync():`
`179`	`179`	`print(`
`180`	`180`	`f"Main: Worker completed. Worker's weight sum: {model_state['weight_sum']:.4f}"`
`181`	`181`	`)`
`182`		`- print("✓ Shared memory synchronization successful!")`
	`182`	`+ print("Shared memory synchronization successful!")`
`183`	`183`
`184`	`184`
`185`	`185`	`def main():`