DeepSpeed stage 3 and mixed precision cause an error
See original GitHub issueπ Bug
Using strategy="deepspeed_stage_3" and precision=16 causes an error
To Reproduce
import os
import torch
from torch.utils.data import DataLoader, Dataset
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return FusedAdam(self.layer.parameters(), lr=0.1)
# return torch.optim.Adam(self.parameters(),lr = .1)
def run():
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
num_sanity_val_steps=0,
max_epochs=1,
enable_model_summary=False,
logger=False,
enable_checkpointing=False,
gpus = 4,
precision=16,
strategy = "deepspeed_stage_3"
)
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run()
I get the following error:
Traceback (most recent call last):
File "bug.py", line 69, in <module>
run()
File "bug.py", line 64, in run
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in fit
self._call_and_handle_interrupt(
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1182, in _run
self._pre_dispatch()
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1217, in _pre_dispatch
self.accelerator.pre_dispatch(self)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 136, in pre_dispatch
self.training_type_plugin.pre_dispatch()
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/deepspeed.py", line 397, in pre_dispatch
self.init_deepspeed()
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/deepspeed.py", line 474, in init_deepspeed
self._initialize_deepspeed_train(model)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/deepspeed.py", line 507, in _initialize_deepspeed_train
model, deepspeed_optimizer = self._setup_model_and_optimizer(model, optimizer, scheduler)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/deepspeed.py", line 431, in _setup_model_and_optimizer
deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initialize(
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/__init__.py", line 131, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 223, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 905, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1152, in _configure_zero_optimizer
optimizer = FP16_DeepSpeedZeroOptimizer_Stage3(
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 905, in __init__
self.create_reduce_and_remove_grad_hooks()
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 1885, in create_reduce_and_remove_grad_hooks
param.all_gather()
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 590, in all_gather
return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 699, in _all_gather
ret_value = self._allgather_params_coalesced(all_gather_list, hierarchy)
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 947, in _allgather_params_coalesced
h = dist._all_gather_base(allgather_params[param_idx],
File "/home/kirill.trapeznikov/miniconda3/envs/semafor_nlg/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2070, in _all_gather_base
work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: output tensor must have the same type as input tensor
Expected behavior
it should work, right?
Environment
* CUDA:
- GPU:
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- GeForce RTX 2080 Ti
- available: True
- version: 11.3
* Packages:
- numpy: 1.21.1
- pyTorch_debug: False
- pyTorch_version: 1.10.0+cu113
- pytorch-lightning: 1.5.1
- tqdm: 4.62.0
* System:
- OS: Linux
- architecture:
- 64bit
- ELF
- processor: x86_64
- python: 3.8.11
- version: #1 SMP Wed Feb 3 15:06:38 UTC 2021
- Any other relevant information:
deepspeed 0.5.6
Additional context
Issue Analytics
- State:
- Created 2 years ago
- Comments:19 (7 by maintainers)
Top Results From Across the Web
DeepSpeed v1.6.1 issues with Pytorch Lightning Deep Speed ...
Describe the bug DeepSpeed Stage 3 with Offloading is throwing an error. This error is not thrown in DeepSpeed version 0.5.8.
Read more >DeepSpeed Integration - Hugging Face
DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. ... and WarmupLR scheduler and will enable mixed precision training if --fp16 is passed:....
Read more >DeepSpeed Configuration JSON
Configuration for using mixed precision/FP16 training that leverages NVIDIA's Apex package. An example, including the available dictionary keys isΒ ...
Read more >PyTorch Mixed Precision Training on Gaudi
Habana Mixed Precision (HMP) package is a tool that allows you to run mixed precision training on HPU without extensive modifications to existing...
Read more >Changelog β PyTorch Lightning 1.8.5 documentation
Fixed torch.jit.script -ing a LightningModule causing an unintended error message about ... Fixed Model Summary when using DeepSpeed Stage 3 (#13427).
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
ahhh huge thanks @tjruwase! I recall having this in place because there were some internal deepspeed assertions that were raised if the model was partially partitioned! Iβve removed the code and have confirmed all tests are passing.
@ktrapeznikov #10655 should fix this issue π
@SeanNaren, the problem seems to be due to Lightning calling zero.Init() here on an already constructed model. It particularly, zero.Init() is meant for constructing massive models that are too large for a single device. For already constructed models, the stage 3 optimizer will automatically setup the required partitioning as seen here. Hope that helps.