OutOfMemoryError Traceback (most recent call last) Cell In[10], line 3 1 torch.cuda.empty_cache() ----> 3 model = AutoModelForCausalLM.from_pretrained( 4 model_name, 5 quantization_config=bnb_config, 6 device_map="auto", # will automatically offload layers 7 offload_folder="/workspace/offload", # temp folder on disk for offloading 8 ) File /usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py:604, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) 602 if model_class.config_class == config.sub_configs.get("text_config", None): 603 config = config.get_text_config() --> 604 return model_class.from_pretrained( 605 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs 606 ) 607 raise ValueError( 608 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" 609 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}." 610 ) File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:277, in restore_default_dtype.._wrapper(*args, **kwargs) 275 old_dtype = torch.get_default_dtype() 276 try: --> 277 return func(*args, **kwargs) 278 finally: 279 torch.set_default_dtype(old_dtype) File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:5051, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs) 5041 if dtype_orig is not None: 5042 torch.set_default_dtype(dtype_orig) 5044 ( 5045 model, 5046 missing_keys, 5047 unexpected_keys, 5048 mismatched_keys, 5049 offload_index, 5050 error_msgs, -> 5051 ) = cls._load_pretrained_model( 5052 model, 5053 state_dict, 5054 checkpoint_files, 5055 pretrained_model_name_or_path, 5056 ignore_mismatched_sizes=ignore_mismatched_sizes, 5057 sharded_metadata=sharded_metadata, 5058 device_map=device_map, 5059 disk_offload_folder=offload_folder, 5060 dtype=dtype, 5061 hf_quantizer=hf_quantizer, 5062 keep_in_fp32_regex=keep_in_fp32_regex, 5063 device_mesh=device_mesh, 5064 key_mapping=key_mapping, 5065 weights_only=weights_only, 5066 ) 5067 # make sure token embedding weights are still tied if needed 5068 model.tie_weights() File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:5471, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, checkpoint_files, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, device_map, disk_offload_folder, dtype, hf_quantizer, keep_in_fp32_regex, device_mesh, key_mapping, weights_only) 5468 args_list = logging.tqdm(args_list, desc="Loading checkpoint shards") 5470 for args in args_list: -> 5471 _error_msgs, disk_offload_index = load_shard_file(args) 5472 error_msgs += _error_msgs 5474 # Save offloaded index if needed File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:847, in load_shard_file(args) 845 # Skip it with fsdp on ranks other than 0 846 elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized): --> 847 disk_offload_index = _load_state_dict_into_meta_model( 848 model, 849 state_dict, 850 shard_file, 851 reverse_key_renaming_mapping, 852 device_map=device_map, 853 disk_offload_folder=disk_offload_folder, 854 disk_offload_index=disk_offload_index, 855 hf_quantizer=hf_quantizer, 856 keep_in_fp32_regex=keep_in_fp32_regex, 857 device_mesh=device_mesh, 858 ) 860 return error_msgs, disk_offload_index File /usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py:120, in context_decorator..decorate_context(*args, **kwargs) 117 @functools.wraps(func) 118 def decorate_context(*args, **kwargs): 119 with ctx_factory(): --> 120 return func(*args, **kwargs) File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:770, in _load_state_dict_into_meta_model(model, state_dict, shard_file, reverse_renaming_mapping, device_map, disk_offload_folder, disk_offload_index, hf_quantizer, keep_in_fp32_regex, device_mesh) 767 if is_fsdp_enabled(): 768 param_device = "cpu" if is_local_dist_rank_0() else "meta" --> 770 _load_parameter_into_model(model, param_name, param.to(param_device)) 772 else: 773 # TODO naming is stupid it loads it as well 774 hf_quantizer.create_quantized_param(model, param, param_name, param_device) OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB. GPU 1 has a total capacity of 94.97 GiB of which 600.19 MiB is free. Including non-PyTorch memory, this process has 94.38 GiB memory in use. Of the allocated memory 93.46 GiB is allocated by PyTorch, and 284.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)