OutOfMemoryError                          Traceback (most recent call last)
Cell In[10], line 3
      1 torch.cuda.empty_cache()
----> 3 model = AutoModelForCausalLM.from_pretrained(
      4     model_name,
      5     quantization_config=bnb_config,
      6     device_map="auto",  # will automatically offload layers
      7     offload_folder="/workspace/offload", # temp folder on disk for offloading
      8 )

File /usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py:604, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    602     if model_class.config_class == config.sub_configs.get("text_config", None):
    603         config = config.get_text_config()
--> 604     return model_class.from_pretrained(
    605         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    606     )
    607 raise ValueError(
    608     f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
    609     f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
    610 )

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:277, in restore_default_dtype.<locals>._wrapper(*args, **kwargs)
    275 old_dtype = torch.get_default_dtype()
    276 try:
--> 277     return func(*args, **kwargs)
    278 finally:
    279     torch.set_default_dtype(old_dtype)

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:5051, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
   5041     if dtype_orig is not None:
   5042         torch.set_default_dtype(dtype_orig)
   5044     (
   5045         model,
   5046         missing_keys,
   5047         unexpected_keys,
   5048         mismatched_keys,
   5049         offload_index,
   5050         error_msgs,
-> 5051     ) = cls._load_pretrained_model(
   5052         model,
   5053         state_dict,
   5054         checkpoint_files,
   5055         pretrained_model_name_or_path,
   5056         ignore_mismatched_sizes=ignore_mismatched_sizes,
   5057         sharded_metadata=sharded_metadata,
   5058         device_map=device_map,
   5059         disk_offload_folder=offload_folder,
   5060         dtype=dtype,
   5061         hf_quantizer=hf_quantizer,
   5062         keep_in_fp32_regex=keep_in_fp32_regex,
   5063         device_mesh=device_mesh,
   5064         key_mapping=key_mapping,
   5065         weights_only=weights_only,
   5066     )
   5067 # make sure token embedding weights are still tied if needed
   5068 model.tie_weights()

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:5471, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, checkpoint_files, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, device_map, disk_offload_folder, dtype, hf_quantizer, keep_in_fp32_regex, device_mesh, key_mapping, weights_only)
   5468         args_list = logging.tqdm(args_list, desc="Loading checkpoint shards")
   5470     for args in args_list:
-> 5471         _error_msgs, disk_offload_index = load_shard_file(args)
   5472         error_msgs += _error_msgs
   5474 # Save offloaded index if needed

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:847, in load_shard_file(args)
    845 # Skip it with fsdp on ranks other than 0
    846 elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized):
--> 847     disk_offload_index = _load_state_dict_into_meta_model(
    848         model,
    849         state_dict,
    850         shard_file,
    851         reverse_key_renaming_mapping,
    852         device_map=device_map,
    853         disk_offload_folder=disk_offload_folder,
    854         disk_offload_index=disk_offload_index,
    855         hf_quantizer=hf_quantizer,
    856         keep_in_fp32_regex=keep_in_fp32_regex,
    857         device_mesh=device_mesh,
    858     )
    860 return error_msgs, disk_offload_index

File /usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py:120, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    117 @functools.wraps(func)
    118 def decorate_context(*args, **kwargs):
    119     with ctx_factory():
--> 120         return func(*args, **kwargs)

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:770, in _load_state_dict_into_meta_model(model, state_dict, shard_file, reverse_renaming_mapping, device_map, disk_offload_folder, disk_offload_index, hf_quantizer, keep_in_fp32_regex, device_mesh)
    767     if is_fsdp_enabled():
    768         param_device = "cpu" if is_local_dist_rank_0() else "meta"
--> 770     _load_parameter_into_model(model, param_name, param.to(param_device))
    772 else:
    773     # TODO naming is stupid it loads it as well
    774     hf_quantizer.create_quantized_param(model, param, param_name, param_device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB. GPU 1 has a total capacity of 94.97 GiB of which 600.19 MiB is free. Including non-PyTorch memory, this process has 94.38 GiB memory in use. Of the allocated memory 93.46 GiB is allocated by PyTorch, and 284.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)