-
Notifications
You must be signed in to change notification settings - Fork 1k
Open
Labels
Description
🐛 Bug
After TPU VM upgrade, transformers stuck in loading/downloading model
seems the tqdm broken in the new VM
To Reproduce
Select TPU accelerator
import torch
import torch_xla
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")Expected behavior
Transformer download the model.
Current behavior
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File /usr/local/lib/python3.12/site-packages/traitlets/traitlets.py:632, in TraitType.get(self, obj, cls)
631 try:
--> 632 value = obj._trait_values[self.name]
633 except KeyError:
634 # Check for a dynamic initializer.
KeyError: 'layout'
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
Cell In[1], line 5
2 import torch_xla
3 from transformers import AutoTokenizer, AutoModelForCausalLM
----> 5 model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
File /usr/local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py:604, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
602 if model_class.config_class == config.sub_configs.get("text_config", None):
603 config = config.get_text_config()
--> 604 return model_class.from_pretrained(
605 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
606 )
607 raise ValueError(
608 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
609 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
610 )
File /usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py:288, in restore_default_dtype.<locals>._wrapper(*args, **kwargs)
286 old_dtype = torch.get_default_dtype()
287 try:
--> 288 return func(*args, **kwargs)
289 finally:
290 torch.set_default_dtype(old_dtype)
File /usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py:5030, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
5020 if (
5021 gguf_file
5022 and device_map is not None
5023 and ((isinstance(device_map, dict) and "disk" in device_map.values()) or "disk" in device_map)
5024 ):
5025 raise RuntimeError(
5026 "One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
5027 "loaded from GGUF files."
5028 )
-> 5030 checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
5031 pretrained_model_name_or_path=pretrained_model_name_or_path,
5032 subfolder=subfolder,
5033 variant=variant,
5034 gguf_file=gguf_file,
5035 from_tf=from_tf,
5036 from_flax=from_flax,
5037 use_safetensors=use_safetensors,
5038 cache_dir=cache_dir,
5039 force_download=force_download,
5040 proxies=proxies,
5041 local_files_only=local_files_only,
5042 token=token,
5043 user_agent=user_agent,
5044 revision=revision,
5045 commit_hash=commit_hash,
5046 is_remote_code=cls._auto_class is not None,
5047 transformers_explicit_filename=transformers_explicit_filename,
5048 )
5050 is_sharded = sharded_metadata is not None
5051 is_quantized = hf_quantizer is not None
File /usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py:1308, in _get_resolved_checkpoint_files(pretrained_model_name_or_path, subfolder, variant, gguf_file, from_tf, from_flax, use_safetensors, cache_dir, force_download, proxies, local_files_only, token, user_agent, revision, commit_hash, is_remote_code, transformers_explicit_filename)
1306 sharded_metadata = None
1307 if is_sharded:
-> 1308 checkpoint_files, sharded_metadata = get_checkpoint_shard_files(
1309 pretrained_model_name_or_path,
1310 resolved_archive_file,
1311 cache_dir=cache_dir,
1312 force_download=force_download,
1313 proxies=proxies,
1314 local_files_only=local_files_only,
1315 token=token,
1316 user_agent=user_agent,
1317 revision=revision,
1318 subfolder=subfolder,
1319 _commit_hash=commit_hash,
1320 )
1321 else:
1322 checkpoint_files = [resolved_archive_file] if pretrained_model_name_or_path is not None else None
File /usr/local/lib/python3.12/site-packages/transformers/utils/hub.py:1119, in get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, cache_dir, force_download, proxies, resume_download, local_files_only, token, user_agent, revision, subfolder, _commit_hash, **deprecated_kwargs)
1115 return shard_filenames, sharded_metadata
1117 # At this stage pretrained_model_name_or_path is a model identifier on the Hub. Try to get everything from cache,
1118 # or download the files
-> 1119 cached_filenames = cached_files(
1120 pretrained_model_name_or_path,
1121 shard_filenames,
1122 cache_dir=cache_dir,
1123 force_download=force_download,
1124 proxies=proxies,
1125 resume_download=resume_download,
1126 local_files_only=local_files_only,
1127 token=token,
1128 user_agent=user_agent,
1129 revision=revision,
1130 subfolder=subfolder,
1131 _commit_hash=_commit_hash,
1132 )
1134 return cached_filenames, sharded_metadata
File /usr/local/lib/python3.12/site-packages/transformers/utils/hub.py:566, in cached_files(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
563 # Any other Exception type should now be re-raised, in order to provide helpful error messages and break the execution flow
564 # (EntryNotFoundError will be treated outside this block and correctly re-raised if needed)
565 elif not isinstance(e, EntryNotFoundError):
--> 566 raise e
568 resolved_files = [
569 _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) for filename in full_filenames
570 ]
571 # If there are any missing file and the flag is active, raise
File /usr/local/lib/python3.12/site-packages/transformers/utils/hub.py:493, in cached_files(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
478 hf_hub_download(
479 path_or_repo_id,
480 filenames[0],
(...) 490 local_files_only=local_files_only,
491 )
492 else:
--> 493 snapshot_download(
494 path_or_repo_id,
495 allow_patterns=full_filenames,
496 repo_type=repo_type,
497 revision=revision,
498 cache_dir=cache_dir,
499 user_agent=user_agent,
500 force_download=force_download,
501 proxies=proxies,
502 resume_download=resume_download,
503 token=token,
504 local_files_only=local_files_only,
505 )
507 except Exception as e:
508 # We cannot recover from them
509 if isinstance(e, RepositoryNotFoundError) and not isinstance(e, GatedRepoError):
File /usr/local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py:332, in snapshot_download(repo_id, repo_type, revision, cache_dir, local_dir, library_name, library_version, user_agent, proxies, etag_timeout, force_download, token, local_files_only, allow_patterns, ignore_patterns, max_workers, tqdm_class, headers, endpoint, local_dir_use_symlinks, resume_download)
330 _inner_hf_hub_download(file)
331 else:
--> 332 thread_map(
333 _inner_hf_hub_download,
334 filtered_repo_files,
335 desc=tqdm_desc,
336 max_workers=max_workers,
337 # User can use its own tqdm class or the default one from `huggingface_hub.utils`
338 tqdm_class=tqdm_class or hf_tqdm,
339 )
341 if local_dir is not None:
342 return str(os.path.realpath(local_dir))
File /usr/local/lib/python3.12/site-packages/tqdm/contrib/concurrent.py:69, in thread_map(fn, *iterables, **tqdm_kwargs)
55 """
56 Equivalent of `list(map(fn, *iterables))`
57 driven by `concurrent.futures.ThreadPoolExecutor`.
(...) 66 [default: max(32, cpu_count() + 4)].
67 """
68 from concurrent.futures import ThreadPoolExecutor
---> 69 return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs)
File /usr/local/lib/python3.12/site-packages/tqdm/contrib/concurrent.py:51, in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs)
47 with ensure_lock(tqdm_class, lock_name=lock_name) as lk:
48 # share lock in case workers are already using `tqdm`
49 with PoolExecutor(max_workers=max_workers, initializer=tqdm_class.set_lock,
50 initargs=(lk,)) as ex:
---> 51 return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs))
File /usr/local/lib/python3.12/site-packages/tqdm/notebook.py:250, in tqdm_notebook.__iter__(self)
248 try:
249 it = super().__iter__()
--> 250 for obj in it:
251 # return super(tqdm...) will not catch exception
252 yield obj
253 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File /usr/local/lib/python3.12/site-packages/tqdm/std.py:1181, in tqdm.__iter__(self)
1178 time = self._time
1180 try:
-> 1181 for obj in iterable:
1182 yield obj
1183 # Update and possibly print the progressbar.
1184 # Note: does not call self.update(1) for speed optimisation.
File /usr/local/lib/python3.12/concurrent/futures/_base.py:619, in Executor.map.<locals>.result_iterator()
616 while fs:
617 # Careful not to keep a reference to the popped future
618 if timeout is None:
--> 619 yield _result_or_cancel(fs.pop())
620 else:
621 yield _result_or_cancel(fs.pop(), end_time - time.monotonic())
File /usr/local/lib/python3.12/concurrent/futures/_base.py:317, in _result_or_cancel(***failed resolving arguments***)
315 try:
316 try:
--> 317 return fut.result(timeout)
318 finally:
319 fut.cancel()
File /usr/local/lib/python3.12/concurrent/futures/_base.py:456, in Future.result(self, timeout)
454 raise CancelledError()
455 elif self._state == FINISHED:
--> 456 return self.__get_result()
457 else:
458 raise TimeoutError()
File /usr/local/lib/python3.12/concurrent/futures/_base.py:401, in Future.__get_result(self)
399 if self._exception:
400 try:
--> 401 raise self._exception
402 finally:
403 # Break a reference cycle with the exception in self._exception
404 self = None
File /usr/local/lib/python3.12/concurrent/futures/thread.py:59, in _WorkItem.run(self)
56 return
58 try:
---> 59 result = self.fn(*self.args, **self.kwargs)
60 except BaseException as exc:
61 self.future.set_exception(exc)
File /usr/local/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py:306, in snapshot_download.<locals>._inner_hf_hub_download(repo_file)
305 def _inner_hf_hub_download(repo_file: str):
--> 306 return hf_hub_download(
307 repo_id,
308 filename=repo_file,
309 repo_type=repo_type,
310 revision=commit_hash,
311 endpoint=endpoint,
312 cache_dir=cache_dir,
313 local_dir=local_dir,
314 local_dir_use_symlinks=local_dir_use_symlinks,
315 library_name=library_name,
316 library_version=library_version,
317 user_agent=user_agent,
318 proxies=proxies,
319 etag_timeout=etag_timeout,
320 resume_download=resume_download,
321 force_download=force_download,
322 token=token,
323 headers=headers,
324 )
File /usr/local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/huggingface_hub/file_download.py:1010, in hf_hub_download(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, resume_download, force_filename, local_dir_use_symlinks)
990 return _hf_hub_download_to_local_dir(
991 # Destination
992 local_dir=local_dir,
(...) 1007 local_files_only=local_files_only,
1008 )
1009 else:
-> 1010 return _hf_hub_download_to_cache_dir(
1011 # Destination
1012 cache_dir=cache_dir,
1013 # File info
1014 repo_id=repo_id,
1015 filename=filename,
1016 repo_type=repo_type,
1017 revision=revision,
1018 # HTTP info
1019 endpoint=endpoint,
1020 etag_timeout=etag_timeout,
1021 headers=hf_headers,
1022 proxies=proxies,
1023 token=token,
1024 # Additional options
1025 local_files_only=local_files_only,
1026 force_download=force_download,
1027 )
File /usr/local/lib/python3.12/site-packages/huggingface_hub/file_download.py:1171, in _hf_hub_download_to_cache_dir(cache_dir, repo_id, filename, repo_type, revision, endpoint, etag_timeout, headers, proxies, token, local_files_only, force_download)
1168 # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
1170 with WeakFileLock(lock_path):
-> 1171 _download_to_tmp_and_move(
1172 incomplete_path=Path(blob_path + ".incomplete"),
1173 destination_path=Path(blob_path),
1174 url_to_download=url_to_download,
1175 proxies=proxies,
1176 headers=headers,
1177 expected_size=expected_size,
1178 filename=filename,
1179 force_download=force_download,
1180 etag=etag,
1181 xet_file_data=xet_file_data,
1182 )
1183 if not os.path.exists(pointer_path):
1184 _create_symlink(blob_path, pointer_path, new_blob=True)
File /usr/local/lib/python3.12/site-packages/huggingface_hub/file_download.py:1723, in _download_to_tmp_and_move(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download, etag, xet_file_data)
1721 if xet_file_data is not None and is_xet_available():
1722 logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
-> 1723 xet_get(
1724 incomplete_path=incomplete_path,
1725 xet_file_data=xet_file_data,
1726 headers=headers,
1727 expected_size=expected_size,
1728 displayed_filename=filename,
1729 )
1730 else:
1731 if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
File /usr/local/lib/python3.12/site-packages/huggingface_hub/file_download.py:615, in xet_get(incomplete_path, xet_file_data, headers, expected_size, displayed_filename, _tqdm_bar)
612 if len(displayed_filename) > 40:
613 displayed_filename = f"{displayed_filename[:40]}(…)"
--> 615 progress_cm = _get_progress_bar_context(
616 desc=displayed_filename,
617 log_level=logger.getEffectiveLevel(),
618 total=expected_size,
619 initial=0,
620 name="huggingface_hub.xet_get",
621 _tqdm_bar=_tqdm_bar,
622 )
624 with progress_cm as progress:
626 def progress_updater(progress_bytes: float):
File /usr/local/lib/python3.12/site-packages/huggingface_hub/utils/tqdm.py:299, in _get_progress_bar_context(desc, log_level, total, initial, unit, unit_scale, name, _tqdm_bar)
294 return nullcontext(_tqdm_bar)
295 # ^ `contextlib.nullcontext` mimics a context manager that does nothing
296 # Makes it easier to use the same code path for both cases but in the later
297 # case, the progress bar is not closed when exiting the context manager.
--> 299 return tqdm(
300 unit=unit,
301 unit_scale=unit_scale,
302 total=total,
303 initial=initial,
304 desc=desc,
305 disable=is_tqdm_disabled(log_level=log_level),
306 name=name,
307 )
File /usr/local/lib/python3.12/site-packages/huggingface_hub/utils/tqdm.py:225, in tqdm.__init__(self, *args, **kwargs)
223 if are_progress_bars_disabled(name):
224 kwargs["disable"] = True
--> 225 super().__init__(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/tqdm/notebook.py:234, in tqdm_notebook.__init__(self, *args, **kwargs)
232 unit_scale = 1 if self.unit_scale is True else self.unit_scale or 1
233 total = self.total * unit_scale if self.total else self.total
--> 234 self.container = self.status_printer(self.fp, total, self.desc, self.ncols)
235 self.container.pbar = proxy(self)
236 self.displayed = False
File /usr/local/lib/python3.12/site-packages/tqdm/notebook.py:110, in tqdm_notebook.status_printer(_, total, desc, ncols)
108 raise ImportError(WARN_NOIPYW)
109 if total:
--> 110 pbar = IProgress(min=0, max=total)
111 else: # No total? Show info style bar with no progress tqdm status
112 pbar = IProgress(min=0, max=1)
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget_float.py:26, in _Float.__init__(self, value, **kwargs)
24 if value is not None:
25 kwargs['value'] = value
---> 26 super().__init__(**kwargs)
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget_description.py:35, in DescriptionWidget.__init__(self, *args, **kwargs)
33 kwargs.setdefault('tooltip', kwargs['description_tooltip'])
34 del kwargs['description_tooltip']
---> 35 super().__init__(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget.py:506, in Widget.__init__(self, **kwargs)
503 super().__init__(**kwargs)
505 Widget._call_widget_constructed(self)
--> 506 self.open()
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget.py:525, in Widget.open(self)
523 """Open a comm to the frontend if one isn't already open."""
524 if self.comm is None:
--> 525 state, buffer_paths, buffers = _remove_buffers(self.get_state())
527 args = dict(target_name='jupyter.widget',
528 data={'state': state, 'buffer_paths': buffer_paths},
529 buffers=buffers,
530 metadata={'version': __protocol_version__}
531 )
532 if self._model_id is not None:
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget.py:615, in Widget.get_state(self, key, drop_defaults)
613 for k in keys:
614 to_json = self.trait_metadata(k, 'to_json', self._trait_to_json)
--> 615 value = to_json(getattr(self, k), self)
616 if not drop_defaults or not self._compare(value, traits[k].default_value):
617 state[k] = value
File /usr/local/lib/python3.12/site-packages/traitlets/traitlets.py:687, in TraitType.__get__(self, obj, cls)
685 return self
686 else:
--> 687 return t.cast(G, self.get(obj, cls))
File /usr/local/lib/python3.12/site-packages/traitlets/traitlets.py:635, in TraitType.get(self, obj, cls)
632 value = obj._trait_values[self.name]
633 except KeyError:
634 # Check for a dynamic initializer.
--> 635 default = obj.trait_defaults(self.name)
636 if default is Undefined:
637 warn(
638 "Explicit using of Undefined as the default value "
639 "is deprecated in traitlets 5.0, and may cause "
(...) 642 stacklevel=2,
643 )
File /usr/local/lib/python3.12/site-packages/traitlets/traitlets.py:1897, in HasTraits.trait_defaults(self, *names, **metadata)
1894 raise TraitError(f"'{n}' is not a trait of '{type(self).__name__}' instances")
1896 if len(names) == 1 and len(metadata) == 0:
-> 1897 return t.cast(Sentinel, self._get_trait_default_generator(names[0])(self))
1899 trait_names = self.trait_names(**metadata)
1900 trait_names.extend(names)
File /usr/local/lib/python3.12/site-packages/traitlets/traitlets.py:602, in TraitType.default(self, obj)
600 return t.cast(G, self.default_value)
601 elif hasattr(self, "make_dynamic_default"):
--> 602 return t.cast(G, self.make_dynamic_default())
603 else:
604 # Undefined will raise in TraitType.get
605 return t.cast(G, self.default_value)
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/trait_types.py:409, in InstanceDict.make_dynamic_default(self)
408 def make_dynamic_default(self):
--> 409 return self.klass(*(self.default_args or ()),
410 **(self.default_kwargs or {}))
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget_layout.py:86, in Layout.__init__(self, **kwargs)
83 for side in ['top', 'right', 'bottom', 'left']:
84 kwargs.setdefault(f'border_{side}', border)
---> 86 super().__init__(**kwargs)
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget.py:506, in Widget.__init__(self, **kwargs)
503 super().__init__(**kwargs)
505 Widget._call_widget_constructed(self)
--> 506 self.open()
File /usr/local/lib/python3.12/site-packages/ipywidgets/widgets/widget.py:535, in Widget.open(self)
532 if self._model_id is not None:
533 args['comm_id'] = self._model_id
--> 535 self.comm = comm.create_comm(**args)
File /usr/local/lib/python3.12/site-packages/ipywidgets/comm.py:33, in create_comm(*args, **kwargs)
31 return Comm(*args, **kwargs)
32 else:
---> 33 return comm.create_comm(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/ipykernel/ipkernel.py:52, in _create_comm(*args, **kwargs)
50 def _create_comm(*args, **kwargs):
51 """Create a new Comm."""
---> 52 return BaseComm(*args, **kwargs)
File /usr/local/lib/python3.12/site-packages/comm/base_comm.py:65, in BaseComm.__init__(self, target_name, data, metadata, buffers, comm_id, primary, target_module, topic, _open_data, _close_data, **kwargs)
61 self._closed = True
63 if self.primary:
64 # I am primary, open my peer.
---> 65 self.open(data=data, metadata=metadata, buffers=buffers)
66 else:
67 self._closed = False
File /usr/local/lib/python3.12/site-packages/comm/base_comm.py:103, in BaseComm.open(self, data, metadata, buffers)
101 comm_manager.register_comm(self)
102 try:
--> 103 self.publish_msg(
104 "comm_open",
105 data=data,
106 metadata=metadata,
107 buffers=buffers,
108 target_name=self.target_name,
109 target_module=self.target_module,
110 )
111 self._closed = False
112 except Exception:
File /usr/local/lib/python3.12/site-packages/ipykernel/comm/comm.py:42, in BaseComm.publish_msg(self, msg_type, data, metadata, buffers, **keys)
34 self.kernel = Kernel.instance()
36 assert self.kernel.session is not None
37 self.kernel.session.send(
38 self.kernel.iopub_socket,
39 msg_type,
40 content,
41 metadata=json_clean(metadata),
---> 42 parent=self.kernel.get_parent(),
43 ident=self.topic,
44 buffers=buffers,
45 )
File /usr/local/lib/python3.12/site-packages/ipykernel/kernelbase.py:797, in Kernel.get_parent(self, channel)
795 if channel == "control":
796 return self._control_parent
--> 797 return self._shell_parent.get()
LookupError: <ContextVar name='shell_parent' at 0x786cedd48f90>