Skip to content

Commit b252643

Browse files
committed
rm Chameleon's slow tokenizer
1 parent ae5d537 commit b252643

File tree

4 files changed

+8
-32
lines changed

4 files changed

+8
-32
lines changed

src/transformers/models/auto/tokenization_auto.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,7 @@
109109
("canine", ("CanineTokenizer", None)),
110110
(
111111
"chameleon",
112-
(
113-
"LlamaTokenizer" if is_sentencepiece_available() else None,
114-
"LlamaTokenizerFast" if is_tokenizers_available() else None,
115-
),
112+
(None, "LlamaTokenizerFast" if is_tokenizers_available() else None),
116113
),
117114
("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
118115
(

src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from transformers import (
2626
ChameleonConfig,
27-
ChameleonForCausalLM,
27+
ChameleonForConditionalGeneration,
2828
ChameleonImageProcessor,
2929
ChameleonProcessor,
3030
)
@@ -49,10 +49,10 @@
4949
Thereafter, models can be loaded via:
5050
5151
```py
52-
from transformers import ChameleonForCausalLM, LlamaTokenizer
52+
from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast
5353
54-
model = ChameleonForCausalLM.from_pretrained("/output/path")
55-
tokenizer = LlamaTokenizer.from_pretrained("/output/path")
54+
model = ChameleonForConditionalGeneration.from_pretrained("/output/path")
55+
tokenizer = LlamaTokenizerFast.from_pretrained("/output/path")
5656
```
5757
5858
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
@@ -372,7 +372,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
372372
vocabulary_map=vocabulary_map,
373373
)
374374
with init_empty_weights():
375-
model = ChameleonForCausalLM(config)
375+
model = ChameleonForConditionalGeneration(config)
376376

377377
model.load_state_dict(state_dict, assign=True, strict=False)
378378
model.save_pretrained(model_path, safe_serialization=True)
@@ -397,7 +397,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
397397
# taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
398398
print("Loading the checkpoint in a Chameleon model...")
399399
print("*" * 100)
400-
model = ChameleonForCausalLM.from_pretrained(
400+
model = ChameleonForConditionalGeneration.from_pretrained(
401401
model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
402402
)
403403
processor = ChameleonProcessor.from_pretrained(model_path)

src/transformers/models/chameleon/processing_chameleon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class ChameleonProcessor(ProcessorMixin):
6868
"""
6969

7070
attributes = ["image_processor", "tokenizer"]
71-
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
71+
tokenizer_class = "LlamaTokenizerFast"
7272
image_processor_class = "ChameleonImageProcessor"
7373

7474
def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):

tests/models/chameleon/test_processor_chameleon.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import unittest
33

44
from transformers import ChameleonProcessor
5-
from transformers.models.auto.processing_auto import processor_class_from_name
65

76
from ...test_processing_common import ProcessorTesterMixin
87

@@ -11,26 +10,6 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
1110
from_pretrained_id = "leloy/Anole-7b-v0.1-hf"
1211
processor_class = ChameleonProcessor
1312

14-
def get_component(self, attribute, **kwargs):
15-
assert attribute in self.processor_class.attributes
16-
if attribute != "tokenizer":
17-
return super().get_component(attribute, **kwargs)
18-
# We use the fast tokenizer by default as the slow tokenizer expects the vocab file to be present in the loading directory.
19-
# This vocab file is neither in the official repo nor does it get saved when we save the processor in `setUp` below.
20-
component_class_name = getattr(self.processor_class, f"{attribute}_class")
21-
if isinstance(component_class_name, tuple):
22-
if "_fast" in component_class_name[0]:
23-
component_class_name = component_class_name[0]
24-
else:
25-
component_class_name = component_class_name[1]
26-
27-
component_class = processor_class_from_name(component_class_name)
28-
component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
29-
if attribute == "tokenizer" and not component.pad_token:
30-
component.pad_token = "[TEST_PAD]"
31-
32-
return component
33-
3413
def setUp(self):
3514
self.tmpdirname = tempfile.mkdtemp()
3615
processor = self.processor_class.from_pretrained(self.from_pretrained_id)

0 commit comments

Comments
 (0)