16
16
Processor class for Chameleon.
17
17
"""
18
18
19
- from typing import List , Optional , Union
19
+ import sys
20
+ from typing import List , Union
20
21
21
22
import numpy as np
22
23
23
24
from ...feature_extraction_utils import BatchFeature
24
25
from ...image_utils import ImageInput
25
- from ...processing_utils import ProcessorMixin
26
- from ...tokenization_utils_base import PaddingStrategy , PreTokenizedInput , TextInput , TruncationStrategy
26
+ from ...processing_utils import ProcessingKwargs , ProcessorMixin , TextKwargs
27
+ from ...tokenization_utils_base import PreTokenizedInput , TextInput
27
28
from ...utils import TensorType , is_vision_available
28
29
30
+ if sys .version_info >= (3 , 11 ):
31
+ from typing import Unpack
32
+ else :
33
+ from typing_extensions import Unpack
29
34
30
35
if is_vision_available ():
31
36
import PIL
32
37
33
38
39
+ class ChameleonTextKwargs (TextKwargs , total = False ):
40
+ return_for_text_completion : bool
41
+
42
+
43
+ class ChameleonProcessorKwargs (ProcessingKwargs , total = False ):
44
+ text_kwargs : ChameleonTextKwargs
45
+ _defaults = {
46
+ "text_kwargs" : {
47
+ "padding" : False ,
48
+ "stride" : 0 ,
49
+ "return_for_text_completion" : False ,
50
+ },
51
+ "common_kwargs" : {
52
+ "return_tensors" : TensorType .PYTORCH ,
53
+ },
54
+ }
55
+
56
+
34
57
class ChameleonProcessor (ProcessorMixin ):
35
58
r"""
36
59
Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
@@ -65,11 +88,7 @@ def __call__(
65
88
self ,
66
89
text : Union [TextInput , PreTokenizedInput , List [TextInput ], List [PreTokenizedInput ]] = None ,
67
90
images : ImageInput = None ,
68
- padding : Union [bool , str , PaddingStrategy ] = False ,
69
- truncation : Union [bool , str , TruncationStrategy ] = None ,
70
- max_length : int = None ,
71
- return_tensors : Optional [Union [str , TensorType ]] = TensorType .PYTORCH ,
72
- return_for_text_completion : bool = False ,
91
+ ** kwargs : Unpack [ChameleonProcessorKwargs ],
73
92
) -> BatchFeature :
74
93
"""
75
94
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -86,26 +105,6 @@ def __call__(
86
105
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
87
106
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
88
107
tensor. Both channels-first and channels-last formats are supported.
89
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
90
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
91
- index) among:
92
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
93
- sequence if provided).
94
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
95
- acceptable input length for the model if that argument is not provided.
96
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
97
- lengths).
98
- max_length (`int`, *optional*):
99
- Maximum length of the returned list and optionally padding length (see above).
100
- truncation (`bool`, *optional*):
101
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
102
- return_tensors (`str` or [`~utils.TensorType`], *optional*):
103
- If set, will return tensors of a particular framework. Acceptable values are:
104
-
105
- - `'tf'`: Return TensorFlow `tf.constant` objects.
106
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
107
- - `'np'`: Return NumPy `np.ndarray` objects.
108
- - `'jax'`: Return JAX `jnp.ndarray` objects.
109
108
110
109
Returns:
111
110
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -120,6 +119,15 @@ def __call__(
120
119
text = [text ]
121
120
elif not isinstance (text , list ) and not isinstance (text [0 ], str ):
122
121
raise ValueError ("Invalid input text. Please provide a string, or a list of strings" )
122
+ if text is None and images is None :
123
+ raise ValueError ("You must provide either text or images" )
124
+
125
+ output_kwargs = self ._merge_kwargs (
126
+ ChameleonProcessorKwargs ,
127
+ tokenizer_init_kwargs = self .tokenizer .init_kwargs ,
128
+ ** kwargs ,
129
+ )
130
+ return_for_text_completion = output_kwargs ["text_kwargs" ].pop ("return_for_text_completion" , False )
123
131
124
132
# Replace the image token with the expanded image token sequence
125
133
prompt_strings = []
@@ -130,19 +138,12 @@ def __call__(
130
138
sample += self .tokenizer .sep_token # special Chameleon treatment to add sep for chat mode
131
139
prompt_strings .append (sample )
132
140
133
- data = self .tokenizer (
134
- prompt_strings ,
135
- return_tensors = return_tensors ,
136
- padding = padding ,
137
- truncation = truncation ,
138
- max_length = max_length ,
139
- )
141
+ data = self .tokenizer (prompt_strings , ** output_kwargs ["text_kwargs" ])
140
142
141
143
if images is not None :
142
- pixel_values = self .image_processor (images , return_tensors = return_tensors )["pixel_values" ]
143
- data ["pixel_values" ] = pixel_values
144
+ data ["pixel_values" ] = self .image_processor (images , ** output_kwargs ["images_kwargs" ])["pixel_values" ]
144
145
145
- return BatchFeature (data = data , tensor_type = return_tensors )
146
+ return BatchFeature (data = data , tensor_type = output_kwargs [ "common_kwargs" ][ " return_tensors" ] )
146
147
147
148
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
148
149
def batch_decode (self , * args , ** kwargs ):
0 commit comments