@@ -39,7 +39,8 @@ function canSetUnknownToken(test)
39
39
tok = bert .tokenizer .internal .WordPieceTokenizer(enc ,' UnknownToken' ,unk );
40
40
test .verifyEqual(tok .Unk ,unk )
41
41
str = " blah" ;
42
- act_out = tok .tokenize(str );
42
+ ustr = textanalytics .unicode .UTF32(str );
43
+ act_out = tok .tokenize(ustr );
43
44
exp_out = unk ;
44
45
test .verifyEqual(act_out ,exp_out );
45
46
end
@@ -50,7 +51,8 @@ function canSetMaxTokenLength(test)
50
51
tok = bert .tokenizer .internal .WordPieceTokenizer(enc ,' MaxTokenLength' ,maxLen );
51
52
test .verifyEqual(tok .MaxChar ,maxLen );
52
53
str = " foo" ;
53
- act_out = tok .tokenize(str );
54
+ ustr = textanalytics .unicode .UTF32(str );
55
+ act_out = tok .tokenize(ustr );
54
56
exp_out = tok .Unk ;
55
57
test .verifyEqual(act_out ,exp_out );
56
58
end
@@ -59,7 +61,9 @@ function canTokenize(test)
59
61
enc = wordEncoding([" foo" ," bar" ," ##foo" ]);
60
62
tok = bert .tokenizer .internal .WordPieceTokenizer(enc );
61
63
str = " foo bar foobar barba bafoobar barfoo" ;
62
- act_out = tok .tokenize(str );
64
+ wsTok = bert .tokenizer .internal .WhitespaceTokenizer ;
65
+ ustr = textanalytics .unicode .UTF32(wsTok .tokenize(str ));
66
+ act_out = tok .tokenize(ustr );
63
67
exp_out = [" foo" ," bar" ,tok .Unk ,tok .Unk ,tok .Unk ," bar" ," ##foo" ];
64
68
test .verifyEqual(act_out ,exp_out );
65
69
end
0 commit comments