-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathQWenTokenizer.cs
72 lines (64 loc) · 2.11 KB
/
QWenTokenizer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
using Microsoft.DeepDev;
using Microsoft.KernelMemory.AI;
namespace Cnblogs.KernelMemory.AI.DashScope;
/// <summary>
/// Tokenizer using QWen
/// </summary>
public class QWenTokenizer : ITextTokenizer
{
private static readonly Dictionary<string, int> SpecialTokens =
((string[]) ["<|endoftext|>", "<|im_start|>", "<|im_end|>"])
.Concat(Enumerable.Range(0, 205).Select(x => $"<|extra_{x}|>"))
.Select((x, i) => new KeyValuePair<string, int>(x, 151643 + i))
.ToDictionary();
private static readonly ITokenizer Tokenizer = TokenizerBuilder.CreateTokenizer(
DashScopeEmbeddedResource.ReadBpeFile(),
SpecialTokens,
@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+");
/// <summary>
/// Encode text.
/// </summary>
/// <param name="text">The text to be encoded.</param>
/// <returns></returns>
public static List<int> Encode(string text)
{
return Tokenizer.Encode(text, false);
}
/// <summary>
/// Decode tokens.
/// </summary>
/// <param name="tokens">The tokens to be decoded</param>
/// <returns></returns>
public static string Decode(int[] tokens)
{
return Tokenizer.Decode(tokens);
}
/// <inheritdoc />
public int CountTokens(string text)
{
return Tokenizer.Encode(text).Count;
}
/// <inheritdoc />
public IReadOnlyList<string> GetTokens(string text)
{
return Tokenizer.Encode(text).Select(x => Tokenizer.Decode([x])).ToList();
}
/// <summary>
/// Count tokens.
/// </summary>
/// <param name="text">The text to be tokenized.</param>
/// <returns></returns>
public static int CountTokensStatic(string text)
{
return Tokenizer.Encode(text).Count;
}
/// <summary>
/// Get tokens
/// </summary>
/// <param name="text">The text to tokenizers.</param>
/// <returns></returns>
public static IReadOnlyList<string> GetTokensStatic(string text)
{
return Tokenizer.Encode(text).Select(x => Tokenizer.Decode([x])).ToList();
}
}