Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,081 changes: 1,057 additions & 24 deletions Cargo.lock

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@ crate-type = ["staticlib"]

[dependencies]
libc = "0.2.162"
tokenizers = {version = "0.20.2" }
tokenizers = { version = "0.20.0", features = ["http"] }
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason for the downgrade in version? is something broken in 20.2?

tiktoken-rs = "0.7.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.140"
base64 = "0.22"
rustc-hash = "1.1.0"
minijinja = { version = "2.2.0", features = ["json", "loop_controls"] }
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
chrono = "0.4"
tracing = "0.1"
anyhow = "1.0"
thiserror = "1.0.48"
utoipa = { version = "4.2.0", features = ["axum_extras"] }

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
Expand Down
2 changes: 1 addition & 1 deletion example/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1.3

FROM golang:1.21 as builder-go
FROM golang:1.21.5 as builder-go
ARG TARGETPLATFORM
ARG VERSION=v0.6.0
WORKDIR /workspace
Expand Down
72 changes: 72 additions & 0 deletions example/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,72 @@ func tiktoken() error {
return nil
}

func chatTemplateDeepSeek() error {
template := "test/data/deepseek-ai/DeepSeek-R1/tokenizer_config.json"
ct, err := tokenizers.NewChatTemplate(template)
if err != nil {
fmt.Printf("NewChatTemplate error: %v\n", err)
return err
}
defer ct.Close()

messages_str := `[{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "Hello!"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "What can you do?"
}
]`

result, err := ct.ApplyChatTemplate(messages_str, "", "")
if err != nil {
fmt.Printf("Failed to apply chat template: %v", err)
return err
}
fmt.Println(result)
return nil
}

func chatTemplateQwen3() error {

template := "test/data/Qwen/Qwen3-235B-A22B/tokenizer_config.json"
ct, err := tokenizers.NewChatTemplate(template)
if err != nil {
fmt.Printf("Failed to create chat template: %v", err)
return err
}
defer ct.Close()

messages_str := `[{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "hello!"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "What can you do?"
}
]`

result, err := ct.ApplyChatTemplate(messages_str, "", "")
if err != nil {
fmt.Printf("Failed to apply chat template: %v", err)
}
fmt.Println(result)
return nil
}

func main() {
if err := simple(); err != nil {
log.Fatal(err)
Expand All @@ -154,4 +220,10 @@ func main() {
if err := tiktoken(); err != nil {
log.Fatal(err)
}
if err := chatTemplateDeepSeek(); err != nil {
log.Fatal(err)
}
if err := chatTemplateQwen3(); err != nil {
log.Fatal(err)
}
}
7 changes: 6 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@ module github.com/daulet/tokenizers

go 1.18

require github.com/stretchr/testify v1.8.2
require (
github.com/stretchr/testify v1.8.2
github.com/tidwall/gjson v1.18.0
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Expand Down
7 changes: 4 additions & 3 deletions release/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# syntax=docker/dockerfile:1.3
# syntax=docker/dockerfile:1.4

FROM rust:1.87 as builder-rust
ARG TARGETPLATFORM
Expand All @@ -9,7 +9,7 @@ COPY ./Cargo.toml ./Cargo.toml
COPY ./Cargo.lock ./Cargo.lock
RUN cargo build --release

FROM golang:1.21 as builder-go
FROM golang:1.21.5 as builder-go
ARG DOCKER_TARGETPLATFORM
WORKDIR /workspace
COPY ./release/go.mod .
Expand All @@ -21,4 +21,5 @@ COPY --from=builder-rust \
/workspace/target/release/libtokenizers.a \
./tokenizers/lib/${DOCKER_TARGETPLATFORM}/
COPY ./test/data ./test/data
RUN go run -ldflags="-extldflags '-L./tokenizers/lib/${DOCKER_TARGETPLATFORM}'" .
RUN go mod tidy
CMD go run -ldflags="-extldflags '-L./tokenizers/lib/${DOCKER_TARGETPLATFORM}'" .
10 changes: 10 additions & 0 deletions release/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
github.com/daulet/tokenizers v1.22.1 h1:3wzAFIxfgRuqGKka8xdkeTbctDmmqOOs12GofqdorpM=
github.com/daulet/tokenizers v1.22.1/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
77 changes: 75 additions & 2 deletions release/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,72 @@ import (
"github.com/daulet/tokenizers/release/tokenizers"
)

func chatTemplateDeepSeek() error {
template := "test/data/deepseek-ai/DeepSeek-R1/tokenizer_config.json"
ct, err := tokenizers.NewChatTemplate(template)
if err != nil {
fmt.Printf("NewChatTemplate error: %v\n", err)
return err
}
defer ct.Close()

messages_str := `[{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "How are you?"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "What can you do?"
}
]`

result, err := ct.ApplyChatTemplate(messages_str, "", "")
if err != nil {
fmt.Printf("Failed to apply chat template: %v", err)
return err
}
fmt.Printf("apply chat_template for DeepSeek: %v", result)
return nil
}

func chatTemplateQwen3() error {

template := "test/data/Qwen/Qwen3-235B-A22B/tokenizer_config.json"
ct, err := tokenizers.NewChatTemplate(template)
if err != nil {
fmt.Printf("Failed to create chat template: %v", err)
return err
}
defer ct.Close()

messages_str := `[{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "How are you?"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "What can you do?"
}
]`

result, err := ct.ApplyChatTemplate(messages_str, "", "")
if err != nil {
fmt.Printf("Failed to apply chat template: %v", err)
}
fmt.Printf("apply chat_template for qwen3: %v", result)
return nil
}

func main() {
tk, err := tokenizers.FromFile("./test/data/bert-base-uncased.json")
if err != nil {
Expand All @@ -15,10 +81,17 @@ func main() {
defer tk.Close()
fmt.Println("Vocab size:", tk.VocabSize())
// Vocab size: 30522
fmt.Println(tk.Encode("brown fox jumps over the lazy dog", false))
// [2829 4419 14523 2058 1996 13971 3899] [brown fox jumps over the lazy dog]
fmt.Println(tk.Encode("brown fox jumps over the lazy dog", true))
// [101 2829 4419 14523 2058 1996 13971 3899 102] [[CLS] brown fox jumps over the lazy dog [SEP]]
fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true))
fmt.Println(tk.Decode([]uint32{111308, 3837, 35946, 101922, 30534, 100134, 104811}, true))
// brown fox jumps over the lazy dog

if err = chatTemplateDeepSeek(); err != nil {
panic(err)
}
if err = chatTemplateQwen3(); err != nil {
panic(err)
}

}
Loading
Loading