Skip to content

Commit 444e05a

Browse files
authored
Tokenization notebook: change title (#178)
and minor refactoring
1 parent fb3ea3a commit 444e05a

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

notebooks/document-chunking/tokenization.ipynb

+4-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"id": "s49gpkvZ7q53"
88
},
99
"source": [
10-
"# Tokenization for Semantic Search (ELSER and E5)\n",
10+
"# Calculating tokens for Semantic Search (ELSER and E5)\n",
1111
"\n",
1212
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/search/tokenization.ipynb)\n",
1313
"\n",
@@ -217,9 +217,10 @@
217217
"outputs": [],
218218
"source": [
219219
"SEMANTIC_SEARCH_TOKEN_LIMIT = 510 # 512 minus space for the 2 special tokens\n",
220+
"ELSER_TOKEN_OVERLAP = 0.5 # 50% token overlap between chunks is recommended for ELSER\n",
220221
"\n",
221-
"def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT):\n",
222-
" step_size = round(chunk_size * .5) # 50% token overlap between chunks is recommended for ELSER\n",
222+
"def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP):\n",
223+
" step_size = round(chunk_size * overlap_ratio)\n",
223224
"\n",
224225
" for i in range(0, len(tokens), step_size):\n",
225226
" yield tokens[i:i+chunk_size]"

0 commit comments

Comments
 (0)