39
39
from .utils import chunk_list , read_doc_config
40
40
41
41
42
- Chunk = namedtuple ("Chunk" , "text source package_name" )
43
- Embedding = namedtuple ("Embedding" , "text source package_name embedding" )
42
+ Chunk = namedtuple ("Chunk" , "text source_page_url source_page_title package_name" )
43
+ Embedding = namedtuple ("Embedding" , "text source_page_url source_page_title package_name embedding" )
44
44
45
45
MEILI_INDEX = "docs-embed"
46
46
MEILI_INDEX_TEMP = "docs-embed-temp"
@@ -98,7 +98,8 @@ def get_chunks(self, page_info, chunk_len_chars, prefix=[]):
98
98
chunks .append (
99
99
Chunk (
100
100
text = prefix_str .strip () + "\n \n " + chunk_str .strip (),
101
- source = f"{ page_info ['page' ]} #{ self .anchor } " ,
101
+ source_page_url = f"https://huggingface.co/docs/{ page_info ['package_name' ]} /{ page_info ['page' ]} #{ self .anchor } " ,
102
+ source_page_title = get_page_title (page_info ["page" ]),
102
103
package_name = page_info ["package_name" ],
103
104
)
104
105
)
@@ -109,7 +110,8 @@ def get_chunks(self, page_info, chunk_len_chars, prefix=[]):
109
110
chunks .append (
110
111
Chunk (
111
112
text = prefix_str .strip () + "\n \n " + chunk_str .strip (),
112
- source = f"{ page_info ['page' ]} #{ self .anchor } " ,
113
+ source_page_url = f"https://huggingface.co/docs/{ page_info ['package_name' ]} /{ page_info ['page' ]} #{ self .anchor } " ,
114
+ source_page_title = get_page_title (page_info ["page" ]),
113
115
package_name = page_info ["package_name" ],
114
116
)
115
117
)
@@ -198,7 +200,8 @@ def create_autodoc_chunks(content, package, return_anchors=False, page_info=None
198
200
object_doc_chunks = [
199
201
Chunk (
200
202
text = od ["doc" ],
201
- source = f"{ page_info ['page' ]} #{ od ['anchor_name' ]} " ,
203
+ source_page_url = f"https://huggingface.co/docs/{ page_info ['package_name' ]} /{ page_info ['page' ]} #{ od ['anchor_name' ]} " ,
204
+ source_page_title = get_page_title (page_info ["page" ]),
202
205
package_name = page_info ["package_name" ],
203
206
)
204
207
for od in object_docs
@@ -270,6 +273,19 @@ def clean_md(text):
270
273
return text .strip ()
271
274
272
275
276
+ def get_page_title (path : str ):
277
+ """
278
+ Given a path to doc page, generate doc page title.
279
+ Example: "api/schedulers/lms_discrete" -> "Lms discrete"
280
+ """
281
+ # Split the string by '/' and take the last part
282
+ last_part = path .split ("/" )[- 1 ]
283
+ # Replace underscores with spaces
284
+ formatted_string = last_part .replace ("_" , " " )
285
+ # Capitalize the first letter of the entire string
286
+ return formatted_string .capitalize ()
287
+
288
+
273
289
_re_autodoc_all = re .compile (r"(\[\[autodoc\]\]\s+[\w\.]+(?:\n\s+-\s+\w+)*\b)" , re .DOTALL )
274
290
275
291
@@ -366,7 +382,13 @@ def chunks_to_embeddings(client, chunks) -> List[Embedding]:
366
382
inference_output = client .feature_extraction (texts , truncate = True )
367
383
inference_output = inference_output .tolist ()
368
384
embeddings = [
369
- Embedding (text = c .text , source = c .source , package_name = c .package_name , embedding = embed )
385
+ Embedding (
386
+ text = c .text ,
387
+ source_page_url = c .source_page_url ,
388
+ source_page_title = c .source_page_title ,
389
+ package_name = c .package_name ,
390
+ embedding = embed ,
391
+ )
370
392
for c , embed in zip (chunks , inference_output )
371
393
]
372
394
return embeddings
@@ -461,8 +483,6 @@ def build_embeddings(
461
483
is_python_module = is_python_module ,
462
484
)
463
485
464
- # return
465
-
466
486
# Step 2: create embeddings
467
487
embeddings = call_embedding_inference (chunks , hf_ie_name , hf_ie_namespace , hf_ie_token )
468
488
0 commit comments