Skip to content

Commit 181d6e9

Browse files
committed
better source page metadata
1 parent a105d1f commit 181d6e9

File tree

2 files changed

+30
-9
lines changed

2 files changed

+30
-9
lines changed

src/doc_builder/build_embeddings.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@
3939
from .utils import chunk_list, read_doc_config
4040

4141

42-
Chunk = namedtuple("Chunk", "text source package_name")
43-
Embedding = namedtuple("Embedding", "text source package_name embedding")
42+
Chunk = namedtuple("Chunk", "text source_page_url source_page_title package_name")
43+
Embedding = namedtuple("Embedding", "text source_page_url source_page_title package_name embedding")
4444

4545
MEILI_INDEX = "docs-embed"
4646
MEILI_INDEX_TEMP = "docs-embed-temp"
@@ -98,7 +98,8 @@ def get_chunks(self, page_info, chunk_len_chars, prefix=[]):
9898
chunks.append(
9999
Chunk(
100100
text=prefix_str.strip() + "\n\n" + chunk_str.strip(),
101-
source=f"{page_info['page']}#{self.anchor}",
101+
source_page_url=f"https://huggingface.co/docs/{page_info['package_name']}/{page_info['page']}#{self.anchor}",
102+
source_page_title=get_page_title(page_info["page"]),
102103
package_name=page_info["package_name"],
103104
)
104105
)
@@ -109,7 +110,8 @@ def get_chunks(self, page_info, chunk_len_chars, prefix=[]):
109110
chunks.append(
110111
Chunk(
111112
text=prefix_str.strip() + "\n\n" + chunk_str.strip(),
112-
source=f"{page_info['page']}#{self.anchor}",
113+
source_page_url=f"https://huggingface.co/docs/{page_info['package_name']}/{page_info['page']}#{self.anchor}",
114+
source_page_title=get_page_title(page_info["page"]),
113115
package_name=page_info["package_name"],
114116
)
115117
)
@@ -198,7 +200,8 @@ def create_autodoc_chunks(content, package, return_anchors=False, page_info=None
198200
object_doc_chunks = [
199201
Chunk(
200202
text=od["doc"],
201-
source=f"{page_info['page']}#{od['anchor_name']}",
203+
source_page_url=f"https://huggingface.co/docs/{page_info['package_name']}/{page_info['page']}#{od['anchor_name']}",
204+
source_page_title=get_page_title(page_info["page"]),
202205
package_name=page_info["package_name"],
203206
)
204207
for od in object_docs
@@ -270,6 +273,19 @@ def clean_md(text):
270273
return text.strip()
271274

272275

276+
def get_page_title(path: str):
277+
"""
278+
Given a path to doc page, generate doc page title.
279+
Example: "api/schedulers/lms_discrete" -> "Lms discrete"
280+
"""
281+
# Split the string by '/' and take the last part
282+
last_part = path.split("/")[-1]
283+
# Replace underscores with spaces
284+
formatted_string = last_part.replace("_", " ")
285+
# Capitalize the first letter of the entire string
286+
return formatted_string.capitalize()
287+
288+
273289
_re_autodoc_all = re.compile(r"(\[\[autodoc\]\]\s+[\w\.]+(?:\n\s+-\s+\w+)*\b)", re.DOTALL)
274290

275291

@@ -366,7 +382,13 @@ def chunks_to_embeddings(client, chunks) -> List[Embedding]:
366382
inference_output = client.feature_extraction(texts, truncate=True)
367383
inference_output = inference_output.tolist()
368384
embeddings = [
369-
Embedding(text=c.text, source=c.source, package_name=c.package_name, embedding=embed)
385+
Embedding(
386+
text=c.text,
387+
source_page_url=c.source_page_url,
388+
source_page_title=c.source_page_title,
389+
package_name=c.package_name,
390+
embedding=embed,
391+
)
370392
for c, embed in zip(chunks, inference_output)
371393
]
372394
return embeddings
@@ -461,8 +483,6 @@ def build_embeddings(
461483
is_python_module=is_python_module,
462484
)
463485

464-
# return
465-
466486
# Step 2: create embeddings
467487
embeddings = call_embedding_inference(chunks, hf_ie_name, hf_ie_namespace, hf_ie_token)
468488

src/doc_builder/meilisearch_helper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def add_embeddings_to_db(client: Client, index_name: str, embeddings):
9090
{
9191
"id": hash_text_sha1(e.text),
9292
"text": e.text,
93-
"source": e.source,
93+
"source_page_url": e.source_page_url,
94+
"source_page_title": e.source_page_title,
9495
"library": e.package_name,
9596
"_vectors": {VECOR_NAME: e.embedding},
9697
}

0 commit comments

Comments
 (0)