Skip to content

Parallel mdx building #594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/doc_builder/autodoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ def regex_closure(object_doc, regex):


def get_signature_component_svelte(name, anchor, signature, object_doc, source_link=None, is_getset_desc=False):
print("some things svelte")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(this one was polluting the CI runs :) )

"""
Returns the svelte `Docstring` component string.

Expand Down
159 changes: 107 additions & 52 deletions src/doc_builder/build_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import re
import shutil
import zlib
from multiprocessing import Pool
from pathlib import Path

import yaml
Expand Down Expand Up @@ -156,9 +157,94 @@ def resolve_autodoc(content, package, return_anchors=False, page_info=None, vers
return (new_content, anchors, source_files, errors) if return_anchors else new_content


def _process_single_mdx_file(file_info: tuple) -> dict:
"""
Worker function to process a single MDX file with multiprocessing.

Args:
file_info (tuple):
Tuple containing file information (file_path, doc_folder, output_dir, page_info, version_tag_suffix).

Returns:
dict: Dictionary containing the processed results for this file (file, new_anchors, errors, source_files).
"""
file_path, doc_folder, output_dir, page_info, version_tag_suffix = file_info
package_name = page_info["package_name"]

file_path = Path(file_path)
doc_folder = Path(doc_folder)
output_dir = Path(output_dir)

result = {
"file": str(file_path),
"new_anchors": None,
"errors": None,
"source_files": None,
}

try:
# Import package in worker process
package = importlib.import_module(package_name) if package_name else None

# Create a copy of page_info for this file
file_page_info = page_info.copy()
file_page_info["path"] = file_path

if file_path.suffix in [".md", ".mdx"]:
dest_file = output_dir / (file_path.with_suffix(".mdx").relative_to(doc_folder))
file_page_info["page"] = file_path.with_suffix(".html").relative_to(doc_folder).as_posix()
os.makedirs(dest_file.parent, exist_ok=True)

with open(file_path, "r", encoding="utf-8-sig") as reader:
content = reader.read()
content = convert_md_to_mdx(content, file_page_info)
content = resolve_open_in_colab(content, file_page_info)
content, new_anchors, source_files, errors = resolve_autodoc(
content, package, return_anchors=True, page_info=file_page_info, version_tag_suffix=version_tag_suffix
)

with open(dest_file, "w", encoding="utf-8") as writer:
writer.write(content)

result["new_anchors"] = new_anchors
result["errors"] = errors
result["source_files"] = source_files

elif file_path.suffix in [".rst"]:
dest_file = output_dir / (file_path.with_suffix(".mdx").relative_to(doc_folder))
file_page_info["page"] = file_path.with_suffix(".html").relative_to(doc_folder)
os.makedirs(dest_file.parent, exist_ok=True)

with open(file_path, "r", encoding="utf-8") as reader:
content = reader.read()
content = convert_rst_to_mdx(content, file_page_info)
content = resolve_open_in_colab(content, file_page_info)
content, new_anchors, source_files, errors = resolve_autodoc(
content, package, return_anchors=True, page_info=file_page_info, version_tag_suffix=version_tag_suffix
)

with open(dest_file, "w", encoding="utf-8") as writer:
writer.write(content)

result["new_anchors"] = new_anchors
result["errors"] = errors
result["source_files"] = source_files

elif file_path.is_file() and "__" not in str(file_path):
# __ is a reserved svelte file/folder prefix
dest_file = output_dir / (file_path.relative_to(doc_folder))
os.makedirs(dest_file.parent, exist_ok=True)
shutil.copy(file_path, dest_file)

except Exception as e:
result["errors"] = [f"There was an error when converting {file_path} to the MDX format.\n{str(e)}"]

return result


def build_mdx_files(package, doc_folder, output_dir, page_info, version_tag_suffix):
"""
Build the MDX files for a given package.
Build the MDX files for a given package. Uses multiprocessing to process files in parallel.

Args:
package (`types.ModuleType`): The package where to look for objects to document.
Expand All @@ -181,66 +267,35 @@ def build_mdx_files(package, doc_folder, output_dir, page_info, version_tag_suff

all_files = list(doc_folder.glob("**/*"))
all_errors = []
for file in tqdm(all_files, desc="Building the MDX files"):
new_anchors = None
errors = None
page_info["path"] = file
try:
if file.suffix in [".md", ".mdx"]:
dest_file = output_dir / (file.with_suffix(".mdx").relative_to(doc_folder))
page_info["page"] = file.with_suffix(".html").relative_to(doc_folder).as_posix()
os.makedirs(dest_file.parent, exist_ok=True)
with open(file, "r", encoding="utf-8-sig") as reader:
content = reader.read()
content = convert_md_to_mdx(content, page_info)
content = resolve_open_in_colab(content, page_info)
content, new_anchors, source_files, errors = resolve_autodoc(
content, package, return_anchors=True, page_info=page_info, version_tag_suffix=version_tag_suffix
)
if source_files is not None:
source_files_mapping[source_files] = str(file)
with open(dest_file, "w", encoding="utf-8") as writer:
writer.write(content)
# Make sure we clean up for next page.
del page_info["page"]
elif file.suffix in [".rst"]:
dest_file = output_dir / (file.with_suffix(".mdx").relative_to(doc_folder))
page_info["page"] = file.with_suffix(".html").relative_to(doc_folder)
os.makedirs(dest_file.parent, exist_ok=True)
with open(file, "r", encoding="utf-8") as reader:
content = reader.read()
content = convert_rst_to_mdx(content, page_info)
content = resolve_open_in_colab(content, page_info)
content, new_anchors, source_files, errors = resolve_autodoc(
content, package, return_anchors=True, page_info=page_info, version_tag_suffix=version_tag_suffix
)
if source_files is not None:
source_files_mapping[source_files] = str(file)
with open(dest_file, "w", encoding="utf-8") as writer:
writer.write(content)
# Make sure we clean up for next page.
del page_info["page"]
elif file.is_file() and "__" not in str(file):
# __ is a reserved svelte file/folder prefix
dest_file = output_dir / (file.relative_to(doc_folder))
os.makedirs(dest_file.parent, exist_ok=True)
shutil.copy(file, dest_file)

except Exception as e:
raise type(e)(f"There was an error when converting {file} to the MDX format.\n" + e.args[0]) from e
# Prepare arguments for multiprocessing
file_args = [(str(file), str(doc_folder), str(output_dir), page_info, version_tag_suffix) for file in all_files]

if new_anchors is not None:
page_name = str(file.with_suffix("").relative_to(doc_folder))
for anchor in new_anchors:
# Use multiprocessing to process files in parallel
with Pool() as pool:
results = list(
tqdm(pool.imap(_process_single_mdx_file, file_args), total=len(file_args), desc="Building the MDX files")
)

# Process results and collect mappings
for result in results:
file_path = Path(result["file"])

if result["new_anchors"] is not None:
page_name = str(file_path.with_suffix("").relative_to(doc_folder))
for anchor in result["new_anchors"]:
if isinstance(anchor, tuple):
anchor_mapping.update(
{a: f"{page_name}#{anchor[0]}" for a in anchor[1:] if a not in anchor_mapping}
)
anchor = anchor[0]
anchor_mapping[anchor] = page_name

if errors is not None:
all_errors.extend(errors)
if result["errors"]:
all_errors.extend(result["errors"])

if result["source_files"] is not None:
source_files_mapping[result["source_files"]] = str(file_path)

if len(all_errors) > 0:
raise ValueError(
Expand Down