Skip to content

Commit

Permalink
SHift to Documentconverter class
Browse files Browse the repository at this point in the history
  • Loading branch information
gagb committed Dec 13, 2024
1 parent 8f16f32 commit 7979eec
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 66 deletions.
140 changes: 75 additions & 65 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,75 @@ def _get_mlm_description(self, local_path, extension, client, model, prompt=None
return response.choices[0].message.content


class GitHubIssueConverter(DocumentConverter):
"""Converts GitHub issues to Markdown."""

def convert(self, issue_url, github_token) -> Union[None, DocumentConverterResult]:

# Bail if not a valid GitHub issue URL
if issue_url:
parsed_url = urlparse(issue_url)
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "issues":
return None

if not github_token:
raise ValueError("GitHub token is not set. Cannot convert GitHub issue.")

return self._convert_github_issue(issue_url, github_token)

return None

def _convert_github_issue(
self, issue_url: str, github_token: str
) -> DocumentConverterResult:
"""
Convert a GitHub issue to a markdown document.
Args:
issue_url (str): The URL of the GitHub issue to convert.
github_token (str): A GitHub token with access to the repository.
Returns:
DocumentConverterResult: The result containing the issue title and markdown content.
Raises:
ImportError: If the PyGithub library is not installed.
ValueError: If the provided URL is not a valid GitHub issue URL.
"""
if not IS_GITHUB_ISSUE_CAPABLE:
raise ImportError(
"PyGithub is not installed. Please install it to use this feature."
)

# Parse the issue URL
parsed_url = urlparse(issue_url)
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "issues":
raise ValueError("Invalid GitHub issue URL")

owner, repo, _, issue_number = path_parts[:4]

# Authenticate with GitHub
g = Github(github_token)
repo = g.get_repo(f"{owner}/{repo}")
issue = repo.get_issue(int(issue_number))

# Convert issue details to markdown
markdown_content = f"# {issue.title}\n\n{issue.body}\n\n"
markdown_content += f"**State:** {issue.state}\n"
markdown_content += f"**Created at:** {issue.created_at}\n"
markdown_content += f"**Updated at:** {issue.updated_at}\n"
markdown_content += f"**Comments:**\n"

for comment in issue.get_comments():
markdown_content += (
f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
)

return DocumentConverterResult(
title=issue.title,
text_content=markdown_content,
)


class FileConversionException(BaseException):
pass

Expand Down Expand Up @@ -897,6 +966,12 @@ def convert(
- source: can be a string representing a path or url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
"""
# Handle GitHub issue URLs directly
if isinstance(source, str) and "github.com" in source and "/issues/" in source:

Check failure

Code scanning / CodeQL

Incomplete URL substring sanitization High

The string
github.com
may be at an arbitrary position in the sanitized URL.
github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN"))
if not github_token:
raise ValueError("GitHub token is required for GitHub issue conversion.")
return GitHubIssueConverter().convert(issue_url=source, github_token=github_token)

# Local path or url
if isinstance(source, str):
Expand Down Expand Up @@ -1107,68 +1182,3 @@ def _guess_ext_magic(self, path):
def register_page_converter(self, converter: DocumentConverter) -> None:
"""Register a page text converter."""
self._page_converters.insert(0, converter)

def convert_github_issue(
self, issue_url: str, github_token: str
) -> DocumentConverterResult:
"""
Convert a GitHub issue to a markdown document.
Args:
issue_url (str): The URL of the GitHub issue to convert.
github_token (str): A GitHub token with access to the repository.
Returns:
DocumentConverterResult: The result containing the issue title and markdown content.
Raises:
ImportError: If the PyGithub library is not installed.
ValueError: If the provided URL is not a valid GitHub issue URL.
Example:
# Example markdown format
# Issue Title
Issue body content...
**State:** open
**Created at:** 2023-10-01 12:34:56
**Updated at:** 2023-10-02 12:34:56
**Comments:**
- user1 (2023-10-01 13:00:00): Comment content...
- user2 (2023-10-01 14:00:00): Another comment...
"""
if not IS_GITHUB_ISSUE_CAPABLE:
raise ImportError(
"PyGithub is not installed. Please install it to use this feature."
)

# Parse the issue URL
parsed_url = urlparse(issue_url)
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "issues":
raise ValueError("Invalid GitHub issue URL")

owner, repo, _, issue_number = path_parts[:4]

# Authenticate with GitHub
g = Github(github_token)
repo = g.get_repo(f"{owner}/{repo}")
issue = repo.get_issue(int(issue_number))

# Convert issue details to markdown
markdown_content = f"# {issue.title}\n\n{issue.body}\n\n"
markdown_content += f"**State:** {issue.state}\n"
markdown_content += f"**Created at:** {issue.created_at}\n"
markdown_content += f"**Updated at:** {issue.updated_at}\n"
markdown_content += f"**Comments:**\n"

for comment in issue.get_comments():
markdown_content += (
f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
)

return DocumentConverterResult(
title=issue.title,
text_content=markdown_content,
)
2 changes: 1 addition & 1 deletion tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def test_markitdown_exiftool() -> None:
)
def test_markitdown_github_issue() -> None:
markitdown = MarkItDown()
result = markitdown.convert_github_issue(GITHUB_ISSUE_URL, GITHUB_TOKEN)
result = markitdown.convert(GITHUB_ISSUE_URL, github_token=GITHUB_TOKEN)
print(result.text_content)
assert "User-Defined Functions" in result.text_content
assert "closed" in result.text_content
Expand Down

0 comments on commit 7979eec

Please sign in to comment.