diff --git a/llama_hub/github_repo/README.md b/llama_hub/github_repo/README.md index 1bb2e8e1d9..809f3a6dac 100644 --- a/llama_hub/github_repo/README.md +++ b/llama_hub/github_repo/README.md @@ -39,6 +39,54 @@ for doc in docs: print(doc.extra_info) ``` +### Azure DevOps + +```bash +export AZURE_DEVOPS_BASEURL='...' +export AZURE_DEVOPS_USERNAME='...' +export AZURE_DEVOPS_PASSWORD='...' +``` + +```python +import os + +from llama_index import download_loader +download_loader("GithubRepositoryReader") + +from llama_hub.github_repo import GithubRepositoryReader, AzureDevOpsAdapter + +# Example: https://dev.azure.com/ahmetkarapinar/testProject/_git/testProject/commit/08633d3844192a69ab5011c20201dba3aced0a41?refName=refs%2Fheads%2Fmaster +# 'ahmetkarapinar' is organization id +# 'testProject' is project id +# 'testProject' is repository id +# '08633d3844192a69ab5011c20201dba3aced0a41' commit sha +# 'master' branch name + + +azure_devops_adapter = AzureDevOpsAdapter( + base_url=os.environ["AZURE_DEVOPS_BASE_URL"], # Ex. 'https://dev.azure.com/YOURORG' + username=os.environ["AZURE_DEVOPS_USERNAME"], + password=os.environ["AZURE_DEVOPS_PASSWORD"], +) + +loader = GithubRepositoryReader( + github_client = azure_devops_adapter, + owner = "", + repo = "", + filter_directories = (["llama_index", "docs"], GithubRepositoryReader.FilterType.INCLUDE), + filter_file_extensions = ([".py"], GithubRepositoryReader.FilterType.INCLUDE), + verbose = True, + concurrent_requests = 10, +) + +docs = loader.load_data(branch="main") +# alternatively, load from a specific commit: +# docs = loader.load_data(commit_sha="a6c89159bf8e7086bea2f4305cff3f0a4102e370") + +for doc in docs: + print(doc.extra_info) +``` + ## Examples This loader designed to be used as a way to load data into [Llama Index](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/llama_hub/github_repo/azure_devops.py b/llama_hub/github_repo/azure_devops.py new file mode 100644 index 0000000000..324e2afeca --- /dev/null +++ b/llama_hub/github_repo/azure_devops.py @@ -0,0 +1,250 @@ +""" +Azure DevOps Client Adapter for BaseGithubClient. + +This class is used to interact with Azure DevOps repositories. It uses the azure-devops package. +The implementation is merely a workaround to use the same code for Github and Azure DevOps. +""" + +from typing import Any, Dict, List, Optional +from llama_hub.github_repo.github_client import ( + BaseGithubClient, + GitBlobResponseModel, + GitBranchResponseModel, + GitCommitResponseModel, + GitTreeResponseModel, +) + +from azure.devops.v7_0.git.git_client import GitClient +from azure.devops.v7_0.git.models import GitTreeRef +from azure.devops.v7_0.git.models import GitTreeEntryRef +from azure.devops.v7_0.git.models import GitBlobRef +from azure.devops.v7_0.git.models import GitCommit +from azure.devops.v7_0.git.models import GitBranchStats + + +class AzureDevOpsAdapter(BaseGithubClient): + """ + Azure DevOps adapter. + + This class is used to interact with Azure DevOps repositories. It uses the azure-devops package. + Each method is same as the corresponding method in BaseGithubClient. All of the Azure DevOps specific + response models are converted to the corresponding Github response models. + + Args: + - `base_url (str)`: Azure DevOps base url. Example: 'https://dev.azure.com/YOURORG' + - `username (str)`: Azure DevOps username. You can leave this blank if you are using a PAT. ex: '' + - `password (str)`: Azure DevOps password. Personal Access Token (PAT) is recommended. + + Raises: + - `ImportError`: If azure-devops package is not installed. + - `ValueError`: If base_url, username or password is not provided. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + try: + from azure.devops.connection import Connection + from msrest.authentication import BasicAuthentication + except ImportError: + raise ImportError( + "Please install azure-devops package to use Azure DevOps adapter" + ) + if kwargs.get("base_url") is None: + raise ValueError( + "Azure DevOps base_url is required. Example: 'https://dev.azure.com/YOURORG'" + ) + if kwargs.get("username") is None: + raise ValueError( + "Azure DevOps username is required. You can leave this blank if you are using a PAT. ex: ''" + ) + if kwargs.get("password") is None: + raise ValueError( + "Azure DevOps password is required. Personal Access Token (PAT) is recommended." + ) + + self.connection = Connection( + base_url=kwargs.get("base_url"), + creds=BasicAuthentication( + username=kwargs.get("username"), + password=kwargs.get("password"), + ), + ) + self._git_client: GitClient = self.connection.clients.get_git_client() + + def get_all_endpoints(self) -> Dict[str, str]: + raise NotImplementedError + + async def request( + self, + endpoint: str, + method: str, + headers: Dict[str, Any] = {}, + **kwargs: Any, + ) -> Any: + raise NotImplementedError + + async def get_tree( + self, + owner: str, + repo: str, + tree_sha: str, + ) -> GitTreeResponseModel: + """ + Get the tree for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `tree_sha (str)`: sha of the tree. + + Returns: + - `tree (GitTreeResponseModel)`: Tree response model. + """ + _git_tree_response: GitTreeRef = self._git_client.get_tree( + repository_id=repo, + sha1=tree_sha, + project=owner, + ) + + git_tree_object_list: List[GitTreeResponseModel.GitTreeObject] = [] + tree_entry: GitTreeEntryRef + for tree_entry in _git_tree_response.tree_entries: + git_tree_object: GitTreeResponseModel.GitTreeObject = ( + GitTreeResponseModel.GitTreeObject( + path=tree_entry.relative_path, + mode=tree_entry.mode, + type=tree_entry.git_object_type, + sha=tree_entry.object_id, + url=tree_entry.url, + size=tree_entry.size, + ) + ) + git_tree_object_list.append(git_tree_object) + return GitTreeResponseModel( + sha=_git_tree_response.object_id, + url=_git_tree_response.url, + tree=git_tree_object_list, + truncated=False, + ) + + async def get_blob( + self, + owner: str, + repo: str, + file_sha: str, + ) -> GitBlobResponseModel: + """ + Get the blob for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `file_sha (str)`: sha of the blob. + + Returns: + - `blob (GitBlobResponseModel)`: Blob response model. + """ + _git_blob_response: GitBlobRef = self._git_client.get_blob( + repository_id=repo, + sha1=file_sha, + project=owner, + download=False, + resolve_lfs=False, + ) + + _git_blob_content_iterator = self._git_client.get_blob_content( + repository_id=repo, + sha1=file_sha, + project=owner, + download=False, + resolve_lfs=False, + ) + + size = 0 + _git_blob_content: bytes = b"" + for chunk in _git_blob_content_iterator: + _git_blob_content += chunk + size += len(chunk) + + return GitBlobResponseModel( + content=_git_blob_content, + size=size, + encoding="utf-8", + sha=_git_blob_response.object_id, + url=_git_blob_response.url, + node_id=None, + ) + + async def get_commit( + self, + owner: str, + repo: str, + commit_sha: str, + ) -> GitCommitResponseModel: + """ + Get the commit for a given sha. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `commit_sha (str)`: sha of the commit. + + Returns: + - `commit (GitCommitResponseModel)`: Commit response model. + """ + _git_commit_response: GitCommit = self._git_client.get_commit( + repository_id=repo, + commit_id=commit_sha, + project=owner, + ) + + return GitCommitResponseModel( + url=_git_commit_response.url, + sha=_git_commit_response.commit_id, + commit=GitCommitResponseModel.Commit( + tree=GitCommitResponseModel.Commit.Tree( + sha=_git_commit_response.tree_id, + ), + ), + ) + + async def get_branch( + self, + owner: str, + repo: str, + branch: Optional[str], + branch_name: Optional[str], + ) -> GitBranchResponseModel: + """ + Get the branch for a given branch name. + + Args: + - `owner (str)`: Project name or project id. + - `repo (str)`: repository id. + - `branch (str)`: branch name. + + Returns: + - `branch (GitBranchResponseModel)`: Branch response model. + """ + _git_branch_response: GitBranchStats = self._git_client.get_branch( + repository_id=repo, project=owner, name=branch + ) + + # get the latest commit for the branch + _git_commit_response: GitCommit = self._git_client.get_commit( + repository_id=repo, + commit_id=_git_branch_response.commit.commit_id, + project=owner, + ) + + return GitBranchResponseModel( + name=_git_branch_response.name, + commit=GitBranchResponseModel.Commit( + commit=GitBranchResponseModel.Commit.Commit( + tree=GitBranchResponseModel.Commit.Commit.Tree( + sha=_git_commit_response.tree_id, + ), + ), + ), + _links=None, + ) diff --git a/llama_hub/github_repo/base.py b/llama_hub/github_repo/base.py index d2097b799b..4e529ac652 100644 --- a/llama_hub/github_repo/base.py +++ b/llama_hub/github_repo/base.py @@ -18,6 +18,7 @@ from llama_index.readers.base import BaseReader from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS from llama_index.readers.schema.base import Document +from llama_hub.github_repo import github_client from llama_hub.github_repo.github_client import ( BaseGithubClient, @@ -247,7 +248,7 @@ def _load_data_from_branch(self, branch: str) -> List[Document]: :return: list of documents """ branch_data: GitBranchResponseModel = self._loop.run_until_complete( - self._github_client.get_branch(self._owner, self._repo, branch) + self._github_client.get_branch(self._owner, self._repo, branch, branch) ) tree_sha = branch_data.commit.commit.tree.sha @@ -393,7 +394,7 @@ async def _generate_documents( async for blob_data, full_path in buffered_iterator: print_if_verbose(self._verbose, f"generating document for {full_path}") assert ( - blob_data.encoding == "base64" + blob_data.encoding == "base64" or blob_data.encoding == "utf-8" ), f"blob encoding {blob_data.encoding} not supported" decoded_bytes = None try: @@ -403,7 +404,13 @@ async def _generate_documents( print_if_verbose( self._verbose, f"could not decode {full_path} as base64" ) - continue + # tried to decode the content that was base64 encoded but failed + # continue + if blob_data.encoding == "base64": + continue + # if the content was not base64 encoded and we failed to decode it + # as base64, then we assume it is raw text + decoded_bytes = blob_data.content if self._use_parser: document = self._parse_supported_file( @@ -547,7 +554,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None: verbose=True, filter_directories=( ["docs"], - GithubRepositoryReader.FilterType.INCLUDE, + GithubRepositoryReader.FilterType.EXCLUDE, ), filter_file_extensions=( [ @@ -557,7 +564,7 @@ def wrapper(*args: Any, **kwargs: Any) -> None: ".gif", ".svg", ".ico", - "json", + ".json", ".ipynb", ], GithubRepositoryReader.FilterType.EXCLUDE, @@ -584,6 +591,6 @@ def load_data_from_branch() -> None: load_data_from_branch() - # input("Press enter to load github repository from commit sha...") + input("Press enter to load github repository from commit sha...") - # load_data_from_commit() + load_data_from_commit() diff --git a/llama_hub/github_repo/requirements.txt b/llama_hub/github_repo/requirements.txt index 79228389fc..7dd09db67d 100644 --- a/llama_hub/github_repo/requirements.txt +++ b/llama_hub/github_repo/requirements.txt @@ -1 +1,2 @@ -httpx \ No newline at end of file +httpx +azure-devops \ No newline at end of file