From 072ad16d51f1d6035c1de23af772b3aa442dabaa Mon Sep 17 00:00:00 2001 From: Maharshi Basu Date: Thu, 12 Sep 2024 09:25:10 +0530 Subject: [PATCH 1/2] chore(github util): add scraper for github issue labels Signed-off-by: Maharshi Basu --- github_app/github_helper/utils.py | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/github_app/github_helper/utils.py b/github_app/github_helper/utils.py index 644ea266..4cf492df 100644 --- a/github_app/github_helper/utils.py +++ b/github_app/github_helper/utils.py @@ -5,6 +5,8 @@ import logging import hmac import hashlib +from bs4 import BeautifulSoup +from typing import List, Tuple logger = logging.getLogger(__name__) @@ -91,3 +93,40 @@ def is_github_signature_valid(headers, body): mac = hmac.new(github_secret, msg=body, digestmod=hashlib.sha256) return hmac.compare_digest(mac.hexdigest(), signature) + +def scrape_labels(labels_url) -> List[Tuple[str, str]]: + """ + Scrape the label names and descriptions of a repository + """ + response = requests.get(labels_url) + if response.status_code == 200: + html_content = response.text + else: + logger.error( + f"Unable to fetch labels page with error: {response.status_code} url: {labels_url}" + ) + soup = BeautifulSoup(html_content, 'html.parser') + label_elem = soup.find_all('div', class_='js-label-preview') + label_with_desc = [] + for label in label_elem: + labels = label.find_all('span', class_='IssueLabel') + for l in labels: + name = l.text.strip() + desc_elem = l.find_text('div') + desc_str = desc_elem.text.strip() + label_desc = desc_str if desc_str else "N/A" + label_with_desc.append((name, label_desc)) + return label_with_desc + +def scrape_labels_all_pages(base_label_url) -> List[Tuple[str, str]]: + all_labels = [] + page_number = 1 + while True: + url = f"{base_label_url}?page={page_number}" + _labels = scrape_labels(url) + if not _labels: + break + all_labels += _labels + page_number += 1 + return all_labels + From 23324d1c21367a1d4f37b6f47038e3dba44db1c2 Mon Sep 17 00:00:00 2001 From: Maharshi Basu Date: Thu, 12 Sep 2024 09:33:55 +0530 Subject: [PATCH 2/2] chore(lint): make flake8 happy Signed-off-by: Maharshi Basu --- github_app/github_helper/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/github_app/github_helper/utils.py b/github_app/github_helper/utils.py index 4cf492df..c0b819d7 100644 --- a/github_app/github_helper/utils.py +++ b/github_app/github_helper/utils.py @@ -94,6 +94,7 @@ def is_github_signature_valid(headers, body): mac = hmac.new(github_secret, msg=body, digestmod=hashlib.sha256) return hmac.compare_digest(mac.hexdigest(), signature) + def scrape_labels(labels_url) -> List[Tuple[str, str]]: """ Scrape the label names and descriptions of a repository @@ -103,21 +104,22 @@ def scrape_labels(labels_url) -> List[Tuple[str, str]]: html_content = response.text else: logger.error( - f"Unable to fetch labels page with error: {response.status_code} url: {labels_url}" + f"Unable to fetch labels page with error: {response.status_code} url: {labels_url}" ) soup = BeautifulSoup(html_content, 'html.parser') label_elem = soup.find_all('div', class_='js-label-preview') label_with_desc = [] for label in label_elem: labels = label.find_all('span', class_='IssueLabel') - for l in labels: - name = l.text.strip() - desc_elem = l.find_text('div') + for label in labels: + name = label.text.strip() + desc_elem = label.find_text('div') desc_str = desc_elem.text.strip() label_desc = desc_str if desc_str else "N/A" label_with_desc.append((name, label_desc)) return label_with_desc + def scrape_labels_all_pages(base_label_url) -> List[Tuple[str, str]]: all_labels = [] page_number = 1 @@ -129,4 +131,3 @@ def scrape_labels_all_pages(base_label_url) -> List[Tuple[str, str]]: all_labels += _labels page_number += 1 return all_labels -