Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

misc update and tidelift scraping #91

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions tools/all_repos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "requests",
# "rich",
# "beautifulsoup4",
# ]
# ///
"""GitHub Organization Activity Tracker

This module tracks and reports the last activity of members across GitHub organizations.
It implements disk-based caching to minimize API requests and respect rate limits.
"""

import os
import asks
from rich import print
import trio

import requests
from rich import print
from bs4 import BeautifulSoup


def get_packages(url):
# Send a GET request to the webpage with a custom user agent
headers = {"User-Agent": "python/request/jupyter"}
response = requests.get(url, headers=headers, allow_redirects=True)

if response.status_code != 200:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
exit(1)

if "A required part of this site couldn’t load" in response.text:
print("Fastly is blocking us. Status code: 403")
exit(1)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find all <h3> tags and accumulate their text in a list
h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]

# Sort the list of <h3> contents
h3_tags.sort()

if not h3_tags:
print("No packages found")
exit(1)
return h3_tags


default_orgs = [
"binder-examples",
"binderhub-ci-repos",
"ipython",
"jupyter",
"jupyter-attic",
"jupyter-book",
"jupyter-governance",
"jupyter-incubator",
"jupyter-resources",
"jupyter-server",
"jupyter-standard",
"jupyter-standards",
"jupyter-widgets",
"jupyter-xeus",
"jupytercon",
"jupyterhub",
"jupyterlab",
"voila-dashboards",
"voila-gallery",
"pickleshare",
]

token = os.getenv("GH_TOKEN")
if not token:
print("[red]Error: GH_TOKEN environment variable not set[/red]")
exit(1)

headers = {
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
}


async def list_repos(orgs):
async with trio.open_nursery() as nursery:
results = []
for org in orgs:

async def _loc(results, org):
results.append(await list_repos_for_org(org))

nursery.start_soon(_loc, results, org)
for org_repos in results:
for org, repo in org_repos:
yield org, repo


async def list_repos_for_org(org):
reps = []
for p in range(1, 10):
response = await asks.get(
f"https://api.github.com/orgs/{org}/repos?per_page=100&page={p}",
headers=headers,
)
response.raise_for_status()
repos = response.json()
for repo in repos:
reps.append((org, repo["name"]))
if len(repos) < 100:
break
return reps


async def main():

packages = get_packages(f"https://pypi.org/org/jupyter/")
print(f"Found {len(packages)} packages in the pypi jupyter org")

map = {p.lower().replace("-", "_"): p for p in packages}

todo = []
async for org, repo in list_repos(default_orgs):
lowname = repo.lower().replace("-", "_")
if lowname in map:
print(
f"{org}/{repo}".ljust(40),
f"https://pypi.org/project/{map[lowname]}",
" in jupyter org",
)
del map[lowname]
else:
todo.append((org, repo))

print()
print("check potentially matching Pypi names:")

async with trio.open_nursery() as nursery:
targets = []
for org, repo in todo:

async def _loc(targets, org, repo):
targets.append(
(
org,
repo,
(
await asks.get(f"https://pypi.org/pypi/{repo}/json")
).status_code,
)
)

nursery.start_soon(_loc, targets, org, repo)
corg = ""
for org, repo, status in sorted(targets):
if org != corg:
print()
corg = org
if status == 200:
print(
f"https://github.com/{org}/{repo}".ljust(70),
f"{status} for https://pypi.org/project/{repo}",
)

print()
print("repos with no Pypi package:")
corg = ""
for org, repo, status in sorted(targets):
if org != corg:
print()
corg = org
if status != 200:
print(f"https://github.com/{org}/{repo}")

print()
print("Packages with no repos.")
print(map)


trio.run(main)
15 changes: 12 additions & 3 deletions tools/private_sec_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,18 @@ async def get_private_report(session, org, repo):
) as repo_response:
repo_info = await repo_response.json()
archived = repo_info.get("archived", False)
private = repo_info.get("private", False)
async with session.get(private_report_url, headers=headers) as response:
if response.status == 200:
return org, repo, (await response.json()).get("enabled", False), archived
return (
org,
repo,
(await response.json()).get("enabled", False),
archived,
private,
)
else:
return org, repo, False, archived
return org, repo, False, archived, private


async def main():
Expand All @@ -90,14 +97,16 @@ async def main():

results = await asyncio.gather(*tasks)
prev_org = None
for org, repo, enabled, archived in results:
for org, repo, enabled, archived, private in results:
if org != prev_org:
print()
print(f"[bold]{org}[/bold]")
prev_org = org
if enabled:
print(f" [green]{repo}: {enabled}[/green]")
else:
if private:
print(f" [yellow]{org}/{repo}: {enabled} (private)[/yellow]")
if archived:
print(f" [yellow]{org}/{repo}: {enabled} (archived)[/yellow]")
elif f"{org}/{repo}" in ignore_repos:
Expand Down
119 changes: 119 additions & 0 deletions tools/tide.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "requests",
# "rich",
# "beautifulsoup4",
# ]
# ///
import requests
from rich import print
from bs4 import BeautifulSoup
import sys
from rich.table import Table


def get_packages(url):
# Send a GET request to the webpage with a custom user agent
headers = {"User-Agent": "python/request/jupyter"}
response = requests.get(url, headers=headers, allow_redirects=True)

if response.status_code != 200:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
exit(1)

if "A required part of this site couldn’t load" in response.text:
print(f"Fastly is blocking us for {url}. Status code: 403")
exit(1)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find all <h3> tags and accumulate their text in a list
h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]

# Sort the list of <h3> contents
h3_tags.sort()

if not h3_tags:
print("No packages found")
exit(1)
return h3_tags


def get_tidelift_data(packages):
packages_data = [{"platform": "pypi", "name": h3} for h3 in packages]

data = {"packages": packages_data}
res = requests.post(
"https://tidelift.com/api/depci/estimate/bulk_estimates", json=data
)

res.raise_for_status()

# Collecting all package data for aligned printing
package_data = []
response_data = res.json()

for package in response_data:
name = package["name"]
lifted = package["lifted"]
estimated_money = package["estimated_money"]
package_data.append((name, lifted, estimated_money))

package_names = {p["name"] for p in response_data}
for package in packages:
if package not in package_names:
package_data.append((package, None, None))

# Print the collected data in aligned columns

# Create a table for aligned output
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Package Name")
table.add_column("Estimated Money")
table.add_column("Lifted")

def maybefloat(x):
if x is None:
return 0
try:
return float(x)
except TypeError:
return 0

package_data.sort(
key=lambda x: (x[1] is None, x[1], -maybefloat(x[2]), x[0])
) # sort lifted True first, then None, then False, then amount, then by name
for name, lifted, estimated_money in package_data:
if lifted:
table.add_row(name, "-- need login ––", f"[green]{lifted}[/green]")
else:
table.add_row(name, str(estimated_money), f"[red]{lifted}[/red]")

print(table)


if __name__ == "__main__":
# URL of the webpage
args = sys.argv[1:]
packages = []
while args:
if args[0] == "--org":
url = f"https://pypi.org/org/{args[1]}/"
packages += get_packages(url)
args = args[2:]
elif args[0] == "--user":
url = f"https://pypi.org/user/{args[1]}/"
packages += get_packages(url)
args = args[2:]
elif args[0] == "--packages":
packages += args[1:]
args = []
else:
print(
"Invalid argument. Please use either --org ORG, --user USER or --packages PACKAGE1 PACKAGE2 ..."
)
exit(1)
get_tidelift_data(packages)