Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Package releases for #76 #77

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Imports:
withr
Remotes:
cboettig/codemeta
RoxygenNote: 7.1.2
RoxygenNote: 7.3.2
URL: https://github.com/ropensci-org/makeregistry
BugReports: https://github.com/ropensci-org/makeregistry/issues
Suggests:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(build_ropensci_packages_json)
export(create_codemetas)
export(create_registry)
export(make_registry)
export(registry_pkg_versions)
export(track_repos)
importFrom(crul,HttpClient)
importFrom(ghql,GraphqlClient)
Expand Down
256 changes: 256 additions & 0 deletions R/pkg-versions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# Functions to extract version numbers of packages, both from CRAN and latest
# GitHub releases.

check_pkg_version_path <- function () {
cm_path <- normalizePath (file.path ("raw_cm.json"), mustWork = FALSE)

if (!file.exists (cm_path)) {
stop (
"This function can only be run in the ",
"output directory of 'roregistry', which ",
"must contain the 'raw_cm.json' file.",
call. = FALSE
)
}

return (cm_path)
}

#' Extract data on releases of packages, both on CRAN and GitHub.
#'
#' @param CRAN_only If `TRUE`, return only those rows of `repo_data` which
#' correspond to packages which are or were on CRAN; otherwise return all data.
#' @return A `data.frame` with one row for each package, and columns containing
#' information on the package including dates and version numbers of GitHub and
#' CRAN releases (if any).
#'
#' @export
registry_pkg_versions <- function (CRAN_only = TRUE) {

cm_path <- check_pkg_version_path ()

# Code from list-packages.R to get all pkgs:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not use packages.json?

hosted_packages <- get_hosted_packages()
other_packages <- get_other_packages()
packages <- c(hosted_packages, other_packages)
packages <- packages[order(purrr::map_chr(packages, "package"))]
packages <- lapply (packages, function (i) i [c ("package", "url", "branch")])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not purrr? since it' used on the line above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, no reason really, purrr throughout would be better

packages <- data.frame (do.call (rbind, lapply (packages, unlist)))

ptn <- "^https\\:\\/\\/github\\.com\\/|\\/.*$"
packages$org <- gsub (ptn, "", packages$url)
packages$repo <- gsub ("^.*\\/", "", packages$url)
# All "official" packages are in "ropensci" only:
index <- which (grepl ("ropensci|r-lib", packages$org) & packages$org != "ropensci")
packages <- packages [-index, ]
packages_other <- packages [which (!packages$org == "ropensci"), c ("org", "repo")]

packages_other_dat <- apply (packages_other, 1, function (i) {
get_pkg_release_data (org = as.character (i [1]), repo = as.character (i [2]))
})
packages_other_dat <- do.call (rbind, packages_other_dat)

rbind (get_pkg_releases_data ("ropensci"), packages_other_dat) |>
add_pkg_name (cm_path) |>
add_CRAN_version (CRAN_only = CRAN_only)
}

#' Main function to extract and collate data on package releases, both on
#' GitHub and CRAN.
#'
#' @param org Name of GitHub organization for which data are to be extracted.
#' @param n Number of entries returned from a single queries. Queries are run
#' until all data are extracted, so this parameter has no effect, and should
#' not be modified.
#' @return A `data.frame` of package data.
#'
#' @noRd
get_pkg_releases_data <- function (org = "ropensci", n = 100L) {

q <- get_releases_query (org = org, n = n)
dat <- gh::gh_gql (query = q)
page_info <- dat$data$repositoryOwner$repositories$pageInfo
repo_data <- dat$data$repositoryOwner$repositories$nodes
while (page_info$hasNextPage) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and why use the V4 API for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All releases because at least initially we need all that information that we currently don't have. And v4 API because we can grab the whole org in one call; with the REST api, we'd have to do each repo as a separate call.

q <- get_releases_query (org = org, n = n, end_cursor = page_info$endCursor)
dat <- gh::gh_gql (query = q)
page_info <- dat$data$repositoryOwner$repositories$pageInfo
repo_data <- c (repo_data, dat$data$repositoryOwner$repositories$nodes)
}

return (repo_data_to_df (repo_data))
}

get_pkg_release_data <- function (org = "ropensci", repo = "") {

q <- get_single_release_query (org = org, repo = repo)
dat <- gh::gh_gql (query = q)

repo_data_to_df (list (dat$data$repository))
}

#' Convert GitHub graphql list data to a flat `data.frame`.
#'
#' @param repo_data A list returned directly from `gh::gh_gql`.
#' @return The input data converted into a flat `data.frame`.
#'
#' @noRd
repo_data_to_df <- function (repo_data) {

do.call (rbind, lapply (repo_data, function (i) {

nodes <- i$releases$nodes

if (length (nodes) == 0L) {
node1 <- list (
author = list (email = NA_character_, login = NA_character_),
name = NA_character_,
publishedAt = NA_character_,
isLatest = NA_character_,
isPrerelease = NA_character_,
tagName = NA_character_
)
} else {
node1 <- nodes [[1]]
}

data.frame (
name = i$name,
url = i$url,
author_email = node1$author$email,
author_login = node1$author$login,
name = null_na (node1$name),
publishedAt = null_na (node1$publishedAt),
isLatest = null_na (node1$isLatest),
isPrerelease = null_na (node1$isPrerelease),
tagName = null_na (node1$tagNam)
)
}))
}

#' Construct the GitHub graphql query to extract data on releases for an entire
#' GitHub organization.
#'
#' @param org Name of GitHub organization for which data are to be extracted.
#' @param n Number of entries returned from a single queries. Queries are run
#' until all data are extracted, so this parameter has no effect, and should
#' not be modified.
#' @return A complex nested list of results.
#'
#' @noRd
get_releases_query <- function (org = "ropensci", n = 100, end_cursor = NULL) {

after_txt <- ""
if (!is.null (end_cursor)) {
after_txt <- paste0 (", after:\"", end_cursor, "\"")
}

q <- paste0 ("{
repositoryOwner(login:\"", org, "\") {
... on Organization {
repositories (first: ", n, after_txt, "){
pageInfo {
hasNextPage
endCursor
}
nodes {
name
url
releases(first: 1, orderBy: {field: CREATED_AT, direction: DESC}) {
nodes {
author {
email
login
}
name
createdAt
isLatest
isPrerelease
publishedAt
tagName
}
}
}
}
}
}
}")

return (q)
}

#' Modified version of `get_releases_query` for single repo only.
#'
#' @param org Name of GitHub organization for which data are to be extracted.
#' @param repo Name of GitHub repository for which data are to be extracted.
#' @return A complex nested list of results.
#'
#' @noRd
get_single_release_query <- function (org = "ropensci", repo = "") {

q <- paste0 ("{
repository(owner:\"", org, "\", name:\"", repo, "\") {
name
url
releases(first: 1, orderBy: {field: CREATED_AT, direction: DESC}) {
nodes {
author {
email
login
}
name
createdAt
isLatest
isPrerelease
publishedAt
tagName
}
}
}
}")

return (q)
}

#' Add actual package names to `repo_data`, which need not necessarily be
#' identical to repository names.
#'
#' @param repo_data `data.frame` of repository data.
#' @param cm_path Result of `check_pkg_version_path` function, as path to
#' `raw_cm.json` file stored in `roregistry` repository.
#' @return Input `data.frame` with additional column of "pkg_name" appended.
#'
#' @noRd
add_pkg_name <- function (repo_data, cm_path) {

identifier <- codeRepository <- NULL # suppress no visible binding msgs

cm <- jsonlite::read_json (cm_path, simplifyVector = TRUE) |>
dplyr::select (identifier, codeRepository) |>
dplyr::rename (pkg_name = identifier, url = codeRepository)
dplyr::left_join (repo_data, cm, by = "url")
}

#' Add current CRAN version identifier to `repo_data`.
#'
#' @param repo_data `data.frame` of repository data.
#' @param CRAN_only If `TRUE`, return only those rows of `repo_data` which
#' correspond to packages which are or were on CRAN; otherwise return all data.
#'
#' @return Input `data.frame` with additional column of "Version" appended, and
#' potentially including only those rows which describe packages on CRAN.
#'
#' @noRd
add_CRAN_version <- function (repo_data, CRAN_only = TRUE) {

Package <- Version <- NULL # suppress no visible binding msgs

ap <- data.frame (utils::available.packages ()) |>
dplyr::select (Package, Version) |>
dplyr::rename (pkg_name = Package)
repo_data <- dplyr::left_join (repo_data, ap, by = "pkg_name")
if (CRAN_only) {
repo_data <- dplyr::filter (repo_data, !is.na (Version))
}
return (repo_data)
}
2 changes: 2 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ is_package <- function(path, old_cm){
}
return(FALSE) # not a package in other cases (no old registry, or no entry in it)
}

null_na <- function(x) ifelse(is.null(x), NA_character_, x[1L])
20 changes: 20 additions & 0 deletions man/registry_pkg_versions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading