diff --git a/DESCRIPTION b/DESCRIPTION index 406791b3..6e414dd7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: manynet Title: Many Ways to Make, Modify, Mark, and Measure Myriad Networks -Version: 1.6.7 -Date: 2025-11-05 +Version: 1.7.0 +Date: 2025-11-19 Description: Many tools for making, modifying, marking, measuring, and motifs and memberships of many different types of networks. All functions operate with matrices, edge lists, and 'igraph', 'network', and 'tidygraph' objects, diff --git a/NAMESPACE b/NAMESPACE index 07bd5079..8bc24682 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -353,6 +353,7 @@ export(create_star) export(create_tree) export(create_wheel) export(create_windmill) +export(delete_changes) export(delete_nodes) export(delete_ties) export(describe_changes) @@ -446,6 +447,7 @@ export(net_factions) export(net_harmonic) export(net_hazard) export(net_heterophily) +export(net_homophily) export(net_immunity) export(net_indegree) export(net_independence) @@ -571,6 +573,7 @@ export(node_fluid) export(node_harmonic) export(node_heterophily) export(node_hierarchy) +export(node_homophily) export(node_hub) export(node_in_adopter) export(node_in_automorphic) diff --git a/NEWS.md b/NEWS.md index 7d42993e..bc839c98 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,39 @@ +# manynet 1.7.0 + +## Making + +- Improved vector printing to be more succinct and suggest `print_all()` +- Improved printing methods for changing mnet objects + +## Modifying + +- Added `delete_changes()` for deleting all changes from a changing network +- Improved `to_waves()` to work with networks that are changing, longitudinal, or both +- Fixed `as_diffusion()` to not trim off final wave in the report +- Fixed `as_diffusion()` to return correct E and R compartments for waning diffusion models + +## Marking + +- Improved `is_longitudinal()` to only check for tie waves and not nodal changes + +## Measuring + +- Improved `net_diversity()` and `node_diversity()` + - No longer offers a cluster option, which can be obtained using `over_membership()` + - Added more methods for calculating diversity, including Teachman's, coefficient of variation, and the Gini coefficient + - Improved `net_diversity()` and `node_diversity()` to use and declare methods appropriate for the vector of attributes +- Added `net_homophily()` and `node_homophily()` for measuring homophily according to different methods including + - Krackhardt's EI index of heterophily as well as its inverse as a measure of homophily + - Yule's Q as a further measure of homophily + - Geary's C as a measure of homophily for numeric attributes +- Updated documentation for richness, diversity, and homophily measures +- Fixed `node_is_latent()`, `node_is_infected()`, `node_is_recoverd()` to work with changing networks +- Improved `node_is_exposed()` to work with changing networks and now accepts a time argument + +## Tutorials + +- Fixed description of `to_directed()` and `to_redirected()` in the data tutorial + # manynet 1.6.7 ## Making diff --git a/R/class_marks.R b/R/class_marks.R index 3dda4091..a690066f 100644 --- a/R/class_marks.R +++ b/R/class_marks.R @@ -55,9 +55,9 @@ print_tblvec <- function(y, names, n){ body <- pillar::tbl_format_body(tibs, setup)[c(TRUE, FALSE, TRUE)] if(setup$extra_cols_total > 0){ print(body) - cat(pillar::style_subtle(paste("# ... with", + cat(pillar::style_subtle(paste("# ... and", setup$extra_cols_total, - "more values from this nodeset unprinted.", - "Use `print(..., n = Inf)` to print all values."))) + "more values from this nodeset.", + "Use `print_all(...)` to print all values."))) } else print(body) } diff --git a/R/class_models.R b/R/class_models.R index 52c85980..35d0b2a7 100644 --- a/R/class_models.R +++ b/R/class_models.R @@ -17,6 +17,7 @@ print.diff_model <- function(x, ..., verbose = FALSE){ if(!verbose){ x$n <- NULL x$s <- NULL + x$S_new <- NULL x$I_new <- NULL x$E_new <- NULL x$R_new <- NULL diff --git a/R/class_networks.R b/R/class_networks.R index a82f11bb..2d4448e2 100644 --- a/R/class_networks.R +++ b/R/class_networks.R @@ -180,9 +180,10 @@ describe_ties <- function(x){ #' @export describe_changes <- function(x){ if(is_longitudinal(x)){ - paste(" over", max(tie_attribute(x, "wave")), "waves") + waves <- tie_attribute(x, "wave") + if(is.null(waves)) waves <- as_changelist(x)$time + paste(" over", max(waves), "waves") } else if (is_dynamic(x)){ - if("time" %in% net_tie_attributes(x)){ paste(" from", min(tie_attribute(x, "time"), na.rm = TRUE), "to", max(tie_attribute(x, "time"), na.rm = TRUE)) diff --git a/R/manip_as.R b/R/manip_as.R index 8c463f01..e5e54a04 100644 --- a/R/manip_as.R +++ b/R/manip_as.R @@ -1001,24 +1001,26 @@ as_diffusion.diff_model <- function(.data, twomode = FALSE, events) { #' @export as_diffusion.mnet <- function(.data, twomode = FALSE, events) { events <- as_changelist(.data) - nodes <- net_nodes(.data) + nodes <- c(net_nodes(.data)) sumchanges <- events %>% dplyr::group_by(time) %>% - dplyr::reframe(I_new = sum(value == "I"), + dplyr::reframe(S_new = sum(value == "S"), E_new = sum(value == "E"), + I_new = sum(value == "I"), R_new = sum(value == "R")) - report <- dplyr::tibble(time = seq_len(max(events$time)) - 1, + report <- dplyr::tibble(time = 0:max(events$time), n = nodes) %>% dplyr::left_join(sumchanges, by = dplyr::join_by(time)) report[is.na(report)] <- 0 - report$R <- cumsum(report$R_new) - report$I <- cumsum(report$I_new) - report$R - report$E <- ifelse(report$E_new == 0 & - cumsum(report$E_new) == max(cumsum(report$E_new)), - report$E_new, cumsum(report$E_new)) - report$E <- ifelse(report$R + report$I + report$E > report$n, - report$n - (report$R + report$I), - report$E) - report$S <- report$n - report$R - report$I - report$E + + if(all(report$E_new == 0)){ + report$S = report$n + cumsum(report$S_new - report$I_new) + report$E = rep(0, nrow(report)) + } else { + report$S = report$n + cumsum(report$S_new - report$E_new) # susceptible decreases as they become exposed + report$E = cumsum(report$E_new) - cumsum(report$I_new) # exposed become infectious + } + report$I = cumsum(report$I_new) - cumsum(report$R_new) # infectious recover + report$R = cumsum(report$R_new) - cumsum(report$S_new) # recovered accumulate report$s <- vapply(report$time, function(t){ twin <- dplyr::filter(events, events$time <= t) infected <- dplyr::filter(twin, twin$value == "I")$node @@ -1028,11 +1030,11 @@ as_diffusion.mnet <- function(.data, twomode = FALSE, events) { expos[recovered] <- F sum(expos) }, numeric(1) ) - if (any(report$R + report$I + report$E + report$S != report$n)) { + if (any((report$R + report$I + report$E + report$S) != report$n)) { snet_abort("Oops, something is wrong") } report <- dplyr::select(report, - dplyr::any_of(c("time", "n", "S", "s", "E", "E_new", + dplyr::any_of(c("time", "n", "S", "s", "S_new", "E", "E_new", "I", "I_new", "R", "R_new"))) # make_diff_model(events, report, .data) class(report) <- c("diff_model", class(report)) diff --git a/R/manip_nodes.R b/R/manip_nodes.R index 3dafb6f6..d36df743 100644 --- a/R/manip_nodes.R +++ b/R/manip_nodes.R @@ -327,6 +327,12 @@ add_changes <- function(.data, changes){ .data %>% mutate_nodes(diffusion = "S") } +#' @rdname manip_changes +#' @export +delete_changes <- function(.data){ + igraph::delete_graph_attr(.data, "changes") +} + #' @rdname manip_changes #' @export mutate_changes <- function(.data, ...) UseMethod("mutate_changes") diff --git a/R/manip_split.R b/R/manip_split.R index b0c0fe67..d560f49f 100644 --- a/R/manip_split.R +++ b/R/manip_split.R @@ -182,21 +182,86 @@ to_waves <- function(.data, attribute = "wave", panels = NULL, #' @export to_waves.tbl_graph <- function(.data, attribute = "wave", panels = NULL, cumulative = FALSE) { - wp <- unique(tie_attribute(.data, attribute)) - if(!is.null(panels)) - wp <- intersect(panels, wp) - if(length(wp) > 1) { - out <- lapply(wp, function(l){ - filter_ties(.data, !!as.name(attribute) == l) + out <- NULL + if(is_changing(.data) && is_longitudinal(.data)){ + cl <- as_changelist(.data) + el <- as_edgelist(.data) + + # Get all unique times in order + times <- sort(unique(cl$time)) + if(!is.null(panels)) + times <- intersect(panels, times) + + waves <- lapply(times, function(t) { + # Latest changes by time t + changes <- cl %>% + dplyr::filter(time <= t) %>% + dplyr::group_by(node) %>% + dplyr::reframe(var = var, + latest_value = value[which.max(time)], + .groups = "drop") + for(v in unique(changes$var)){ + upd <- rep(NA, net_nodes(.data)) + upd[changes[var = v,]$node] <- changes[var = v,]$latest_value + old <- node_attribute(.data, v) + if(inherits(old, "logi")) old <- as.logical(old) + out <- add_node_attribute(.data, v, dplyr::coalesce(upd, old)) + } + out <- delete_changes(out) + out <- filter_ties(out, wave == t) + out }) - names(out) <- wp - } else { - out <- filter_ties(.data, !!as.name(attribute) == wp) - } - if (isTRUE(cumulative)) { - out <- cumulative_ties(out, attribute) + names(waves) <- paste("Wave", times) + out <- waves + } else if(is_changing(.data)){ + cl <- as_changelist(.data) + if(!attribute %in% names(cl) && "time" %in% names(cl)){ + attribute <- "time" + } + # Get all unique times in order + times <- sort(unique(cl$time)) + if(!is.null(panels)) + times <- intersect(panels, times) + + # Iterate over times + waves <- lapply(times, function(t) { + # Latest changes by time t + changes <- cl %>% + dplyr::filter(time <= t) %>% + dplyr::group_by(node) %>% + dplyr::reframe(var = var, + latest_value = value[which.max(time)], + .groups = "drop") + for(v in unique(changes$var)){ + upd <- rep(NA, net_nodes(.data)) + upd[changes[var = v,]$node] <- changes[var = v,]$latest_value + old <- node_attribute(.data, v) + if(inherits(old, "logi")) old <- as.logical(old) + out <- add_node_attribute(.data, v, dplyr::coalesce(upd, old)) + } + out <- delete_changes(out) + out + }) + names(waves) <- paste("Wave", times) + out <- waves + } else if(is_longitudinal(.data)){ + wp <- unique(tie_attribute(.data, attribute)) + if(!is.null(panels)) + wp <- intersect(panels, wp) + if(length(wp) > 1) { + out <- lapply(wp, function(l){ + filter_ties(.data, !!as.name(attribute) == l) + }) + names(out) <- wp + } else { + out <- filter_ties(.data, !!as.name(attribute) == wp) + } + if (isTRUE(cumulative)) { + out <- .cumulative_ties(out, attribute) + } + out <- out[order(names(out))] } - out[order(names(out))] + if(is.null(out)) .data else out } #' @export @@ -218,7 +283,7 @@ to_waves.data.frame <- function(.data, attribute = "wave", panels = NULL, out <- .data[,attribute == wp] } if (isTRUE(cumulative)) { - out <- cumulative_ties(out, attribute) + out <- .cumulative_ties(out, attribute) } out } @@ -240,12 +305,12 @@ to_waves.diff_model <- function(.data, attribute = "t", panels = NULL, Recovered = node_is_recovered(diff, time = k)) } if (isTRUE(cumulative)) { - out <- cumulative_ties(out, attribute) + out <- .cumulative_ties(out, attribute) } out } -cumulative_ties <- function(x, attribute) { +.cumulative_ties <- function(x, attribute) { edges <- to <- from <- NULL thisRequires("zoo") thisRequires("purrr") diff --git a/R/manynet-utils.R b/R/manynet-utils.R index 8252b1ac..8e1cd337 100644 --- a/R/manynet-utils.R +++ b/R/manynet-utils.R @@ -6,7 +6,8 @@ utils::globalVariables(c(".data", "obs", "nodes","event","exposure", "student","students","colleges", "node","value","var","active","time", - "A","B","C","D")) + "A","B","C","D", + "n")) # Helper function for declaring available methods available_methods <- function(fun_vctr) { diff --git a/R/mark_net.R b/R/mark_net.R index c1343561..0a9941d8 100644 --- a/R/mark_net.R +++ b/R/mark_net.R @@ -105,9 +105,10 @@ is_list <- function(.data) { is_longitudinal <- function(.data) { if(is_manynet(.data)) { ig <- as_igraph(.data) - catts <- names(igraph::graph_attr(ig, "changes")) + # catts <- names(igraph::graph_attr(ig, "changes")) tatts <- igraph::edge_attr_names(ig) - return("time" %in% catts | "wave" %in% tatts | "panel" %in% tatts) + return(#"time" %in% catts | + "wave" %in% tatts | "panel" %in% tatts) } else if(is_list(.data)){ all(lapply(.data, net_nodes)==net_nodes(.data[[1]])) } diff --git a/R/mark_nodes.R b/R/mark_nodes.R index 0f3d6f58..3fd359f4 100644 --- a/R/mark_nodes.R +++ b/R/mark_nodes.R @@ -202,8 +202,21 @@ NULL #' node_is_latent(play_diffusion(create_tree(6), latency = 1), time = 1) #' @export node_is_latent <- function(.data, time = 0){ - if(inherits(.data, "diff_model")){ - event <- nodes <- n <- NULL + if(is_changing(.data)){ + t <- time + latent <- as_changelist(.data) %>% + dplyr::filter(time <= t & value %in% c("E", "I")) %>% + dplyr::group_by(node) %>% + dplyr::mutate(n = dplyr::n()) %>% + dplyr::filter(n == 1 & value == "E") + if (is_labelled(.data)) { + out <- seq_len(net_nodes(.data)) %in% latent$node + names(out) <- node_names(.data) + } else { + out <- seq_len(net_nodes(.data)) %in% latent$node + } + make_node_mark(out, .data) + } else if(inherits(.data, "diff_model")){ latent <- summary(.data) %>% dplyr::filter(t <= time & event %in% c("E", "I")) %>% group_by(nodes) %>% @@ -231,8 +244,21 @@ node_is_latent <- function(.data, time = 0){ #' node_is_infected(play_diffusion(create_tree(6)), time = 1) #' @export node_is_infected <- function(.data, time = 0) { - if(inherits(.data, "diff_model")){ - event <- nodes <- n <- NULL + if(is_changing(.data)){ + t <- time + infected <- as_changelist(.data) %>% + dplyr::filter(time <= t & value %in% c("I", "R")) %>% + dplyr::group_by(node) %>% + dplyr::mutate(n = dplyr::n()) %>% + dplyr::filter(n == 1 & value == "I") + if (is_labelled(.data)) { + out <- seq_len(net_nodes(.data)) %in% infected$node + names(out) <- node_names(.data) + } else { + out <- seq_len(net_nodes(.data)) %in% infected$node + } + make_node_mark(out, .data) + } else if(inherits(.data, "diff_model")){ infected <- summary(.data) %>% dplyr::filter(t <= time & event %in% c("I", "R")) %>% group_by(nodes) %>% @@ -259,8 +285,21 @@ node_is_infected <- function(.data, time = 0) { #' node_is_recovered(play_diffusion(create_tree(6), recovery = 0.5), time = 3) #' @export node_is_recovered <- function(.data, time = 0){ - if(inherits(.data, "diff_model")){ - event <- nodes <- n <- NULL + if(is_changing(.data)){ + t <- time + recovered <- as_changelist(.data) %>% + dplyr::filter(time <= t & value %in% c("R")) %>% + dplyr::group_by(node) %>% + dplyr::mutate(n = dplyr::n()) %>% + dplyr::filter(n == 1 & value == "R") + if (is_labelled(.data)) { + out <- seq_len(net_nodes(.data)) %in% recovered$node + names(out) <- node_names(.data) + } else { + out <- seq_len(net_nodes(.data)) %in% recovered$node + } + make_node_mark(out, .data) + } else if(inherits(.data, "diff_model")){ recovered <- summary(.data) %>% dplyr::filter(t <= time & event == "R") %>% group_by(nodes) %>% @@ -300,13 +339,17 @@ node_is_recovered <- function(.data, time = 0){ #' (expos <- node_is_exposed(manynet::create_tree(14), mark = c(1,3))) #' which(expos) #' @export -node_is_exposed <- function(.data, mark){ - event <- nodes <- NULL - if (missing(mark) && inherits(.data, "diff_model")){ - mark <- summary(.data) %>% - dplyr::filter(t == 0 & event == "I") %>% - dplyr::select(nodes) %>% unlist() - .data <- attr(.data, "network") +node_is_exposed <- function(.data, mark, time = 0){ + if (missing(mark)){ + if(is_changing(.data)){ + t <- time + return(make_node_mark(node_exposure(.data, time = t)>0, .data)) + } else if(inherits(.data, "diff_model")){ + mark <- summary(.data) %>% + dplyr::filter(t == 0 & event == "I") %>% + dplyr::select(nodes) %>% unlist() + .data <- attr(.data, "network") + } } if(is.logical(mark)) mark <- which(mark) out <- rep(F, manynet::net_nodes(.data)) diff --git a/R/measure_diffusion.R b/R/measure_diffusion.R index 4e508810..a0558062 100644 --- a/R/measure_diffusion.R +++ b/R/measure_diffusion.R @@ -215,7 +215,7 @@ net_immunity <- function(.data, normalized = TRUE){ #' @rdname measure_diffusion_infection #' @examples #' smeg <- generate_smallworld(15, 0.025) -#' smeg_diff <- play_diffusion(smeg, recovery = 0.2) +#' smeg_diff <- play_diffusion(smeg) #' net_infection_complete(smeg_diff) #' @export net_infection_complete <- function(.data){ @@ -426,8 +426,8 @@ node_thresholds <- function(.data, normalized = TRUE, lag = 1){ } #' @rdname measure_diffusion_node -#' @section Infection length: -#' `node_infection_length()` measures the average length of time that nodes +#' @section Recovery: +#' `node_recovery()` measures the average length of time that nodes #' that become infected remain infected in a compartmental model with recovery. #' Infections that are not concluded by the end of the study period are #' calculated as infinite. diff --git a/R/measure_heterogeneity.R b/R/measure_heterogeneity.R index 23ddf072..7c57659c 100644 --- a/R/measure_heterogeneity.R +++ b/R/measure_heterogeneity.R @@ -9,8 +9,7 @@ #' in a network attribute. #' - `node_richness()` measures the number of unique categories #' of an attribute to which each node is connected. -#' - `net_diversity()` measures the heterogeneity of ties across a network -#' or within clusters by node attributes. +#' - `net_diversity()` measures the heterogeneity of ties across a network. #' - `node_diversity()` measures the heterogeneity of each node's #' local neighbourhood. #' - `net_heterophily()` measures how embedded nodes in the network @@ -24,12 +23,27 @@ #' @inheritParams mark_is #' @param attribute Name of a nodal attribute or membership vector #' to use as categories for the diversity measure. -#' @param clusters A nodal cluster membership vector or name of a vertex attribute. +#' @param method Which method to use for `net_diversity()`. +#' Either "blau" (Blau's index) or "teachman" (Teachman's index) for +#' categorical attributes, or "variation" (coefficient of variation) +#' or "gini" (Gini coefficient) for numeric attributes. +#' Default is "blau". +#' If an incompatible method is chosen for the attribute type, +#' a suitable alternative will be used instead with a message. #' @name measure_heterogeneity #' @family measures NULL -#' @rdname measure_heterogeneity +#' @rdname measure_heterogeneity +#' @section Richness: +#' Richness is a simple count of the number of different categories +#' present for a given attribute. +#' @references +#' ## On richness +#' Magurran, Anne E. 1988. +#' _Ecological Diversity and Its Measurement_. +#' Princeton: Princeton University Press. +#' \doi{10.1007/978-94-015-7358-0} #' @examples #' net_richness(ison_networkers) #' @export @@ -52,12 +66,12 @@ node_richness <- function(.data, attribute){ } #' @rdname measure_heterogeneity -#' @section net_diversity: +#' @section Diversity: #' Blau's index (1977) uses a formula known also in other disciplines #' by other names #' (Gini-Simpson Index, Gini impurity, Gini's diversity index, #' Gibbs-Martin index, and probability of interspecific encounter (PIE)): -#' \deqn{1 - \sum\limits_{i = 1}^k {p_i^2 }}, +#' \deqn{1 - \sum\limits_{i = 1}^k {p_i^2 }} #' where \eqn{p_i} is the proportion of group members in \eqn{i}th category #' and \eqn{k} is the number of categories for an attribute of interest. #' This index can be interpreted as the probability that two members @@ -65,13 +79,58 @@ node_richness <- function(.data, attribute){ #' This index finds its minimum value (0) when there is no variety, #' i.e. when all individuals are classified in the same category. #' The maximum value depends on the number of categories and -#' whether nodes can be evenly distributed across categories. +#' whether nodes can be evenly distributed across categories. +#' +#' Teachman's index (1980) is based on information theory +#' and is calculated as: +#' \deqn{- \sum\limits_{i = 1}^k {p_i \log(p_i)}} +#' where \eqn{p_i} is the proportion of group members in \eqn{i}th category +#' and \eqn{k} is the number of categories for an attribute of interest. +#' This index finds its minimum value (0) when there is no variety, +#' i.e. when all individuals are classified in the same category. +#' The maximum value depends on the number of categories and +#' whether nodes can be evenly distributed across categories. +#' It thus shares similar properties to Blau's index, +#' but includes also a notion of richness that tends to give more weight to +#' rare categories and thus tends to highlight imbalances more. +#' +#' The coefficient of variation (CV) is a standardised measure of dispersion +#' of a probability distribution or frequency distribution. +#' It is defined as the ratio of the standard deviation \eqn{\sigma} +#' to the mean \eqn{\mu}: +#' \deqn{CV = \frac{\sigma}{\mu}} +#' It is often expressed as a percentage. +#' The CV is useful because the standard deviation of data must always be understood +#' in the context of the mean of the data. +#' The CV is particularly useful when comparing the degree of variation +#' from one data series to another, +#' even if the means are drastically different from each other. +#' +#' The Gini coefficient is a measure of statistical dispersion +#' that is intended to represent the income or wealth distribution +#' of a nation's residents, +#' and is commonly used as a measure of inequality. +#' It is defined as a ratio with values between 0 and 1, +#' where 0 corresponds with perfect equality +#' (everyone has the same income) and 1 corresponds with perfect inequality +#' (one person has all the income, and everyone else has zero income). +#' The Gini coefficient can be calculated from the Lorenz curve, +#' which plots the proportion of the total income of the population +#' that is cumulatively earned by the bottom x% of the population. +#' The Gini coefficient is defined as the area between the line of equality +#' and the Lorenz curve, +#' divided by the total area under the line of equality. #' @references #' ## On diversity #' Blau, Peter M. 1977. #' _Inequality and heterogeneity_. #' New York: Free Press. #' +#' Teachman, Jay D. 1980. +#' Analysis of population diversity: Measures of qualitative variation. +#' _Sociological Methods & Research_, 8:341-362. +#' \doi{10.1177/004912418000800305} +#' #' Page, Scott E. 2010. #' _Diversity and Complexity_. #' Princeton: Princeton University Press. @@ -79,29 +138,45 @@ node_richness <- function(.data, attribute){ #' @examples #' marvel_friends <- to_unsigned(ison_marvel_relationships, "positive") #' net_diversity(marvel_friends, "Gender") -#' net_diversity(marvel_friends, "Attractive") -#' net_diversity(marvel_friends, "Gender", "Rich") +#' net_diversity(marvel_friends, "Appearances") #' @export -net_diversity <- function(.data, attribute, clusters = NULL){ +net_diversity <- function(.data, attribute, + method = c("blau","teachman","variation","gini")){ if(missing(.data)) {expect_nodes(); .data <- .G()} # nocov blau <- function(features) { 1 - sum((table(features)/length(features))^2) } + teachman <- function(features) { + p <- table(features)/length(features) + -sum(p * log(p)) + } + cv <- function(values) { + stats::sd(values, na.rm = TRUE) / mean(values, na.rm = TRUE) } + gini <- function(values) { + x <- sort(values) + n <- length(x) + G <- sum(x * (1:n)) + return((2 * G) / (n * sum(x)) - (n + 1) / n) + } attr <- manynet::node_attribute(.data, attribute) - if (is.null(clusters)) { - blauout <- blau(attr) - } else if (is.numeric(clusters) && is.vector(clusters)) { - blauout <- vapply(unique(clusters), - function(i) blau(attr[clusters == i]), - numeric(1)) - names(blauout) <- paste0("Cluster ", unique(clusters)) - } else if (is.character(clusters)) { - clu <- manynet::node_attribute(.data, clusters) - blauout <- vapply(unique(clu), - function(i) blau(attr[clu == i]), - numeric(1)) - names(blauout) <- paste0("Cluster ", unique(clu)) - blauout <- blauout[order(names(blauout))] - } else snet_abort("`clusters` must be the name of a nodal variable in the object.") - make_network_measure(blauout, .data, call = deparse(sys.call())) + method <- match.arg(method) + if(is.numeric(attr) && method %in% c("blau","teachman")){ + snet_info("{.val {method}} index is not appropriate for numeric attributes.") + snet_info("Using {.val variation} coefficient instead", + "({.val gini} coefficient also available).") + method <- "variation" + } + if(is.character(attr) && method %in% c("variation","gini")){ + snet_info("{.val {method}} coefficient is not appropriate for categorical attributes.") + snet_info("Using {.val blau} index instead", + "({.val teachman} index also available).") + method <- "blau" + } + + out <- switch(method, + blau = blau(attr), + teachman = teachman(attr), + variation = cv(attr), + gini = gini(attr)) + make_network_measure(out, .data, call = deparse(sys.call())) } #' @rdname measure_heterogeneity @@ -109,18 +184,33 @@ net_diversity <- function(.data, attribute, clusters = NULL){ #' node_diversity(marvel_friends, "Gender") #' node_diversity(marvel_friends, "Attractive") #' @export -node_diversity <- function(.data, attribute){ +node_diversity <- function(.data, attribute, + method = c("blau","teachman","variation","gini")){ if(missing(.data)) {expect_nodes(); .data <- .G()} # nocov + attr <- manynet::node_attribute(.data, attribute) + method <- match.arg(method) + if(is.numeric(attr) && method %in% c("blau","teachman")){ + snet_info("{.val {method}} index is not appropriate for numeric attributes.") + snet_info("Using {.val variation} coefficient instead", + "({.val gini} coefficient also available).") + method <- "variation" + } + if(is.character(attr) && method %in% c("variation","gini")){ + snet_info("{.val {method}} coefficient is not appropriate for categorical attributes.") + snet_info("Using {.val blau} index instead", + "({.val teachman} index also available).") + method <- "blau" + } out <- vapply(igraph::ego(manynet::as_igraph(.data)), function(x) net_diversity( igraph::induced_subgraph(manynet::as_igraph(.data), x), - attribute), + attribute, method = method), FUN.VALUE = numeric(1)) make_node_measure(out, .data) } #' @rdname measure_heterogeneity -#' @section net_heterophily: +#' @section Homophily: #' Given a partition of a network into a number of mutually exclusive groups then #' The E-I index is the number of ties between (or _external_) nodes #' grouped in some mutually exclusive categories @@ -185,12 +275,137 @@ node_heterophily <- function(.data, attribute){ make_node_measure(ei, .data) } +#' @rdname measure_heterogeneity +#' @examples +#' net_homophily(marvel_friends, "Gender") +#' @export +net_homophily <- function(.data, attribute, + method = c("ie","ei","yule","geary")){ + if(missing(.data)) {expect_nodes(); .data <- .G()} # nocov + # mode <- attr_mode(.data, attribute) + # if(is_twomode(.data) && !is.null(mode)){ + # if(mode){ + # snet_info("Attribute only present on second mode of two-mode network.") + # snet_info("Calculating homophily on first mode instead.") + # attribute <- manynet::node_attribute(.data, attribute)[ + # !manynet::node_is_mode(.data)] + # } else { + # snet_info("Attribute only present on first mode of two-mode network.") + # snet_info("Calculating homophily on second mode instead.") + # attribute <- manynet::node_attribute(.data, attribute)[ + # manynet::node_is_mode(.data)] + # } + # .data <- manynet::to_mode(.data, mode = mode) + # } + if (length(attribute) == 1 && is.character(attribute)) { + attribute <- manynet::node_attribute(.data, attribute) + } + method <- match.arg(method) + if(is.numeric(attribute) && method %in% c("ie","ei","yule")){ + snet_info("{.val {method}} index is not appropriate for numeric attributes.") + snet_info("Using {.val geary}'s C instead.") + method <- "geary" + } + if(!is.numeric(attribute) && method == "geary"){ + snet_info("{.val {method}} index is not appropriate for categorical attributes.") + snet_info("Using {.val ie} index instead.") + method <- "ie" + } + + m <- manynet::as_matrix(to_unweighted(.data)) + + ei <- function(m, attribute){ + same <- outer(attribute, attribute, "==") + nInternal <- sum(m * same, na.rm = TRUE) + nExternal <- sum(m, na.rm = TRUE) - nInternal + (nExternal - nInternal) / sum(m, na.rm = TRUE) + } + + yule <- function(m, attribute){ + same <- outer(attribute, attribute, "==") + a <- sum(m * same, na.rm = TRUE) + b <- sum(m * (!same), na.rm = TRUE) + c <- sum((1 - m) * same, na.rm = TRUE) + d <- sum((1 - m) * (!same), na.rm = TRUE) + (a*d - b*c)/(a*d + b*c) + } + + geary <- function(m, attribute){ + # identify valid nodes + valid <- !is.na(attribute) + attr_valid <- attribute[valid] + m_valid <- m[valid, valid, drop = FALSE] + + # recompute n and mean on valid nodes + n <- length(attr_valid) + xbar <- mean(attr_valid, na.rm = TRUE) + + # pairwise squared differences + diffsq <- (outer(attr_valid, attr_valid, "-"))^2 + + # weight sum + W <- sum(m_valid, na.rm = TRUE) + + # numerator and denominator + num <- (n - 1) * sum(m_valid * diffsq, na.rm = TRUE) + den <- 2 * W * sum((attr_valid - xbar)^2, na.rm = TRUE) + + if (den == 0) return(NA_real_) + num / den + } + + res <- switch(match.arg(method), + ie = -ei(m, attribute), + ei = ei(m, attribute), + yule = yule(m, attribute), + geary = geary(m, attribute)) + + make_network_measure(res, .data, call = deparse(sys.call())) +} + + +attr_mode <- function(.data, attribute){ + if(is_twomode(.data)){ + miss <- is.na(node_attribute(.data, attribute)) + mode <- node_is_mode(.data) + if(all(miss[mode])) return(FALSE) # attribute only on first (FALSE) mode + if(all(miss[!mode])) return(TRUE) # attribute only on second (TRUE) mode + } else NULL +} + +#' @rdname measure_heterogeneity +#' @export +node_homophily <- function(.data, attribute, + method = c("ie","ei","yule","geary")){ + if(missing(.data)) {expect_nodes(); .data <- .G()} # nocov + if (length(attribute) == 1 && is.character(attribute)) { + attribute <- manynet::node_attribute(.data, attribute) + } + method <- match.arg(method) + if(is.numeric(attribute) && method %in% c("ie","ei","yule")){ + snet_info("{.val {method}} index is not appropriate for numeric attributes.") + snet_info("Using {.val geary}'s C instead.") + method <- "geary" + } + if(!is.numeric(attribute) && method == "geary"){ + snet_info("{.val {method}} index is not appropriate for categorical attributes.") + snet_info("Using {.val ie} index instead.") + method <- "ie" + } + out <- vapply(igraph::ego(manynet::as_igraph(.data)), + function(x) net_homophily( + igraph::induced_subgraph(manynet::as_igraph(.data), x), + attribute, method = method), + FUN.VALUE = numeric(1)) + make_node_measure(out, .data) +} + #' @rdname measure_heterogeneity #' @importFrom igraph assortativity_degree #' @references #' ## On assortativity #' Newman, Mark E.J. 2002. -#' "Assortative Mxing in Networks". +#' "Assortative mixing in networks". #' _Physical Review Letters_, 89(20): 208701. #' \doi{10.1103/physrevlett.89.208701} #' @examples @@ -207,7 +422,7 @@ net_assortativity <- function(.data){ #' @references #' ## On spatial autocorrelation #' Moran, Patrick Alfred Pierce. 1950. -#' "Notes on Continuous Stochastic Phenomena". +#' "Notes on continuous stochastic phenomena". #' _Biometrika_ 37(1): 17-23. #' \doi{10.2307/2332142} #' @examples diff --git a/cran-comments.md b/cran-comments.md index 7c5065eb..7166b318 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -7,4 +7,6 @@ ## R CMD check results -0 errors | 0 warnings | 0 notes \ No newline at end of file +0 errors | 0 warnings | 0 notes + +- Required for versions of autograph and migraph that fix errors \ No newline at end of file diff --git a/inst/tutorials/tutorial1/data.Rmd b/inst/tutorials/tutorial1/data.Rmd index 2ce2c3fa..9cc8f8f4 100644 --- a/inst/tutorials/tutorial1/data.Rmd +++ b/inst/tutorials/tutorial1/data.Rmd @@ -592,7 +592,8 @@ For example: - `to_unnamed()` removes/anonymises all vertex/node labels - `to_named()` adds some random (U.S.) childrens' names, which can be useful for identifying particular nodes - `to_undirected()` replaces directed ties with an undirected tie (if an arc in either direction is present) -- `to_redirected()` replaces undirected ties with directed ties (arcs) or, if already directed, swaps arcs' direction +- `to_directed()` replaces undirected ties with directed ties (arcs) or, if already directed, does nothing +- `to_redirected()` swaps the direction of directed ties (arcs) or, if undirected, does nothing - `to_unweighted()` binarises or dichotomises a network around a particular threshold (by default `1`) - `to_unsigned()` returns just the "positive" or "negative" ties from a signed network, respectively - `to_uniplex()` reduces a multigraph or multiplex network to one with a single set of edges or ties diff --git a/man/manip_changes.Rd b/man/manip_changes.Rd index 4d99324a..a8bae34c 100644 --- a/man/manip_changes.Rd +++ b/man/manip_changes.Rd @@ -3,6 +3,7 @@ \name{manip_changes} \alias{manip_changes} \alias{add_changes} +\alias{delete_changes} \alias{mutate_changes} \alias{filter_changes} \alias{select_changes} @@ -12,6 +13,8 @@ \usage{ add_changes(.data, changes) +delete_changes(.data) + mutate_changes(.data, ...) filter_changes(.data, ..., .by = NULL) diff --git a/man/mark_diff.Rd b/man/mark_diff.Rd index c52dd719..64e46f80 100644 --- a/man/mark_diff.Rd +++ b/man/mark_diff.Rd @@ -14,7 +14,7 @@ node_is_infected(.data, time = 0) node_is_recovered(.data, time = 0) -node_is_exposed(.data, mark) +node_is_exposed(.data, mark, time = 0) } \arguments{ \item{.data}{An object of a manynet-consistent class: diff --git a/man/measure_diffusion_infection.Rd b/man/measure_diffusion_infection.Rd index 387295ca..5998b0f7 100644 --- a/man/measure_diffusion_infection.Rd +++ b/man/measure_diffusion_infection.Rd @@ -42,7 +42,7 @@ highest infection rate is observed. } \examples{ smeg <- generate_smallworld(15, 0.025) - smeg_diff <- play_diffusion(smeg, recovery = 0.2) + smeg_diff <- play_diffusion(smeg) net_infection_complete(smeg_diff) net_infection_total(smeg_diff) net_infection_peak(smeg_diff) diff --git a/man/measure_diffusion_node.Rd b/man/measure_diffusion_node.Rd index 4178db22..60755615 100644 --- a/man/measure_diffusion_node.Rd +++ b/man/measure_diffusion_node.Rd @@ -79,9 +79,9 @@ That is, \eqn{theta} is now a proportion, and works regardless of whether \eqn{w} is weighted or not. } -\section{Infection length}{ +\section{Recovery}{ -\code{node_infection_length()} measures the average length of time that nodes +\code{node_recovery()} measures the average length of time that nodes that become infected remain infected in a compartmental model with recovery. Infections that are not concluded by the end of the study period are calculated as infinite. diff --git a/man/measure_heterogeneity.Rd b/man/measure_heterogeneity.Rd index ac64226b..c575dc3c 100644 --- a/man/measure_heterogeneity.Rd +++ b/man/measure_heterogeneity.Rd @@ -8,6 +8,8 @@ \alias{node_diversity} \alias{net_heterophily} \alias{node_heterophily} +\alias{net_homophily} +\alias{node_homophily} \alias{net_assortativity} \alias{net_spatial} \title{Measures of network diversity} @@ -16,14 +18,26 @@ net_richness(.data, attribute) node_richness(.data, attribute) -net_diversity(.data, attribute, clusters = NULL) +net_diversity( + .data, + attribute, + method = c("blau", "teachman", "variation", "gini") +) -node_diversity(.data, attribute) +node_diversity( + .data, + attribute, + method = c("blau", "teachman", "variation", "gini") +) net_heterophily(.data, attribute) node_heterophily(.data, attribute) +net_homophily(.data, attribute, method = c("ie", "ei", "yule", "geary")) + +node_homophily(.data, attribute, method = c("ie", "ei", "yule", "geary")) + net_assortativity(.data) net_spatial(.data, attribute) @@ -41,7 +55,13 @@ net_spatial(.data, attribute) \item{attribute}{Name of a nodal attribute or membership vector to use as categories for the diversity measure.} -\item{clusters}{A nodal cluster membership vector or name of a vertex attribute.} +\item{method}{Which method to use for \code{net_diversity()}. +Either "blau" (Blau's index) or "teachman" (Teachman's index) for +categorical attributes, or "variation" (coefficient of variation) +or "gini" (Gini coefficient) for numeric attributes. +Default is "blau". +If an incompatible method is chosen for the attribute type, +a suitable alternative will be used instead with a message.} } \description{ These functions offer ways to measure the heterogeneity of an attribute @@ -52,8 +72,7 @@ across this attribute: in a network attribute. \item \code{node_richness()} measures the number of unique categories of an attribute to which each node is connected. -\item \code{net_diversity()} measures the heterogeneity of ties across a network -or within clusters by node attributes. +\item \code{net_diversity()} measures the heterogeneity of ties across a network. \item \code{node_diversity()} measures the heterogeneity of each node's local neighbourhood. \item \code{net_heterophily()} measures how embedded nodes in the network @@ -65,13 +84,19 @@ of nodes with the same attribute. (global Moran's I) in a network. } } -\section{net_diversity}{ +\section{Richness}{ + +Richness is a simple count of the number of different categories +present for a given attribute. +} + +\section{Diversity}{ Blau's index (1977) uses a formula known also in other disciplines by other names (Gini-Simpson Index, Gini impurity, Gini's diversity index, Gibbs-Martin index, and probability of interspecific encounter (PIE)): -\deqn{1 - \sum\limits_{i = 1}^k {p_i^2 }}, +\deqn{1 - \sum\limits_{i = 1}^k {p_i^2 }} where \eqn{p_i} is the proportion of group members in \eqn{i}th category and \eqn{k} is the number of categories for an attribute of interest. This index can be interpreted as the probability that two members @@ -80,9 +105,49 @@ This index finds its minimum value (0) when there is no variety, i.e. when all individuals are classified in the same category. The maximum value depends on the number of categories and whether nodes can be evenly distributed across categories. + +Teachman's index (1980) is based on information theory +and is calculated as: +\deqn{- \sum\limits_{i = 1}^k {p_i \log(p_i)}} +where \eqn{p_i} is the proportion of group members in \eqn{i}th category +and \eqn{k} is the number of categories for an attribute of interest. +This index finds its minimum value (0) when there is no variety, +i.e. when all individuals are classified in the same category. +The maximum value depends on the number of categories and +whether nodes can be evenly distributed across categories. +It thus shares similar properties to Blau's index, +but includes also a notion of richness that tends to give more weight to +rare categories and thus tends to highlight imbalances more. + +The coefficient of variation (CV) is a standardised measure of dispersion +of a probability distribution or frequency distribution. +It is defined as the ratio of the standard deviation \eqn{\sigma} +to the mean \eqn{\mu}: +\deqn{CV = \frac{\sigma}{\mu}} +It is often expressed as a percentage. +The CV is useful because the standard deviation of data must always be understood +in the context of the mean of the data. +The CV is particularly useful when comparing the degree of variation +from one data series to another, +even if the means are drastically different from each other. + +The Gini coefficient is a measure of statistical dispersion +that is intended to represent the income or wealth distribution +of a nation's residents, +and is commonly used as a measure of inequality. +It is defined as a ratio with values between 0 and 1, +where 0 corresponds with perfect equality +(everyone has the same income) and 1 corresponds with perfect inequality +(one person has all the income, and everyone else has zero income). +The Gini coefficient can be calculated from the Lorenz curve, +which plots the proportion of the total income of the population +that is cumulatively earned by the bottom x\% of the population. +The Gini coefficient is defined as the area between the line of equality +and the Lorenz curve, +divided by the total area under the line of equality. } -\section{net_heterophily}{ +\section{Homophily}{ Given a partition of a network into a number of mutually exclusive groups then The E-I index is the number of ties between (or \emph{external}) nodes @@ -98,24 +163,37 @@ net_richness(ison_networkers) node_richness(ison_networkers, "Discipline") marvel_friends <- to_unsigned(ison_marvel_relationships, "positive") net_diversity(marvel_friends, "Gender") -net_diversity(marvel_friends, "Attractive") -net_diversity(marvel_friends, "Gender", "Rich") +net_diversity(marvel_friends, "Appearances") node_diversity(marvel_friends, "Gender") node_diversity(marvel_friends, "Attractive") net_heterophily(marvel_friends, "Gender") net_heterophily(marvel_friends, "Attractive") node_heterophily(marvel_friends, "Gender") node_heterophily(marvel_friends, "Attractive") +net_homophily(marvel_friends, "Gender") net_assortativity(ison_networkers) net_spatial(ison_lawfirm, "age") } \references{ +\subsection{On richness}{ + +Magurran, Anne E. 1988. +\emph{Ecological Diversity and Its Measurement}. +Princeton: Princeton University Press. +\doi{10.1007/978-94-015-7358-0} +} + \subsection{On diversity}{ Blau, Peter M. 1977. \emph{Inequality and heterogeneity}. New York: Free Press. +Teachman, Jay D. 1980. +Analysis of population diversity: Measures of qualitative variation. +\emph{Sociological Methods & Research}, 8:341-362. +\doi{10.1177/004912418000800305} + Page, Scott E. 2010. \emph{Diversity and Complexity}. Princeton: Princeton University Press. @@ -138,7 +216,7 @@ McPherson, Miller, Lynn Smith-Lovin, and James M. Cook. 2001. \subsection{On assortativity}{ Newman, Mark E.J. 2002. -"Assortative Mxing in Networks". +"Assortative mixing in networks". \emph{Physical Review Letters}, 89(20): 208701. \doi{10.1103/physrevlett.89.208701} } @@ -146,7 +224,7 @@ Newman, Mark E.J. 2002. \subsection{On spatial autocorrelation}{ Moran, Patrick Alfred Pierce. 1950. -"Notes on Continuous Stochastic Phenomena". +"Notes on continuous stochastic phenomena". \emph{Biometrika} 37(1): 17-23. \doi{10.2307/2332142} } diff --git a/tests/testthat/test-measure_heterogeneity.R b/tests/testthat/test-measure_heterogeneity.R index 0c121fd3..52478d55 100644 --- a/tests/testthat/test-measure_heterogeneity.R +++ b/tests/testthat/test-measure_heterogeneity.R @@ -1,9 +1,9 @@ #*************** Test the heterogeneity family of functions ******************# -test_that("diversity function works", { +test_that("diversity functions works", { expect_equal(as.numeric(net_diversity(ison_marvel_relationships, "Gender")), 0.306, tolerance = 0.001) - expect_equal(as.numeric(net_diversity(ison_marvel_relationships, "Gender", "Rich")), - c(0.3367,0.1653), tolerance = 0.001) + expect_equal(top3(node_diversity(ison_lawfirm, "gender")), + c(0.285, 0.375,0), tolerance = 0.01) }) test_that("heterophily function works", { @@ -26,11 +26,6 @@ test_that("richness function works", { expect_s3_class(node_richness(ison_networkers, "type"), "node_measure") }) -test_that("node_diversity works", { - expect_equal(top3(node_diversity(ison_lawfirm, "gender")), - c(0.285, 0.375,0), tolerance = 0.01) -}) - test_that("net_spatial works", { expect_values(net_spatial(ison_lawfirm, "age"), 0.126) }) \ No newline at end of file