diff --git a/.gitignore b/.gitignore index cb58c8a..f8745a1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ inst/doc misc vignettes/.*R vignettes/.*html +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index e73b6d2..b113edd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: blocking Type: Package Title: Various Blocking Methods for Entity Resolution -Version: 1.0.1 +Version: 1.0.2 Authors@R: c(person(given = "Maciej", family = "Beręsewicz", @@ -19,7 +19,7 @@ LazyData: true URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.ue.poznan.pl/blocking/ BugReports: https://github.com/ncn-foreigners/blocking/issues Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: text2vec, tokenizers, diff --git a/NAMESPACE b/NAMESPACE index 6afe324..0ecf1f9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(logLik,est_block_error) S3method(print,blocking) S3method(print,est_block_error) export(blocking) @@ -35,6 +36,7 @@ importFrom(mlpack,lsh) importFrom(readr,read_table) importFrom(rnndescent,rnnd_build) importFrom(rnndescent,rnnd_query) +importFrom(stats,AIC) importFrom(stats,dist) importFrom(stats,dpois) importFrom(stats,runif) diff --git a/NEWS.md b/NEWS.md index 4b40110..ff5177b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # development +# version 1.0.2 + ++ Updated `est_block_error` function. + # version 1.0.1 + Fixed CRAN errors. diff --git a/R/blocking.R b/R/blocking.R index 76a2697..19d04c0 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -127,16 +127,16 @@ blocking <- function(x, "lsh" = NULL, "kd" = NULL) - stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported" = + stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported." = is.character(x) | is.matrix(x) | inherits(x, "Matrix")) if (!is.null(ann_write)) { - stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) ) + stopifnot("Path provided in the `ann_write` is incorrect." = file.exists(ann_write) ) } if (ann == "nnd") { - stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" = + stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`." = distance %in% c("euclidean", "cosine","manhatan", "hamming")) } @@ -145,15 +145,18 @@ blocking <- function(x, } if (ann == "hnsw") { - stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" = + stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`." = distance %in% c("l2", "euclidean", "cosine", "ip")) } if (ann == "annoy") { - stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" = + stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`." = distance %in% c("euclidean", "manhatan", "hamming", "angular")) } + stopifnot("Algorithm should be `nnd, hnsw, annoy, lsh, kd`." = + ann %in% c("nnd", "hnsw", "annoy", "lsh", "kd")) + if (!is.null(y)) { deduplication <- FALSE y_default <- FALSE @@ -167,15 +170,15 @@ blocking <- function(x, if (!is.null(true_blocks)) { - stopifnot("`true_blocks` should be a data.frame" = is.data.frame(true_blocks)) + stopifnot("`true_blocks` should be a data.frame." = is.data.frame(true_blocks)) if (deduplication == FALSE) { - stopifnot("`true blocks` should be a data.frame with columns: x, y, block" = + stopifnot("`true blocks` should be a data.frame with columns: x, y, block." = length(colnames(true_blocks)) == 3, all(colnames(true_blocks) == c("x", "y", "block"))) } if (deduplication) { - stopifnot("`true blocks` should be a data.frame with columns: x, block" = + stopifnot("`true blocks` should be a data.frame with columns: x, block." = length(colnames(true_blocks)) == 2, all(colnames(true_blocks) == c("x", "block"))) } diff --git a/R/est_block_error.R b/R/est_block_error.R index 38d5d4a..2d33194 100644 --- a/R/est_block_error.R +++ b/R/est_block_error.R @@ -1,5 +1,6 @@ #' @importFrom stats dpois #' @importFrom stats runif +#' @importFrom stats AIC #' #' @title Estimate errors due to blocking in record linkage #' @@ -11,25 +12,34 @@ #' @param x Reference data (required if `n` and `N` are not provided). #' @param y Query data (required if `n` is not provided). #' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided). +#' It must contain a column named `y` storing the indices of the records in the query data set. #' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set #' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`). #' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`). -#' @param G Number of classes in the finite mixture model. +#' @param G Integer or vector of integers. Number of classes in the finite mixture model. +#' If `G` is a vector, the optimal number of classes is selected from the provided values +#' based on the Akaike Information Criterion (AIC). #' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`). #' @param p Numeric vector of initial matching probabilities in each class of the mixture model -#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`). +#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)` or `rep(runif(1, 0.5, 1), G)`, +#' depending on the parameter `equal_p`). #' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model #' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`). -#' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`). -#' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`). +#' @param equal_p Logical, indicating whether the matching probabilities +#' `p` should be constrained to be equal across all latent classes (default `FALSE`). +#' @param tol Convergence tolerance for the EM algorithm (default `10^(-4)`). +#' @param maxiter Maximum number of iterations for the EM algorithm (default `100`). #' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data). #' #' @details -#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file. +#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: +#' a register (reference data `x`) and a file (query data `y`). #' Assume that the register has no undercoverage, -#' i.e. each record from the file corresponds to exactly one record from the same individual in the register. +#' i.e., each record from the file corresponds to exactly one record from the same individual in the register. #' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with -#' record \eqn{i} on the file. Assume that:\cr +#' record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file. +#' Let \eqn{v_i} denote record \eqn{i} from the file. +#' Assume that:\cr #' \itemize{ #' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N}, #' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}. @@ -71,14 +81,20 @@ #' } #' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}. #' +#' @note +#' The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes +#' by setting `equal_p = TRUE`. #' -#' -#' @returns Returns a list containing:\cr +#' @returns Returns an object of class `est_block_error`, with a list containing:\cr #' \itemize{ #' \item{`FPR` -- estimated false positive rate,} #' \item{`FNR` -- estimated false negative rate,} +#' \item{`G` -- number of classes used in the optimal model,} +#' \item{`log_lik` -- final log-likelihood value,} +#' \item{`equal_p` -- logical, indicating whether the matching probabilities were constrained,} #' \item{`iter` -- number of the EM algorithm iterations performed,} -#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.} +#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations,} +#' \item{`AIC` -- Akaike Information Criterion value in the optimal model.} #' } #' #' @references @@ -92,15 +108,15 @@ #' ## an example proposed by Dasylva and Goussanou (2021) #' ## we obtain results very close to those reported in the paper #' -#' set.seed(111) +#' set.seed(11) #' #' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5)) #' #' errors <- est_block_error(n = neighbors, #' N = 63155, -#' G = 2, +#' G = 1:3, #' tol = 10^(-3), -#' maxiter = 50) +#' equal_p = TRUE) #' #' errors #' @@ -114,6 +130,7 @@ est_block_error <- function(x = NULL, alpha = NULL, p = NULL, lambda = NULL, + equal_p = FALSE, tol = 10^(-4), maxiter = 100, sample_size = NULL) { @@ -135,6 +152,29 @@ est_block_error <- function(x = NULL, n <- sample(n, size = sample_size, replace = TRUE) } + if (length(G) > 1) { + + G_cand <- sort(G) + results_list <- list() + aic_values <- numeric(length(G_cand)) + + for (i in seq_along(G_cand)) { + + fit <- est_block_error(n = n, N = N, G = G_cand[i], + alpha = NULL, p = NULL, lambda = NULL, + equal_p = equal_p, tol = tol, maxiter = maxiter) + results_list[[i]] <- fit + aic_values[i] <- fit$AIC + + } + + best_idx <- which.min(aic_values) + best_model <- results_list[[best_idx]] + + return(best_model) + + } + convergence <- FALSE m <- length(n) @@ -143,7 +183,13 @@ est_block_error <- function(x = NULL, } if (is.null(p)) { - p <- runif(G, min = 0.5, max = 1) + if (equal_p) { + p <- rep(runif(1, min = 0.5, max = 1), G) + } else { + p <- runif(G, min = 0.5, max = 1) + } + } else if (equal_p && length(p) == G) { + p <- rep(mean(p), G) } if (is.null(lambda)) { @@ -192,7 +238,11 @@ est_block_error <- function(x = NULL, ## M alpha <- 1 / m * colSums(probs_c_n) - p <- colSums(E_c_n_M) / (m * alpha) + if (equal_p) { + p <- rep(sum(E_c_n_M) / m, G) + } else { + p <- colSums(E_c_n_M) / (m * alpha) + } lambda <- colSums(E_c_n_U) / (m * alpha) ## check @@ -215,13 +265,20 @@ est_block_error <- function(x = NULL, FNR <- 1 - sum(alpha * p) FPR <- sum(alpha * lambda) / (N - 1) - return(structure( + res <- structure( list( - FPR = FPR, - FNR = FNR, - iter = l, - convergence = convergence - ), - class = "est_block_error")) + FPR = FPR, + FNR = FNR, + G = G, + log_lik = log_lik_new, + equal_p = equal_p, + iter = l, + convergence = convergence + ), + class = "est_block_error") + + res$AIC <- AIC(res) + + return(res) } diff --git a/R/methods.R b/R/methods.R index 0815c90..310f482 100644 --- a/R/methods.R +++ b/R/methods.R @@ -34,8 +34,9 @@ print.blocking <- function(x, ...) { #' @exportS3Method print.est_block_error <- function(x, ...) { - cat("FPR: ", x$FPR, "\n") - cat("FNR: ", x$FNR, "\n") + cat("Estimated FPR: ", x$FPR, "\n") + cat("Estimated FNR: ", x$FNR, "\n") + cat("Number of classes in the model: ", x$G, "\n") cat("========================================================\n") @@ -46,3 +47,20 @@ print.est_block_error <- function(x, ...) { } } +#' @method logLik est_block_error +#' @exportS3Method +logLik.est_block_error <- function(object, ...) { + + val <- object$log_lik + if (object$equal_p) { + k <- 2 * object$G + } else { + k <- 3 * object$G - 1 + } + + attr(val, "df") <- k + class(val) <- "logLik" + + val + +} diff --git a/data/RLdata500.rda b/data/RLdata500.rda index 4190fe2..c23fa9b 100644 Binary files a/data/RLdata500.rda and b/data/RLdata500.rda differ diff --git a/data/census.rda b/data/census.rda index bd45fdb..ed6ab8d 100644 Binary files a/data/census.rda and b/data/census.rda differ diff --git a/data/cis.rda b/data/cis.rda index b67cbea..7b604ae 100644 Binary files a/data/cis.rda and b/data/cis.rda differ diff --git a/data/foreigners.rda b/data/foreigners.rda index a7e0944..a6b624a 100644 Binary files a/data/foreigners.rda and b/data/foreigners.rda differ diff --git a/man/est_block_error.Rd b/man/est_block_error.Rd index c327dbe..324c793 100644 --- a/man/est_block_error.Rd +++ b/man/est_block_error.Rd @@ -14,6 +14,7 @@ est_block_error( alpha = NULL, p = NULL, lambda = NULL, + equal_p = FALSE, tol = 10^(-4), maxiter = 100, sample_size = NULL @@ -24,36 +25,47 @@ est_block_error( \item{y}{Query data (required if \code{n} is not provided).} -\item{blocking_result}{\code{data.frame} or \code{data.table} containing blocking results (required if \code{n} is not provided).} +\item{blocking_result}{\code{data.frame} or \code{data.table} containing blocking results (required if \code{n} is not provided). +It must contain a column named \code{y} storing the indices of the records in the query data set.} \item{n}{Integer vector of numbers of accepted pairs formed by each record in the query data set with records in the reference data set, based on blocking criteria (if \code{NULL}, derived from \code{blocking_result}).} \item{N}{Total number of records in the reference data set (if \code{NULL}, derived as \code{length(x)}).} -\item{G}{Number of classes in the finite mixture model.} +\item{G}{Integer or vector of integers. Number of classes in the finite mixture model. +If \code{G} is a vector, the optimal number of classes is selected from the provided values +based on the Akaike Information Criterion (AIC).} \item{alpha}{Numeric vector of initial class proportions (length \code{G}; if \code{NULL}, initialized as \code{rep(1/G, G)}).} \item{p}{Numeric vector of initial matching probabilities in each class of the mixture model -(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.5, 1)}).} +(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.5, 1)} or \code{rep(runif(1, 0.5, 1), G)}, +depending on the parameter \code{equal_p}).} \item{lambda}{Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model (length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.1, 2)}).} -\item{tol}{Convergence tolerance for the EM algorithm (default \code{10^(-6)}).} +\item{equal_p}{Logical, indicating whether the matching probabilities +\code{p} should be constrained to be equal across all latent classes (default \code{FALSE}).} -\item{maxiter}{Maximum number of iterations for the EM algorithm (default \code{1000}).} +\item{tol}{Convergence tolerance for the EM algorithm (default \code{10^(-4)}).} + +\item{maxiter}{Maximum number of iterations for the EM algorithm (default \code{100}).} \item{sample_size}{Bootstrap sample (from \code{n}) size used for calculations (if \code{NULL}, uses all data).} } \value{ -Returns a list containing:\cr +Returns an object of class \code{est_block_error}, with a list containing:\cr \itemize{ \item{\code{FPR} -- estimated false positive rate,} \item{\code{FNR} -- estimated false negative rate,} +\item{\code{G} -- number of classes used in the optimal model,} +\item{\code{log_lik} -- final log-likelihood value,} +\item{\code{equal_p} -- logical, indicating whether the matching probabilities were constrained,} \item{\code{iter} -- number of the EM algorithm iterations performed,} -\item{\code{convergence} -- logical, indicating whether the EM algorithm converged within \code{maxiter} iterations.} +\item{\code{convergence} -- logical, indicating whether the EM algorithm converged within \code{maxiter} iterations,} +\item{\code{AIC} -- Akaike Information Criterion value in the optimal model.} } } \description{ @@ -62,11 +74,14 @@ as proposed by Dasylva and Goussanou (2021). Assumes duplicate-free data sources complete coverage of the reference data set and blocking decisions based solely on record pairs. } \details{ -Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file. +Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: +a register (reference data \code{x}) and a file (query data \code{y}). Assume that the register has no undercoverage, -i.e. each record from the file corresponds to exactly one record from the same individual in the register. +i.e., each record from the file corresponds to exactly one record from the same individual in the register. Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with -record \eqn{i} on the file. Assume that:\cr +record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file. +Let \eqn{v_i} denote record \eqn{i} from the file. +Assume that:\cr \itemize{ \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N}, \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}. @@ -108,19 +123,23 @@ As \eqn{N \to \infty}, the error rates and the model parameters are related as f } where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}. } +\note{ +The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes +by setting \code{equal_p = TRUE}. +} \examples{ ## an example proposed by Dasylva and Goussanou (2021) ## we obtain results very close to those reported in the paper -set.seed(111) +set.seed(11) neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5)) errors <- est_block_error(n = neighbors, N = 63155, - G = 2, + G = 1:3, tol = 10^(-3), - maxiter = 50) + equal_p = TRUE) errors diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd index b92967b..92f4846 100644 --- a/vignettes/v1-deduplication.Rmd +++ b/vignettes/v1-deduplication.Rmd @@ -21,6 +21,7 @@ knitr::opts_chunk$set( message = FALSE, comment = "#>" ) +data.table::setDTthreads(1) ``` # Setup @@ -36,6 +37,7 @@ Read the `RLdata500` data (taken from the [RecordLinkage](https://CRAN.R-project ```{r} data(RLdata500) +setDT(RLdata500) head(RLdata500) ``` This dataset contains `r nrow(RLdata500)` rows with `r NROW(unique(RLdata500$ent_id))` entities. diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 193bd1d..034099f 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -23,6 +23,7 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) +data.table::setDTthreads(1) ``` # Setup @@ -58,6 +59,8 @@ all records in the cis). ```{r} data(census) data(cis) +setDT(census) +setDT(cis) ``` + `census` object has `r nrow(census)` rows and `r ncol(census)` columns, diff --git a/vignettes/v3-integration.Rmd b/vignettes/v3-integration.Rmd index 1b0300f..eca4f63 100644 --- a/vignettes/v3-integration.Rmd +++ b/vignettes/v3-integration.Rmd @@ -22,12 +22,14 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) +data.table::setDTthreads(1) ``` # Setup ```{r setup} library(blocking) +library(data.table) library(reclin2) ``` @@ -38,6 +40,8 @@ In the example we will use the same dataset as in the *Blocking records for reco ```{r} data(census) data(cis) +setDT(census) +setDT(cis) census[, x:=1:.N] cis[, y:=1:.N] ```