Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ inst/doc
misc
vignettes/.*R
vignettes/.*html
.DS_Store
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: blocking
Type: Package
Title: Various Blocking Methods for Entity Resolution
Version: 1.0.1
Version: 1.0.2
Authors@R:
c(person(given = "Maciej",
family = "Beręsewicz",
Expand All @@ -19,7 +19,7 @@ LazyData: true
URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.ue.poznan.pl/blocking/
BugReports: https://github.com/ncn-foreigners/blocking/issues
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Imports:
text2vec,
tokenizers,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

S3method(logLik,est_block_error)
S3method(print,blocking)
S3method(print,est_block_error)
export(blocking)
Expand Down Expand Up @@ -35,6 +36,7 @@ importFrom(mlpack,lsh)
importFrom(readr,read_table)
importFrom(rnndescent,rnnd_build)
importFrom(rnndescent,rnnd_query)
importFrom(stats,AIC)
importFrom(stats,dist)
importFrom(stats,dpois)
importFrom(stats,runif)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# development

# version 1.0.2

+ Updated `est_block_error` function.

# version 1.0.1

+ Fixed CRAN errors.
Expand Down
19 changes: 11 additions & 8 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,16 @@ blocking <- function(x,
"lsh" = NULL,
"kd" = NULL)

stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported" =
stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported." =
is.character(x) | is.matrix(x) | inherits(x, "Matrix"))


if (!is.null(ann_write)) {
stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
stopifnot("Path provided in the `ann_write` is incorrect." = file.exists(ann_write) )
}

if (ann == "nnd") {
stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" =
stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`." =
distance %in% c("euclidean", "cosine","manhatan", "hamming"))
}

Expand All @@ -145,15 +145,18 @@ blocking <- function(x,
}

if (ann == "hnsw") {
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" =
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`." =
distance %in% c("l2", "euclidean", "cosine", "ip"))
}

if (ann == "annoy") {
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`." =
distance %in% c("euclidean", "manhatan", "hamming", "angular"))
}

stopifnot("Algorithm should be `nnd, hnsw, annoy, lsh, kd`." =
ann %in% c("nnd", "hnsw", "annoy", "lsh", "kd"))

if (!is.null(y)) {
deduplication <- FALSE
y_default <- FALSE
Expand All @@ -167,15 +170,15 @@ blocking <- function(x,

if (!is.null(true_blocks)) {

stopifnot("`true_blocks` should be a data.frame" = is.data.frame(true_blocks))
stopifnot("`true_blocks` should be a data.frame." = is.data.frame(true_blocks))

if (deduplication == FALSE) {
stopifnot("`true blocks` should be a data.frame with columns: x, y, block" =
stopifnot("`true blocks` should be a data.frame with columns: x, y, block." =
length(colnames(true_blocks)) == 3,
all(colnames(true_blocks) == c("x", "y", "block")))
}
if (deduplication) {
stopifnot("`true blocks` should be a data.frame with columns: x, block" =
stopifnot("`true blocks` should be a data.frame with columns: x, block." =
length(colnames(true_blocks)) == 2,
all(colnames(true_blocks) == c("x", "block")))
}
Expand Down
101 changes: 79 additions & 22 deletions R/est_block_error.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @importFrom stats dpois
#' @importFrom stats runif
#' @importFrom stats AIC
#'
#' @title Estimate errors due to blocking in record linkage
#'
Expand All @@ -11,25 +12,34 @@
#' @param x Reference data (required if `n` and `N` are not provided).
#' @param y Query data (required if `n` is not provided).
#' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided).
#' It must contain a column named `y` storing the indices of the records in the query data set.
#' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set
#' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`).
#' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`).
#' @param G Number of classes in the finite mixture model.
#' @param G Integer or vector of integers. Number of classes in the finite mixture model.
#' If `G` is a vector, the optimal number of classes is selected from the provided values
#' based on the Akaike Information Criterion (AIC).
#' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`).
#' @param p Numeric vector of initial matching probabilities in each class of the mixture model
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`).
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)` or `rep(runif(1, 0.5, 1), G)`,
#' depending on the parameter `equal_p`).
#' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`).
#' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`).
#' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`).
#' @param equal_p Logical, indicating whether the matching probabilities
#' `p` should be constrained to be equal across all latent classes (default `FALSE`).
#' @param tol Convergence tolerance for the EM algorithm (default `10^(-4)`).
#' @param maxiter Maximum number of iterations for the EM algorithm (default `100`).
#' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data).
#'
#' @details
#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file.
#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources:
#' a register (reference data `x`) and a file (query data `y`).
#' Assume that the register has no undercoverage,
#' i.e. each record from the file corresponds to exactly one record from the same individual in the register.
#' i.e., each record from the file corresponds to exactly one record from the same individual in the register.
#' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with
#' record \eqn{i} on the file. Assume that:\cr
#' record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file.
#' Let \eqn{v_i} denote record \eqn{i} from the file.
#' Assume that:\cr
#' \itemize{
#' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N},
#' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}.
Expand Down Expand Up @@ -71,14 +81,20 @@
#' }
#' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}.
#'
#' @note
#' The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes
#' by setting `equal_p = TRUE`.
#'
#'
#' @returns Returns a list containing:\cr
#' @returns Returns an object of class `est_block_error`, with a list containing:\cr
#' \itemize{
#' \item{`FPR` -- estimated false positive rate,}
#' \item{`FNR` -- estimated false negative rate,}
#' \item{`G` -- number of classes used in the optimal model,}
#' \item{`log_lik` -- final log-likelihood value,}
#' \item{`equal_p` -- logical, indicating whether the matching probabilities were constrained,}
#' \item{`iter` -- number of the EM algorithm iterations performed,}
#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.}
#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations,}
#' \item{`AIC` -- Akaike Information Criterion value in the optimal model.}
#' }
#'
#' @references
Expand All @@ -92,15 +108,15 @@
#' ## an example proposed by Dasylva and Goussanou (2021)
#' ## we obtain results very close to those reported in the paper
#'
#' set.seed(111)
#' set.seed(11)
#'
#' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5))
#'
#' errors <- est_block_error(n = neighbors,
#' N = 63155,
#' G = 2,
#' G = 1:3,
#' tol = 10^(-3),
#' maxiter = 50)
#' equal_p = TRUE)
#'
#' errors
#'
Expand All @@ -114,6 +130,7 @@ est_block_error <- function(x = NULL,
alpha = NULL,
p = NULL,
lambda = NULL,
equal_p = FALSE,
tol = 10^(-4),
maxiter = 100,
sample_size = NULL) {
Expand All @@ -135,6 +152,29 @@ est_block_error <- function(x = NULL,
n <- sample(n, size = sample_size, replace = TRUE)
}

if (length(G) > 1) {

G_cand <- sort(G)
results_list <- list()
aic_values <- numeric(length(G_cand))

for (i in seq_along(G_cand)) {

fit <- est_block_error(n = n, N = N, G = G_cand[i],
alpha = NULL, p = NULL, lambda = NULL,
equal_p = equal_p, tol = tol, maxiter = maxiter)
results_list[[i]] <- fit
aic_values[i] <- fit$AIC

}

best_idx <- which.min(aic_values)
best_model <- results_list[[best_idx]]

return(best_model)

}

convergence <- FALSE
m <- length(n)

Expand All @@ -143,7 +183,13 @@ est_block_error <- function(x = NULL,
}

if (is.null(p)) {
p <- runif(G, min = 0.5, max = 1)
if (equal_p) {
p <- rep(runif(1, min = 0.5, max = 1), G)
} else {
p <- runif(G, min = 0.5, max = 1)
}
} else if (equal_p && length(p) == G) {
p <- rep(mean(p), G)
}

if (is.null(lambda)) {
Expand Down Expand Up @@ -192,7 +238,11 @@ est_block_error <- function(x = NULL,
## M

alpha <- 1 / m * colSums(probs_c_n)
p <- colSums(E_c_n_M) / (m * alpha)
if (equal_p) {
p <- rep(sum(E_c_n_M) / m, G)
} else {
p <- colSums(E_c_n_M) / (m * alpha)
}
lambda <- colSums(E_c_n_U) / (m * alpha)

## check
Expand All @@ -215,13 +265,20 @@ est_block_error <- function(x = NULL,
FNR <- 1 - sum(alpha * p)
FPR <- sum(alpha * lambda) / (N - 1)

return(structure(
res <- structure(
list(
FPR = FPR,
FNR = FNR,
iter = l,
convergence = convergence
),
class = "est_block_error"))
FPR = FPR,
FNR = FNR,
G = G,
log_lik = log_lik_new,
equal_p = equal_p,
iter = l,
convergence = convergence
),
class = "est_block_error")

res$AIC <- AIC(res)

return(res)

}
22 changes: 20 additions & 2 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ print.blocking <- function(x, ...) {
#' @exportS3Method
print.est_block_error <- function(x, ...) {

cat("FPR: ", x$FPR, "\n")
cat("FNR: ", x$FNR, "\n")
cat("Estimated FPR: ", x$FPR, "\n")
cat("Estimated FNR: ", x$FNR, "\n")
cat("Number of classes in the model: ", x$G, "\n")

cat("========================================================\n")

Expand All @@ -46,3 +47,20 @@ print.est_block_error <- function(x, ...) {
}
}

#' @method logLik est_block_error
#' @exportS3Method
logLik.est_block_error <- function(object, ...) {

val <- object$log_lik
if (object$equal_p) {
k <- 2 * object$G
} else {
k <- 3 * object$G - 1
}

attr(val, "df") <- k
class(val) <- "logLik"

val

}
Binary file modified data/RLdata500.rda
Binary file not shown.
Binary file modified data/census.rda
Binary file not shown.
Binary file modified data/cis.rda
Binary file not shown.
Binary file modified data/foreigners.rda
Binary file not shown.
Loading