diff --git a/.Rbuildignore b/.Rbuildignore index 7b8ddbe..a406ea5 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ playground ^docs$ ^pkgdown$ check +misc diff --git a/.Rproj.user/E3DB6272/build_options b/.Rproj.user/E3DB6272/build_options deleted file mode 100644 index 4089afd..0000000 --- a/.Rproj.user/E3DB6272/build_options +++ /dev/null @@ -1,7 +0,0 @@ -auto_roxygenize_for_build_and_reload="0" -auto_roxygenize_for_build_package="1" -auto_roxygenize_for_check="1" -live_preview_website="0" -makefile_args="" -preview_website="1" -website_output_format="all" diff --git a/.Rproj.user/E3DB6272/copilot_options b/.Rproj.user/E3DB6272/copilot_options deleted file mode 100644 index cd9e9bc..0000000 --- a/.Rproj.user/E3DB6272/copilot_options +++ /dev/null @@ -1,2 +0,0 @@ -copilot_enabled="0" -copilot_indexing_enabled="0" diff --git a/.Rproj.user/E3DB6272/pcs/files-pane.pper b/.Rproj.user/E3DB6272/pcs/files-pane.pper deleted file mode 100644 index b09a3cc..0000000 --- a/.Rproj.user/E3DB6272/pcs/files-pane.pper +++ /dev/null @@ -1,13 +0,0 @@ -{ - "sortOrder": [ - { - "columnIndex": 2, - "ascending": true - }, - { - "columnIndex": 4, - "ascending": false - } - ], - "path": "~/git/nauka/ncn-foreigners/software/blocking/inst/tinytest" -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/pcs/source-pane.pper b/.Rproj.user/E3DB6272/pcs/source-pane.pper deleted file mode 100644 index c755c58..0000000 --- a/.Rproj.user/E3DB6272/pcs/source-pane.pper +++ /dev/null @@ -1,4 +0,0 @@ -{ - "activeTab": 2, - "activeTabSourceWindow0": 0 -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/pcs/windowlayoutstate.pper b/.Rproj.user/E3DB6272/pcs/windowlayoutstate.pper deleted file mode 100644 index 8e07871..0000000 --- a/.Rproj.user/E3DB6272/pcs/windowlayoutstate.pper +++ /dev/null @@ -1,14 +0,0 @@ -{ - "left": { - "splitterpos": 235, - "topwindowstate": "NORMAL", - "panelheight": 768, - "windowheight": 806 - }, - "right": { - "splitterpos": 477, - "topwindowstate": "NORMAL", - "panelheight": 768, - "windowheight": 806 - } -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/pcs/workbench-pane.pper b/.Rproj.user/E3DB6272/pcs/workbench-pane.pper deleted file mode 100644 index f398270..0000000 --- a/.Rproj.user/E3DB6272/pcs/workbench-pane.pper +++ /dev/null @@ -1,5 +0,0 @@ -{ - "TabSet1": 3, - "TabSet2": 3, - "TabZoom": {} -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/sources/prop/67301661 b/.Rproj.user/E3DB6272/sources/prop/67301661 deleted file mode 100644 index 839e0df..0000000 --- a/.Rproj.user/E3DB6272/sources/prop/67301661 +++ /dev/null @@ -1,6 +0,0 @@ -{ - "source_window_id": "", - "Source": "Source", - "cursorPosition": "75,41", - "scrollLine": "55" -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/sources/prop/AB62ED0C b/.Rproj.user/E3DB6272/sources/prop/AB62ED0C deleted file mode 100644 index 125adf8..0000000 --- a/.Rproj.user/E3DB6272/sources/prop/AB62ED0C +++ /dev/null @@ -1,6 +0,0 @@ -{ - "source_window_id": "", - "Source": "Source", - "cursorPosition": "9,39", - "scrollLine": "0" -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/sources/prop/AE0C7A4A b/.Rproj.user/E3DB6272/sources/prop/AE0C7A4A deleted file mode 100644 index 17d8c10..0000000 --- a/.Rproj.user/E3DB6272/sources/prop/AE0C7A4A +++ /dev/null @@ -1,6 +0,0 @@ -{ - "source_window_id": "", - "Source": "Source", - "cursorPosition": "23,23", - "scrollLine": "0" -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/sources/prop/B8117F7C b/.Rproj.user/E3DB6272/sources/prop/B8117F7C deleted file mode 100644 index f19986a..0000000 --- a/.Rproj.user/E3DB6272/sources/prop/B8117F7C +++ /dev/null @@ -1,6 +0,0 @@ -{ - "source_window_id": "", - "Source": "Source", - "cursorPosition": "17,18", - "scrollLine": "0" -} \ No newline at end of file diff --git a/.Rproj.user/E3DB6272/sources/prop/INDEX b/.Rproj.user/E3DB6272/sources/prop/INDEX deleted file mode 100644 index 67c6bfa..0000000 --- a/.Rproj.user/E3DB6272/sources/prop/INDEX +++ /dev/null @@ -1,37 +0,0 @@ -%2FUsers%2Fberenz%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FNEWS.md="5EE4624C" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2F.Rbuildignore="9CECB563" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2F.github%2Fworkflows%2Ftest-coverage.yaml="91E826D3" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2F.gitignore="0955E4E5" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FDESCRIPTION="B8117F7C" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FNAMESPACE="AE0C7A4A" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FNEWS.md="306F4DA5" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fblock_for_reclin.R="67301661" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fblocking.R="54DEFD94" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fcontrols.R="36158F40" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fhello.R="AB62ED0C" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethod_annoy.R="8ADC1E6D" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethod_hnsw.R="0AB1AEAE" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethod_lsh.R="331179A8" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethod_mlpack.R="60B3716C" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethod_nnd.R="2B283ECD" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Fmethods.R="984FE5E2" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FR%2Freclin2_pair_ann.R="CB258DE3" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FREADME.Rmd="9F11C3D9" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2FREADME.md="225CD148" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2F_pkgdown.yml="131FF339" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ffiles%2Findex-colnames.txt="457A9029" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_annoy.R="769DD9A0" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_blocking.R="E20BF99F" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_data.R="C5E30237" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_hnsw.R="0120AD07" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_mlpack.R="A5DC68DD" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_print.R="1F5D2174" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Finst%2Ftinytest%2Ftest_reclin2.R="C455613E" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fplayground%2Ftesting-package.R="B39F586D" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Ftest.txt="5026DCC6" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Ftests%2Ftinytest.R="49387E76" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv1-deduplication.Rmd="0DDC09E2" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv2-reclin.Rmd="59CC71B7" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv3-evaluation.Rmd="412A89DC" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv4-integration.Rmd="F09D0DE8" -~%2Fgit%2Fnauka%2Fncn-foreigners%2Fsoftware%2Fblocking%2Fvignettes%2Fv5-bigdata.Rmd="738E79A5" diff --git a/.Rproj.user/shared/notebooks/patch-chunk-names b/.Rproj.user/shared/notebooks/patch-chunk-names deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index dcf341f..735ecf5 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,28 +1,20 @@ -/Users/berenz/Downloads/Symulacje - MC, MM, BP.Rmd="6F10509D" -/Users/berenz/Downloads/run_splink_benchmarks_in_ec2-0.0.3/README.md="CA7B3BF6" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="C912F95E" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_mlpack.R="B6A90565" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/README.Rmd="CBB944CE" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/index-colnames.txt="0350B51E" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_annoy.R="4302FC18" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_blocking.R="DABEA252" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_data.R="9D1011B0" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_hnsw.R="2E19A832" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_true_blocks.R="8B9CECC7" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/misc/hnsw-nndesc.Rmd="F39A0093" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="E778A54F" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="E3EFC8F1" -/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v5-bigdata.Rmd="335CBF49" +C:/Users/48607/Desktop/Grant/blocking_30_05/DESCRIPTION="BADE5BA2" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/blocking.R="44766414" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/controls.R="A7B8B8D2" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/est_block_error.R="1DB588BC" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/eval.R="30BB8D2A" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_annoy.R="38099BF2" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_hnsw.R="ECE2EFAA" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_mlpack.R="7020DB20" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_nnd.R="1937EFF4" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/methods.R="2B7AFCBF" +C:/Users/48607/Desktop/Grant/blocking_30_05/R/sentence_to_vector.R="71CCE4B2" +C:/Users/48607/Desktop/Grant/blocking_30_05/README.md="3BCBDFED" +C:/Users/48607/Desktop/Grant/blocking_30_05/inst/WORDLIST="5610708B" +C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_annoy.R="A935531D" +C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_hnsw.R="13EE8820" +C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_true_blocks.R="7D5E11CB" +C:/Users/48607/Desktop/Grant/blocking_30_05/tests/tinytest.R="2AFE54EE" +C:/Users/48607/Desktop/Grant/blocking_30_05/vignettes/v2-reclin.Rmd="7E043D0D" +C:/Users/48607/Desktop/Grant/blocking_30_05/vignettes/v3-integration.Rmd="EE91B56E" +C:/Users/48607/Desktop/Grant/blocking_test_3/vignettes/v2-reclin.Rmd="F7A0D8BC" diff --git a/.gitignore b/.gitignore index a83549c..cb58c8a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ playground docs inst/doc misc +vignettes/.*R +vignettes/.*html diff --git a/DESCRIPTION b/DESCRIPTION index 8ff08af..c8a44d2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,35 +1,44 @@ Package: blocking Type: Package -Title: Blocking records for record linkage / entity resolution +Title: Deduplication / Entity Resolution with Record Blocking Version: 0.1.0 Authors@R: c(person(given = "Maciej", family = "Beręsewicz", role = c("aut", "cre"), email = "maciej.beresewicz@ue.poznan.pl", - comment = c(ORCID = "0000-0002-8281-4301"))) + comment = c(ORCID = "0000-0002-8281-4301")), + person(given = "Adam", + family = "Struzik", + role = c("aut", "ctr"), + email = "adastr5@st.amu.edu.pl")) Description: An R package that uses various approximate nearest neighbours algorithms and graphs to block records for data deduplication / record linkage / entity resolution. License: GPL-3 Encoding: UTF-8 LazyData: true -URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.github.io/blocking/ -BugReports: https://github.com/ncn-foreigners/blocking +URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.ue.poznan.pl/blocking/ +BugReports: https://github.com/ncn-foreigners/blocking/issues Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: - text2vec, - tokenizers, - RcppHNSW, - RcppAnnoy, - mlpack, - rnndescent, - igraph, - data.table, - RcppAlgos, - methods + text2vec, + tokenizers, + RcppHNSW, + RcppAnnoy, + mlpack, + rnndescent, + igraph, + data.table, + RcppAlgos, + methods, + readr, + utils, + Matrix Suggests: tinytest, reclin2, knitr, rmarkdown VignetteBuilder: knitr +Depends: + R (>= 4.1.0) diff --git a/NAMESPACE b/NAMESPACE index 3af2e01..cb9fe15 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,21 @@ # Generated by roxygen2: do not edit by hand S3method(print,blocking) +S3method(print,est_block_error) export(blocking) +export(control_annoy) +export(control_hnsw) +export(control_kd) +export(control_lsh) +export(control_nnd) export(controls_ann) export(controls_txt) +export(est_block_error) export(pair_ann) import(data.table) +importFrom(Matrix,colSums) +importFrom(Matrix,rowSums) +importFrom(Matrix,sparseMatrix) importFrom(RcppAlgos,comboGeneral) importFrom(RcppAnnoy,AnnoyAngular) importFrom(RcppAnnoy,AnnoyEuclidean) @@ -25,12 +35,20 @@ importFrom(igraph,make_clusters) importFrom(methods,new) importFrom(mlpack,knn) importFrom(mlpack,lsh) +importFrom(readr,read_table) importFrom(rnndescent,rnnd_build) importFrom(rnndescent,rnnd_query) +importFrom(stats,dist) +importFrom(stats,dpois) +importFrom(stats,runif) +importFrom(stats,setNames) importFrom(text2vec,create_dtm) importFrom(text2vec,create_vocabulary) importFrom(text2vec,itoken) importFrom(text2vec,itoken_parallel) +importFrom(text2vec,space_tokenizer) importFrom(text2vec,vocab_vectorizer) +importFrom(utils,download.file) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) +importFrom(utils,unzip) diff --git a/R/blocking.R b/R/blocking.R index 4f70322..be76863 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -1,4 +1,4 @@ -#' Imports +#' #' @importFrom text2vec itoken #' @importFrom text2vec itoken_parallel #' @importFrom text2vec create_vocabulary @@ -10,18 +10,25 @@ #' @importFrom igraph make_clusters #' @importFrom igraph compare #' @importFrom RcppAlgos comboGeneral +#' @importFrom stats dist +#' @importFrom readr read_table +#' @importFrom utils download.file +#' @importFrom utils unzip #' #' #' @title Block records based on text data. #' -#' @author Maciej Beręsewicz +#' @author Maciej Beręsewicz, Adam Struzik #' #' @description -#' Function creates shingles (strings with 2 characters, default), applies approximate nearest neighbour (ANN) algorithms via the [rnndescent], RcppHNSW, [RcppAnnoy] and [mlpack] packages, -#' and creates blocks using graphs via [igraph]. +#' Function creates shingles (strings with 2 characters, default) or vectors using a given model (e.g., GloVe), +#' applies approximate nearest neighbour (ANN) algorithms via the \link[rnndescent]{rnndescent}, \link[RcppHNSW]{RcppHNSW}, \link[RcppAnnoy]{RcppAnnoy} and \link[mlpack]{mlpack} packages, +#' and creates blocks using graphs via \link[igraph]{igraph}. #' #' @param x reference data (a character vector or a matrix), #' @param y query data (a character vector or a matrix), if not provided NULL by default and thus deduplication is performed, +#' @param representation method of representing input data (possible \code{c("shingles", "vectors")}; default \code{"shingles"}), +#' @param model a matrix containing word embeddings (e.g., GloVe), required only when \code{representation = "vectors"}, #' @param deduplication whether deduplication should be applied (default TRUE as y is set to NULL), #' @param on variables for ANN search (currently not supported), #' @param on_blocking variables for blocking records before ANN search (currently not supported), @@ -29,33 +36,35 @@ #' @param distance distance metric (default \code{cosine}, more options are possible see details), #' @param ann_write writing an index to file. Two files will be created: 1) an index, 2) and text file with column names, #' @param ann_colnames file with column names if \code{x} or \code{y} are indices saved on the disk (currently not supported), -#' @param true_blocks matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix as well as all metrics from [igraph::compare()] are returned). +#' @param true_blocks matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix as well as all metrics from \link[igraph]{compare} are returned). #' @param verbose whether log should be provided (0 = none, 1 = main, 2 = ANN algorithm verbose used), #' @param graph whether a graph should be returned (default FALSE), #' @param seed seed for the algorithms (for reproducibility), #' @param n_threads number of threads used for the ANN algorithms and adding data for index and query, -#' @param control_txt list of controls for text data (passed only to [text2vec::itoken_parallel] or [text2vec::itoken]), +#' @param control_txt list of controls for text data (passed only to \link[text2vec]{itoken_parallel} or \link[text2vec]{itoken}), used only when \code{representation = "shingles"}, #' @param control_ann list of controls for the ANN algorithms. #' -#' @returns Returns a list with containing:\cr +#' @returns Returns a list containing:\cr #' \itemize{ #' \item{\code{result} -- \code{data.table} with indices (rows) of x, y, block and distance between points} #' \item{\code{method} -- name of the ANN algorithm used,} #' \item{\code{deduplication} -- information whether deduplication was applied,} +#' \item{\code{representation} -- information whether shingles or vectors were used,} #' \item{\code{metrics} -- metrics for quality assessment, if \code{true_blocks} is provided,} +#' \item{\code{confusion} -- confusion matrix, if \code{true_blocks} is provided,} #' \item{\code{colnames} -- variable names (colnames) used for search,} #' \item{\code{graph} -- \code{igraph} class object.} #' } #' #' @examples -#' #' ## an example using RcppHNSW +#' #' df_example <- data.frame(txt = c("jankowalski", "kowalskijan", "kowalskimjan", #' "kowaljan", "montypython", "pythonmonty", "cyrkmontypython", "monty")) #' #' result <- blocking(x = df_example$txt, #' ann = "hnsw", -#' control_ann = controls_ann(hnsw = list(M = 5, ef_c = 10, ef_s = 10))) +#' control_ann = controls_ann(hnsw = control_hnsw(M = 5, ef_c = 10, ef_s = 10))) #' #' result #' @@ -65,9 +74,65 @@ #' ann = "lsh") #' #' result_lsh +#' +#' ## an example using GloVe and RcppAnnoy +#' \dontrun{ +#' options(timeout = 500) +#' utils::download.file("https://nlp.stanford.edu/data/glove.6B.zip", destfile = "glove.6B.zip") +#' utils::unzip("glove.6B.zip") +#' +#' glove_6B_50d <- readr::read_table("glove.6B.50d.txt", +#' col_names = FALSE, +#' show_col_types = FALSE) +#' data.table::setDT(glove_6B_50d) +#' +#' glove_vectors <- glove_6B_50d[,-1] +#' glove_vectors <- as.matrix(glove_vectors) +#' rownames(glove_vectors) <- glove_6B_50d$X1 +#' +#' ## spaces between words are required +#' df_example_spaces <- data.frame(txt = c("jan kowalski", "kowalski jan", "kowalskim jan", +#' "kowal jan", "monty python", "python monty", "cyrk monty python", "monty")) +#' +#' result_annoy <- blocking(x = df_example_spaces$txt, +#' ann = "annoy", +#' representation = "vectors", +#' model = glove_vectors) +#' +#' result_annoy +#' } +#' +#' ## an example with the NN descent algorithm and true blocks +#' +#' data(census) +#' data(cis) +#' +#' set.seed(2024) +#' census <- census[sample(nrow(census), floor(nrow(census) / 2)), ] +#' set.seed(2024) +#' cis <- cis[sample(nrow(cis), floor(nrow(cis) / 2)), ] +#' +#' census[, txt:=paste0(pername1, pername2, sex, +#' dob_day, dob_mon, dob_year, enumcap, enumpc)] +#' cis[, txt:=paste0(pername1, pername2, sex, +#' dob_day, dob_mon, dob_year, enumcap, enumpc)] +#' +#' matches <- merge(x = census[, .(x=1:.N, person_id)], +#' y = cis[, .(y = 1:.N, person_id)], +#' by = "person_id") +#' matches[, block:=1:.N] +#' +#' result_true_blocks <- blocking(x = census$txt, y = cis$txt, verbose = 1, +#' true_blocks = matches[, .(x, y, block)], +#' seed = 2024) +#' +#' result_true_blocks +#' #' @export blocking <- function(x, y = NULL, + representation = c("shingles", "vectors"), + model, deduplication = TRUE, on = NULL, on_blocking = NULL, @@ -84,6 +149,7 @@ blocking <- function(x, control_ann = controls_ann()) { ## defaults + if (missing(representation)) representation <- "shingles" if (missing(verbose)) verbose <- 0 if (missing(ann)) ann <- "nnd" if (missing(distance)) distance <- switch(ann, @@ -101,6 +167,15 @@ blocking <- function(x, stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) ) } + if (ann == "nnd") { + stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" = + distance %in% c("euclidean", "cosine","manhatan", "hamming")) + } + + if ((ann == "nnd") && (distance == "manhatan")) { + distance <- "manhattan" + } + if (ann == "hnsw") { stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" = distance %in% c("l2", "euclidean", "cosine", "ip")) @@ -147,46 +222,33 @@ blocking <- function(x, if (verbose %in% 1:2) cat("===== creating tokens =====\n") - ## tokens for x - if (.Platform$OS.type == "unix") { - x_tokens <- text2vec::itoken_parallel( - iterable = x, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, - n = control_txt$n_shingles, - lowercase = control_txt$lowercase, - strip_non_alphanum = control_txt$strip_non_alphanum), - n_chunks = control_txt$n_chunks, - progressbar = verbose) - } else { - x_tokens <- text2vec::itoken( - iterable = x, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, - n = control_txt$n_shingles, - lowercase = control_txt$lowercase, - strip_non_alphanum = control_txt$strip_non_alphanum), - n_chunks = control_txt$n_chunks, - progressbar = verbose) - } + ## vectors - x_voc <- text2vec::create_vocabulary(x_tokens) - x_vec <- text2vec::vocab_vectorizer(x_voc) - x_dtm <- text2vec::create_dtm(x_tokens, x_vec) + if (representation == "vectors"){ + x_embeddings <- sentence_to_vector(x, model) - if (is.null(y_default)) { - y_dtm <- x_dtm - } else { + if (is.null(y_default)) { + y_embeddings <- x_embeddings + } else { + y_embeddings <- sentence_to_vector(y, model) + } + } else{ + + ## shingles + + ## tokens for x if (.Platform$OS.type == "unix") { - y_tokens <- text2vec::itoken_parallel( - iterable = y, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, - n = control_txt$n_shingles, - lowercase = control_txt$lowercase, - strip_non_alphanum = control_txt$strip_non_alphanum), - n_chunks = control_txt$n_chunks, - progressbar = verbose) + x_tokens <- text2vec::itoken_parallel( + iterable = x, + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), + n_chunks = control_txt$n_chunks, + progressbar = verbose) } else { - y_tokens <- text2vec::itoken( - iterable = y, + x_tokens <- text2vec::itoken( + iterable = x, tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles, lowercase = control_txt$lowercase, @@ -194,55 +256,93 @@ blocking <- function(x, n_chunks = control_txt$n_chunks, progressbar = verbose) } - y_voc <- text2vec::create_vocabulary(y_tokens) - y_vec <- text2vec::vocab_vectorizer(y_voc) - y_dtm <- text2vec::create_dtm(y_tokens, y_vec) + x_voc <- text2vec::create_vocabulary(x_tokens) + x_vec <- text2vec::vocab_vectorizer(x_voc) + x_dtm <- text2vec::create_dtm(x_tokens, x_vec) + + if (is.null(y_default)) { + y_dtm <- x_dtm + } else { + if (.Platform$OS.type == "unix") { + y_tokens <- text2vec::itoken_parallel( + iterable = y, + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), + n_chunks = control_txt$n_chunks, + progressbar = verbose) + } else { + y_tokens <- text2vec::itoken( + iterable = y, + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), + n_chunks = control_txt$n_chunks, + progressbar = verbose) + } + y_voc <- text2vec::create_vocabulary(y_tokens) + y_vec <- text2vec::vocab_vectorizer(y_voc) + y_dtm <- text2vec::create_dtm(y_tokens, y_vec) + + } } + + } - colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm)) + if (representation == "shingles"){ + colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm)) + } if (verbose %in% 1:2) { - cat(sprintf("===== starting search (%s, x, y: %d, %d, t: %d) =====\n", - ann, nrow(x_dtm), nrow(y_dtm), length(colnames_xy))) + + if (representation == "shingles") { + cat(sprintf("===== starting search (%s, x, y: %d, %d, t: %d) =====\n", + ann, nrow(x_dtm), nrow(y_dtm), length(colnames_xy))) + } else { + cat("===== starting search =====") + } + } x_df <- switch(ann, - "nnd" = method_nnd(x = x_dtm[, colnames_xy], - y = y_dtm[, colnames_xy], + "nnd" = method_nnd(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings, + y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings, k = k, distance = distance, deduplication = deduplication, verbose = if (verbose == 2) TRUE else FALSE, n_threads = n_threads, control = control_ann), - "hnsw" = method_hnsw(x = x_dtm[, colnames_xy], - y = y_dtm[, colnames_xy], + "hnsw" = method_hnsw(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings, + y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings, k = k, distance = distance, verbose = if (verbose == 2) TRUE else FALSE, n_threads = n_threads, path = ann_write, control = control_ann), - "lsh" = method_mlpack(x = x_dtm[, colnames_xy], - y = y_dtm[, colnames_xy], + "lsh" = method_mlpack(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings, + y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings, algo = "lsh", k = k, verbose = if (verbose == 2) TRUE else FALSE, seed = seed, path = ann_write, control = control_ann), - "kd" = method_mlpack(x = x_dtm[, colnames_xy], - y = y_dtm[, colnames_xy], + "kd" = method_mlpack(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings, + y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings, algo = "kd", k = k, verbose = if (verbose == 2) TRUE else FALSE, seed = seed, path = ann_write, control = control_ann), - "annoy" = method_annoy(x = x_dtm[, colnames_xy], - y = y_dtm[, colnames_xy], + "annoy" = method_annoy(x = if (representation == "shingles") x_dtm[, colnames_xy] else x_embeddings, + y = if (representation == "shingles") y_dtm[, colnames_xy] else y_embeddings, k = k, distance = distance, verbose = if (verbose == 2) TRUE else FALSE, @@ -255,7 +355,21 @@ blocking <- function(x, ## remove duplicated pairs - if (deduplication) x_df <- x_df[y > x] + if (deduplication) { + setDT(x_df) + setorder(x_df, x) + x_df[, "pair" := sapply(seq_len(.N), function(i) paste(sort(c(x[i], y[i])), collapse = "_"))] + x_df <- x_df[, .SD[dist == min(dist)], by = "pair"] + x_df <- x_df[, .SD[1], by = "pair"] + x_df[, "pair" := NULL] + x_df <- x_df[x != y] + } else { + # x_df <- x_df[order(dist)] + # x_df <- x_df[!duplicated(y), ] + # x_df <- x_df[!duplicated(x), ] + } + + x_df[, x := as.integer(x)] if (deduplication) { x_df[, `:=`("query_g", paste0("q", y))] @@ -268,7 +382,7 @@ blocking <- function(x, x_gr <- igraph::graph_from_data_frame(x_df[, c("query_g", "index_g")], directed = F) x_block <- igraph::components(x_gr, "weak")$membership - x_df[, `:=`(block, x_block[names(x_block) %in% x_df$query_g])] + x_df[, `:=`("block", x_block[names(x_block) %in% x_df$query_g])] ## if true are given if (!is.null(true_blocks)) { @@ -277,60 +391,78 @@ blocking <- function(x, if (!deduplication) { - pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")] - pairs_to_eval[true_blocks, on = c("x", "y"), both := 0L] - pairs_to_eval[is.na(both), both := -1L] - - true_blocks[pairs_to_eval, on = c("x", "y"), both := 0L] - true_blocks[is.na(both), both := 1L] - true_blocks[, block:=block+max(pairs_to_eval$block)] - - pairs_to_eval <- rbind(pairs_to_eval, true_blocks[both == 1L, .(x,y,block, both)]) - - pairs_to_eval[, row_id := 1:.N] - pairs_to_eval[, x2:=x+max(y)] - - pairs_to_eval_long <- melt(pairs_to_eval[, .(y, x2, row_id, block, both)], id.vars = c("row_id", "block", "both")) - pairs_to_eval_long[both == 0L, ":="(block_id = .GRP, true_id = .GRP), block] - - block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE) - pairs_to_eval_long[both == -1L, block_id:= block_id_max + .GRP, row_id] - block_id_max <- max(pairs_to_eval_long$block_id, na.rm = TRUE) - pairs_to_eval_long[both == 1L & is.na(block_id), block_id := block_id_max + rleid(row_id)] - - true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE) - pairs_to_eval_long[both == 1L, true_id:= true_id_max + .GRP, row_id] - true_id_max <- max(pairs_to_eval_long$true_id, na.rm = TRUE) - pairs_to_eval_long[both == -1L & is.na(true_id), true_id := true_id_max + rleid(row_id)] + eval <- eval_reclin(x_df, true_blocks) + eval_metrics <- unlist(get_metrics(TP = eval$TP, + FP = eval$FP, + FN = eval$FN, + TN = eval$TN)) + confusion <- get_confusion(TP = eval$TP, + FP = eval$FP, + FN = eval$FN, + TN = eval$TN) } else { #true_blocks <- data.frame(x=1:NROW(identity.RLdata500), block = identity.RLdata500) - pairs_to_eval_long <- melt(x_df[, .(x,y,block)], id.vars = c("block")) - pairs_to_eval_long <- unique(pairs_to_eval_long[, .(block_id=block, x=value)]) - pairs_to_eval_long[true_blocks, on = "x", true_id := i.block] - - } - - candidate_pairs <- RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) + # pairs_to_eval_long <- melt(x_df[, .(x,y,block)], id.vars = c("block")) + # pairs_to_eval_long <- unique(pairs_to_eval_long[, .(block_id=block, x=value)]) + # pairs_to_eval_long[true_blocks, on = "x", true_id := i.block] - same_block <- pairs_to_eval_long$block_id[candidate_pairs[, 1]] == pairs_to_eval_long$block_id[candidate_pairs[,2]] - same_truth <- pairs_to_eval_long$true_id[candidate_pairs[,1]] == pairs_to_eval_long$true_id[candidate_pairs[,2]] + eval <- eval_dedup(x_df, true_blocks) + eval_metrics <- unlist(get_metrics(TP = eval$TP, + FP = eval$FP, + FN = eval$FN, + TN = eval$TN)) + confusion <- get_confusion(TP = eval$TP, + FP = eval$FP, + FN = eval$FN, + TN = eval$TN) - confusion <- table(same_block, same_truth) + } - fp <- confusion[2, 1] - fn <- confusion[1, 2] - tp <- confusion[2, 2] - tn <- confusion[1, 1] - recall <- tp/(fn + tp) + # if (deduplication) { + # candidate_pairs <- RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) + # + # same_block <- pairs_to_eval_long$block_id[candidate_pairs[, 1]] == pairs_to_eval_long$block_id[candidate_pairs[,2]] + # same_truth <- pairs_to_eval_long$true_id[candidate_pairs[,1]] == pairs_to_eval_long$true_id[candidate_pairs[,2]] + # + # confusion <- table(same_block, same_truth) + # + # fp <- confusion[2, 1] + # fn <- confusion[1, 2] + # tp <- confusion[2, 2] + # tn <- confusion[1, 1] + # recall <- tp/(fn + tp) + # + # eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp), + # fpr = fp / (fp + tn), fnr = fn / (fn + tp), + # accuracy = (tp + tn) / (tp + tn + fn + fp), + # specificity = tn / (tn + fp)) + # } + + # candidate_pairs <- RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) + # + # same_block <- pairs_to_eval_long$block_id[candidate_pairs[, 1]] == pairs_to_eval_long$block_id[candidate_pairs[,2]] + # same_truth <- pairs_to_eval_long$true_id[candidate_pairs[,1]] == pairs_to_eval_long$true_id[candidate_pairs[,2]] + # + # confusion <- table(same_block, same_truth) + # + # fp <- confusion[2, 1] + # fn <- confusion[1, 2] + # tp <- confusion[2, 2] + # tn <- confusion[1, 1] + # recall <- tp/(fn + tp) + # + # eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp), + # fpr = fp / (fp + tn), fnr = fn / (fn + tp), + # accuracy = (tp + tn) / (tp + tn + fn + fp), + # specificity = tn / (tn + fp)) - eval_metrics <- c(recall = tp / (fn + tp), precision = tp / (tp + fp), - fpr = fp / (fp + tn), fnr = fn / (fn + tp), - accuracy = (tp + tn) / (tp + tn + fn + fp), - specificity = tn / (tn + fp)) + } + if (deduplication){ + x_df[, `:=`(x = pmin(x, y), y = pmax(x, y))] } setorderv(x_df, c("x", "y", "block")) @@ -340,9 +472,10 @@ blocking <- function(x, result = x_df[, c("x", "y", "block", "dist")], method = ann, deduplication = deduplication, + representation = representation, metrics = if (is.null(true_blocks)) NULL else eval_metrics, confusion = if (is.null(true_blocks)) NULL else confusion, - colnames = colnames_xy, + colnames = if (exists("colnames_xy", where = environment())) colnames_xy else NULL, graph = if (graph) { igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = F) } else NULL @@ -350,3 +483,4 @@ blocking <- function(x, class = "blocking" ) } + diff --git a/R/controls.R b/R/controls.R index f02d5b9..fd70076 100644 --- a/R/controls.R +++ b/R/controls.R @@ -1,64 +1,225 @@ +#' @title Controls for HNSW +#' +#' @description +#' Controls for HNSW algorithm used in the package (see [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()] for details). +#' +#' @param M Controls the number of bi-directional links created for each element during index construction. +#' @param ef_c Size of the dynamic list used during construction. +#' @param ef_s Size of the dynamic list used during search. +#' @param grain_size Minimum amount of work to do (rows in the dataset to add) per thread. +#' @param byrow If \code{TRUE} (the default), this indicates that the items in the dataset to be indexed are stored in each row. +#' Otherwise, the items are stored in the columns of the dataset. +#' @param ... Additional arguments. +#' +#' @returns Returns a list with parameters. +#' +#' @export +control_hnsw <- function(M = 25, + ef_c = 200, + ef_s = 200, + grain_size = 1, + byrow = TRUE, + ...){ + append(list(M = M, + ef_c = ef_c, + ef_s = ef_s, + grain_size = grain_size, + byrow = byrow), + list(...)) +} + +#' @title Controls for NND +#' +#' @description +#' Controls for NND algorithm used in the package (see \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query} for details). +#' +#' @param k_build Number of nearest neighbors to build the index for. +#' @param use_alt_metric If \code{TRUE}, use faster metrics that maintain the ordering of distances internally (e.g. squared Euclidean distances if using \code{metric = "euclidean"}), +#' then apply a correction at the end. +#' @param init Name of the initialization strategy or initial data neighbor graph to optimize. +#' @param n_trees The number of trees to use in the RP forest. +#' Only used if \code{init = "tree"}. +#' @param leaf_size The maximum number of items that can appear in a leaf. +#' Only used if \code{init = "tree"}. +#' @param max_tree_depth The maximum depth of the tree to build (default = 200). +#' Only used if \code{init = "tree"}. +#' @param margin A character string specifying the method used to assign points to one side of the hyperplane or the other. +#' @param n_iters Number of iterations of nearest neighbor descent to carry out. +#' @param delta The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed. +#' @param max_candidates Maximum number of candidate neighbors to try for each item in each iteration. +#' @param low_memory If \code{TRUE}, use a lower memory, but more computationally expensive approach to index construction. If set to \code{FALSE}, you should see a noticeable speed improvement, especially when using a smaller number of threads, so this is worth trying if you have the memory to spare. +#' @param n_search_trees The number of trees to keep in the search forest as part of index preparation. The default is 1. +#' @param pruning_degree_multiplier How strongly to truncate the final neighbor list for each item. +#' @param diversify_prob The degree of diversification of the search graph by removing unnecessary edges through occlusion pruning. +#' @param weight_by_degree If \code{TRUE}, then candidates for the local join are weighted according to their in-degree, +#' so that if there are more than \code{max_candidates} in a candidate list, candidates with a smaller degree are favored for retention. +#' @param prune_reverse If \code{TRUE}, prune the reverse neighbors of each item before the reverse graph diversification step using \code{pruning_degree_multiplier}. +#' @param progress Determines the type of progress information logged during the nearest neighbor descent stage. +#' @param obs set to \code{C} to indicate that the input data orientation stores each observation as a column. +#' The default \code{R} means that observations are stored in each row. +#' @param max_search_fraction Maximum fraction of the reference data to search. +#' @param epsilon Controls trade-off between accuracy and search cost. +#' @param ... Additional arguments. +#' +#' @returns Returns a list with parameters. +#' +#' @export +control_nnd <- function(k_build = 30, + use_alt_metric = FALSE, + init = "tree", + n_trees = NULL, + leaf_size = NULL, + max_tree_depth = 200, + margin = "auto", + n_iters = NULL, + delta = 0.001, + max_candidates = NULL, + low_memory = TRUE, + n_search_trees = 1, + pruning_degree_multiplier = 1.5, + diversify_prob = 1, + weight_by_degree = FALSE, + prune_reverse = FALSE, + progress = "bar", + obs = "R", + max_search_fraction = 1, + epsilon = 0.1, + ...){ + append(list(k_build = k_build, + use_alt_metric = use_alt_metric, + init = init, + n_trees = n_trees, + leaf_size = leaf_size, + max_tree_depth = max_tree_depth, + margin = margin, + n_iters = n_iters, + delta = delta, + max_candidates = max_candidates, + low_memory = low_memory, + n_search_trees = n_search_trees, + pruning_degree_multiplier = pruning_degree_multiplier, + diversify_prob = diversify_prob, + weight_by_degree = weight_by_degree, + prune_reverse = prune_reverse, + progress = progress, + obs = obs, + max_search_fraction = max_search_fraction, + epsilon = epsilon), + list(...)) +} + +#' @title Controls for LSH +#' +#' @description +#' Controls for LSH algorithm used in the package (see \link[mlpack]{lsh} for details). +#' +#' @param bucket_size The size of a bucket in the second level hash. +#' @param hash_width The hash width for the first-level hashing in the LSH preprocessing. +#' @param num_probes Number of additional probes for multiprobe LSH. +#' @param projections The number of hash functions for each table. +#' @param tables The number of hash tables to be used. +#' @param ... Additional arguments. +#' +#' @returns Returns a list with parameters. +#' +#' @export +control_lsh <- function(bucket_size = 10, + hash_width = 6, + num_probes = 5, + projections = 10, + tables = 30, + ...){ + append(list(bucket_size = bucket_size, + hash_width = hash_width, + num_probes = num_probes, + projections = projections, + tables = tables), + list(...)) +} + +#' @title Controls for Annoy +#' +#' @description +#' Controls for Annoy algorithm used in the package (see \link[RcppAnnoy]{RcppAnnoy} for details). +#' +#' @param n_trees An integer specifying the number of trees to build in the Annoy index. +#' @param build_on_disk A logical value indicating whether to build the Annoy index on disk instead of in memory. +#' @param ... Additional arguments. +#' +#' @returns Returns a list with parameters. +#' +#' @export +control_annoy <- function(n_trees = 250, + build_on_disk = FALSE, + ...){ + append(list(n_trees = n_trees, + build_on_disk = build_on_disk), + list(...)) +} + +#' @title Controls for KD +#' +#' @description +#' Controls for KD algorithm used in the package (see \link[mlpack]{knn} for details). +#' +#' @param algorithm Type of neighbor search: \code{'naive'}, \code{'single_tree'}, \code{'dual_tree'}, \code{'greedy'}. +#' @param epsilon If specified, will do approximate nearest neighbor search with given relative error. +#' @param leaf_size Leaf size for tree building +#' (used for kd-trees, vp trees, random projection trees, UB trees, R trees, R* trees, X trees, Hilbert R trees, R+ trees, R++ trees, spill trees, and octrees). +#' @param random_basis Before tree-building, project the data onto a random orthogonal basis. +#' @param rho Balance threshold (only valid for spill trees). +#' @param tau Overlapping size (only valid for spill trees). +#' @param tree_type Type of tree to use: \code{'kd'}, \code{'vp'}, \code{'rp'}, \code{'max-rp'}, \code{'ub'}, \code{'cover'}, \code{'r'}, \code{'r-star'}, +#' \code{'x'}, \code{'ball'}, \code{'hilbert-r'}, \code{'r-plus'}, \code{'r-plus-plus'}, \code{'spill'}, \code{'oct'}. +#' @param ... Additional arguments. +#' +#' @returns Returns a list with parameters. +#' +#' @export +control_kd <- function(algorithm = "dual_tree", + epsilon = 0, + leaf_size = 20, + random_basis = FALSE, + rho = 0.7, + tau = 0, + tree_type = "kd", + ...){ + append(list(algorithm = algorithm, + epsilon = epsilon, + leaf_size = leaf_size, + random_basis = random_basis, + rho = rho, + tau = tau, + tree_type = tree_type), + list(...)) +} + #' @title Controls for approximate nearest neighbours algorithms #' #' @author Maciej Beręsewicz #' #' @description -#' Controls for ANN algorithms used in the package +#' Controls for ANN algorithms used in the package. #' #' @param sparse whether sparse data should be used as an input for algorithms, #' @param k_search number of neighbours to search, -#' @param nnd list of parameters for [rnndescent::rnnd_build()] and [rnndescent::rnnd_query()], -#' @param hnsw list of parameters for [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()], -#' @param lsh list of parameters for [mlpack::lsh()] function, -#' @param kd list of kd parameters for [mlpack::knn()] function, -#' @param annoy list of parameters for [RcppAnnoy] package. +#' @param nnd parameters for \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query} (should be inside [control_nnd] function), +#' @param hnsw parameters for \link[RcppHNSW]{hnsw_build} and \link[RcppHNSW]{hnsw_search} (should be inside [control_hnsw] function), +#' @param lsh parameters for \link[mlpack]{lsh} function (should be inside [control_lsh] function), +#' @param kd kd parameters for \link[mlpack]{knn} function (should be inside [control_kd] function), +#' @param annoy parameters for \link[RcppAnnoy]{RcppAnnoy} package (should be inside [control_annoy] function). #' -#' @returns Returns a list with parameters +#' @returns Returns a list with parameters. #' #' @export controls_ann <- function( sparse = FALSE, k_search = 30, - nnd = list(k_build = 30, - use_alt_metric = FALSE, - init = "tree", - n_trees = NULL, - leaf_size = NULL, - max_tree_depth = 200, - margin = "auto", - n_iters = NULL, - delta = 0.001, - max_candidates = NULL, - low_memory = TRUE, - n_search_trees = 1, - pruning_degree_multiplier = 1.5, - diversify_prob = 1, - weight_by_degree = FALSE, - prune_reverse = FALSE, - progress = "bar", - obs = "R", - ## - max_search_fraction = 1, - epsilon = 0.1), - hnsw = list(M = 25, - ef_c = 200, - ef_s = 200, - grain_size = 1, - byrow = TRUE), - lsh = list(bucket_size = 500, - hash_width = 10, - num_probes = 0, - projections = 10, - tables = 30), - kd = list(algorithm = "dual_tree", - epsilon = 0, - leaf_size = 20, - random_basis = FALSE, - rho = 0.7, - tau = 0, - tree_type = "kd"), - annoy = list(n_trees = 250, - build_on_disk = FALSE) + nnd = control_nnd(), + hnsw = control_hnsw(), + lsh = control_lsh(), + kd = control_kd(), + annoy = control_annoy() ) { list(sparse = sparse, @@ -75,7 +236,7 @@ controls_ann <- function( #' @author Maciej Beręsewicz #' #' @description -#' Controls for text data used in the \code{blocking} functions, passed to [tokenizers::tokenize_character_shingles]. +#' Controls for text data used in the \code{blocking} function (if \code{representation = shingles}), passed to \link[tokenizers]{tokenize_character_shingles}. #' #' @param n_shingles length of shingles (default `2L`), #' @param n_chunks passed to (default `10L`), diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..01ccee5 --- /dev/null +++ b/R/data.R @@ -0,0 +1,146 @@ +#' @title RLdata500 dataset from the RecordLinkage package +#' +#' @description +#' +#' This data is taken from \pkg{RecordLinkage} R package developed by Murat Sariyar and Andreas Borg. +#' The package is licensed under GPL-3 license. +#' +#' The `RLdata500` table contains artificial personal data. +#' Some records have been duplicated with randomly generated errors. `RLdata500` contains fifty duplicates. +#' +#' @format A `data.table` with 500 records. Each row represents one record, with the following columns: +#' +#' \itemize{ +#' \item{`fname_cq` -- first name, first component,} +#' \item{`fname_c2` -- first name, second component,} +#' \item{`lname_c1` -- last name, first component,} +#' \item{`lname_c2` -- last name, second component,} +#' \item{`by` -- year of birth,} +#' \item{`bm` -- month of birth,} +#' \item{`bd` -- day of birth,} +#' \item{`rec_id` -- record id,} +#' \item{`ent_id` -- entity id.} +#' } +#' +#' @references +#' Sariyar M., Borg A. (2022). RecordLinkage: Record Linkage Functions for Linking and Deduplicating Data Sets. +#' R package version 0.4-12.4, \url{https://CRAN.R-project.org/package=RecordLinkage} +#' +#' @docType data +#' @keywords datasets +#' @name RLdata500 +#' @rdname RLdata500 +#' @examples +#' +#' data("RLdata500") +#' head(RLdata500) +#' +"RLdata500" + + +#' @title Fictional census data +#' +#' @description +#' This data set was created by Paula McLeod, Dick Heasman and Ian Forbes, ONS, +#' for the ESSnet DI on-the-job training course, Southampton, 25-28 January 2011. +#' It contains fictional data representing some observations from a decennial Census. +#' +#' @format A `data.table` with 25343 records. Each row represents one record, with the following columns: +#' \itemize{ +#' \item{`person_id` -- a unique number for each person, consisting of postcode, house number and person number,} +#' \item{`pername1` -- forename,} +#' \item{`pername2` -- surname,} +#' \item{`sex` -- gender (M/F),} +#' \item{`dob_day` -- day of birth,} +#' \item{`dob_mon` -- month of birth,} +#' \item{`dob_year` -- year of birth,} +#' \item{`hse_num` -- house number, a numeric label for each house within a street,} +#' \item{`enumcap` -- an address consisting of house number and street name,} +#' \item{`enumpc` -- postcode,} +#' \item{`str_nam` -- street name of person's household's street,} +#' \item{`cap_add` -- full address, consisting of house number, street name and postcode,} +#' \item{`census_id` -- person ID with "CENS" added in front.} +#' } +#' +#' @references +#' McLeod, P., Heasman, D., Forbes, I. (2011). Simulated data for the ESSnet DI on-the-job training course, +#' Southampton, 25-28 January 2011. +#' \url{https://wayback.archive-it.org/12090/20231221144450/https://cros-legacy.ec.europa.eu/content/job-training_en} +#' +#' @docType data +#' @keywords datasets +#' @name census +#' @rdname census +#' @examples +#' +#' data("census") +#' head(census) +#' +"census" + +#' @title Fictional customer data +#' +#' @description +#' This data set was created by Paula McLeod, Dick Heasman and Ian Forbes, ONS, +#' for the ESSnet DI on-the-job training course, Southampton, 25-28 January 2011. +#' It contains fictional observations from Customer Information System, +#' which is combined administrative data from the tax and benefit systems. +#' +#' @format A `data.table` with 24613 records. Each row represents one record, with the following columns: +#' \itemize{ +#' \item{`person_id` -- a unique number for each person, consisting of postcode, house number and person number,} +#' \item{`pername1` -- forename,} +#' \item{`pername2` -- surname,} +#' \item{`sex` -- gender (M/F),} +#' \item{`dob_day` -- day of birth,} +#' \item{`dob_mon` -- month of birth,} +#' \item{`dob_year` -- year of birth,} +#' \item{`enumcap` -- an address consisting of house number and street name,} +#' \item{`enumpc` -- postcode,} +#' \item{`cis_id` -- person ID with "CIS" added in front.} +#' } +#' +#' @references +#' McLeod, P., Heasman, D., Forbes, I. (2011). Simulated data for the ESSnet DI on-the-job training course, +#' Southampton, 25-28 January 2011. +#' \url{https://wayback.archive-it.org/12090/20231221144450/https://cros-legacy.ec.europa.eu/content/job-training_en} +#' +#' @docType data +#' @keywords datasets +#' @name cis +#' @rdname cis +#' @examples +#' +#' data("cis") +#' head(cis) +#' +"cis" + +#' Fictional 2024 population of foreigners in Poland +#' +#' @description +#' A fictional data set of the foreign population in Poland, +#' generated based on publicly available information +#' while maintaining the distributions from administrative registers. +#' +#' @format A `data.table` with 110000 records. Each row represents one record, with the following columns: +#' \itemize{ +#' \item{`fname` -- first name,} +#' \item{`sname` -- second name,} +#' \item{`surname` -- surname,} +#' \item{`date` -- date of birth,} +#' \item{`region` -- region (county),} +#' \item{`country` -- country,} +#' \item{`true_id` -- person ID.} +#' } +#' +#' @docType data +#' @keywords datasets +#' @name foreigners +#' @rdname foreigners +#' @examples +#' +#' data("foreigners") +#' head(foreigners) +#' +"foreigners" diff --git a/R/est_block_error.R b/R/est_block_error.R new file mode 100644 index 0000000..7d42f11 --- /dev/null +++ b/R/est_block_error.R @@ -0,0 +1,306 @@ +#' @importFrom stats dpois +#' @importFrom stats runif +#' +#' @title Estimate errors due to blocking in record linkage +#' +#' @description +#' Function computes estimators for false positive rate (FPR) and false negative rate (FNR) due to blocking in record linkage, +#' as proposed by Dasylva and Goussanou (2021). Assumes duplicate-free data sources, +#' complete coverage of the reference data set and blocking decisions based solely on record pairs. +#' +#' @param x Reference data (required if `n` and `N` are not provided). +#' @param y Query data (required if `n` is not provided). +#' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided). +#' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set +#' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`). +#' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`). +#' @param G Number of classes in the finite mixture model. +#' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`). +#' @param p Numeric vector of initial matching probabilities in each class of the mixture model +#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`). +#' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model +#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`). +#' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`). +#' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`). +#' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data). +#' +#' @details +#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file. +#' Assume that the register has no undercoverage, +#' i.e. each record from the file corresponds to exactly one record from the same individual in the register. +#' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with +#' record \eqn{i} on the file. Assume that:\cr +#' \itemize{ +#' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N}, +#' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}. +#' } +#' The finite mixture model \eqn{n_i \sim \sum_{g=1}^G \alpha_g(\text{Bernoulli}(p_g) \ast \text{Poisson}(\lambda_g))} is assumed. +#' When \eqn{G} is fixed, the unknown model parameters are given by the vector \eqn{\psi = [(\alpha_g, p_g, \lambda_g)]_{1 \leq g \leq G}} +#' that may be estimated with the Expectation-Maximization (EM) procedure. +#' +#' Let \eqn{n_i = n_{i|M} + n_{i|U}}, where \eqn{n_{i|M}} is the number of matched neighbours +#' and \eqn{n_{i|U}} is the number of unmatched neighbours, and let \eqn{c_{ig}} denote +#' the indicator that record \eqn{i} is from class \eqn{g}. +#' For the E-step of the EM procedure, the equations are as follows +#' \deqn{ +#' \begin{aligned} +#' P(n_i | c_{ig} = 1) &= I(n_i = 0)(1-p_g)e^{-\lambda_g}+I(n_i > 0)\Bigl(p_g+(1-p_g)\frac{\lambda_g}{n_i}\Bigr)\frac{e^{-\lambda_g}\lambda_g^{n_i-1}}{(n_i-1)!}, \\ +#' P(c_{ig} = 1 | n_i) &= \frac{\alpha_gP(n_i | c_{ig} = 1)}{\sum_{g'=1}^G\alpha_{g'}P(n_i | c_{ig'} = 1)}, \\ +#' P(n_{i|M} = 1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\ +#' P(n_{i|U} = n_i | n_i,c_{ig} = 1) &= I(n_i = 0) + I(n_i > 0)\frac{(1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}, \\ +#' P(n_{i|U} = n_i-1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\ +#' E[c_{ig}n_{i|M} | n_i] &= P(c_{ig} = 1 | n_i)P(n_{i|M} = 1 | n_i,c_{ig} = 1), \\ +#' E[n_{i|U} | n_i,c_{ig} = 1] &= \Bigl(\frac{p_g(n_i-1) + (1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}\Bigr)n_i, \\ +#' E[c_{ig}n_{i|U} | n_i] &= P(c_{ig} = 1 | n_i)E[n_{i|U} | n_i,c_{ig} = 1]. +#' \end{aligned} +#' } +#' The M-step is given by following equations +#' \deqn{ +#' \begin{aligned} +#' \hat{p}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|M} | n_i;\psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\ +#' \hat{\lambda}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|U} | n_i; \psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\ +#' \hat{\alpha}_g &= \frac{1}{m}\sum_{i=1}^mE[c_{ig} | n_i; \psi]. +#' \end{aligned} +#' } +#' As \eqn{N \to \infty}, the error rates and the model parameters are related as follows +#' \deqn{ +#' \begin{aligned} +#' \text{FNR} &\xrightarrow{p} 1 - E[p(v_i)], \\ +#' (N-1)\text{FPR} &\xrightarrow{p} E[\lambda(v_i)], +#' \end{aligned} +#' } +#' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}. +#' +#' +#' +#' @returns Returns a list containing:\cr +#' \itemize{ +#' \item{`FPR` -- estimated false positive rate,} +#' \item{`FNR` -- estimated false negative rate,} +#' \item{`iter` -- number of the EM algorithm iterations performed,} +#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.} +#' } +#' +#' @references +#' Dasylva, A., Goussanou, A. (2021). Estimating the false negatives due to blocking in record linkage. +#' Survey Methodology, Statistics Canada, Catalogue No. 12-001-X, Vol. 47, No. 2. +#' +#' Dasylva, A., Goussanou, A. (2022). On the consistent estimation of linkage errors without training data. +#' Jpn J Stat Data Sci 5, 181–216. \doi{10.1007/s42081-022-00153-3} +#' +#' @examples +#' ## an example proposed by Dasylva and Goussanou (2021) +#' +#' set.seed(111) +#' +#' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5)) +#' +#' errors <- est_block_error(n = neighbors, +#' N = 63155, +#' G = 2, +#' tol = 10^(-3), +#' maxiter = 50) +#' +#' errors +#' +#' @export +est_block_error <- function(x = NULL, + y = NULL, + blocking_result = NULL, + n = NULL, + N = NULL, + G, + alpha = NULL, + p = NULL, + lambda = NULL, + tol = 10^(-6), + maxiter = 1000, + sample_size = NULL) { + + if (is.null(n)) { + stopifnot("`x` should be a vector" = is.vector(x)) + stopifnot("`y` should be a vector" = is.vector(y)) + stopifnot("`blocking_result` should be a data.frame or a data.table" = is.data.frame(blocking_result) | is.data.table(blocking_result)) + stopifnot("`blocking_result` should contain a column named `y`" = "y" %in% colnames(blocking_result)) + + n <- as.integer(table( + factor(blocking_result$y, levels = 1:length(y)) + )) + + N <- length(x) + } + + if (is.numeric(sample_size)) { + n <- sample(n, size = sample_size, replace = TRUE) + } + + convergence <- FALSE + m <- length(n) + + if (is.null(alpha)) { + alpha <- rep(1/G, G) + } + + if (is.null(p)) { + p <- runif(G, min = 0.5, max = 1) + } + + if (is.null(lambda)) { + lambda <- runif(G, min = 0.1, max = 2) + } + + for (l in 1:maxiter) { + + ## E + + # probs_n_c <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # if (n[i] == 0) { + # probs_n_c[i, g] <- (1 - p[g]) * exp(-lambda[g]) + # } else { + # probs_n_c[i, g] <- (p[g] + (1 - p[g]) * lambda[g] / n[i]) * + # exp(-lambda[g]) * lambda[g]^(n[i] - 1) / factorial(n[i] - 1) + # } + # } + # } + + probs_n_c <- mapply( + function(x, y) { + ifelse( + n == 0, + (1 - x) * exp(-y), + (x + (1 - x) * y / n) * dpois(n - 1, y) + ) + }, + p, lambda, + SIMPLIFY = TRUE) + + # probs_c_n <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # probs_c_n[i, g] <- + # alpha[g] * probs_n_c[i, g] / sum(alpha * as.vector(probs_n_c[i, ])) + # } + # } + + # probs_c_n <- t( + # t(probs_n_c * alpha) * as.vector(1 / (probs_n_c %*% alpha)) + # ) + + probs_c_n <- probs_n_c * alpha / as.vector(probs_n_c %*% alpha) + + # probs_n_M <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # probs_n_M[i, g] <- + # p[g] * n[i] / (p[g] * n[i] + (1 - p[g]) * lambda[g]) + # } + # } + + # probs_n_M <- mapply( + # function(x, y) { + # x * n / (x * n + (1 - x) * y) + # }, + # p, lambda, + # SIMPLIFY = TRUE + # ) + + n_mat <- matrix(n, nrow = m, ncol = G) + p_mat <- matrix(p, nrow = m, ncol = G, byrow = TRUE) + lambda_mat <- matrix(lambda, nrow = m, ncol = G, byrow = TRUE) + + probs_n_M <- n_mat * p_mat / (n_mat * p_mat + (1 - p_mat) * lambda_mat) + + # probs_n_U <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # if (n[i] == 0) { + # probs_n_U[i, g] <- 1 + # } else { + # probs_n_U[i, g] <- (1 - p[g]) * lambda[g] / (p[g] * n[i] + (1 - p[g]) * lambda[g]) + # } + # } + # } + + probs_n_U <- mapply( + function(x, y) { + ifelse(n == 0, + 1, + (1 - x) * y / (x * n + (1 - x) * y)) + }, + p, lambda, + SIMPLIFY = TRUE + ) + + # E_c_n_M <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # E_c_n_M[i, g] <- probs_c_n[i, g] * probs_n_M[i, g] + # } + # } + + E_c_n_M <- probs_c_n * probs_n_M + + # E_n_U <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # E_n_U[i, g] <- + # ((p[g] * (n[i] - 1) + (1 - p[g]) * lambda[g]) / (p[g] * n[i] + (1 - p[g]) * lambda[g])) * n[i] + # } + # } + + E_n_U<- ((p_mat * (n_mat - 1) + (1 - p_mat) * lambda_mat) / (p_mat * n_mat + (1 - p_mat) * lambda_mat)) * n_mat + + # E_c_n_U <- matrix(0, m, G) + # + # for (i in 1:m) { + # for (g in 1:G) { + # E_c_n_U[i, g] <- probs_c_n[i, g] * E_n_U[i, g] + # } + # } + + E_c_n_U <- probs_c_n * E_n_U + + ## M + + alpha <- 1 / m * colSums(probs_c_n) + p <- colSums(E_c_n_M) / (m * alpha) + lambda <- colSums(E_c_n_U) / (m * alpha) + + ## check + + if (l >= 2) { + log_lik_old <- log_lik_new + log_lik_new <- sum(log(probs_n_c %*% as.matrix(alpha))) + } else { + log_lik_new <- sum(log(probs_n_c %*% as.matrix(alpha))) + next + } + + if (abs(log_lik_new - log_lik_old) <= tol) { + convergence <- TRUE + break + } + + } + + FNR <- 1 - sum(alpha * p) + FPR <- sum(alpha * lambda) / (N - 1) + + return(structure( + list( + FPR = FPR, + FNR = FNR, + iter = l, + convergence = convergence + ), + class = "est_block_error")) + +} diff --git a/R/eval.R b/R/eval.R new file mode 100644 index 0000000..9d9d92d --- /dev/null +++ b/R/eval.R @@ -0,0 +1,183 @@ +#' @importFrom Matrix sparseMatrix +#' @importFrom Matrix rowSums +#' @importFrom Matrix colSums +#' +#' @title Evaluation for record linkage +#' +#' @description +#' Function calculates TP, FP, FN and TN for record linkage. +#' +#' @param pred_df Output from the blocking algorithm. +#' @param true_df Ground-truth links (may be subset). +#' +#' @returns +#' Returns a list containing TP, FP, FN and TN. +#' +eval_reclin <- function(pred_df, true_df) { + + pred_x_map <- unique(pred_df[, c("x", "block"), with = FALSE]) + pred_y_map <- unique(pred_df[, c("y", "block"), with = FALSE]) + + true_x <- unique(true_df[, c("x", "block"), with = FALSE]) + true_y <- unique(true_df[, c("y", "block"), with = FALSE]) + + pred_x <- true_x + pred_x$block <- pred_x_map$block[match(true_x$x, pred_x_map$x)] + + pred_y <- true_y + pred_y$block <- pred_y_map$block[match(true_y$y, pred_y_map$y)] + + max_block <- max(c(pred_df$block, true_df$block), na.rm = TRUE) + if (any(is.na(pred_x$block))) { + count_na <- sum(is.na(pred_x$block)) + pred_x$block[is.na(pred_x$block)] <- seq(max_block + 1, length.out = count_na) + } + if (any(is.na(pred_y$block))) { + pred_y$block[is.na(pred_y$block)] <- seq(max_block + 1 + count_na, + length.out = sum(is.na(pred_y$block))) + } + + # n_pred <- length(unique(c(pred_x$block, pred_y$block))) + # n_true <- length(unique(c(true_x$block, true_y$block))) + n1 <- max(c(pred_x$block, pred_y$block)) + n2 <- max(c(true_x$block, true_y$block)) + + # cx <- Matrix::sparseMatrix(i = pred_x$block, j = true_x$block, x = 1, dims = c(n_pred, n_true)) + # cy <- Matrix::sparseMatrix(i = pred_y$block, j = true_y$block, x = 1, dims = c(n_pred, n_true)) + cx <- Matrix::sparseMatrix(i = pred_x$block, j = true_x$block, x = 1, dims = c(n1, n2)) + cy <- Matrix::sparseMatrix(i = pred_y$block, j = true_y$block, x = 1, dims = c(n1, n2)) + + TP <- sum(cx * cy) + + row_sum_x <- Matrix::rowSums(cx) + row_sum_y <- Matrix::rowSums(cy) + true_pairs <- sum(row_sum_x * row_sum_y) + + col_sum_x <- Matrix::colSums(cx) + col_sum_y <- Matrix::colSums(cy) + pred_pairs <- sum(col_sum_x * col_sum_y) + + FP <- pred_pairs - TP + FN <- true_pairs - TP + NX <- nrow(true_x) + NY <- nrow(true_y) + TN <- NX * NY - TP - FP - FN + + return(list(TP = TP, + FP = FP, + FN = FN, + TN = TN)) +} + +#' @title Evaluation for deduplication +#' +#' @description +#' Function calculates TP, FP, FN and TN for deduplication. +#' +#' @param pred_df Output from the blocking algorithm. +#' @param true_df Ground-truth links (may be subset). +#' +#' @returns +#' Returns a list containing TP, FP, FN and TN. +#' +eval_dedup <- function(pred_df, true_df) { + + pred_lbl <- melt(pred_df, + id.vars = "block", + measure.vars = c("x", "y"), + value.name = "rec")[, c("rec", "block"), with = FALSE] + pred_lbl <- pred_lbl[!duplicated(pred_lbl[["rec"]])] + + true_lbl <- true_df[, c("x", "block"), with = FALSE] + setnames(true_lbl, "x", "rec") + + setkey(pred_lbl, "rec") + setkey(true_lbl, "rec") + + pred_lbl <- true_lbl[pred_lbl] + pred_lbl <- pred_lbl[!is.na(pred_lbl$block)] + + if (any(is.na(pred_lbl$i.block))) { + max_block <- max(c(pred_lbl$i.block, true_lbl$block), na.rm = TRUE) + pred_lbl$i.block[is.na(pred_lbl$i.block)] <- seq(max_block + 1, + length.out = length(sum(is.na(pred_lbl$i.block)))) + } + + grouped <- pred_lbl[, list(N = .N), by = c("block", "i.block")] + + TP <- sum(grouped$N * (grouped$N - 1) / 2) + + row_sum <- grouped[, list(row_sum = sum(N)), by = "i.block"] + col_sum <- grouped[, list(col_sum = sum(N)), by = "block"] + + pred_pairs <- sum(row_sum$row_sum * (row_sum$row_sum - 1) / 2) + true_pairs <- sum(col_sum$col_sum * (col_sum$col_sum - 1) / 2) + + FP <- pred_pairs - TP + FN <- true_pairs - TP + + N <- nrow(pred_lbl) + total_pairs <- N * (N - 1) / 2 + TN <- total_pairs - TP - FP - FN + + return(list(TP = TP, + FP = FP, + FN = FN, + TN = TN + )) +} + +#' @title Metrics for evaluating dedupliaction and record linkage +#' +#' @description +#' Function calculates standard evaluation metrics. +#' +#' @param TP TP +#' @param FP FP +#' @param FN FN +#' @param TN TN +#' +#' @returns +#' Returns a list containing evaluation metrics. +#' +get_metrics <- function(TP, FP, FN, TN) { + + recall <- if (TP + FN != 0) TP / (TP + FN) else 0 + precision <- if (TP + FP != 0) TP / (TP + FP) else 0 + fpr <- if (FP + TN != 0) FP / (FP + TN) else 0 + fnr <- if (FN + TP != 0) FN / (FN + TP) else 0 + accuracy <- if (TP + FP + FN + TN != 0) (TP + TN) / (TP + FP + FN + TN) else 0 + specificity <- if (TN + FP != 0) TN / (TN + FP) else 0 + f1_score <- if (precision + recall != 0) 2 * (precision * recall) / (precision + recall) else 0 + + return(list(recall = recall, + precision = precision, + fpr = fpr, + fnr = fnr, + accuracy = accuracy, + specificity = specificity, + f1_score = f1_score)) +} + +#' @title Confusion matrix +#' +#' @description +#' Function creates a confusion matrix from raw counts. +#' +#' @param TP TP +#' @param FP FP +#' @param FN FN +#' @param TN TN +#' +#' @returns +#' Returns a confusion matrix. +#' +get_confusion <- function(TP, FP, FN, TN) { + + cm <- matrix(c(TP, FP, FN, TN), nrow = 2) + colnames(cm) <- c("Predicted Positive", + "Predicted Negative") + rownames(cm) <- c("Actual Positive", + "Actual Negative") + return(cm) +} diff --git a/R/method_annoy.R b/R/method_annoy.R index 316c495..2df7e40 100644 --- a/R/method_annoy.R +++ b/R/method_annoy.R @@ -1,4 +1,4 @@ -#' Imports +#' #' @importFrom RcppAnnoy AnnoyAngular #' @importFrom RcppAnnoy AnnoyEuclidean #' @importFrom RcppAnnoy AnnoyHamming @@ -6,7 +6,7 @@ #' @importFrom methods new #' @importFrom data.table data.table #' -#' @title An internal function to use Annoy algorithm via the [RcppAnnoy] package. +#' @title An internal function to use Annoy algorithm via the \link[RcppAnnoy]{RcppAnnoy} package. #' @author Maciej Beręsewicz #' #' @param x deduplication or reference data, @@ -16,10 +16,10 @@ #' @param verbose if TRUE, log messages to the console, #' @param seed seed for the pseudo-random numbers algorithm, #' @param path path to write the index, -#' @param control controls for \code{new} or \code{build} methods for [RcppAnnoy]. +#' @param control controls for \code{new} or \code{build} methods for \link[RcppAnnoy]{RcppAnnoy}. #' #' @description -#' See details of the [RcppAnnoy] package. +#' See details of the \link[RcppAnnoy]{RcppAnnoy} package. #' #' @@ -37,10 +37,10 @@ method_annoy <- function(x, ncols <- ncol(x) l_ind <- switch(distance, - "euclidean" = methods::new(RcppAnnoy::AnnoyManhattan, ncols), + "euclidean" = methods::new(RcppAnnoy::AnnoyEuclidean, ncols), "manhatan" = methods::new(RcppAnnoy::AnnoyManhattan, ncols), - "hamming" = methods::new(RcppAnnoy::AnnoyHamming, ncols), - "angular" = methods::new(RcppAnnoy::AnnoyAngular, ncols) + "hamming" = methods::new(RcppAnnoy::AnnoyHamming, ncols), + "angular" = methods::new(RcppAnnoy::AnnoyAngular, ncols) ) l_ind$setSeed(seed) diff --git a/R/method_hnsw.R b/R/method_hnsw.R index 3c4bb3b..4524f30 100644 --- a/R/method_hnsw.R +++ b/R/method_hnsw.R @@ -22,7 +22,7 @@ #' @param control controls for the HNSW algorithm. #' #' @description -#' See details of [RcppHNSW::hnsw_build] and [RcppHNSW::hnsw_search]. +#' See details of \link[RcppHNSW]{hnsw_build} and \link[RcppHNSW]{hnsw_search}. #' #' method_hnsw <- function(x, diff --git a/R/method_mlpack.R b/R/method_mlpack.R index 0859a63..6608488 100644 --- a/R/method_mlpack.R +++ b/R/method_mlpack.R @@ -3,7 +3,7 @@ #' @importFrom mlpack knn #' @importFrom data.table data.table #' -#' @title An internal function to use the LSH and KD-tree algorithm via the [mlpack] package. +#' @title An internal function to use the LSH and KD-tree algorithm via the \link[mlpack]{mlpack} package. #' @author Maciej Beręsewicz #' #' @param x deduplication or reference data, @@ -16,7 +16,7 @@ #' @param control controls for the \code{lsh} or \code{kd} algorithms. #' #' @description -#' See details of [mlpack::lsh] and [mlpack::knn] +#' See details of \link[mlpack]{lsh} and \link[mlpack]{knn}. #' method_mlpack <- function(x, y, diff --git a/R/method_nnd.R b/R/method_nnd.R index 11424ab..54b4c60 100644 --- a/R/method_nnd.R +++ b/R/method_nnd.R @@ -3,7 +3,7 @@ #' @importFrom rnndescent rnnd_query #' @importFrom data.table data.table #' -#' @title An internal function to use the NN descent algorithm via the [rnndescent] package. +#' @title An internal function to use the NN descent algorithm via the \link[rnndescent]{rnndescent} package. #' @author Maciej Beręsewicz #' #' @param x deduplication or reference data, @@ -16,7 +16,7 @@ #' @param control controls for the NN descent algorithm. #' #' @description -#' See details of [rnndescent::rnnd_build] and [rnndescent::rnnd_query]. +#' See details of \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query}. #' #' method_nnd <- function(x, diff --git a/R/methods.R b/R/methods.R index 8796737..a2dc3d7 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,6 +1,6 @@ #' @method print blocking #' @exportS3Method -print.blocking <- function(x,...) { +print.blocking <- function(x, ...) { blocks_tab <- table(x$result$block) block_ids <- rep(as.numeric(names(blocks_tab)), blocks_tab+1) @@ -9,7 +9,9 @@ print.blocking <- function(x,...) { cat("========================================================\n") cat("Blocking based on the", x$method, "method.\n") cat("Number of blocks: ", length(unique(block_ids)), ".\n",sep="") - cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="") + if (x$representation == "shingles") { + cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="") + } cat("Reduction ratio: ", sprintf("%.4f", rr), ".\n",sep="") cat("========================================================\n") @@ -27,3 +29,19 @@ print.blocking <- function(x,...) { } invisible(x) } + +#' @method print est_block_error +#' @exportS3Method +print.est_block_error <- function(x, ...) { + + cat("FPR: ", x$FPR, "\n") + cat("FNR: ", x$FNR, "\n") + + cat("========================================================\n") + + if (x$convergence) { + cat("EM algorithm converged successfully within", x$iter, "iterations.") + } else { + cat("EM algorithm did not converge within", x$iter, "iterations.") + } +} diff --git a/R/reclin2_pair_ann.R b/R/reclin2_pair_ann.R index 6770c51..4e518ab 100644 --- a/R/reclin2_pair_ann.R +++ b/R/reclin2_pair_ann.R @@ -1,12 +1,11 @@ -#' Imports -#' +#' @importFrom stats setNames #' @import data.table #' -#' @title Integration with the reclin2 package +#' @title Integration with the \pkg{reclin2} package #' @author Maciej Beręsewicz #' #' @description -#' Function for the integration with the `reclin2` package. The function is based on [reclin2::pair_minsim()] and reuses some of its source code. +#' Function for the integration with the \pkg{reclin2} package. The function is based on \link[reclin2]{pair_minsim} and reuses some of its source code. #' #' @param x reference data (a data.frame or a data.table), #' @param y query data (a data.frame or a data.table, default NULL), @@ -15,10 +14,11 @@ #' @param deduplication whether deduplication should be performed (default TRUE), #' @param keep_block whether to keep the block variable in the set, #' @param add_xy whether to add x and y, -#' @param ... arguments passed to [blocking::blocking()] function. +#' @param ... arguments passed to [blocking] function. #' #' -#' @returns Returns a [data.table] with two columns \code{.x} and \code{.y}. Columns \code{.x} and \code{.y} are row numbers from data.frames x and y respectively. Returning data.table is also of a class \code{pairs} which allows for integration with the [reclin2::compare_pairs()] package. +#' @returns Returns a \link[data.table]{data.table} with two columns \code{.x} and \code{.y}. Columns \code{.x} and \code{.y} are row numbers from data.frames x and y respectively. +#' Returned `data.table` is also of a class \code{pairs} which allows for integration with the \link[reclin2]{compare_pairs} function. #' #' @examples #' @@ -51,28 +51,36 @@ pair_ann <- function(x, add_xy = TRUE, ...) { + stopifnot("Only data.frame or data.table is supported" = + is.data.frame(x) | is.data.table(x)) + stopifnot("Only one `on` is currently supported" = NROW(on) == 1) if (!is.null(y)) deduplication <- FALSE + if (!is.null(y)){ + stopifnot("Only data.frame or data.table is supported" = + is.data.frame(y) | data.table::is.data.table(y)) + } + y <- if (deduplication) x else y x <- data.table::as.data.table(x) y <- data.table::as.data.table(y) - block_result <- blocking::blocking(x = x[, ..on][[1]], - y = if (deduplication) NULL else y[, ..on][[1]], + block_result <- blocking::blocking(x = x[[on]], + y = if (deduplication) NULL else y[[on]], deduplication = deduplication, ...) - a <- x[, ..on] + a <- x[, c(on), with = FALSE] a[, `:=`(".x", .I)] - a <- a[unique(block_result$result[,.(".x"=x, block)]), on = ".x"] + a <- a[unique(block_result$result[, c("x", "block"), with = FALSE]), on = setNames("x", ".x")] a[, `:=`((on), NULL)] - b <- y[, `..on`] + b <- y[, c(on), with = FALSE] b[, `:=`(".y", .I)] - b <- b[unique(block_result$result[,.(".y"=y, block)]), on = ".y"] + b <- b[unique(block_result$result[, c("y", "block"), with = FALSE]), on = setNames("y", ".y")] b[, `:=`((on), NULL)] pairs <- merge(a, b, @@ -81,7 +89,7 @@ pair_ann <- function(x, all.y = FALSE, allow.cartesian = TRUE) - if (deduplication) pairs <- pairs[.y > .x] + if (deduplication) pairs <- pairs[pairs[[".y"]] > pairs[[".x"]]] data.table::setkey(pairs, NULL) data.table::setattr(pairs, "class", c("pairs", class(pairs))) diff --git a/R/sentence_to_vector.R b/R/sentence_to_vector.R new file mode 100644 index 0000000..57c4365 --- /dev/null +++ b/R/sentence_to_vector.R @@ -0,0 +1,36 @@ +#' +#' @importFrom text2vec space_tokenizer +#' +#' @title Sentence to vector +#' +#' @description +#' Function creates a matrix with word embeddings using a given model. +#' +#' @param sentences a character vector, +#' @param model a matrix containing word embeddings (e.g., GloVe). +#' +sentence_to_vector <- function(sentences, model) { + tokens <- text2vec::space_tokenizer(tolower(sentences)) + + dim <- ncol(model) + result <- matrix(0, nrow = length(sentences), ncol = dim) + + for (i in seq_along(sentences)) { + words <- tokens[[i]] + + valid_words <- words[words %in% rownames(model)] + + if (length(valid_words) > 0) { + word_vectors <- model[valid_words, , drop = FALSE] + result[i, ] <- colMeans(word_vectors) + } + else { + result[i, ] <- 0 + } + } + + return(result) +} + + + diff --git a/README.md b/README.md index 60cf47d..d3978de 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ pair_ann(x = df_base, y = df_example, on = "txt", deduplication = FALSE) |> See section `Data Integration (Statistical Matching and Record Linkage)` in [the Official Statistics Task -View](https://cran.r-project.org/web/views/OfficialStatistics.html). +View](https://CRAN.R-project.org/view=OfficialStatistics). Packages that allow blocking: diff --git a/blocking.Rproj b/blocking.Rproj index 526ad2d..3a6722b 100644 --- a/blocking.Rproj +++ b/blocking.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0bfbaf6a-1248-437d-a72d-d01d92f9019b RestoreWorkspace: Default SaveWorkspace: Default diff --git a/data/RLdata500.rda b/data/RLdata500.rda new file mode 100644 index 0000000..9068aa2 Binary files /dev/null and b/data/RLdata500.rda differ diff --git a/data/census.rda b/data/census.rda new file mode 100644 index 0000000..6bded1b Binary files /dev/null and b/data/census.rda differ diff --git a/data/cis.rda b/data/cis.rda new file mode 100644 index 0000000..a6ea49c Binary files /dev/null and b/data/cis.rda differ diff --git a/data/foreigners.rda b/data/foreigners.rda new file mode 100644 index 0000000..a7e0944 Binary files /dev/null and b/data/foreigners.rda differ diff --git a/index-colnames.txt b/index-colnames.txt deleted file mode 100644 index 000f1fb..0000000 --- a/index-colnames.txt +++ /dev/null @@ -1,28 +0,0 @@ -cy -ij -im -km -lj -mj -nk -nm -rk -yr -yp -ho -ki -ls -py -sk -th -yt -al -an -ja -ko -mo -nt -ow -ty -wa -on diff --git a/index.annoy b/index.annoy deleted file mode 100644 index edfb810..0000000 Binary files a/index.annoy and /dev/null differ diff --git a/index.hnsw b/index.hnsw deleted file mode 100644 index dd31590..0000000 Binary files a/index.hnsw and /dev/null differ diff --git a/inst/WORDLIST b/inst/WORDLIST new file mode 100644 index 0000000..eeba3bc --- /dev/null +++ b/inst/WORDLIST @@ -0,0 +1 @@ +deduplication diff --git a/inst/tinytest/index.hnsw b/inst/tinytest/index.hnsw index dd31590..80e2fbf 100644 Binary files a/inst/tinytest/index.hnsw and b/inst/tinytest/index.hnsw differ diff --git a/inst/tinytest/test_annoy.R b/inst/tinytest/test_annoy.R index d9d52be..c28f5ac 100644 --- a/inst/tinytest/test_annoy.R +++ b/inst/tinytest/test_annoy.R @@ -18,11 +18,12 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 4, 5)), + dist = c(0, 1, 0, 1, 0, 1, 2, 2.236068)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "annoy", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", @@ -45,11 +46,12 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 4, 4)), + dist = c(0, 1, 0, 1, 0, 1, 2, 2)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "annoy", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", @@ -77,28 +79,32 @@ expect_error( expect_true({ + tmp_dir <- tempdir() blocking(x = mat_y, ann = "annoy", distance = "euclidean", - ann_write = ".") - file.exists("./index.annoy") & - file.exists("./index-colnames.txt") + ann_write = file.path(tmp_dir)) + file.exists(file.path(tmp_dir, "index.annoy")) & + file.exists(file.path(tmp_dir, "index-colnames.txt")) }) expect_true({ + tmp_dir <- tempdir() + sub_dir <- file.path(tmp_dir, "sub") + dir.create(sub_dir, showWarnings = FALSE) blocking(x = mat_y, ann = "annoy", distance = "euclidean", - ann_write = "./") - file.exists("./index.annoy") & - file.exists("./index-colnames.txt") + ann_write = file.path(sub_dir)) + file.exists(file.path(sub_dir, "index.annoy")) & + file.exists(file.path(sub_dir, "index-colnames.txt")) }) ## testing reading saved index expect_equal({ - ncols <- length(readLines("./index-colnames.txt")) - ann_annoy <- methods::new(RcppAnnoy::AnnoyManhattan, ncols) - ann_annoy$load("./index.annoy") + ncols <- length(readLines(file.path(tmp_dir, "index-colnames.txt"))) + ann_annoy <- methods::new(RcppAnnoy::AnnoyEuclidean, ncols) + ann_annoy$load(file.path(tmp_dir, "index.annoy")) ann_annoy$getNItems() }, 8) diff --git a/inst/tinytest/test_hnsw.R b/inst/tinytest/test_hnsw.R index d62a0c5..a7ee0e9 100644 --- a/inst/tinytest/test_hnsw.R +++ b/inst/tinytest/test_hnsw.R @@ -24,6 +24,7 @@ expect_equal( class = c("data.table", "data.frame")), method = "hnsw", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", @@ -56,6 +57,7 @@ expect_equal( class = c("data.table", "data.frame")), method = "hnsw", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", diff --git a/inst/tinytest/test_mlpack.R b/inst/tinytest/test_mlpack.R index 8a9a83d..767e963 100644 --- a/inst/tinytest/test_mlpack.R +++ b/inst/tinytest/test_mlpack.R @@ -13,6 +13,7 @@ expect_equal( class = c("data.table", "data.frame")), method = "lsh", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", @@ -34,6 +35,7 @@ expect_equal( class = c("data.table", "data.frame")), method = "kd", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames =c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", @@ -90,6 +92,7 @@ expect_equal( class = c("data.table", "data.frame")), method = "kd", deduplication = FALSE, + representation = "shingles", metrics = NULL, confusion = NULL, colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow", diff --git a/inst/tinytest/test_true_blocks.R b/inst/tinytest/test_true_blocks.R index 0f4eef3..a0eddcf 100644 --- a/inst/tinytest/test_true_blocks.R +++ b/inst/tinytest/test_true_blocks.R @@ -10,7 +10,7 @@ expect_equal( blocking(x = df_example$txt, true_blocks = data.frame(x = 1:8, block = rep(1:2, each=4)))$metrics, c(recall = 1, precision = 1, fpr = 0, fnr = 0, accuracy = 1, - specificity = 1) + specificity = 1, f1_score = 1) ) diff --git a/man/RLdata500.Rd b/man/RLdata500.Rd new file mode 100644 index 0000000..90c8ca2 --- /dev/null +++ b/man/RLdata500.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{RLdata500} +\alias{RLdata500} +\title{RLdata500 dataset from the RecordLinkage package} +\format{ +A \code{data.table} with 500 records. Each row represents one record, with the following columns: + +\itemize{ +\item{\code{fname_cq} -- first name, first component,} +\item{\code{fname_c2} -- first name, second component,} +\item{\code{lname_c1} -- last name, first component,} +\item{\code{lname_c2} -- last name, second component,} +\item{\code{by} -- year of birth,} +\item{\code{bm} -- month of birth,} +\item{\code{bd} -- day of birth,} +\item{\code{rec_id} -- record id,} +\item{\code{ent_id} -- entity id.} +} +} +\usage{ +RLdata500 +} +\description{ +This data is taken from \pkg{RecordLinkage} R package developed by Murat Sariyar and Andreas Borg. +The package is licensed under GPL-3 license. + +The \code{RLdata500} table contains artificial personal data. +Some records have been duplicated with randomly generated errors. \code{RLdata500} contains fifty duplicates. +} +\examples{ + +data("RLdata500") +head(RLdata500) + +} +\references{ +Sariyar M., Borg A. (2022). RecordLinkage: Record Linkage Functions for Linking and Deduplicating Data Sets. +R package version 0.4-12.4, \url{https://CRAN.R-project.org/package=RecordLinkage} +} +\keyword{datasets} diff --git a/man/blocking.Rd b/man/blocking.Rd index d40fc40..572d66c 100644 --- a/man/blocking.Rd +++ b/man/blocking.Rd @@ -7,6 +7,8 @@ blocking( x, y = NULL, + representation = c("shingles", "vectors"), + model, deduplication = TRUE, on = NULL, on_blocking = NULL, @@ -28,6 +30,10 @@ blocking( \item{y}{query data (a character vector or a matrix), if not provided NULL by default and thus deduplication is performed,} +\item{representation}{method of representing input data (possible \code{c("shingles", "vectors")}; default \code{"shingles"}),} + +\item{model}{a matrix containing word embeddings (e.g., GloVe), required only when \code{representation = "vectors"},} + \item{deduplication}{whether deduplication should be applied (default TRUE as y is set to NULL),} \item{on}{variables for ANN search (currently not supported),} @@ -42,7 +48,7 @@ blocking( \item{ann_colnames}{file with column names if \code{x} or \code{y} are indices saved on the disk (currently not supported),} -\item{true_blocks}{matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix as well as all metrics from \code{\link[igraph:compare]{igraph::compare()}} are returned).} +\item{true_blocks}{matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix as well as all metrics from \link[igraph]{compare} are returned).} \item{verbose}{whether log should be provided (0 = none, 1 = main, 2 = ANN algorithm verbose used),} @@ -52,37 +58,37 @@ blocking( \item{n_threads}{number of threads used for the ANN algorithms and adding data for index and query,} -\item{control_txt}{list of controls for text data (passed only to \link[text2vec:itoken]{text2vec::itoken_parallel} or \link[text2vec:itoken]{text2vec::itoken}),} +\item{control_txt}{list of controls for text data (passed only to \link[text2vec]{itoken_parallel} or \link[text2vec]{itoken}), used only when \code{representation = "shingles"},} \item{control_ann}{list of controls for the ANN algorithms.} } \value{ -Returns a list with containing:\cr +Returns a list containing:\cr \itemize{ \item{\code{result} -- \code{data.table} with indices (rows) of x, y, block and distance between points} \item{\code{method} -- name of the ANN algorithm used,} \item{\code{deduplication} -- information whether deduplication was applied,} +\item{\code{representation} -- information whether shingles or vectors were used,} \item{\code{metrics} -- metrics for quality assessment, if \code{true_blocks} is provided,} +\item{\code{confusion} -- confusion matrix, if \code{true_blocks} is provided,} \item{\code{colnames} -- variable names (colnames) used for search,} \item{\code{graph} -- \code{igraph} class object.} } } \description{ -Function creates shingles (strings with 2 characters, default), applies approximate nearest neighbour (ANN) algorithms via the \link{rnndescent}, RcppHNSW, \link{RcppAnnoy} and \link{mlpack} packages, -and creates blocks using graphs via \link{igraph}. -} -\details{ -Imports +Function creates shingles (strings with 2 characters, default) or vectors using a given model (e.g., GloVe), +applies approximate nearest neighbour (ANN) algorithms via the \link[rnndescent]{rnndescent}, \link[RcppHNSW]{RcppHNSW}, \link[RcppAnnoy]{RcppAnnoy} and \link[mlpack]{mlpack} packages, +and creates blocks using graphs via \link[igraph]{igraph}. } \examples{ - ## an example using RcppHNSW + df_example <- data.frame(txt = c("jankowalski", "kowalskijan", "kowalskimjan", "kowaljan", "montypython", "pythonmonty", "cyrkmontypython", "monty")) result <- blocking(x = df_example$txt, ann = "hnsw", - control_ann = controls_ann(hnsw = list(M = 5, ef_c = 10, ef_s = 10))) + control_ann = controls_ann(hnsw = control_hnsw(M = 5, ef_c = 10, ef_s = 10))) result @@ -92,7 +98,61 @@ result_lsh <- blocking(x = df_example$txt, ann = "lsh") result_lsh + +## an example using GloVe and RcppAnnoy +\dontrun{ +options(timeout = 500) +utils::download.file("https://nlp.stanford.edu/data/glove.6B.zip", destfile = "glove.6B.zip") +utils::unzip("glove.6B.zip") + +glove_6B_50d <- readr::read_table("glove.6B.50d.txt", + col_names = FALSE, + show_col_types = FALSE) +data.table::setDT(glove_6B_50d) + +glove_vectors <- glove_6B_50d[,-1] +glove_vectors <- as.matrix(glove_vectors) +rownames(glove_vectors) <- glove_6B_50d$X1 + +## spaces between words are required +df_example_spaces <- data.frame(txt = c("jan kowalski", "kowalski jan", "kowalskim jan", +"kowal jan", "monty python", "python monty", "cyrk monty python", "monty")) + +result_annoy <- blocking(x = df_example_spaces$txt, + ann = "annoy", + representation = "vectors", + model = glove_vectors) + +result_annoy +} + +## an example with the NN descent algorithm and true blocks + +data(census) +data(cis) + +set.seed(2024) +census <- census[sample(nrow(census), floor(nrow(census) / 2)), ] +set.seed(2024) +cis <- cis[sample(nrow(cis), floor(nrow(cis) / 2)), ] + +census[, txt:=paste0(pername1, pername2, sex, + dob_day, dob_mon, dob_year, enumcap, enumpc)] +cis[, txt:=paste0(pername1, pername2, sex, + dob_day, dob_mon, dob_year, enumcap, enumpc)] + +matches <- merge(x = census[, .(x=1:.N, person_id)], + y = cis[, .(y = 1:.N, person_id)], + by = "person_id") +matches[, block:=1:.N] + +result_true_blocks <- blocking(x = census$txt, y = cis$txt, verbose = 1, + true_blocks = matches[, .(x, y, block)], + seed = 2024) + +result_true_blocks + } \author{ -Maciej Beręsewicz +Maciej Beręsewicz, Adam Struzik } diff --git a/man/census.Rd b/man/census.Rd new file mode 100644 index 0000000..afb7a26 --- /dev/null +++ b/man/census.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{census} +\alias{census} +\title{Fictional census data} +\format{ +A \code{data.table} with 25343 records. Each row represents one record, with the following columns: +\itemize{ +\item{\code{person_id} -- a unique number for each person, consisting of postcode, house number and person number,} +\item{\code{pername1} -- forename,} +\item{\code{pername2} -- surname,} +\item{\code{sex} -- gender (M/F),} +\item{\code{dob_day} -- day of birth,} +\item{\code{dob_mon} -- month of birth,} +\item{\code{dob_year} -- year of birth,} +\item{\code{hse_num} -- house number, a numeric label for each house within a street,} +\item{\code{enumcap} -- an address consisting of house number and street name,} +\item{\code{enumpc} -- postcode,} +\item{\code{str_nam} -- street name of person's household's street,} +\item{\code{cap_add} -- full address, consisting of house number, street name and postcode,} +\item{\code{census_id} -- person ID with "CENS" added in front.} +} +} +\usage{ +census +} +\description{ +This data set was created by Paula McLeod, Dick Heasman and Ian Forbes, ONS, +for the ESSnet DI on-the-job training course, Southampton, 25-28 January 2011. +It contains fictional data representing some observations from a decennial Census. +} +\examples{ + +data("census") +head(census) + +} +\references{ +McLeod, P., Heasman, D., Forbes, I. (2011). Simulated data for the ESSnet DI on-the-job training course, +Southampton, 25-28 January 2011. +\url{https://wayback.archive-it.org/12090/20231221144450/https://cros-legacy.ec.europa.eu/content/job-training_en} +} +\keyword{datasets} diff --git a/man/cis.Rd b/man/cis.Rd new file mode 100644 index 0000000..e3b42f9 --- /dev/null +++ b/man/cis.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{cis} +\alias{cis} +\title{Fictional customer data} +\format{ +A \code{data.table} with 24613 records. Each row represents one record, with the following columns: +\itemize{ +\item{\code{person_id} -- a unique number for each person, consisting of postcode, house number and person number,} +\item{\code{pername1} -- forename,} +\item{\code{pername2} -- surname,} +\item{\code{sex} -- gender (M/F),} +\item{\code{dob_day} -- day of birth,} +\item{\code{dob_mon} -- month of birth,} +\item{\code{dob_year} -- year of birth,} +\item{\code{enumcap} -- an address consisting of house number and street name,} +\item{\code{enumpc} -- postcode,} +\item{\code{cis_id} -- person ID with "CIS" added in front.} +} +} +\usage{ +cis +} +\description{ +This data set was created by Paula McLeod, Dick Heasman and Ian Forbes, ONS, +for the ESSnet DI on-the-job training course, Southampton, 25-28 January 2011. +It contains fictional observations from Customer Information System, +which is combined administrative data from the tax and benefit systems. +} +\examples{ + +data("cis") +head(cis) + +} +\references{ +McLeod, P., Heasman, D., Forbes, I. (2011). Simulated data for the ESSnet DI on-the-job training course, +Southampton, 25-28 January 2011. +\url{https://wayback.archive-it.org/12090/20231221144450/https://cros-legacy.ec.europa.eu/content/job-training_en} +} +\keyword{datasets} diff --git a/man/control_annoy.Rd b/man/control_annoy.Rd new file mode 100644 index 0000000..7c3d168 --- /dev/null +++ b/man/control_annoy.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/controls.R +\name{control_annoy} +\alias{control_annoy} +\title{Controls for Annoy} +\usage{ +control_annoy(n_trees = 250, build_on_disk = FALSE, ...) +} +\arguments{ +\item{n_trees}{An integer specifying the number of trees to build in the Annoy index.} + +\item{build_on_disk}{A logical value indicating whether to build the Annoy index on disk instead of in memory.} + +\item{...}{Additional arguments.} +} +\value{ +Returns a list with parameters. +} +\description{ +Controls for Annoy algorithm used in the package (see \link[RcppAnnoy]{RcppAnnoy} for details). +} diff --git a/man/control_hnsw.Rd b/man/control_hnsw.Rd new file mode 100644 index 0000000..ce5b6d7 --- /dev/null +++ b/man/control_hnsw.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/controls.R +\name{control_hnsw} +\alias{control_hnsw} +\title{Controls for HNSW} +\usage{ +control_hnsw(M = 25, ef_c = 200, ef_s = 200, grain_size = 1, byrow = TRUE, ...) +} +\arguments{ +\item{M}{Controls the number of bi-directional links created for each element during index construction.} + +\item{ef_c}{Size of the dynamic list used during construction.} + +\item{ef_s}{Size of the dynamic list used during search.} + +\item{grain_size}{Minimum amount of work to do (rows in the dataset to add) per thread.} + +\item{byrow}{If \code{TRUE} (the default), this indicates that the items in the dataset to be indexed are stored in each row. +Otherwise, the items are stored in the columns of the dataset.} + +\item{...}{Additional arguments.} +} +\value{ +Returns a list with parameters. +} +\description{ +Controls for HNSW algorithm used in the package (see \code{\link[RcppHNSW:hnsw_build]{RcppHNSW::hnsw_build()}} and \code{\link[RcppHNSW:hnsw_search]{RcppHNSW::hnsw_search()}} for details). +} diff --git a/man/control_kd.Rd b/man/control_kd.Rd new file mode 100644 index 0000000..ebec1ba --- /dev/null +++ b/man/control_kd.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/controls.R +\name{control_kd} +\alias{control_kd} +\title{Controls for KD} +\usage{ +control_kd( + algorithm = "dual_tree", + epsilon = 0, + leaf_size = 20, + random_basis = FALSE, + rho = 0.7, + tau = 0, + tree_type = "kd", + ... +) +} +\arguments{ +\item{algorithm}{Type of neighbor search: \code{'naive'}, \code{'single_tree'}, \code{'dual_tree'}, \code{'greedy'}.} + +\item{epsilon}{If specified, will do approximate nearest neighbor search with given relative error.} + +\item{leaf_size}{Leaf size for tree building +(used for kd-trees, vp trees, random projection trees, UB trees, R trees, R* trees, X trees, Hilbert R trees, R+ trees, R++ trees, spill trees, and octrees).} + +\item{random_basis}{Before tree-building, project the data onto a random orthogonal basis.} + +\item{rho}{Balance threshold (only valid for spill trees).} + +\item{tau}{Overlapping size (only valid for spill trees).} + +\item{tree_type}{Type of tree to use: \code{'kd'}, \code{'vp'}, \code{'rp'}, \code{'max-rp'}, \code{'ub'}, \code{'cover'}, \code{'r'}, \code{'r-star'}, +\code{'x'}, \code{'ball'}, \code{'hilbert-r'}, \code{'r-plus'}, \code{'r-plus-plus'}, \code{'spill'}, \code{'oct'}.} + +\item{...}{Additional arguments.} +} +\value{ +Returns a list with parameters. +} +\description{ +Controls for KD algorithm used in the package (see \link[mlpack]{knn} for details). +} diff --git a/man/control_lsh.Rd b/man/control_lsh.Rd new file mode 100644 index 0000000..8b50c37 --- /dev/null +++ b/man/control_lsh.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/controls.R +\name{control_lsh} +\alias{control_lsh} +\title{Controls for LSH} +\usage{ +control_lsh( + bucket_size = 10, + hash_width = 6, + num_probes = 5, + projections = 10, + tables = 30, + ... +) +} +\arguments{ +\item{bucket_size}{The size of a bucket in the second level hash.} + +\item{hash_width}{The hash width for the first-level hashing in the LSH preprocessing.} + +\item{num_probes}{Number of additional probes for multiprobe LSH.} + +\item{projections}{The number of hash functions for each table.} + +\item{tables}{The number of hash tables to be used.} + +\item{...}{Additional arguments.} +} +\value{ +Returns a list with parameters. +} +\description{ +Controls for LSH algorithm used in the package (see \link[mlpack]{lsh} for details). +} diff --git a/man/control_nnd.Rd b/man/control_nnd.Rd new file mode 100644 index 0000000..343a1bd --- /dev/null +++ b/man/control_nnd.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/controls.R +\name{control_nnd} +\alias{control_nnd} +\title{Controls for NND} +\usage{ +control_nnd( + k_build = 30, + use_alt_metric = FALSE, + init = "tree", + n_trees = NULL, + leaf_size = NULL, + max_tree_depth = 200, + margin = "auto", + n_iters = NULL, + delta = 0.001, + max_candidates = NULL, + low_memory = TRUE, + n_search_trees = 1, + pruning_degree_multiplier = 1.5, + diversify_prob = 1, + weight_by_degree = FALSE, + prune_reverse = FALSE, + progress = "bar", + obs = "R", + max_search_fraction = 1, + epsilon = 0.1, + ... +) +} +\arguments{ +\item{k_build}{Number of nearest neighbors to build the index for.} + +\item{use_alt_metric}{If \code{TRUE}, use faster metrics that maintain the ordering of distances internally (e.g. squared Euclidean distances if using \code{metric = "euclidean"}), +then apply a correction at the end.} + +\item{init}{Name of the initialization strategy or initial data neighbor graph to optimize.} + +\item{n_trees}{The number of trees to use in the RP forest. +Only used if \code{init = "tree"}.} + +\item{leaf_size}{The maximum number of items that can appear in a leaf. +Only used if \code{init = "tree"}.} + +\item{max_tree_depth}{The maximum depth of the tree to build (default = 200). +Only used if \code{init = "tree"}.} + +\item{margin}{A character string specifying the method used to assign points to one side of the hyperplane or the other.} + +\item{n_iters}{Number of iterations of nearest neighbor descent to carry out.} + +\item{delta}{The minimum relative change in the neighbor graph allowed before early stopping. Should be a value between 0 and 1. The smaller the value, the smaller the amount of progress between iterations is allowed.} + +\item{max_candidates}{Maximum number of candidate neighbors to try for each item in each iteration.} + +\item{low_memory}{If \code{TRUE}, use a lower memory, but more computationally expensive approach to index construction. If set to \code{FALSE}, you should see a noticeable speed improvement, especially when using a smaller number of threads, so this is worth trying if you have the memory to spare.} + +\item{n_search_trees}{The number of trees to keep in the search forest as part of index preparation. The default is 1.} + +\item{pruning_degree_multiplier}{How strongly to truncate the final neighbor list for each item.} + +\item{diversify_prob}{The degree of diversification of the search graph by removing unnecessary edges through occlusion pruning.} + +\item{weight_by_degree}{If \code{TRUE}, then candidates for the local join are weighted according to their in-degree, +so that if there are more than \code{max_candidates} in a candidate list, candidates with a smaller degree are favored for retention.} + +\item{prune_reverse}{If \code{TRUE}, prune the reverse neighbors of each item before the reverse graph diversification step using \code{pruning_degree_multiplier}.} + +\item{progress}{Determines the type of progress information logged during the nearest neighbor descent stage.} + +\item{obs}{set to \code{C} to indicate that the input data orientation stores each observation as a column. +The default \code{R} means that observations are stored in each row.} + +\item{max_search_fraction}{Maximum fraction of the reference data to search.} + +\item{epsilon}{Controls trade-off between accuracy and search cost.} + +\item{...}{Additional arguments.} +} +\value{ +Returns a list with parameters. +} +\description{ +Controls for NND algorithm used in the package (see \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query} for details). +} diff --git a/man/controls_ann.Rd b/man/controls_ann.Rd index e321a63..5582266 100644 --- a/man/controls_ann.Rd +++ b/man/controls_ann.Rd @@ -7,18 +7,11 @@ controls_ann( sparse = FALSE, k_search = 30, - nnd = list(k_build = 30, use_alt_metric = FALSE, init = "tree", n_trees = NULL, - leaf_size = NULL, max_tree_depth = 200, margin = "auto", n_iters = NULL, delta = - 0.001, max_candidates = NULL, low_memory = TRUE, n_search_trees = 1, - pruning_degree_multiplier = 1.5, diversify_prob = 1, weight_by_degree = FALSE, - prune_reverse = FALSE, progress = "bar", obs = "R", max_search_fraction = 1, epsilon - = 0.1), - hnsw = list(M = 25, ef_c = 200, ef_s = 200, grain_size = 1, byrow = TRUE), - lsh = list(bucket_size = 500, hash_width = 10, num_probes = 0, projections = 10, tables - = 30), - kd = list(algorithm = "dual_tree", epsilon = 0, leaf_size = 20, random_basis = FALSE, - rho = 0.7, tau = 0, tree_type = "kd"), - annoy = list(n_trees = 250, build_on_disk = FALSE) + nnd = control_nnd(), + hnsw = control_hnsw(), + lsh = control_lsh(), + kd = control_kd(), + annoy = control_annoy() ) } \arguments{ @@ -26,21 +19,21 @@ controls_ann( \item{k_search}{number of neighbours to search,} -\item{nnd}{list of parameters for \code{\link[rnndescent:rnnd_build]{rnndescent::rnnd_build()}} and \code{\link[rnndescent:rnnd_query]{rnndescent::rnnd_query()}},} +\item{nnd}{parameters for \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query} (should be inside \link{control_nnd} function),} -\item{hnsw}{list of parameters for \code{\link[RcppHNSW:hnsw_build]{RcppHNSW::hnsw_build()}} and \code{\link[RcppHNSW:hnsw_search]{RcppHNSW::hnsw_search()}},} +\item{hnsw}{parameters for \link[RcppHNSW]{hnsw_build} and \link[RcppHNSW]{hnsw_search} (should be inside \link{control_hnsw} function),} -\item{lsh}{list of parameters for \code{\link[mlpack:lsh]{mlpack::lsh()}} function,} +\item{lsh}{parameters for \link[mlpack]{lsh} function (should be inside \link{control_lsh} function),} -\item{kd}{list of kd parameters for \code{\link[mlpack:knn]{mlpack::knn()}} function,} +\item{kd}{kd parameters for \link[mlpack]{knn} function (should be inside \link{control_kd} function),} -\item{annoy}{list of parameters for \link{RcppAnnoy} package.} +\item{annoy}{parameters for \link[RcppAnnoy]{RcppAnnoy} package (should be inside \link{control_annoy} function).} } \value{ -Returns a list with parameters +Returns a list with parameters. } \description{ -Controls for ANN algorithms used in the package +Controls for ANN algorithms used in the package. } \author{ Maciej Beręsewicz diff --git a/man/controls_txt.Rd b/man/controls_txt.Rd index 0e71a13..aefef33 100644 --- a/man/controls_txt.Rd +++ b/man/controls_txt.Rd @@ -24,7 +24,7 @@ controls_txt( Returns a list with parameters. } \description{ -Controls for text data used in the \code{blocking} functions, passed to \link[tokenizers:shingle-tokenizers]{tokenizers::tokenize_character_shingles}. +Controls for text data used in the \code{blocking} function (if \code{representation = shingles}), passed to \link[tokenizers]{tokenize_character_shingles}. } \author{ Maciej Beręsewicz diff --git a/man/est_block_error.Rd b/man/est_block_error.Rd new file mode 100644 index 0000000..a35d6ef --- /dev/null +++ b/man/est_block_error.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/est_block_error.R +\name{est_block_error} +\alias{est_block_error} +\title{Estimate errors due to blocking in record linkage} +\usage{ +est_block_error( + x = NULL, + y = NULL, + blocking_result = NULL, + n = NULL, + N = NULL, + G, + alpha = NULL, + p = NULL, + lambda = NULL, + tol = 10^(-6), + maxiter = 1000, + sample_size = NULL +) +} +\arguments{ +\item{x}{Reference data (required if \code{n} and \code{N} are not provided).} + +\item{y}{Query data (required if \code{n} is not provided).} + +\item{blocking_result}{\code{data.frame} or \code{data.table} containing blocking results (required if \code{n} is not provided).} + +\item{n}{Integer vector of numbers of accepted pairs formed by each record in the query data set +with records in the reference data set, based on blocking criteria (if \code{NULL}, derived from \code{blocking_result}).} + +\item{N}{Total number of records in the reference data set (if \code{NULL}, derived as \code{length(x)}).} + +\item{G}{Number of classes in the finite mixture model.} + +\item{alpha}{Numeric vector of initial class proportions (length \code{G}; if \code{NULL}, initialized as \code{rep(1/G, G)}).} + +\item{p}{Numeric vector of initial matching probabilities in each class of the mixture model +(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.5, 1)}).} + +\item{lambda}{Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model +(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.1, 2)}).} + +\item{tol}{Convergence tolerance for the EM algorithm (default \code{10^(-6)}).} + +\item{maxiter}{Maximum number of iterations for the EM algorithm (default \code{1000}).} + +\item{sample_size}{Bootstrap sample (from \code{n}) size used for calculations (if \code{NULL}, uses all data).} +} +\value{ +Returns a list containing:\cr +\itemize{ +\item{\code{FPR} -- estimated false positive rate,} +\item{\code{FNR} -- estimated false negative rate,} +\item{\code{iter} -- number of the EM algorithm iterations performed,} +\item{\code{convergence} -- logical, indicating whether the EM algorithm converged within \code{maxiter} iterations.} +} +} +\description{ +Function computes estimators for false positive rate (FPR) and false negative rate (FNR) due to blocking in record linkage, +as proposed by Dasylva and Goussanou (2021). Assumes duplicate-free data sources, +complete coverage of the reference data set and blocking decisions based solely on record pairs. +} +\details{ +Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file. +Assume that the register has no undercoverage, +i.e. each record from the file corresponds to exactly one record from the same individual in the register. +Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with +record \eqn{i} on the file. Assume that:\cr +\itemize{ +\item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N}, +\item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}. +} +The finite mixture model \eqn{n_i \sim \sum_{g=1}^G \alpha_g(\text{Bernoulli}(p_g) \ast \text{Poisson}(\lambda_g))} is assumed. +When \eqn{G} is fixed, the unknown model parameters are given by the vector \eqn{\psi = [(\alpha_g, p_g, \lambda_g)]_{1 \leq g \leq G}} +that may be estimated with the Expectation-Maximization (EM) procedure. + +Let \eqn{n_i = n_{i|M} + n_{i|U}}, where \eqn{n_{i|M}} is the number of matched neighbours +and \eqn{n_{i|U}} is the number of unmatched neighbours, and let \eqn{c_{ig}} denote +the indicator that record \eqn{i} is from class \eqn{g}. +For the E-step of the EM procedure, the equations are as follows +\deqn{ +\begin{aligned} +P(n_i | c_{ig} = 1) &= I(n_i = 0)(1-p_g)e^{-\lambda_g}+I(n_i > 0)\Bigl(p_g+(1-p_g)\frac{\lambda_g}{n_i}\Bigr)\frac{e^{-\lambda_g}\lambda_g^{n_i-1}}{(n_i-1)!}, \\ +P(c_{ig} = 1 | n_i) &= \frac{\alpha_gP(n_i | c_{ig} = 1)}{\sum_{g'=1}^G\alpha_{g'}P(n_i | c_{ig'} = 1)}, \\ +P(n_{i|M} = 1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\ +P(n_{i|U} = n_i | n_i,c_{ig} = 1) &= I(n_i = 0) + I(n_i > 0)\frac{(1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}, \\ +P(n_{i|U} = n_i-1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\ +E[c_{ig}n_{i|M} | n_i] &= P(c_{ig} = 1 | n_i)P(n_{i|M} = 1 | n_i,c_{ig} = 1), \\ +E[n_{i|U} | n_i,c_{ig} = 1] &= \Bigl(\frac{p_g(n_i-1) + (1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}\Bigr)n_i, \\ +E[c_{ig}n_{i|U} | n_i] &= P(c_{ig} = 1 | n_i)E[n_{i|U} | n_i,c_{ig} = 1]. +\end{aligned} +} +The M-step is given by following equations +\deqn{ +\begin{aligned} +\hat{p}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|M} | n_i;\psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\ +\hat{\lambda}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|U} | n_i; \psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\ +\hat{\alpha}_g &= \frac{1}{m}\sum_{i=1}^mE[c_{ig} | n_i; \psi]. +\end{aligned} +} +As \eqn{N \to \infty}, the error rates and the model parameters are related as follows +\deqn{ +\begin{aligned} +\text{FNR} &\xrightarrow{p} 1 - E[p(v_i)], \\ +(N-1)\text{FPR} &\xrightarrow{p} E[\lambda(v_i)], +\end{aligned} +} +where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}. +} +\examples{ +## an example proposed by Dasylva and Goussanou (2021) + +set.seed(111) + +neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5)) + +errors <- est_block_error(n = neighbors, + N = 63155, + G = 2, + tol = 10^(-3), + maxiter = 50) + +errors + +} +\references{ +Dasylva, A., Goussanou, A. (2021). Estimating the false negatives due to blocking in record linkage. +Survey Methodology, Statistics Canada, Catalogue No. 12-001-X, Vol. 47, No. 2. + +Dasylva, A., Goussanou, A. (2022). On the consistent estimation of linkage errors without training data. +Jpn J Stat Data Sci 5, 181–216. \doi{10.1007/s42081-022-00153-3} +} diff --git a/man/eval_dedup.Rd b/man/eval_dedup.Rd new file mode 100644 index 0000000..d36e358 --- /dev/null +++ b/man/eval_dedup.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eval.R +\name{eval_dedup} +\alias{eval_dedup} +\title{Evaluation for deduplication} +\usage{ +eval_dedup(pred_df, true_df) +} +\arguments{ +\item{pred_df}{Output from the blocking algorithm.} + +\item{true_df}{Ground-truth links (may be subset).} +} +\value{ +Returns a list containing TP, FP, FN and TN. +} +\description{ +Function calculates TP, FP, FN and TN for deduplication. +} diff --git a/man/eval_reclin.Rd b/man/eval_reclin.Rd new file mode 100644 index 0000000..178474e --- /dev/null +++ b/man/eval_reclin.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eval.R +\name{eval_reclin} +\alias{eval_reclin} +\title{Evaluation for record linkage} +\usage{ +eval_reclin(pred_df, true_df) +} +\arguments{ +\item{pred_df}{Output from the blocking algorithm.} + +\item{true_df}{Ground-truth links (may be subset).} +} +\value{ +Returns a list containing TP, FP, FN and TN. +} +\description{ +Function calculates TP, FP, FN and TN for record linkage. +} diff --git a/man/foreigners.Rd b/man/foreigners.Rd new file mode 100644 index 0000000..f0dc3cc --- /dev/null +++ b/man/foreigners.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{foreigners} +\alias{foreigners} +\title{Fictional 2024 population of foreigners in Poland} +\format{ +A \code{data.table} with 110000 records. Each row represents one record, with the following columns: +\itemize{ +\item{\code{fname} -- first name,} +\item{\code{sname} -- second name,} +\item{\code{surname} -- surname,} +\item{\code{date} -- date of birth,} +\item{\code{region} -- region (county),} +\item{\code{country} -- country,} +\item{\code{true_id} -- person ID.} +} +} +\usage{ +foreigners +} +\description{ +A fictional data set of the foreign population in Poland, +generated based on publicly available information +while maintaining the distributions from administrative registers. +} +\examples{ + +data("foreigners") +head(foreigners) + +} +\keyword{datasets} diff --git a/man/get_confusion.Rd b/man/get_confusion.Rd new file mode 100644 index 0000000..64de796 --- /dev/null +++ b/man/get_confusion.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eval.R +\name{get_confusion} +\alias{get_confusion} +\title{Confusion matrix} +\usage{ +get_confusion(TP, FP, FN, TN) +} +\arguments{ +\item{TP}{TP} + +\item{FP}{FP} + +\item{FN}{FN} + +\item{TN}{TN} +} +\value{ +Returns a confusion matrix. +} +\description{ +Function creates a confusion matrix from raw counts. +} diff --git a/man/get_metrics.Rd b/man/get_metrics.Rd new file mode 100644 index 0000000..19ce854 --- /dev/null +++ b/man/get_metrics.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eval.R +\name{get_metrics} +\alias{get_metrics} +\title{Metrics for evaluating dedupliaction and record linkage} +\usage{ +get_metrics(TP, FP, FN, TN) +} +\arguments{ +\item{TP}{TP} + +\item{FP}{FP} + +\item{FN}{FN} + +\item{TN}{TN} +} +\value{ +Returns a list containing evaluation metrics. +} +\description{ +Function calculates standard evaluation metrics. +} diff --git a/man/method_annoy.Rd b/man/method_annoy.Rd index 433e2bc..6aa62a1 100644 --- a/man/method_annoy.Rd +++ b/man/method_annoy.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/method_annoy.R \name{method_annoy} \alias{method_annoy} -\title{An internal function to use Annoy algorithm via the \link{RcppAnnoy} package.} +\title{An internal function to use Annoy algorithm via the \link[RcppAnnoy]{RcppAnnoy} package.} \usage{ method_annoy(x, y, k, distance, verbose, path, seed, control) } @@ -21,13 +21,10 @@ method_annoy(x, y, k, distance, verbose, path, seed, control) \item{seed}{seed for the pseudo-random numbers algorithm,} -\item{control}{controls for \code{new} or \code{build} methods for \link{RcppAnnoy}.} +\item{control}{controls for \code{new} or \code{build} methods for \link[RcppAnnoy]{RcppAnnoy}.} } \description{ -See details of the \link{RcppAnnoy} package. -} -\details{ -Imports +See details of the \link[RcppAnnoy]{RcppAnnoy} package. } \author{ Maciej Beręsewicz diff --git a/man/method_hnsw.Rd b/man/method_hnsw.Rd index 6b2cd5f..baa0d47 100644 --- a/man/method_hnsw.Rd +++ b/man/method_hnsw.Rd @@ -24,7 +24,7 @@ method_hnsw(x, y, k, distance, verbose, n_threads, path, control) \item{control}{controls for the HNSW algorithm.} } \description{ -See details of \link[RcppHNSW:hnsw_build]{RcppHNSW::hnsw_build} and \link[RcppHNSW:hnsw_search]{RcppHNSW::hnsw_search}. +See details of \link[RcppHNSW]{hnsw_build} and \link[RcppHNSW]{hnsw_search}. } \author{ Maciej Beręsewicz diff --git a/man/method_mlpack.Rd b/man/method_mlpack.Rd index fe01e60..faeed7a 100644 --- a/man/method_mlpack.Rd +++ b/man/method_mlpack.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/method_mlpack.R \name{method_mlpack} \alias{method_mlpack} -\title{An internal function to use the LSH and KD-tree algorithm via the \link{mlpack} package.} +\title{An internal function to use the LSH and KD-tree algorithm via the \link[mlpack]{mlpack} package.} \usage{ method_mlpack(x, y, algo = c("lsh", "kd"), k, verbose, seed, path, control) } @@ -24,7 +24,7 @@ method_mlpack(x, y, algo = c("lsh", "kd"), k, verbose, seed, path, control) \item{control}{controls for the \code{lsh} or \code{kd} algorithms.} } \description{ -See details of \link[mlpack:lsh]{mlpack::lsh} and \link[mlpack:knn]{mlpack::knn} +See details of \link[mlpack]{lsh} and \link[mlpack]{knn}. } \author{ Maciej Beręsewicz diff --git a/man/method_nnd.Rd b/man/method_nnd.Rd index aadcae4..5eddf17 100644 --- a/man/method_nnd.Rd +++ b/man/method_nnd.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/method_nnd.R \name{method_nnd} \alias{method_nnd} -\title{An internal function to use the NN descent algorithm via the \link{rnndescent} package.} +\title{An internal function to use the NN descent algorithm via the \link[rnndescent]{rnndescent} package.} \usage{ method_nnd(x, y, k, distance, deduplication, verbose, n_threads, control) } @@ -24,7 +24,7 @@ method_nnd(x, y, k, distance, deduplication, verbose, n_threads, control) \item{control}{controls for the NN descent algorithm.} } \description{ -See details of \link[rnndescent:rnnd_build]{rnndescent::rnnd_build} and \link[rnndescent:rnnd_query]{rnndescent::rnnd_query}. +See details of \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query}. } \author{ Maciej Beręsewicz diff --git a/man/pair_ann.Rd b/man/pair_ann.Rd index f5e240b..486034b 100644 --- a/man/pair_ann.Rd +++ b/man/pair_ann.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reclin2_pair_ann.R \name{pair_ann} \alias{pair_ann} -\title{Integration with the reclin2 package} +\title{Integration with the \pkg{reclin2} package} \usage{ pair_ann( x, @@ -30,16 +30,14 @@ pair_ann( \item{add_xy}{whether to add x and y,} -\item{...}{arguments passed to \code{\link[=blocking]{blocking()}} function.} +\item{...}{arguments passed to \link{blocking} function.} } \value{ -Returns a \link{data.table} with two columns \code{.x} and \code{.y}. Columns \code{.x} and \code{.y} are row numbers from data.frames x and y respectively. Returning data.table is also of a class \code{pairs} which allows for integration with the \code{\link[reclin2:compare_pairs]{reclin2::compare_pairs()}} package. +Returns a \link[data.table]{data.table} with two columns \code{.x} and \code{.y}. Columns \code{.x} and \code{.y} are row numbers from data.frames x and y respectively. +Returned \code{data.table} is also of a class \code{pairs} which allows for integration with the \link[reclin2]{compare_pairs} function. } \description{ -Function for the integration with the \code{reclin2} package. The function is based on \code{\link[reclin2:pair_minsim]{reclin2::pair_minsim()}} and reuses some of its source code. -} -\details{ -Imports +Function for the integration with the \pkg{reclin2} package. The function is based on \link[reclin2]{pair_minsim} and reuses some of its source code. } \examples{ diff --git a/man/sentence_to_vector.Rd b/man/sentence_to_vector.Rd new file mode 100644 index 0000000..997b3d9 --- /dev/null +++ b/man/sentence_to_vector.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sentence_to_vector.R +\name{sentence_to_vector} +\alias{sentence_to_vector} +\title{Sentence to vector} +\usage{ +sentence_to_vector(sentences, model) +} +\arguments{ +\item{sentences}{a character vector,} + +\item{model}{a matrix containing word embeddings (e.g., GloVe).} +} +\description{ +Function creates a matrix with word embeddings using a given model. +} diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index 097b241..0000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.html -*.R diff --git a/vignettes/v1-deduplication.Rmd b/vignettes/v1-deduplication.Rmd index 28b903b..4a75eee 100644 --- a/vignettes/v1-deduplication.Rmd +++ b/vignettes/v1-deduplication.Rmd @@ -25,49 +25,47 @@ knitr::opts_chunk$set( # Setup -Read required packages +Read required packages. ```{r setup} library(blocking) -library(reclin2) library(data.table) ``` -Read the `RLdata500` data from the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package from the [dblink](https://github.com/cleanzr/dblink) Github repository. +Read the `RLdata500` data (taken from the [RecordLinkage](https://CRAN.R-project.org/package=RecordLinkage) package). ```{r} -df <- read.csv("https://raw.githubusercontent.com/cleanzr/dblink/dc3dd0daf55f8a303863423817a0f0042b3c275a/examples/RLdata500.csv") -setDT(df) -head(df) +data(RLdata500) +head(RLdata500) ``` -This dataset contains `r nrow(df)` with `r NROW(unique(df$ent_id))` entities. +This dataset contains `r nrow(RLdata500)` rows with `r NROW(unique(RLdata500$ent_id))` entities. # Blocking for deduplication Now we create a new column that concatenates the information in each row. ```{r} -df[, id_count :=.N, ent_id] ## how many times given unit occurs -df[is.na(fname_c2), fname_c2:=""] -df[is.na(lname_c2), lname_c2:=""] -df[, bm:=sprintf("%02d", bm)] ## add leading zeros to month -df[, bd:=sprintf("%02d", bd)] ## add leading zeros to month -df[, txt:=tolower(paste0(fname_c1,fname_c2,lname_c1,lname_c2,by,bm,bd))] -head(df) +RLdata500[, id_count :=.N, ent_id] ## how many times given unit occurs +RLdata500[is.na(fname_c2), fname_c2:=""] +RLdata500[is.na(lname_c2), lname_c2:=""] +RLdata500[, bm:=sprintf("%02d", bm)] ## add leading zeros to month +RLdata500[, bd:=sprintf("%02d", bd)] ## add leading zeros to month +RLdata500[, txt:=tolower(paste0(fname_c1,fname_c2,lname_c1,lname_c2,by,bm,bd))] +head(RLdata500) ``` In the next step we use the newly created column in the `blocking` function. If we specify verbose, we get information about the progress. ```{r} set.seed(2024) -df_blocks <- blocking(x = df$txt, ann = "nnd", verbose = 1, graph = TRUE) +df_blocks <- blocking(x = RLdata500$txt, ann = "nnd", verbose = 1, graph = TRUE) ``` Results are as follows: -+ based in `rnndescent` we have created `r NROW(unique(df_blocks$result$block))` blocks, ++ based on `rnndescent` we have created `r NROW(unique(df_blocks$result$block))` blocks, + it was based on `r NROW(unique(df_blocks$colnames))` columns (2 character shingles), -+ we have 93 blocks of 2 elements, 38 blocks of 3 elements, ..., 3 block of 6 elements. ++ we have 45 blocks of 2 elements, 33 blocks of 3 elements, ..., 1 block of 17 elements. ```{r} df_blocks @@ -75,26 +73,28 @@ df_blocks Structure of the object is as follows: -+ `result` - a data.table with identifiers and block IDs, -+ `method` - the method used, ++ `result` -- a `data.table` with identifiers and block IDs, ++ `method` -- the method used, + `deduplication` -- whether deduplication was applied, -+ `metrics` - standard metrics and based on the `igraph::compare` methods for comparing graphs (here NULL), -+ `colnames` - column names used for the comparison, ++ `representation` -- whether shingles or vectors were used, ++ `metrics` -- standard metrics and based on the `igraph::compare` methods for comparing graphs (here NULL), ++ `confusion` -- confusion matrix (here NULL), ++ `colnames` -- column names used for the comparison, + `graph` -- an `igraph` object mainly for visualisation. ```{r} str(df_blocks,1) ``` -Plot connections +Plot connections. ```{r} plot(df_blocks$graph, vertex.size=1, vertex.label = NA) ``` -The resulting data.table has three columns: +The resulting `data.table` has four columns: -+ `x` -- reference dataset (i.e. `df`) -- this may not contain all units of `df`, -+ `y` - query (each row of `df`) -- this may not contain all units of `df`, ++ `x` -- reference dataset (i.e. `RLdata500`) -- this may not contain all units of `RLdata500`, ++ `y` - query (each row of `RLdata500`) -- this may not contain all units of `RLdata500`, + `block` -- the block ID, + `dist` -- distance between objects. @@ -113,17 +113,17 @@ head(df_block_melted_rec_block) We add block information to the final dataset. ```{r} -df[df_block_melted_rec_block, on = "rec_id", block_id := i.block] -head(df) +RLdata500[df_block_melted_rec_block, on = "rec_id", block_id := i.block] +head(RLdata500) ``` We can check in how many blocks the same entities (`ent_id`) are observed. In our example, all the same entities are in the same blocks. ```{r} -df[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks] +RLdata500[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks] ``` -We can visualise the distances between units stored in the `df_blocks$result` data set. Clearly we a mixture of two groups: matches (close to 0) and non-matches (close to 1). +We can visualise the distances between units stored in the `df_blocks$result` data set. Clearly we have a mixture of two groups: matches (close to 0) and non-matches (close to 1). ```{r} hist(df_blocks$result$dist, xlab = "Distances", ylab = "Frequency", breaks = "fd", @@ -133,8 +133,8 @@ hist(df_blocks$result$dist, xlab = "Distances", ylab = "Frequency", breaks = "fd Finally, we can visualise the result based on the information whether block contains matches or not. ```{r} -df_for_density <- copy(df_block_melted[block %in% df$block_id]) -df_for_density[, match:= block %in% df[id_count == 2]$block_id] +df_for_density <- copy(df_block_melted[block %in% RLdata500$block_id]) +df_for_density[, match:= block %in% RLdata500[id_count == 2]$block_id] plot(density(df_for_density[match==FALSE]$dist), col = "blue", xlim = c(0, 0.8), main = "Distribution of distances between\nclusters type (match=red, non-match=blue)") diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index d61a39f..eebd25b 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -31,9 +31,7 @@ Read required packages ```{r setup} library(blocking) -library(reclin2) library(data.table) -library(rnndescent) ``` # Data @@ -46,9 +44,9 @@ persons up to the date 31 December 2011. Any years of birth captured as 2012 are therefore in error. Note that in the fictional Census data set, dates of birth between 27 March 2011 and 31 December 2011 are not necessarily in error. -Census: A fictional data set to represent some observations from a +census: A fictional data set to represent some observations from a decennial Census -CIS: Fictional observations from Customer Information System, which is +cis: Fictional observations from Customer Information System, which is combined administrative data from the tax and benefit systems In the dataset census all records contain a person_id. For some of the records @@ -58,12 +56,12 @@ all records in the cis). ``` ```{r} -census <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") -cis <- fread("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") +data(census) +data(cis) ``` -+ `census` object has `r nrow(census)` rows and `r ncol(census)`, -+ `cis` object has `r nrow(census)` rows and `r ncol(census)`. ++ `census` object has `r nrow(census)` rows and `r ncol(census)` columns, ++ `cis` object has `r nrow(cis)` rows and `r ncol(cis)` columns. Census data @@ -76,19 +74,18 @@ CIS data head(cis) ``` -We need to create new columns that concatanates variables from `pername1` to `enumpc`. In the first step we replace `NA`s with `''`. +We randomly select `r as.integer(floor(nrow(census) / 2))` records from `census` and `r as.integer(floor(nrow(cis) / 2))` records from `cis`. ```{r} -census[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon), dob_year=as.character(dob_year))] -cis[, ":="(dob_day=as.character(dob_day), dob_mon=as.character(dob_mon),dob_year=as.character(dob_year))] +set.seed(2024) +census <- census[sample(nrow(census), floor(nrow(census) / 2)), ] +set.seed(2024) +cis <- cis[sample(nrow(cis), floor(nrow(cis) / 2)), ] +``` -census[is.na(dob_day), dob_day := ""] -census[is.na(dob_mon), dob_mon := ""] -census[is.na(dob_year), dob_year := ""] -cis[is.na(dob_day), dob_day := ""] -cis[is.na(dob_mon), dob_mon := ""] -cis[is.na(dob_year), dob_year := ""] +We need to create new columns that concatenate variables from `pername1` to `enumpc`. +```{r} census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] ``` @@ -101,8 +98,7 @@ cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, The goal of this exercise is to link units from the CIS dataset to the CENSUS dataset. ```{r} -set.seed(2024) -result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1, n_threads = 8) +result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1, seed = 2024) ``` Distribution of distances for each pair. @@ -117,16 +113,16 @@ Example pairs. head(result1$result, n= 10) ``` -Let's take a look at the first pair. Obviously there is a typo in the `pername1`, but all the other variables are the same, so it appears to be a match. +Let's take a look at the first pair. Obviously there is a typo in the `pername1` but all the other variables are the same, so it appears to be a match. ```{r} -cbind(t(census[1, 1:9]), t(cis[8152, 1:9])) +cbind(t(census[1, c(1:7, 9:10)]), t(cis[2606, 1:9])) ``` ## Assessing the quality -For some records, we have information about the correct linkage. We can use this information to evaluate our approach, but note that the information for evaluating quality is described in detail in the other vignette. +For some records, we have information about the correct linkage. We can use this information to evaluate our approach. ```{r} matches <- merge(x = census[, .(x=1:.N, person_id)], @@ -139,9 +135,9 @@ head(matches) So in our example we have `r nrow(matches)` pairs. ```{r} -set.seed(2024) result2 <- blocking(x = census$txt, y = cis$txt, verbose = 1, - true_blocks = matches[, .(x, y, block)], n_threads = 8) + true_blocks = matches[, .(x, y, block)], + seed = 2024) ``` Let's see how our approach handled this problem. @@ -150,21 +146,21 @@ Let's see how our approach handled this problem. result2 ``` -It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.2f",result2$metrics["fnr"]*100)`%. We can see if increasing the number of `k` (and thus `max_candidates`) as suggested in the [Nearest Neighbor Descent +It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.2f",result2$metrics["fnr"]*100)`%. We can see if decreasing the `epsilon` parameter as suggested in the [Nearest Neighbor Descent ](https://jlmelville.github.io/rnndescent/articles/nearest-neighbor-descent.html) vignette will help. ```{r} -set.seed(2024) ann_control_pars <- controls_ann() ann_control_pars$nnd$epsilon <- 0.2 result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1, - true_blocks = matches[, .(x, y, block)], n_threads = 8, - control_ann = ann_control_pars) + true_blocks = matches[, .(x, y, block)], + control_ann = ann_control_pars, + seed = 2024) ``` -Changing the `epsilon` search parameter from 0.1 to 0.2 decreased the FDR to `r sprintf("%.1f",result3$metrics["fnr"]*100)`%. +Changing the `epsilon` search parameter from 0.1 to 0.2 decreased the FNR to `r sprintf("%.2f",result3$metrics["fnr"]*100)`%. ```{r} result3 @@ -174,7 +170,7 @@ Finally, compare the NND and HNSW algorithm for this example. ```{r} result4 <- blocking(x = census$txt, y = cis$txt, verbose = 1, - true_blocks = matches[, .(x, y, block)], n_threads = 8, + true_blocks = matches[, .(x, y, block)], ann = "hnsw", seed = 2024) ``` diff --git a/vignettes/v3-evaluation.Rmd b/vignettes/v3-evaluation.Rmd deleted file mode 100644 index de1597e..0000000 --- a/vignettes/v3-evaluation.Rmd +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: "Evaluation of blocking procedures" -author: "Maciej Beręsewicz" -output: - html_vignette: - df_print: kable - toc: true - number_sections: true - fig_width: 6 - fig_height: 4 -vignette: > - %\VignetteIndexEntry{Evaluation of blocking procedures} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Setup - -```{r setup} -library(blocking) -``` - -# Methodology - -The package implements the following measures of assessing the quality of blocking - -# diff --git a/vignettes/v4-integration.Rmd b/vignettes/v3-integration.Rmd similarity index 78% rename from vignettes/v4-integration.Rmd rename to vignettes/v3-integration.Rmd index 78d4f3c..9761aff 100644 --- a/vignettes/v4-integration.Rmd +++ b/vignettes/v3-integration.Rmd @@ -36,16 +36,8 @@ library(reclin2) In the example we will use the same dataset as in the *Blocking records for record linkage* vignette. ```{r} -census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") -cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") -setDT(census) -setDT(cis) -census[is.na(dob_day), dob_day := ""] -census[is.na(dob_mon), dob_mon := ""] -census[is.na(dob_year), dob_year := ""] -cis[is.na(dob_day), dob_day := ""] -cis[is.na(dob_mon), dob_mon := ""] -cis[is.na(dob_year), dob_year := ""] +data(census) +data(cis) census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] census[, x:=1:.N] @@ -54,13 +46,14 @@ cis[, y:=1:.N] # Integration with the `reclin2` package -The package contains function `pair_ann` which aims at integration with `reclin2` package. This function works as follows +The package contains function `pair_ann` which aims at integration with `reclin2` package. This function works as follows. ```{r} pair_ann(x = census[1:1000], y = cis[1:1000], on = "txt", - deduplication = FALSE) + deduplication = FALSE) |> + head() ``` Which provides you information on the total number of pairs. This can be further included in the pipeline of the `reclin2` package. diff --git a/vignettes/v5-bigdata.Rmd b/vignettes/v5-bigdata.Rmd deleted file mode 100644 index d27b88a..0000000 --- a/vignettes/v5-bigdata.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "Dealing with large datasets" -author: "Maciej Beręsewicz" -output: - html_vignette: - df_print: kable - toc: true - number_sections: true - fig_width: 6 - fig_height: 4 -vignette: > - %\VignetteIndexEntry{Dealing with large datasets} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup} -library(blocking) -```