diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index 735ecf5..b438093 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,20 +1,17 @@ -C:/Users/48607/Desktop/Grant/blocking_30_05/DESCRIPTION="BADE5BA2" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/blocking.R="44766414" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/controls.R="A7B8B8D2" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/est_block_error.R="1DB588BC" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/eval.R="30BB8D2A" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_annoy.R="38099BF2" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_hnsw.R="ECE2EFAA" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_mlpack.R="7020DB20" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/method_nnd.R="1937EFF4" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/methods.R="2B7AFCBF" -C:/Users/48607/Desktop/Grant/blocking_30_05/R/sentence_to_vector.R="71CCE4B2" -C:/Users/48607/Desktop/Grant/blocking_30_05/README.md="3BCBDFED" -C:/Users/48607/Desktop/Grant/blocking_30_05/inst/WORDLIST="5610708B" -C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_annoy.R="A935531D" -C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_hnsw.R="13EE8820" -C:/Users/48607/Desktop/Grant/blocking_30_05/inst/tinytest/test_true_blocks.R="7D5E11CB" -C:/Users/48607/Desktop/Grant/blocking_30_05/tests/tinytest.R="2AFE54EE" -C:/Users/48607/Desktop/Grant/blocking_30_05/vignettes/v2-reclin.Rmd="7E043D0D" -C:/Users/48607/Desktop/Grant/blocking_30_05/vignettes/v3-integration.Rmd="EE91B56E" -C:/Users/48607/Desktop/Grant/blocking_test_3/vignettes/v2-reclin.Rmd="F7A0D8BC" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.github/workflows/pkgdown.yaml="48AB106D" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.github/workflows/test-coverage.yaml="E824CBD4" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="DF69E985" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="CB8B0A33" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/NAMESPACE="EBD0CE51" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/blocking.R="25D2A128" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="9BA1FC11" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/data.R="2F3A9433" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/eval.R="AACD4DF9" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/README.Rmd="610BE353" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/man/cis.Rd="485EF5A5" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/man/est_block_error.Rd="73E6957E" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/man/sentence_to_vector.Rd="A6CD2A86" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="AADD0AFB" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="DCCF1C6F" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="7253B478" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-integration.Rmd="A9B0ECDA" diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 27d4528..960234c 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -15,7 +15,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -44,7 +44,7 @@ jobs: - name: Upload test results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage-test-failures path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index c8a44d2..63de70d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: blocking Type: Package Title: Deduplication / Entity Resolution with Record Blocking -Version: 0.1.0 +Version: 1.0.0 Authors@R: c(person(given = "Maciej", family = "Beręsewicz", diff --git a/R/eval.R b/R/eval.R index 9e4ed6f..46ccdb6 100644 --- a/R/eval.R +++ b/R/eval.R @@ -14,6 +14,7 @@ #' Returns a list containing TP, FP, FN and TN. #' #' @keywords internal +#' @noRd eval_reclin <- function(pred_df, true_df) { pred_x_map <- unique(pred_df[, c("x", "block"), with = FALSE]) @@ -82,6 +83,7 @@ eval_reclin <- function(pred_df, true_df) { #' Returns a list containing TP, FP, FN and TN. #' #' @keywords internal +#' @noRd eval_dedup <- function(pred_df, true_df) { pred_lbl <- melt(pred_df, @@ -143,6 +145,7 @@ eval_dedup <- function(pred_df, true_df) { #' Returns a list containing evaluation metrics. #' #' @keywords internal +#' @noRd get_metrics <- function(TP, FP, FN, TN) { recall <- if (TP + FN != 0) TP / (TP + FN) else 0 @@ -176,6 +179,7 @@ get_metrics <- function(TP, FP, FN, TN) { #' Returns a confusion matrix. #' #' @keywords internal +#' @noRd get_confusion <- function(TP, FP, FN, TN) { cm <- matrix(c(TP, FP, FN, TN), nrow = 2) diff --git a/man/eval_dedup.Rd b/man/eval_dedup.Rd deleted file mode 100644 index d36e358..0000000 --- a/man/eval_dedup.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval.R -\name{eval_dedup} -\alias{eval_dedup} -\title{Evaluation for deduplication} -\usage{ -eval_dedup(pred_df, true_df) -} -\arguments{ -\item{pred_df}{Output from the blocking algorithm.} - -\item{true_df}{Ground-truth links (may be subset).} -} -\value{ -Returns a list containing TP, FP, FN and TN. -} -\description{ -Function calculates TP, FP, FN and TN for deduplication. -} diff --git a/man/eval_reclin.Rd b/man/eval_reclin.Rd deleted file mode 100644 index 178474e..0000000 --- a/man/eval_reclin.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval.R -\name{eval_reclin} -\alias{eval_reclin} -\title{Evaluation for record linkage} -\usage{ -eval_reclin(pred_df, true_df) -} -\arguments{ -\item{pred_df}{Output from the blocking algorithm.} - -\item{true_df}{Ground-truth links (may be subset).} -} -\value{ -Returns a list containing TP, FP, FN and TN. -} -\description{ -Function calculates TP, FP, FN and TN for record linkage. -} diff --git a/man/get_confusion.Rd b/man/get_confusion.Rd deleted file mode 100644 index 64de796..0000000 --- a/man/get_confusion.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval.R -\name{get_confusion} -\alias{get_confusion} -\title{Confusion matrix} -\usage{ -get_confusion(TP, FP, FN, TN) -} -\arguments{ -\item{TP}{TP} - -\item{FP}{FP} - -\item{FN}{FN} - -\item{TN}{TN} -} -\value{ -Returns a confusion matrix. -} -\description{ -Function creates a confusion matrix from raw counts. -} diff --git a/man/get_metrics.Rd b/man/get_metrics.Rd deleted file mode 100644 index 19ce854..0000000 --- a/man/get_metrics.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval.R -\name{get_metrics} -\alias{get_metrics} -\title{Metrics for evaluating dedupliaction and record linkage} -\usage{ -get_metrics(TP, FP, FN, TN) -} -\arguments{ -\item{TP}{TP} - -\item{FP}{FP} - -\item{FN}{FN} - -\item{TN}{TN} -} -\value{ -Returns a list containing evaluation metrics. -} -\description{ -Function calculates standard evaluation metrics. -} diff --git a/man/method_annoy.Rd b/man/method_annoy.Rd index 6aa62a1..c9765dd 100644 --- a/man/method_annoy.Rd +++ b/man/method_annoy.Rd @@ -29,3 +29,4 @@ See details of the \link[RcppAnnoy]{RcppAnnoy} package. \author{ Maciej Beręsewicz } +\keyword{internal} diff --git a/man/method_hnsw.Rd b/man/method_hnsw.Rd index baa0d47..b5773b8 100644 --- a/man/method_hnsw.Rd +++ b/man/method_hnsw.Rd @@ -29,3 +29,4 @@ See details of \link[RcppHNSW]{hnsw_build} and \link[RcppHNSW]{hnsw_search}. \author{ Maciej Beręsewicz } +\keyword{internal} diff --git a/man/method_mlpack.Rd b/man/method_mlpack.Rd index faeed7a..1d68795 100644 --- a/man/method_mlpack.Rd +++ b/man/method_mlpack.Rd @@ -29,3 +29,4 @@ See details of \link[mlpack]{lsh} and \link[mlpack]{knn}. \author{ Maciej Beręsewicz } +\keyword{internal} diff --git a/man/method_nnd.Rd b/man/method_nnd.Rd index 5eddf17..74b0b6e 100644 --- a/man/method_nnd.Rd +++ b/man/method_nnd.Rd @@ -29,3 +29,4 @@ See details of \link[rnndescent]{rnnd_build} and \link[rnndescent]{rnnd_query}. \author{ Maciej Beręsewicz } +\keyword{internal} diff --git a/man/sentence_to_vector.Rd b/man/sentence_to_vector.Rd index 997b3d9..a7eb1e0 100644 --- a/man/sentence_to_vector.Rd +++ b/man/sentence_to_vector.Rd @@ -14,3 +14,4 @@ sentence_to_vector(sentences, model) \description{ Function creates a matrix with word embeddings using a given model. } +\keyword{internal}