Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: blocking
Type: Package
Title: Various Blocking Methods for Entity Resolution
Version: 1.0.9001
Version: 1.0.1
Authors@R:
c(person(given = "Maciej",
family = "Beręsewicz",
Expand Down Expand Up @@ -29,7 +29,6 @@ Imports:
rnndescent,
igraph,
data.table,
RcppAlgos,
methods,
readr,
utils,
Expand Down
6 changes: 1 addition & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import(data.table)
importFrom(Matrix,colSums)
importFrom(Matrix,rowSums)
importFrom(Matrix,sparseMatrix)
importFrom(RcppAlgos,comboGeneral)
importFrom(RcppAnnoy,AnnoyAngular)
importFrom(RcppAnnoy,AnnoyEuclidean)
importFrom(RcppAnnoy,AnnoyHamming)
Expand All @@ -27,9 +26,7 @@ importFrom(RcppHNSW,HnswL2)
importFrom(RcppHNSW,hnsw_build)
importFrom(RcppHNSW,hnsw_search)
importFrom(data.table,data.table)
importFrom(igraph,compare)
importFrom(igraph,components)
importFrom(igraph,graph_from_adjacency_matrix)
importFrom(igraph,graph_from_data_frame)
importFrom(igraph,make_clusters)
importFrom(methods,new)
Expand All @@ -48,7 +45,6 @@ importFrom(text2vec,itoken)
importFrom(text2vec,itoken_parallel)
importFrom(text2vec,space_tokenizer)
importFrom(text2vec,vocab_vectorizer)
importFrom(tokenizers,tokenize_character_shingles)
importFrom(utils,download.file)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
importFrom(utils,unzip)
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# development

# version 1.0.1

+ Fixed CRAN errors
+ [Awesome Official Statistics](http://www.awesomeofficialstatistics.org) badge added
+ Removed unnecessary dependency on the `RcppAlgos` package

# version 1.0.0

+ Added support for word embeddings.
Expand Down
6 changes: 2 additions & 4 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
#' @importFrom text2vec create_vocabulary
#' @importFrom text2vec vocab_vectorizer
#' @importFrom text2vec create_dtm
#' @importFrom igraph graph_from_adjacency_matrix
#' @importFrom tokenizers tokenize_character_shingles
#' @importFrom igraph components
#' @importFrom igraph graph_from_data_frame
#' @importFrom igraph make_clusters
#' @importFrom igraph compare
#' @importFrom RcppAlgos comboGeneral
#' @importFrom stats dist
#' @importFrom readr read_table
#' @importFrom utils download.file
Expand All @@ -36,7 +34,7 @@
#' @param distance distance metric (default \code{cosine}, more options are possible see details),
#' @param ann_write writing an index to file. Two files will be created: 1) an index, 2) and text file with column names,
#' @param ann_colnames file with column names if \code{x} or \code{y} are indices saved on the disk (currently not supported),
#' @param true_blocks matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix as well as all metrics from \link[igraph]{compare} are returned).
#' @param true_blocks matrix with true blocks to calculate evaluation metrics (standard metrics based on confusion matrix are returned).
#' @param verbose whether log should be provided (0 = none, 1 = main, 2 = ANN algorithm verbose used),
#' @param graph whether a graph should be returned (default FALSE),
#' @param seed seed for the algorithms (for reproducibility),
Expand Down
2 changes: 0 additions & 2 deletions R/method_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#' @importFrom RcppHNSW HnswIp
#' @importFrom data.table data.table
#' @importFrom methods new
#' @importFrom utils setTxtProgressBar
#' @importFrom utils txtProgressBar
#'
#' @title An internal function to use HNSW algorithm via the RcppHNSW package.
#' @author Maciej Beręsewicz
Expand Down
1 change: 1 addition & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ output: github_document
[![CRAN status](https://www.r-pkg.org/badges/version/blocking)](https://CRAN.R-project.org/package=blocking)
[![CRAN downloads](https://cranlogs.r-pkg.org/badges/grand-total/blocking)](https://cran.r-project.org/package=blocking)
[![CRAN downloads](https://cranlogs.r-pkg.org/badges/blocking)](https://cran.r-project.org/package=blocking)
[![Mentioned in Awesome Official Statistics](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org)


<!-- badges: end -->
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ status](https://www.r-pkg.org/badges/version/blocking)](https://CRAN.R-project.o
downloads](https://cranlogs.r-pkg.org/badges/grand-total/blocking)](https://cran.r-project.org/package=blocking)
[![CRAN
downloads](https://cranlogs.r-pkg.org/badges/blocking)](https://cran.r-project.org/package=blocking)
[![Mentioned in Awesome Official
Statistics](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org)

<!-- badges: end -->

Expand Down
28 changes: 0 additions & 28 deletions inst/tinytest/index-colnames.txt

This file was deleted.

Binary file removed inst/tinytest/index.annoy
Binary file not shown.
Binary file removed inst/tinytest/index.hnsw
Binary file not shown.
70 changes: 35 additions & 35 deletions inst/tinytest/test_annoy.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,43 +72,43 @@ expect_equal(


## file saving
expect_error(
blocking(x = mat_y,
ann = "annoy",
distance = "euclidean",
ann_write = "./plik")
)


expect_true({
tmp_dir <- tempdir()
blocking(x = mat_y,
ann = "annoy",
distance = "euclidean",
ann_write = file.path(tmp_dir))
file.exists(file.path(tmp_dir, "index.annoy")) &
file.exists(file.path(tmp_dir, "index-colnames.txt"))
})

expect_true({
tmp_dir <- tempdir()
sub_dir <- file.path(tmp_dir, "sub")
dir.create(sub_dir, showWarnings = FALSE)
blocking(x = mat_y,
ann = "annoy",
distance = "euclidean",
ann_write = file.path(sub_dir))
file.exists(file.path(sub_dir, "index.annoy")) &
file.exists(file.path(sub_dir, "index-colnames.txt"))
})
# expect_error(
# blocking(x = mat_y,
# ann = "annoy",
# distance = "euclidean",
# ann_write = "./plik")
# )


# expect_true({
# tmp_dir <- tempdir()
# blocking(x = mat_y,
# ann = "annoy",
# distance = "euclidean",
# ann_write = file.path(tmp_dir))
# file.exists(file.path(tmp_dir, "index.annoy")) &
# file.exists(file.path(tmp_dir, "index-colnames.txt"))
# })

# expect_true({
# tmp_dir <- tempdir()
# sub_dir <- file.path(tmp_dir, "sub")
# dir.create(sub_dir, showWarnings = FALSE)
# blocking(x = mat_y,
# ann = "annoy",
# distance = "euclidean",
# ann_write = file.path(sub_dir))
# file.exists(file.path(sub_dir, "index.annoy")) &
# file.exists(file.path(sub_dir, "index-colnames.txt"))
# })

## testing reading saved index
expect_equal({
ncols <- length(readLines(file.path(tmp_dir, "index-colnames.txt")))
ann_annoy <- methods::new(RcppAnnoy::AnnoyEuclidean, ncols)
ann_annoy$load(file.path(tmp_dir, "index.annoy"))
ann_annoy$getNItems()
}, 8)
# expect_equal({
# ncols <- length(readLines(file.path(tmp_dir, "index-colnames.txt")))
# ann_annoy <- methods::new(RcppAnnoy::AnnoyEuclidean, ncols)
# ann_annoy$load(file.path(tmp_dir, "index.annoy"))
# ann_annoy$getNItems()
# }, 8)

## test verbose
expect_stdout(
Expand Down
68 changes: 34 additions & 34 deletions inst/tinytest/test_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,34 +71,34 @@ expect_equal(

## testing saving

expect_error({
blocking(x = mat_y,
ann = "hnsw",
ann_write = "./plik")
})

expect_true({
blocking(x = mat_y,
ann = "hnsw",
ann_write = ".")
file.exists("./index.hnsw") &
file.exists("./index-colnames.txt")
})

expect_true({
blocking(x = mat_y,
ann = "hnsw",
ann_write = "./")
file.exists("./index.hnsw") &
file.exists("./index-colnames.txt")
})


expect_equal({
ncols <- length(readLines("./index-colnames.txt"))
ann_hnsw <- methods::new(RcppHNSW::HnswCosine, ncols, "./index.hnsw")
ann_hnsw$size()
}, 8)
# expect_error({
# blocking(x = mat_y,
# ann = "hnsw",
# ann_write = "./plik")
# })
#
# expect_true({
# blocking(x = mat_y,
# ann = "hnsw",
# ann_write = ".")
# file.exists("./index.hnsw") &
# file.exists("./index-colnames.txt")
# })
#
# expect_true({
# blocking(x = mat_y,
# ann = "hnsw",
# ann_write = "./")
# file.exists("./index.hnsw") &
# file.exists("./index-colnames.txt")
# })
#
#
# expect_equal({
# ncols <- length(readLines("./index-colnames.txt"))
# ann_hnsw <- methods::new(RcppHNSW::HnswCosine, ncols, "./index.hnsw")
# ann_hnsw$size()
# }, 8)


## check verbose
Expand All @@ -108,12 +108,12 @@ expect_stdout(
verbose = 2)
)

expect_stdout(
blocking(x = mat_y,
ann = "hnsw",
verbose = 2,
ann_write = ".")
)
# expect_stdout(
# blocking(x = mat_y,
# ann = "hnsw",
# verbose = 2,
# ann_write = ".")
# )


### checks sparse data
Expand Down
2 changes: 1 addition & 1 deletion man/blocking.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.