Skip to content

Commit

Permalink
No issue: Fix file leaks in tests
Browse files Browse the repository at this point in the history
  • Loading branch information
reckart committed Aug 3, 2024
1 parent 5b8cacf commit 55942bb
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
* @see org.dkpro.core.api.embeddings.binary.BinaryVectorizer
* @see org.dkpro.core.api.embeddings.text.TextFormatVectorizer
*/
public interface Vectorizer
public interface Vectorizer extends AutoCloseable
{
/**
* Get the vector for a token. If the token is unknown, implementing classes should return the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ public class BinaryVectorizer
private final int maxVectorsPerPartition;
private Locale locale;
private float[] unknownVector;
private RandomAccessFile file;

private BinaryVectorizer(Header aHeader, RandomAccessFile file, String[] aWords,
private BinaryVectorizer(Header aHeader, RandomAccessFile aFile, String[] aWords,
long vectorStartOffset, float[] aUnk)
throws IOException
{
file = aFile;
header = aHeader;
words = aWords;

Expand All @@ -69,7 +71,7 @@ private BinaryVectorizer(Header aHeader, RandomAccessFile file, String[] aWords,
}

parts = new FloatBuffer[neededPartitions];
FileChannel channel = file.getChannel();
FileChannel channel = aFile.getChannel();
for (int i = 0; i < neededPartitions; i++) {
long start = vectorStartOffset + ((long) i * maxPartitionSizeBytes);
long length = maxPartitionSizeBytes;
Expand All @@ -80,6 +82,13 @@ private BinaryVectorizer(Header aHeader, RandomAccessFile file, String[] aWords,
parts[i] = channel.map(FileChannel.MapMode.READ_ONLY, start, length).asFloatBuffer();
}
}

@Override
public void close() throws IOException {
if (file != null) {
file.close();
}
}

/**
* Load a binary embeddings file and return a new {@link BinaryVectorizer} object.
Expand All @@ -91,7 +100,7 @@ private BinaryVectorizer(Header aHeader, RandomAccessFile file, String[] aWords,
public static BinaryVectorizer load(File f)
throws IOException
{
RandomAccessFile file = new RandomAccessFile(f, "rw");
var file = new RandomAccessFile(f, "rw");

// Load header
Header header = Header.read(file);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,21 @@ public void setUp()
public void testConvertWordVectorsToBinary()
throws Exception
{
File binaryTarget = writeBinaryFile(vectors);

BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget);

assertThat(vec.contains("t1")).isTrue();
assertThat(vec.contains("t2")).isTrue();
assertThat(vec.dimensions()).isEqualTo(3);
assertThat(vec.size()).isEqualTo(2);
assertThat(vec.isCaseless()).isTrue();

for (String word : vectors.keySet()) {
float[] orig = vectors.get(word);
float[] conv = vec.vectorize(word);

assertThat(conv).containsExactly(orig);
var binaryTarget = writeBinaryFile(vectors);
try (var vec = BinaryVectorizer.load(binaryTarget)) {
assertThat(vec.contains("t1")).isTrue();
assertThat(vec.contains("t2")).isTrue();
assertThat(vec.dimensions()).isEqualTo(3);
assertThat(vec.size()).isEqualTo(2);
assertThat(vec.isCaseless()).isTrue();

for (var word : vectors.keySet()) {
var orig = vectors.get(word);
var conv = vec.vectorize(word);

assertThat(conv).containsExactly(orig);
}
}
}

Expand All @@ -76,41 +76,42 @@ public void testConvertWordVectorsToBinaryCaseSensitive()
throws Exception
{
vectors.put("T1", new float[] { 0.1f, 0.2f, 0.3f });
File binaryTarget = writeBinaryFile(vectors);

BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget);

assertTrue(vec.contains("t1"));
assertTrue(vec.contains("t2"));
assertTrue(vec.contains("T1"));
assertFalse(vec.contains("T2"));
assertEquals(3, vec.dimensions());
assertEquals(3, vec.size());
assertFalse(vec.isCaseless());

for (String word : vectors.keySet()) {
float[] orig = vectors.get(word);
float[] conv = vec.vectorize(word);

assertTrue(Arrays.equals(orig, conv), "Vectors differ for " + word);
var binaryTarget = writeBinaryFile(vectors);

try (var vec = BinaryVectorizer.load(binaryTarget)) {
assertTrue(vec.contains("t1"));
assertTrue(vec.contains("t2"));
assertTrue(vec.contains("T1"));
assertFalse(vec.contains("T2"));
assertEquals(3, vec.dimensions());
assertEquals(3, vec.size());
assertFalse(vec.isCaseless());

for (var word : vectors.keySet()) {
var orig = vectors.get(word);
var conv = vec.vectorize(word);

assertTrue(Arrays.equals(orig, conv), "Vectors differ for " + word);
}
}
}

@Test
public void testRandomVector()
throws IOException
{
File binaryTarget = writeBinaryFile(vectors);
var binaryTarget = writeBinaryFile(vectors);

BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget);
float[] randVector = VectorizerUtils.randomVector(3);

float[] unk1 = vec.vectorize("unk1");
float[] unk2 = vec.vectorize("unk2");
assertTrue(Arrays.equals(randVector, unk1));
assertTrue(Arrays.equals(randVector, unk2));
assertTrue(
Arrays.equals(unk1, unk2), "Vectors or unknown words should always be the same.");
try (var vec = BinaryVectorizer.load(binaryTarget)) {
var randVector = VectorizerUtils.randomVector(3);

var unk1 = vec.vectorize("unk1");
var unk2 = vec.vectorize("unk2");
assertTrue(Arrays.equals(randVector, unk1));
assertTrue(Arrays.equals(randVector, unk2));
assertTrue(
Arrays.equals(unk1, unk2), "Vectors or unknown words should always be the same.");
}
}

/**
Expand All @@ -122,7 +123,7 @@ public void testRandomVector()
private File writeBinaryFile(Map<String, float[]> vectors)
throws IOException
{
File binaryTarget = new File(tempDir, "binaryTarget");
var binaryTarget = new File(tempDir, "binaryTarget");
convertWordVectorsToBinary(vectors, binaryTarget);
return binaryTarget;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Map;

import org.dkpro.core.api.embeddings.binary.BinaryVectorizer;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
Expand All @@ -35,13 +33,12 @@ public class TextFormatVectorizerUtilsTest
@Test
public void testReadEmbeddingFileTxt() throws IOException, URISyntaxException
{
File modelFile = new File("src/test/resources/dummy.vec");
int expectedSize = 699;
int expectedDimensions = 50;
boolean hasHeader = false;
var modelFile = new File("src/test/resources/dummy.vec");
var expectedSize = 699;
var expectedDimensions = 50;
var hasHeader = false;

Map<String, float[]> embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile,
hasHeader);
var embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile, hasHeader);

assertEquals(expectedSize, embeddings.size());
embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length));
Expand All @@ -50,13 +47,12 @@ public void testReadEmbeddingFileTxt() throws IOException, URISyntaxException
@Test
public void testReadEmbeddingFileTxtWithHeader() throws IOException, URISyntaxException
{
File modelFile = new File("src/test/resources/dummy_with_header.vec");
int expectedSize = 699;
int expectedDimensions = 50;
boolean hasHeader = true;
var modelFile = new File("src/test/resources/dummy_with_header.vec");
var expectedSize = 699;
var expectedDimensions = 50;
var hasHeader = true;

Map<String, float[]> embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile,
hasHeader);
var embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile, hasHeader);

assertEquals(expectedSize, embeddings.size());
embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length));
Expand All @@ -65,13 +61,12 @@ public void testReadEmbeddingFileTxtWithHeader() throws IOException, URISyntaxEx
@Test
public void testReadEmbeddingFileTxtCompressed() throws IOException, URISyntaxException
{
File modelFile = new File("src/test/resources/embeddings.gz");
int expectedSize = 699;
int expectedDimensions = 50;
boolean hasHeader = false;
var modelFile = new File("src/test/resources/embeddings.gz");
var expectedSize = 699;
var expectedDimensions = 50;
var hasHeader = false;

Map<String, float[]> embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile,
hasHeader);
var embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile, hasHeader);

assertEquals(expectedSize, embeddings.size());
embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length));
Expand All @@ -80,17 +75,17 @@ public void testReadEmbeddingFileTxtCompressed() throws IOException, URISyntaxEx
@Test
public void testConvertMalletEmbeddingsToBinary(@TempDir File tempDir) throws IOException
{
File modelFile = new File("src/test/resources/dummy.vec");
File targetFile = new File(tempDir, "binary");
var modelFile = new File("src/test/resources/dummy.vec");
var targetFile = new File(tempDir, "binary");

Map<String, float[]> embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile,
false);
var embeddings = TextFormatVectorizerUtils.readEmbeddingFileTxt(modelFile, false);
TextFormatVectorizerUtils.convertMalletEmbeddingsToBinary(modelFile, targetFile);
BinaryVectorizer vec = BinaryVectorizer.load(targetFile);

for (String token : embeddings.keySet()) {
assertTrue(Arrays.equals(embeddings.get(token), vec.vectorize(token)),
"Arrays to not match for token " + token);
try (var vec = BinaryVectorizer.load(targetFile)) {
for (var token : embeddings.keySet()) {
assertTrue(Arrays.equals(embeddings.get(token), vec.vectorize(token)),
"Arrays to not match for token " + token);
}
}
}

Expand Down

0 comments on commit 55942bb

Please sign in to comment.