Skip to content

Commit

Permalink
Load list of files from file now tries few charsets before giving up
Browse files Browse the repository at this point in the history
  • Loading branch information
torakiki committed Jul 27, 2022
1 parent fac3163 commit 577c73f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 7 deletions.
28 changes: 22 additions & 6 deletions pdfsam-service/src/main/java/org/pdfsam/pdf/PdfListParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,29 @@

import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.function.Function;
import java.util.stream.Stream;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* @author Andrea Vacondio
*/
class PdfListParser implements Function<Path, List<File>> {
private static final Logger LOG = LoggerFactory.getLogger(PdfListParser.class);

/**
* Given a Path to text/csv file, it parses is returning a list of PDF files contained in the parsed file
Expand All @@ -50,13 +59,20 @@ public List<File> apply(Path listFile) {
if (isNull(listFile)) {
return Collections.emptyList();
}
try {
return Files.lines(listFile).filter(StringUtils::isNoneBlank).map(PdfListParser::parseLine)
.map(String::trim).filter(s -> s.toUpperCase().endsWith("PDF")).map(Paths::get)
.filter(Files::exists).filter(not(Files::isDirectory)).map(Path::toFile).collect(toList());
} catch (IOException e) {
throw new RuntimeException(e);
List<Charset> charsets = List.of(StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1, Charset.defaultCharset());
for (Charset charset : charsets) {
try {
return Files.lines(listFile, charset).filter(StringUtils::isNoneBlank).map(PdfListParser::parseLine)
.map(String::trim).filter(s -> s.toUpperCase().endsWith("PDF")).map(Paths::get)
.filter(Files::exists).filter(not(Files::isDirectory)).map(Path::toFile).collect(toList());
} catch (UncheckedIOException e) {
LOG.warn("Unable to read lines from " + listFile + " using charset " + charset, e);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
throw new RuntimeException("Unable to read lines from " + listFile);

}

private static String parseLine(String line) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
Expand Down Expand Up @@ -67,6 +68,20 @@ public void apply() throws IOException {
assertThat(parsed, hasItems(file1, file2, file3));
}

@Test
public void applyNonUTFCharset() throws IOException {
File file1 = tmp.newFile("file1è.pdf");
File file2 = tmp.newFile("file2à.PDF");
Path list = tmp.newFile().toPath();
List<String> lines = new ArrayList<>();
lines.add(file1.getAbsolutePath());
lines.add(file2.getAbsolutePath() + ",");
Files.write(list, lines, StandardCharsets.ISO_8859_1);
List<File> parsed = new PdfListParser().apply(list);
assertEquals(2, parsed.size());
assertThat(parsed, hasItems(file1, file2));
}

@Test
public void filenameWithQuotes() throws IOException {
File file1 = tmp.newFile("file\"with quotes.pdf");
Expand Down Expand Up @@ -153,7 +168,6 @@ public void filePathsAreTrimmed() throws IOException {
assertThat(parsed, hasItems(file1));
}


@Test
public void weiredLinesDontBlowUp() throws IOException {
List<String> lines = new ArrayList<>();
Expand Down

0 comments on commit 577c73f

Please sign in to comment.