Skip to content

Commit 97a6770

Browse files
committed
refs 116 updates due to version 1.0 spec
1 parent e5b0c5d commit 97a6770

File tree

18 files changed

+183
-42
lines changed

18 files changed

+183
-42
lines changed

src/main/java/gov/loc/repository/bagit/conformance/BagLinter.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import gov.loc.repository.bagit.domain.Version;
2626
import gov.loc.repository.bagit.exceptions.InvalidBagMetadataException;
2727
import gov.loc.repository.bagit.exceptions.InvalidBagitFileFormatException;
28+
import gov.loc.repository.bagit.exceptions.MaliciousPathException;
2829
import gov.loc.repository.bagit.exceptions.UnparsableVersionException;
30+
import gov.loc.repository.bagit.exceptions.UnsupportedAlgorithmException;
2931
import gov.loc.repository.bagit.exceptions.conformance.BagitVersionIsNotAcceptableException;
3032
import gov.loc.repository.bagit.exceptions.conformance.FetchFileNotAllowedException;
3133
import gov.loc.repository.bagit.exceptions.conformance.MetatdataValueIsNotAcceptableException;
@@ -39,6 +41,8 @@
3941

4042
/**
4143
* Responsible for checking a bag and providing insight into how it cause problems.
44+
* This class is only to be used on VALID bags, using it on un-validated bags may result in
45+
* exceptions being thrown (like {@link java.io.IOException} )
4246
*/
4347
public final class BagLinter {
4448
private static final Logger logger = LoggerFactory.getLogger(BagLinter.class);
@@ -57,7 +61,7 @@ private BagLinter(){
5761
* @param jsonProfile the input stream to the json string describing the profile
5862
* @param bag the bag to check against the profile
5963
*
60-
* @throws IOException if there is a problem reading the profile
64+
* @throws IOException if there is a problem reading the profile or some of the bag files
6165
* @throws JsonMappingException if there is a problem mapping the profile to the {@link BagitProfile}
6266
* @throws JsonParseException if there is a problem parsing the json while mapping to java object
6367
*
@@ -88,8 +92,10 @@ public static void checkAgainstProfile(final InputStream jsonProfile, final Bag
8892
* @throws InvalidBagMetadataException if the bag metadata does not conform to the bagit specification
8993
* @throws UnparsableVersionException if there is an error reading the bagit version
9094
* @throws IOException if there was an error reading a file
95+
* @throws UnsupportedAlgorithmException if there is an error while reading one of the manifests due to the algorithm being unsupported
96+
* @throws MaliciousPathException if the path is crafted to be malicious (overwrite non bag files)
9197
*/
92-
public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException{
98+
public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
9399
return lintBag(rootDir, Collections.emptyList());
94100
}
95101

@@ -107,8 +113,10 @@ public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException,
107113
* @throws InvalidBagMetadataException if the bag metadata does not conform to the bagit specification
108114
* @throws UnparsableVersionException if there is an error reading the bagit version
109115
* @throws IOException if there was an error reading a file
116+
* @throws UnsupportedAlgorithmException if there is an error while reading one of the manifests due to the algorithm being unsupported
117+
* @throws MaliciousPathException if the path is crafted to be malicious (overwrite non bag files)
110118
*/
111-
public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<BagitWarning> warningsToIgnore) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException{
119+
public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<BagitWarning> warningsToIgnore) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
112120
final Set<BagitWarning> warnings = new HashSet<>();
113121

114122
//@Incubating
@@ -128,7 +136,7 @@ public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<Bag
128136
VersionChecker.checkVersion(bagitInfo.getKey(), warnings, warningsToIgnore);
129137

130138
logger.info(messages.getString("checking_manifest_problems"));
131-
ManifestChecker.checkManifests(bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);
139+
ManifestChecker.checkManifests(bagitInfo.getKey(), bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);
132140

133141
logger.info(messages.getString("checking_metadata_problems"));
134142
MetadataChecker.checkBagMetadata(bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);

src/main/java/gov/loc/repository/bagit/conformance/BagitWarning.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ public enum BagitWarning {
2020
OS_SPECIFIC_FILES("os_specific_files"),
2121
PAYLOAD_OXUM_MISSING("payload_oxum_missing"),
2222
TAG_FILES_ENCODING("tag_files_encoding"),
23-
WEAK_CHECKSUM_ALGORITHM("weak_checksum_algorithm");
23+
WEAK_CHECKSUM_ALGORITHM("weak_checksum_algorithm"),
24+
MANIFEST_SETS_DIFFER("manifest_file_sets_differ_between_algorithms");
2425

2526
private final String messageBundleKey;
2627
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");

src/main/java/gov/loc/repository/bagit/conformance/ManifestChecker.java

Lines changed: 120 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,32 @@
77
import java.nio.file.Files;
88
import java.nio.file.Path;
99
import java.text.Normalizer;
10+
import java.util.ArrayList;
1011
import java.util.Collection;
1112
import java.util.HashSet;
13+
import java.util.List;
1214
import java.util.ResourceBundle;
1315
import java.util.Set;
1416

1517
import org.slf4j.Logger;
1618
import org.slf4j.LoggerFactory;
1719
import org.slf4j.helpers.MessageFormatter;
1820

21+
import gov.loc.repository.bagit.domain.Manifest;
22+
import gov.loc.repository.bagit.domain.Version;
1923
import gov.loc.repository.bagit.exceptions.InvalidBagitFileFormatException;
24+
import gov.loc.repository.bagit.exceptions.MaliciousPathException;
25+
import gov.loc.repository.bagit.exceptions.UnsupportedAlgorithmException;
26+
import gov.loc.repository.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
27+
import gov.loc.repository.bagit.reader.ManifestReader;
2028
import gov.loc.repository.bagit.util.PathUtils;
2129

2230
/**
2331
* Part of the BagIt conformance suite.
2432
* This checker checks for various problems related to the manifests in a bag.
2533
*/
26-
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions"})
34+
//TODO refactor to remove PMD warnings!
35+
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions", "PMD.TooManyMethods", "PMD.GodClass"})
2736
public final class ManifestChecker {
2837
private static final Logger logger = LoggerFactory.getLogger(ManifestChecker.class);
2938
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
@@ -34,46 +43,79 @@ public final class ManifestChecker {
3443
private static final String TRASHES_FILE = "\\.(_.)?[Tt][Rr][Aa][Ss][Hh][Ee][Ss]";
3544
private static final String FS_EVENTS_FILE = "\\.[Ff][Ss][Ee][Vv][Ee][Nn][Tt][Ss][Dd]";
3645
private static final String OS_FILES_REGEX = ".*data/(" + THUMBS_DB_FILE + "|" + DS_STORE_FILE + "|" + SPOTLIGHT_FILE + "|" + TRASHES_FILE + "|" + FS_EVENTS_FILE + ")";
46+
private static final Version VERSION_1_0 = new Version(1,0);
3747

3848
private ManifestChecker(){
3949
//intentionally left empty
4050
}
4151

42-
/*
52+
/**
4353
* Check for all the manifest specific potential problems
54+
*
55+
* @param version the version of the bag we are checking
56+
* @param bagitDir the directory where the manifests are stored
57+
* @param encoding the encoding of the manifests
58+
* @param warnings the set of warnings that will be appended to while checking
59+
* @param warningsToIgnore the set of warnings to ignore
60+
*
61+
* @throws IOException if there is a problem reading a file (because it doesn't exist)
62+
* @throws InvalidBagitFileFormatException if one (or more) of the files does not match the formatting as specified in the specification
63+
* @throws MaliciousPathException if someone crafted the bag to specifically try and write outside the bag directory
64+
* @throws UnsupportedAlgorithmException if a manifest uses an algorithm that the computer doesn't know how to use
4465
*/
45-
public static void checkManifests(final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings,
46-
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
66+
//@SuppressWarnings("PMD.CyclomaticComplexity")
67+
public static void checkManifests(final Version version, final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings,
68+
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
4769

4870
boolean missingTagManifest = true;
71+
final List<Path> payloadManifests = new ArrayList<>();
72+
final List<Path> tagManifests = new ArrayList<>();
4973
try(final DirectoryStream<Path> files = Files.newDirectoryStream(bagitDir)){
5074
for(final Path file : files){
51-
final String filename = PathUtils.getFilename(file);
52-
if(filename.contains("manifest-")){
53-
if(filename.startsWith("manifest-")){
54-
checkData(file, encoding, warnings, warningsToIgnore, true);
55-
}
56-
else{
57-
checkData(file, encoding, warnings, warningsToIgnore, false);
58-
missingTagManifest = false;
59-
}
60-
61-
final String algorithm = filename.split("[-\\.]")[1];
62-
checkAlgorthm(algorithm, warnings, warningsToIgnore);
63-
}
75+
missingTagManifest = missingTagManifest && checkManifest(file, payloadManifests, tagManifests, encoding, warnings, warningsToIgnore);
6476
}
6577
}
6678

79+
if(!warnings.contains(BagitWarning.MANIFEST_SETS_DIFFER)){
80+
checkManifestSets(version, tagManifests, payloadManifests, warnings, encoding);
81+
}
82+
6783
if(!warningsToIgnore.contains(BagitWarning.MISSING_TAG_MANIFEST) && missingTagManifest){
6884
logger.warn(messages.getString("bag_missing_tag_manifest_warning"), bagitDir);
6985
warnings.add(BagitWarning.MISSING_TAG_MANIFEST);
7086
}
7187
}
7288

89+
private static boolean checkManifest(final Path file, final List<Path> payloadManifests, final List<Path> tagManifests,
90+
final Charset encoding, final Set<BagitWarning> warnings,
91+
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
92+
boolean missingTagManifest = true;
93+
final String filename = PathUtils.getFilename(file);
94+
if(filename.contains("manifest-")){
95+
if(filename.startsWith("manifest-")){
96+
payloadManifests.add(file);
97+
checkManifestPayload(file, encoding, warnings, warningsToIgnore, true);
98+
}
99+
else{
100+
tagManifests.add(file);
101+
checkManifestPayload(file, encoding, warnings, warningsToIgnore, false);
102+
missingTagManifest = false;
103+
}
104+
105+
final String algorithm = filename.split("[-\\.]")[1];
106+
checkAlgorthm(algorithm, warnings, warningsToIgnore);
107+
}
108+
109+
return missingTagManifest;
110+
}
111+
73112
/*
74-
* Check for a "bag within a bag" and for relative paths in the manifests
113+
* Check for a "bag within a bag", relative paths, and OS specific files in the manifests
75114
*/
76-
private static void checkData(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest) throws IOException, InvalidBagitFileFormatException{
115+
private static void checkManifestPayload(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings,
116+
final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest)
117+
throws IOException, InvalidBagitFileFormatException{
118+
77119
try(final BufferedReader reader = Files.newBufferedReader(manifestFile, encoding)){
78120
final Set<String> paths = new HashSet<>();
79121

@@ -82,28 +124,24 @@ private static void checkData(final Path manifestFile, final Charset encoding, f
82124
String path = parsePath(line);
83125

84126
path = checkForManifestCreatedWithMD5SumTools(path, warnings, warningsToIgnore);
85-
86-
if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
87-
logger.warn(messages.getString("different_case_warning"), manifestFile, path);
88-
warnings.add(BagitWarning.DIFFERENT_CASE);
89-
}
90127
paths.add(path.toLowerCase());
91128

129+
checkForDifferentCase(path, paths, manifestFile, warnings, warningsToIgnore);
92130
if(encoding.name().startsWith("UTF")){
93131
checkNormalization(path, manifestFile.getParent(), warnings, warningsToIgnore);
94132
}
95-
96133
checkForBagWithinBag(line, warnings, warningsToIgnore, isPayloadManifest);
97-
98134
checkForRelativePaths(line, warnings, warningsToIgnore, manifestFile);
99-
100135
checkForOSSpecificFiles(line, warnings, warningsToIgnore, manifestFile);
101136

102137
line = reader.readLine();
103138
}
104139
}
105140
}
106141

142+
/*
143+
* Check to make sure it conforms to <hash> <path>
144+
*/
107145
static String parsePath(final String line) throws InvalidBagitFileFormatException{
108146
final String[] parts = line.split("\\s+", 2);
109147
if(parts.length < 2){
@@ -114,6 +152,9 @@ static String parsePath(final String line) throws InvalidBagitFileFormatExceptio
114152
return parts[1];
115153
}
116154

155+
/*
156+
* We allow for MD5sum tools for compatibility but it is not recommended
157+
*/
117158
private static String checkForManifestCreatedWithMD5SumTools(final String path, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
118159
String fixedPath = path;
119160
final boolean startsWithStar = path.charAt(0) == '*';
@@ -130,6 +171,17 @@ private static String checkForManifestCreatedWithMD5SumTools(final String path,
130171
return fixedPath;
131172
}
132173

174+
/*
175+
* Check that the same line doesn't already exist in the set of paths
176+
*/
177+
private static void checkForDifferentCase(final String path, final Set<String> paths, final Path manifestFile,
178+
final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
179+
if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
180+
logger.warn(messages.getString("different_case_warning"), manifestFile, path);
181+
warnings.add(BagitWarning.DIFFERENT_CASE);
182+
}
183+
}
184+
133185
/*
134186
* Check that the file specified has not changed its normalization (i.e. have the bytes changed but it still looks the same?)
135187
*/
@@ -210,6 +262,47 @@ else if(!warningsToIgnore.contains(BagitWarning.NON_STANDARD_ALGORITHM) && !"SHA
210262
warnings.add(BagitWarning.NON_STANDARD_ALGORITHM);
211263
}
212264
}
265+
266+
static void checkManifestSets(final Version version, final List<Path> tagManifests, final List<Path> payloadManifests,
267+
final Set<BagitWarning> warnings, final Charset encoding)
268+
throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
269+
//edge case, for version 1.0+ all tag manifests SHOULD list the same set of files
270+
if(tagManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
271+
checkManifestsListSameSetOfFiles(warnings, tagManifests, encoding);
272+
}
273+
274+
//edge case, for version 1.0+ all payload manifests SHOULD list the same set of files
275+
if(payloadManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
276+
checkManifestsListSameSetOfFiles(warnings, payloadManifests, encoding);
277+
}
278+
}
279+
280+
//starting with version 1.0 all manifest types (tag, payload) should list the same set of files
281+
@SuppressWarnings("PMD.EmptyCatchBlock")
282+
static void checkManifestsListSameSetOfFiles(final Set<BagitWarning> warnings, final List<Path> manifestPaths, final Charset charset) throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
283+
final StandardBagitAlgorithmNameToSupportedAlgorithmMapping nameMapping = new StandardBagitAlgorithmNameToSupportedAlgorithmMapping();
284+
285+
Manifest compareToManifest = null;
286+
Path compareToManifestPath = null;
287+
for (final Path manifestPath : manifestPaths) {
288+
try {
289+
final Manifest manifest = ManifestReader.readManifest(nameMapping, manifestPath, manifestPath.getParent(), charset);
290+
if(compareToManifest == null) {
291+
compareToManifestPath = manifestPath;
292+
compareToManifest = manifest;
293+
continue;
294+
}
295+
296+
if(!compareToManifest.getFileToChecksumMap().keySet().equals(manifest.getFileToChecksumMap().keySet())) {
297+
logger.warn(messages.getString("manifest_fileset_differ"), compareToManifestPath, manifestPath);
298+
warnings.add(BagitWarning.MANIFEST_SETS_DIFFER);
299+
}
300+
}
301+
catch(UnsupportedAlgorithmException e) {
302+
//ignore an unsupported algorithm as it is caught in checkAlgorthm()
303+
}
304+
}
305+
}
213306

214307
//for unit test only
215308
static String getOsFilesRegex() {

src/main/java/gov/loc/repository/bagit/domain/Version.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public Version(final int major, final int minor){
1717
this.cachedToString = major + "." + minor;
1818
}
1919

20-
public static Version LATEST_BAGIT_VERSION(){
20+
public static Version LATEST_BAGIT_VERSION() {
2121
return new Version(1, 0);
2222
}
2323

src/main/resources/MessageBundle.properties

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ os_specific_files=Files created by the operating system (OS) for its own use. Th
3131
payload_oxum_missing=It is recommended to always include the Payload-Oxum in the bag metadata since it allows for a 'quick verification' of the bag.
3232
tag_files_encoding=It is recommended to always use UTF-8.
3333
weak_checksum_algorithm=The checksum algorithm used is known to be weak. We recommend using SHA-512.
34+
manifest_file_sets_differ_between_algorithms=As of bagit version 1.0 it is recommended that all payload manifests contain the same set of files as other payload manifests. It is also recommended that all tag manifests contain the same set of files as other tag manifests.
3435

3536
#for BagLinter.java
3637
checking_encoding_problems=Checking encoding problems.
@@ -83,6 +84,7 @@ leading_dot_slash_warning=In manifest [{}] line [{}] is a non-normalized path.
8384
os_specific_files_warning=In manifest [{}] line [{}] contains a OS specific file.
8485
weak_algorithm_warning=Detected a known weak algorithm [{}]. With the great advances in computer hardware there is little penalty to using more bits to calculate the checksum.
8586
non_standard_algorithm_warning=Detected algorithm [{}] which is not included by default in Java. This will make it more difficult to read this bag on some systems. Consider changing it to SHA-512.
87+
manifest_fileset_differ=Manifest [{}] does not contain the same set of files as manifest [{}], it is recommended that they be the same.
8688

8789
#for MetadataChecker.java
8890
missing_payload_oxum_warning=The Payload-Oxum key was not found in the bag metadata. This will prevent a "quick verify".

src/test/java/gov/loc/repository/bagit/conformance/BagLinterTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ public void testClassIsWellDefined() throws NoSuchMethodException, InvocationTar
3232
public void testLintBag() throws Exception{
3333
Set<BagitWarning> expectedWarnings = new HashSet<>();
3434
expectedWarnings.addAll(Arrays.asList(BagitWarning.values()));
35+
expectedWarnings.remove(BagitWarning.MANIFEST_SETS_DIFFER); //only applies to version 1.0 but need older version for other warnings, so we test this separately
3536
Set<BagitWarning> warnings = BagLinter.lintBag(rootDir);
3637

3738
if(FileSystems.getDefault().getClass().getName() == "sun.nio.fs.MacOSXFileSystem"){

0 commit comments

Comments
 (0)