Skip to content

Commit 0cf17cd

Browse files
committed
CODEC-335: Add DigestUtils.gitBlob and DigestUtils.gitTree methods
This change adds two methods to `DigestUtils` that compute generalized Git object identifiers using an arbitrary `MessageDigest`, rather than being restricted to SHA-1: - `gitBlob(digest, input)`: computes a generalized [Git blob object identifier](https://git-scm.com/book/en/v2/Git-Internals-Git-Objects) for a given file or byte content. - `gitTree(digest, file)`: computes a generalized [Git tree object identifier](https://git-scm.com/book/en/v2/Git-Internals-Git-Objects) for a given directory. ### Motivation The standard Git object identifiers use SHA-1, which is [in the process of being replaced by SHA-256](https://git-scm.com/docs/hash-function-transition) in Git itself. These methods generalize the identifier computation to support any `MessageDigest`, enabling both forward compatibility and use with external standards. In particular, the `swh:1:cnt:` (content) and `swh:1:dir:` (directory) identifier types defined by [SWHID (ISO/IEC 18670)](https://www.swhid.org/specification/v1.2/5.Core_identifiers/) are currently compatible with Git blob and tree identifiers respectively (using SHA-1), and can be used to generate canonical, persistent identifiers for unpacked source and binary distributions.
1 parent f3b0eb5 commit 0cf17cd

File tree

8 files changed

+465
-0
lines changed

8 files changed

+465
-0
lines changed

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ The <action> type attribute can be add,update,fix,remove.
5151
<!-- ADD -->
5252
<action type="add" dev="ggregory" due-to="Inkeet, Gary Gregory, Wolff Bock von Wuelfingen" issue="CODEC-326">Add Base58 support.</action>
5353
<action type="add" dev="ggregory" due-to="Gary Gregory">Add BaseNCodecInputStream.AbstracBuilder.setByteArray(byte[]).</action>
54+
<action type="add" issue="CODEC-335" dev="pkarwasz" due-to="Piotr P. Karwasz">Add DigestUtils.gitBlob() and DigestUtils.gitTree() to compute Git blob and tree object identifiers.</action>
5455
<!-- UPDATE -->
5556
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump org.apache.commons:commons-parent from 96 to 97.</action>
5657
</release>

src/main/java/org/apache/commons/codec/digest/DigestUtils.java

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,24 @@
1818
package org.apache.commons.codec.digest;
1919

2020
import java.io.BufferedInputStream;
21+
import java.io.ByteArrayOutputStream;
2122
import java.io.File;
2223
import java.io.IOException;
2324
import java.io.InputStream;
2425
import java.io.RandomAccessFile;
2526
import java.nio.ByteBuffer;
2627
import java.nio.channels.FileChannel;
28+
import java.nio.charset.StandardCharsets;
29+
import java.nio.file.DirectoryStream;
2730
import java.nio.file.Files;
2831
import java.nio.file.OpenOption;
2932
import java.nio.file.Path;
3033
import java.security.MessageDigest;
3134
import java.security.NoSuchAlgorithmException;
35+
import java.util.ArrayList;
36+
import java.util.Collection;
37+
import java.util.List;
38+
import java.util.TreeSet;
3239

3340
import org.apache.commons.codec.binary.Hex;
3441
import org.apache.commons.codec.binary.StringUtils;
@@ -139,6 +146,131 @@ public static byte[] digest(final MessageDigest messageDigest, final RandomAcces
139146
return updateDigest(messageDigest, data).digest();
140147
}
141148

149+
/**
150+
* Reads through a byte array and return a generalized Git blob identifier
151+
*
152+
* <p>The identifier is computed in the way described by the
153+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
154+
* algorithm.</p>
155+
*
156+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
157+
*
158+
* @param messageDigest The MessageDigest to use (for example SHA-1).
159+
* @param data Data to digest.
160+
* @return A generalized Git blob identifier.
161+
* @since 1.22.0
162+
*/
163+
public static byte[] gitBlob(final MessageDigest messageDigest, final byte[] data) {
164+
updateDigest(messageDigest, gitBlobPrefix(data.length));
165+
return digest(messageDigest, data);
166+
}
167+
168+
/**
169+
* Reads through a byte array and return a generalized Git blob identifier
170+
*
171+
* <p>The identifier is computed in the way described by the
172+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
173+
* algorithm.</p>
174+
*
175+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
176+
*
177+
* @param messageDigest The MessageDigest to use (for example SHA-1).
178+
* @param data Data to digest.
179+
* @param options Options how to open the file
180+
* @return A generalized Git blob identifier.
181+
* @throws IOException On error accessing the file
182+
* @since 1.22.0
183+
*/
184+
public static byte[] gitBlob(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException {
185+
updateDigest(messageDigest, gitBlobPrefix(Files.size(data)));
186+
return updateDigest(messageDigest, data, options).digest();
187+
}
188+
189+
private static byte[] gitBlobPrefix(final long dataSize) {
190+
return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
191+
}
192+
193+
/**
194+
* Returns a generalized Git tree identifier
195+
*
196+
* <p>The identifier is computed in the way described by the
197+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
198+
* algorithm.</p>
199+
*
200+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
201+
*
202+
* @param messageDigest The MessageDigest to use (for example SHA-1)
203+
* @param entries The directory entries
204+
* @return A generalized Git tree identifier.
205+
*/
206+
static byte[] gitTree(final MessageDigest messageDigest, final Collection<GitDirectoryEntry> entries) {
207+
final TreeSet<GitDirectoryEntry> treeSet = new TreeSet<>(entries);
208+
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
209+
for (final GitDirectoryEntry entry : treeSet) {
210+
final byte[] treeEntryBytes = entry.toTreeEntryBytes();
211+
baos.write(treeEntryBytes, 0, treeEntryBytes.length);
212+
}
213+
updateDigest(messageDigest, gitTreePrefix(baos.size()));
214+
return updateDigest(messageDigest, baos.toByteArray()).digest();
215+
}
216+
217+
/**
218+
* Reads through a byte array and return a generalized Git tree identifier
219+
*
220+
* <p>The identifier is computed in the way described by the
221+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
222+
* algorithm.</p>
223+
*
224+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
225+
*
226+
* @param messageDigest The MessageDigest to use (for example SHA-1).
227+
* @param data Data to digest.
228+
* @param options Options how to open the file
229+
* @return A generalized Git tree identifier.
230+
* @throws IOException On error accessing the file
231+
* @since 1.22.0
232+
*/
233+
public static byte[] gitTree(final MessageDigest messageDigest, final Path data, final OpenOption...options) throws IOException {
234+
final List<GitDirectoryEntry> entries = new ArrayList<>();
235+
try (DirectoryStream<Path> files = Files.newDirectoryStream(data)) {
236+
for (final Path path : files) {
237+
final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path);
238+
final byte[] rawObjectId;
239+
if (type == GitDirectoryEntry.Type.DIRECTORY) {
240+
rawObjectId = gitTree(messageDigest, path, options);
241+
} else {
242+
rawObjectId = gitBlob(messageDigest, path, options);
243+
}
244+
entries.add(new GitDirectoryEntry(path, type, rawObjectId));
245+
}
246+
}
247+
return gitTree(messageDigest, entries);
248+
}
249+
250+
/**
251+
* Returns the {@link GitDirectoryEntry.Type} of a file.
252+
*
253+
* @param path The file to check.
254+
* @return A {@link GitDirectoryEntry.Type}
255+
*/
256+
private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) {
257+
// Symbolic links first
258+
if (Files.isSymbolicLink(path)) {
259+
return GitDirectoryEntry.Type.SYMBOLIC_LINK;
260+
}
261+
if (Files.isDirectory(path)) {
262+
return GitDirectoryEntry.Type.DIRECTORY;
263+
}
264+
if (Files.isExecutable(path)) {
265+
return GitDirectoryEntry.Type.EXECUTABLE;
266+
}
267+
return GitDirectoryEntry.Type.REGULAR;
268+
}
269+
270+
private static byte[] gitTreePrefix(final long dataSize) {
271+
return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
272+
}
273+
142274
/**
143275
* Gets a {@code MessageDigest} for the given {@code algorithm}.
144276
*
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.codec.digest;
19+
20+
import java.nio.charset.StandardCharsets;
21+
import java.nio.file.Path;
22+
23+
/**
24+
* Represents a single entry in a Git tree object.
25+
*
26+
* <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
27+
* <ul>
28+
* <li>a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
29+
* <li>the entry name (file or directory name, without a path separator),</li>
30+
* <li>the raw object id of the referenced blob or sub-tree.</li>
31+
* </ul>
32+
*
33+
* <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
34+
* sorts after {@code foobar}.</p>
35+
*
36+
* <p>Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.</p>
37+
*
38+
* @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
39+
* @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
40+
*/
41+
class GitDirectoryEntry implements Comparable<GitDirectoryEntry> {
42+
43+
/**
44+
* The entry name (file or directory name, no path separator).
45+
*/
46+
private final String name;
47+
48+
/**
49+
* The key used for ordering entries within a tree object.
50+
*
51+
* <p>>Git appends {@code '/'} to directory names before comparing.</p>
52+
*/
53+
private final String sortKey;
54+
55+
/**
56+
* The Git object type, which determines the Unix file-mode prefix.
57+
*/
58+
private final Type type;
59+
60+
/**
61+
* The raw object id of the referenced blob or sub-tree.
62+
*/
63+
private final byte[] rawObjectId;
64+
65+
private GitDirectoryEntry(final String name, final Type type, final byte[] rawObjectId) {
66+
this.name = name;
67+
this.type = type;
68+
this.sortKey = type == Type.DIRECTORY ? name + "/" : name;
69+
this.rawObjectId = rawObjectId;
70+
}
71+
72+
GitDirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) {
73+
this(path.getFileName().toString(), type, rawObjectId);
74+
}
75+
76+
/**
77+
* Returns the binary encoding of this entry as it appears inside a Git tree object.
78+
*
79+
* <p>The format follows the Git tree entry layout:</p>
80+
* <pre>
81+
* &lt;mode&gt; SP &lt;name&gt; NUL &lt;20-byte-object-id&gt;
82+
* </pre>
83+
*
84+
* @return the binary tree-entry encoding; never {@code null}
85+
*/
86+
byte[] toTreeEntryBytes() {
87+
final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
88+
final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2];
89+
System.arraycopy(type.mode, 0, result, 0, type.mode.length);
90+
result[type.mode.length] = ' ';
91+
System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length);
92+
result[type.mode.length + nameBytes.length + 1] = '\0';
93+
System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length);
94+
return result;
95+
}
96+
97+
@Override
98+
public int compareTo(GitDirectoryEntry o) {
99+
return sortKey.compareTo(o.sortKey);
100+
}
101+
102+
@Override
103+
public int hashCode() {
104+
return name.hashCode();
105+
}
106+
107+
@Override
108+
public boolean equals(Object obj) {
109+
if (obj == this) {
110+
return true;
111+
}
112+
if (!(obj instanceof GitDirectoryEntry)) {
113+
return false;
114+
}
115+
final GitDirectoryEntry other = (GitDirectoryEntry) obj;
116+
return name.equals(other.name);
117+
}
118+
119+
/**
120+
* The type of a Git tree entry, which maps to a Unix file-mode string.
121+
*
122+
* <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
123+
* cover the four entry types that Git itself produces.</p>
124+
*
125+
* <p>This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.</p>
126+
*/
127+
enum Type {
128+
129+
/**
130+
* A sub-directory (Git sub-tree)
131+
*/
132+
DIRECTORY("40000"),
133+
134+
/**
135+
* An executable file
136+
*/
137+
EXECUTABLE("100755"),
138+
139+
/**
140+
* A regular (non-executable) file
141+
*/
142+
REGULAR("100644"),
143+
144+
/**
145+
* A symbolic link
146+
*/
147+
SYMBOLIC_LINK("120000");
148+
149+
/**
150+
* The ASCII-encoded octal mode string as it appears in the binary tree entry.
151+
*/
152+
private final byte[] mode;
153+
154+
Type(final String mode) {
155+
this.mode = mode.getBytes(StandardCharsets.US_ASCII);
156+
}
157+
}
158+
}

0 commit comments

Comments
 (0)