From 8e0405ecd40db7b0644eacce73b3071ebbe0d9fd Mon Sep 17 00:00:00 2001
From: Keshav Priyadarshi <git@keshav.space>
Date: Tue, 25 Feb 2025 12:16:10 +0530
Subject: [PATCH 1/4] Create fingerprint for stem code

Signed-off-by: Keshav Priyadarshi <git@keshav.space>
---
 src/matchcode_toolkit/fingerprinting.py | 38 +++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
index 88aafcb..302a846 100644
--- a/src/matchcode_toolkit/fingerprinting.py
+++ b/src/matchcode_toolkit/fingerprinting.py
@@ -13,6 +13,8 @@
 from licensedcode.tokenize import query_lines
 from samecode.halohash import BitAverageHaloHash
 
+from matchcode_toolkit.stemming import get_stem_code
+
 # A collection of directory fingerprints that we want to avoid
 IGNORED_DIRECTORY_FINGERPRINTS = [
     # This is both the directory content and directory structure fingerprint for
@@ -231,6 +233,42 @@ def get_file_fingerprint_hashes(
     )
 
 
+def get_stem_file_fingerprint_hashes(
+    location,
+    ngram_length=5,
+    window_length=16,
+    include_ngrams=False,
+    **kwargs,
+):
+    """
+    Return a mapping of stem code fingerprint hashes for the file at `location`
+
+    The `halo1` hash is the hex digest of the fingerprint of the file.
+    `halo1` is empty if the file is empty.
+
+    - We start by breaking the file into words (tokens)
+    - We compute ngrams over the list of tokens
+
+    Return an empty mapping if `location` is not a text file
+    """
+    from commoncode import filetype
+    from typecode.contenttype import get_type
+
+    # Do not process `location` if it's not a text file
+    ft = get_type(location)
+    if not (filetype.is_file(location) and ft.is_text):
+        return {}
+
+    stemmed_content = get_stem_code(location=location)
+
+    return create_file_fingerprints(
+        stemmed_content,
+        ngram_length=ngram_length,
+        window_length=window_length,
+        include_ngrams=include_ngrams,
+    )
+
+
 def create_file_fingerprints(
     content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
 ):

From fc37d161a1133111f83bf7194c6bd8b0510ce34f Mon Sep 17 00:00:00 2001
From: Keshav Priyadarshi <git@keshav.space>
Date: Tue, 25 Feb 2025 12:50:03 +0530
Subject: [PATCH 2/4] Add pipeline step to compute stem fingerprint

Signed-off-by: Keshav Priyadarshi <git@keshav.space>
---
 src/matchcode_toolkit/pipelines/fingerprint_codebase.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
index f4d20a1..59dd634 100644
--- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
+++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
@@ -34,7 +34,10 @@ class FingerprintCodebase(Pipeline):
 
     @classmethod
     def steps(cls):
-        return (cls.fingerprint_codebase,)
+        return (
+            cls.fingerprint_codebase,
+            cls.fingerprint_stem_codebase_resources,
+        )
 
     def fingerprint_codebase(self):
         """
@@ -42,3 +45,7 @@ def fingerprint_codebase(self):
         """
         matchcode.fingerprint_codebase_directories(self.project)
         matchcode.fingerprint_codebase_resources(self.project)
+
+    def fingerprint_stem_codebase_resources(self):
+        """Compute stem code fingerprint for resources"""
+        matchcode.fingerprint_stem_codebase_resources(self.project)

From 5201aa53ec222f3de25943fde026b16b19244e7d Mon Sep 17 00:00:00 2001
From: Jono Yang <jyang@nexb.com>
Date: Tue, 25 Feb 2025 10:06:09 -0800
Subject: [PATCH 3/4] Update key field values for stemmed fingerprints

Signed-off-by: Jono Yang <jyang@nexb.com>
---
 src/matchcode_toolkit/fingerprinting.py       | 37 ++++++++++++++-----
 .../pipelines/fingerprint_codebase.py         |  5 +--
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
index 302a846..e2b7ec0 100644
--- a/src/matchcode_toolkit/fingerprinting.py
+++ b/src/matchcode_toolkit/fingerprinting.py
@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
         content = f.read()
 
     return create_file_fingerprints(
-        content,
+        content=content,
         ngram_length=ngram_length,
         window_length=window_length,
         include_ngrams=include_ngrams,
     )
 
 
-def get_stem_file_fingerprint_hashes(
+def get_stemmed_file_fingerprint_hashes(
     location,
     ngram_length=5,
     window_length=16,
@@ -241,7 +241,7 @@ def get_stem_file_fingerprint_hashes(
     **kwargs,
 ):
     """
-    Return a mapping of stem code fingerprint hashes for the file at `location`
+    Return a mapping of stemmed code fingerprint hashes for the file at `location`
 
     The `halo1` hash is the hex digest of the fingerprint of the file.
     `halo1` is empty if the file is empty.
@@ -262,7 +262,7 @@ def get_stem_file_fingerprint_hashes(
     stemmed_content = get_stem_code(location=location)
 
     return create_file_fingerprints(
-        stemmed_content,
+        stemmed_content=stemmed_content,
         ngram_length=ngram_length,
         window_length=window_length,
         include_ngrams=include_ngrams,
@@ -270,20 +270,37 @@ def get_stem_file_fingerprint_hashes(
 
 
 def create_file_fingerprints(
-    content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
+    content=None,
+    stemmed_content=None,
+    ngram_length=5,
+    window_length=SNIPPET_WINDOW_LENGTH,
+    include_ngrams=False,
 ):
     """
-    Return a mapping of halo1 and snippet hashes from content string
+    Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both.
     """
     from licensedcode.tokenize import ngrams
     from licensedcode.tokenize import select_ngrams
 
+    if content and stemmed_content:
+        raise Exception(
+            "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
+        )
+
+    if stemmed_content:
+        halo1_key = "stemmed_halo1"
+        snippets_key = "stemmed_snippets"
+    else:
+        halo1_key = "halo1"
+        snippets_key = "snippets"
+
     fingerprints = {
-        "halo1": "",
-        "snippets": [],
+        halo1_key: "",
+        snippets_key: [],
     }
 
     # tokenize content into words
+    content = content or stemmed_content
     words = list(tokenizer(content))
 
     # Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +314,7 @@ def create_file_fingerprints(
         content_fingerprint = content_hash.hexdigest().decode("utf-8")
         ngs_count_hex_str = "%08x" % ngs_count
         file_fingerprint = ngs_count_hex_str + content_fingerprint
-        fingerprints["halo1"] = file_fingerprint
+        fingerprints[halo1_key] = file_fingerprint
 
     # Select windows from the content to compute snippet fingerprints
     windows = ngrams(words, window_length)
@@ -317,7 +334,7 @@ def create_file_fingerprints(
             s["ngrams"] = list(window)
         snippets.append(s)
     if snippets:
-        fingerprints["snippets"] = snippets
+        fingerprints[snippets_key] = snippets
 
     return fingerprints
 
diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
index 59dd634..6eed839 100644
--- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
+++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
@@ -45,7 +45,4 @@ def fingerprint_codebase(self):
         """
         matchcode.fingerprint_codebase_directories(self.project)
         matchcode.fingerprint_codebase_resources(self.project)
-
-    def fingerprint_stem_codebase_resources(self):
-        """Compute stem code fingerprint for resources"""
-        matchcode.fingerprint_stem_codebase_resources(self.project)
+        matchcode.fingerprint_stemmed_codebase_resources(self.project)

From ae010c1e71b6088ede8ec865387b4bcc8783cf3c Mon Sep 17 00:00:00 2001
From: Jono Yang <jyang@nexb.com>
Date: Tue, 25 Feb 2025 12:44:04 -0800
Subject: [PATCH 4/4] Remove step from FingerprintCodebase

Signed-off-by: Jono Yang <jyang@nexb.com>
---
 src/matchcode_toolkit/pipelines/fingerprint_codebase.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
index 6eed839..c74bb99 100644
--- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
+++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
@@ -34,10 +34,7 @@ class FingerprintCodebase(Pipeline):
 
     @classmethod
     def steps(cls):
-        return (
-            cls.fingerprint_codebase,
-            cls.fingerprint_stem_codebase_resources,
-        )
+        return (cls.fingerprint_codebase,)
 
     def fingerprint_codebase(self):
         """