Merge pull request #21 from aboutcode-org/create-stemcode-fingerprint

JonoYang · web-flow · commit 5abe3f436c3e · 2025-02-25T12:53:44.000-08:00
Add pipeline step to compute stem fingerprint
diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
@@ -13,6 +13,8 @@
 from licensedcode.tokenize import query_lines
 from samecode.halohash import BitAverageHaloHash
 
+from matchcode_toolkit.stemming import get_stem_code
+
 # A collection of directory fingerprints that we want to avoid
 IGNORED_DIRECTORY_FINGERPRINTS = [
     # This is both the directory content and directory structure fingerprint for
@@ -224,28 +226,81 @@ def get_file_fingerprint_hashes(
         content = f.read()
 
     return create_file_fingerprints(
-        content,
+        content=content,
+        ngram_length=ngram_length,
+        window_length=window_length,
+        include_ngrams=include_ngrams,
+    )
+
+
+def get_stemmed_file_fingerprint_hashes(
+    location,
+    ngram_length=5,
+    window_length=16,
+    include_ngrams=False,
+    **kwargs,
+):
+    """
+    Return a mapping of stemmed code fingerprint hashes for the file at `location`
+
+    The `halo1` hash is the hex digest of the fingerprint of the file.
+    `halo1` is empty if the file is empty.
+
+    - We start by breaking the file into words (tokens)
+    - We compute ngrams over the list of tokens
+
+    Return an empty mapping if `location` is not a text file
+    """
+    from commoncode import filetype
+    from typecode.contenttype import get_type
+
+    # Do not process `location` if it's not a text file
+    ft = get_type(location)
+    if not (filetype.is_file(location) and ft.is_text):
+        return {}
+
+    stemmed_content = get_stem_code(location=location)
+
+    return create_file_fingerprints(
+        stemmed_content=stemmed_content,
         ngram_length=ngram_length,
         window_length=window_length,
         include_ngrams=include_ngrams,
     )
 
 
 def create_file_fingerprints(
-    content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
+    content=None,
+    stemmed_content=None,
+    ngram_length=5,
+    window_length=SNIPPET_WINDOW_LENGTH,
+    include_ngrams=False,
 ):
     """
-    Return a mapping of halo1 and snippet hashes from content string
+    Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both.
     """
     from licensedcode.tokenize import ngrams
     from licensedcode.tokenize import select_ngrams
 
+    if content and stemmed_content:
+        raise Exception(
+            "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
+        )
+
+    if stemmed_content:
+        halo1_key = "stemmed_halo1"
+        snippets_key = "stemmed_snippets"
+    else:
+        halo1_key = "halo1"
+        snippets_key = "snippets"
+
     fingerprints = {
-        "halo1": "",
-        "snippets": [],
+        halo1_key: "",
+        snippets_key: [],
     }
 
     # tokenize content into words
+    content = content or stemmed_content
     words = list(tokenizer(content))
 
     # Create a file fingerprint from the number of elements in the content hash
@@ -259,7 +314,7 @@ def create_file_fingerprints(
         content_fingerprint = content_hash.hexdigest().decode("utf-8")
         ngs_count_hex_str = "%08x" % ngs_count
         file_fingerprint = ngs_count_hex_str + content_fingerprint
-        fingerprints["halo1"] = file_fingerprint
+        fingerprints[halo1_key] = file_fingerprint
 
     # Select windows from the content to compute snippet fingerprints
     windows = ngrams(words, window_length)
@@ -279,7 +334,7 @@ def create_file_fingerprints(
             s["ngrams"] = list(window)
         snippets.append(s)
     if snippets:
-        fingerprints["snippets"] = snippets
+        fingerprints[snippets_key] = snippets
 
     return fingerprints
 
diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
@@ -42,3 +42,4 @@ def fingerprint_codebase(self):
         """
         matchcode.fingerprint_codebase_directories(self.project)
         matchcode.fingerprint_codebase_resources(self.project)
+        matchcode.fingerprint_stemmed_codebase_resources(self.project)