From 8e0405ecd40db7b0644eacce73b3071ebbe0d9fd Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 25 Feb 2025 12:16:10 +0530 Subject: [PATCH 1/4] Create fingerprint for stem code Signed-off-by: Keshav Priyadarshi --- src/matchcode_toolkit/fingerprinting.py | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 88aafcb..302a846 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -13,6 +13,8 @@ from licensedcode.tokenize import query_lines from samecode.halohash import BitAverageHaloHash +from matchcode_toolkit.stemming import get_stem_code + # A collection of directory fingerprints that we want to avoid IGNORED_DIRECTORY_FINGERPRINTS = [ # This is both the directory content and directory structure fingerprint for @@ -231,6 +233,42 @@ def get_file_fingerprint_hashes( ) +def get_stem_file_fingerprint_hashes( + location, + ngram_length=5, + window_length=16, + include_ngrams=False, + **kwargs, +): + """ + Return a mapping of stem code fingerprint hashes for the file at `location` + + The `halo1` hash is the hex digest of the fingerprint of the file. + `halo1` is empty if the file is empty. + + - We start by breaking the file into words (tokens) + - We compute ngrams over the list of tokens + + Return an empty mapping if `location` is not a text file + """ + from commoncode import filetype + from typecode.contenttype import get_type + + # Do not process `location` if it's not a text file + ft = get_type(location) + if not (filetype.is_file(location) and ft.is_text): + return {} + + stemmed_content = get_stem_code(location=location) + + return create_file_fingerprints( + stemmed_content, + ngram_length=ngram_length, + window_length=window_length, + include_ngrams=include_ngrams, + ) + + def create_file_fingerprints( content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False ): From fc37d161a1133111f83bf7194c6bd8b0510ce34f Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Tue, 25 Feb 2025 12:50:03 +0530 Subject: [PATCH 2/4] Add pipeline step to compute stem fingerprint Signed-off-by: Keshav Priyadarshi --- src/matchcode_toolkit/pipelines/fingerprint_codebase.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py index f4d20a1..59dd634 100644 --- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py +++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py @@ -34,7 +34,10 @@ class FingerprintCodebase(Pipeline): @classmethod def steps(cls): - return (cls.fingerprint_codebase,) + return ( + cls.fingerprint_codebase, + cls.fingerprint_stem_codebase_resources, + ) def fingerprint_codebase(self): """ @@ -42,3 +45,7 @@ def fingerprint_codebase(self): """ matchcode.fingerprint_codebase_directories(self.project) matchcode.fingerprint_codebase_resources(self.project) + + def fingerprint_stem_codebase_resources(self): + """Compute stem code fingerprint for resources""" + matchcode.fingerprint_stem_codebase_resources(self.project) From 5201aa53ec222f3de25943fde026b16b19244e7d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 25 Feb 2025 10:06:09 -0800 Subject: [PATCH 3/4] Update key field values for stemmed fingerprints Signed-off-by: Jono Yang --- src/matchcode_toolkit/fingerprinting.py | 37 ++++++++++++++----- .../pipelines/fingerprint_codebase.py | 5 +-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 302a846..e2b7ec0 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -226,14 +226,14 @@ def get_file_fingerprint_hashes( content = f.read() return create_file_fingerprints( - content, + content=content, ngram_length=ngram_length, window_length=window_length, include_ngrams=include_ngrams, ) -def get_stem_file_fingerprint_hashes( +def get_stemmed_file_fingerprint_hashes( location, ngram_length=5, window_length=16, @@ -241,7 +241,7 @@ def get_stem_file_fingerprint_hashes( **kwargs, ): """ - Return a mapping of stem code fingerprint hashes for the file at `location` + Return a mapping of stemmed code fingerprint hashes for the file at `location` The `halo1` hash is the hex digest of the fingerprint of the file. `halo1` is empty if the file is empty. @@ -262,7 +262,7 @@ def get_stem_file_fingerprint_hashes( stemmed_content = get_stem_code(location=location) return create_file_fingerprints( - stemmed_content, + stemmed_content=stemmed_content, ngram_length=ngram_length, window_length=window_length, include_ngrams=include_ngrams, @@ -270,20 +270,37 @@ def get_stem_file_fingerprint_hashes( def create_file_fingerprints( - content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False + content=None, + stemmed_content=None, + ngram_length=5, + window_length=SNIPPET_WINDOW_LENGTH, + include_ngrams=False, ): """ - Return a mapping of halo1 and snippet hashes from content string + Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both. """ from licensedcode.tokenize import ngrams from licensedcode.tokenize import select_ngrams + if content and stemmed_content: + raise Exception( + "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both." + ) + + if stemmed_content: + halo1_key = "stemmed_halo1" + snippets_key = "stemmed_snippets" + else: + halo1_key = "halo1" + snippets_key = "snippets" + fingerprints = { - "halo1": "", - "snippets": [], + halo1_key: "", + snippets_key: [], } # tokenize content into words + content = content or stemmed_content words = list(tokenizer(content)) # Create a file fingerprint from the number of elements in the content hash @@ -297,7 +314,7 @@ def create_file_fingerprints( content_fingerprint = content_hash.hexdigest().decode("utf-8") ngs_count_hex_str = "%08x" % ngs_count file_fingerprint = ngs_count_hex_str + content_fingerprint - fingerprints["halo1"] = file_fingerprint + fingerprints[halo1_key] = file_fingerprint # Select windows from the content to compute snippet fingerprints windows = ngrams(words, window_length) @@ -317,7 +334,7 @@ def create_file_fingerprints( s["ngrams"] = list(window) snippets.append(s) if snippets: - fingerprints["snippets"] = snippets + fingerprints[snippets_key] = snippets return fingerprints diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py index 59dd634..6eed839 100644 --- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py +++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py @@ -45,7 +45,4 @@ def fingerprint_codebase(self): """ matchcode.fingerprint_codebase_directories(self.project) matchcode.fingerprint_codebase_resources(self.project) - - def fingerprint_stem_codebase_resources(self): - """Compute stem code fingerprint for resources""" - matchcode.fingerprint_stem_codebase_resources(self.project) + matchcode.fingerprint_stemmed_codebase_resources(self.project) From ae010c1e71b6088ede8ec865387b4bcc8783cf3c Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 25 Feb 2025 12:44:04 -0800 Subject: [PATCH 4/4] Remove step from FingerprintCodebase Signed-off-by: Jono Yang --- src/matchcode_toolkit/pipelines/fingerprint_codebase.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py index 6eed839..c74bb99 100644 --- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py +++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py @@ -34,10 +34,7 @@ class FingerprintCodebase(Pipeline): @classmethod def steps(cls): - return ( - cls.fingerprint_codebase, - cls.fingerprint_stem_codebase_resources, - ) + return (cls.fingerprint_codebase,) def fingerprint_codebase(self): """