Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix and enhance support for different bazel metadata versions #4194

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ The following organizations or individuals have contributed to ScanCode:
- Abhigyan Kumar Singh @Abhigyankrsingh
- Abhishek Kumar @Abhishek-Dev09
- Aditya Viki @adityaviki
- Adrian Braemer @abraemer
- Agni Bhattacharyya @PyAgni
- Akanksha Garg @akugarg
- Alex Blekhman @a-tinsmith
Expand Down
175 changes: 175 additions & 0 deletions src/licensedcode/juliacode.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
println("*"^20,"\nHello from Julia!\n","*"^20)

const MSet = Dict{Int,Int}

function build_set_and_tids_mset(token_ids)
tids_mset = MSet()

for tid in token_ids
# this skips already matched token ids that are -1
tid == -1 && continue
tids_mset[tid] = get(()->0, tids_mset, tid) + 1
end

return BitSet(keys(tids_mset)), tids_mset
end

set_counter(set::BitSet) = length(set)
set_counter(set::MSet) = sum(values(set))

set_intersector(set1::BitSet, set2::BitSet) = intersect(set1, set2)
function set_intersector(set1::MSet, set2::MSet)
length(set1) > length(set2) && ((set1,set2) = (set2,set1))
ret = MSet()
for (k, v) in set1
!haskey(set2, k) && continue
ret[k] = min(v, set2[k])
end
return ret
end

set_high_intersection_filter(set::BitSet, cutoff) = filter(<=(cutoff), set)
set_high_intersection_filter(set::MSet, cutoff) = filter(pair -> pair.first<=cutoff, set)

function compare_token_sets(qset, iset, len_legalese, min_matched_length_high, min_matched_length; minimum_containment=0, high_resemblance_threshold=0.8)
intersection = set_intersector(qset, iset)
length(intersection) == 0 && return nothing,nothing
high_intersection = set_high_intersection_filter(intersection, len_legalese)
length(high_intersection) == 0 && return nothing,nothing
length(set_counter(high_intersection)) < min_matched_length_high && return nothing,nothing

rule_length = set_counter(iset)
matched_length = set_counter(intersection)
matched_length < min_matched_length && return nothing, nothing

union_len = set_counter(qset) + rule_length - matched_length
resemblance = matched_length / union_len
containment = matched_length / rule_length
containment < minimum_containment && return nothing, nothing

amplified_resemblance = resemblance^2
score_vec1 = (;
is_highly_resemblant=round(resemblance; digits=1) >= high_resemblance_threshold,
containment=round(containment; digits=1),
resemblance=round(amplified_resemblance; digits=1),
matched_length=round(Int, matched_length / 20))

score_vec2 = (;
is_highly_resemblant=resemblance >= high_resemblance_threshold,
containment=containment,
resemblance=amplified_resemblance,
matched_length=matched_length
)

return (score_vec1,score_vec2), high_intersection

end

const ScoreVector = @NamedTuple{is_highly_resemblant::Bool, containment::Float64, resemblance::Float64, matched_length::Int64}

struct RuleInfo
min_matched_length_unique::Int
min_matched_length::Int
min_high_matched_length_unique::Int
min_high_matched_length::Int
minimum_containment::Float64
end

function convert_rule_list(rules_by_rid)
return [RuleInfo(
pyconvert(Any, r.get_min_matched_length(true)),
pyconvert(Any, r.get_min_matched_length(false)),
pyconvert(Any, r.get_min_high_matched_length(true)),
pyconvert(Any, r.get_min_high_matched_length(false)),
pyconvert(Any, r._minimum_containment)) for r in rules_by_rid]
end

function convert_set_list(sets)
return [isnothing(set) ? nothing : BitSet(set) for set in sets]
end

function convert_mset_list(msets)
return [isnothing(mset) ? nothing : MSet(mset) for mset in msets]
end

function compute_candidates(token_ids, len_legalese, rules_by_rid, sets_by_rid, msets_by_rid,
matchable_rids, top=50, high_resemblance=false, high_resemblance_threshold=0.8)
# collect query-side sets used for matching
qset, qmset = build_set_and_tids_mset(token_ids)

# @info "compute_candidates" typeof(token_ids) typeof(len_legalese) typeof(rules_by_rid) typeof(sets_by_rid) typeof(msets_by_rid) typeof(matchable_rids) typeof(top) typeof(high_resemblance) typeof(high_resemblance_threshold)
# typeof(token_ids) = Vector{Int64} (alias for Array{Int64, 1})
# typeof(len_legalese) = Int64
# typeof(rules_by_rid) = Vector{RuleInfo} (alias for Array{RuleInfo, 1})
# typeof(sets_by_rid) = Vector{Union{Nothing, BitSet}} (alias for Array{Union{Nothing, BitSet}, 1})
# typeof(msets_by_rid) = Vector{Union{Nothing, Dict{Int64, Int64}}} (alias for Array{Union{Nothing, Dict{Int64, Int64}}, 1})
# typeof(matchable_rids) = BitSet
# typeof(top) = Int64
# typeof(high_resemblance) = Bool
# typeof(high_resemblance_threshold) = Float64


# perform two steps of ranking:
# step one with tid sets and step two with tid multisets for refinement

############################################################################
# step 1 is on token id sets:
############################################################################

sortable_candidates = Tuple{Tuple{ScoreVector,ScoreVector}, Int, RuleInfo, BitSet}[]

for (rid, rule) in enumerate(rules_by_rid)
rid -= 1 # julia python compat
rid in matchable_rids || continue

scores_vectors, high_set_intersection = compare_token_sets(
qset,
sets_by_rid[rid+1],
len_legalese,
rule.min_high_matched_length_unique,
rule.min_matched_length_unique;
minimum_containment=rule.minimum_containment,
high_resemblance_threshold)

if !isnothing(scores_vectors)
svr, svf = scores_vectors
if (!high_resemblance || (high_resemblance && svr.is_highly_resemblant && svf.is_highly_resemblant))
# @info "" scores_vectors rid rule high_set_intersection
push!(sortable_candidates, (scores_vectors, rid, rule, high_set_intersection))
end
end
end

length(sortable_candidates) == 0 && return sortable_candidates

sort!(sortable_candidates; rev=true)

####################################################################
# step 2 is on tids multisets
####################################################################
# keep only the 10 x top candidates
sortable_candidates_new = eltype(sortable_candidates)[]
for (k , (_score_vectors, rid, rule, high_set_intersection)) in enumerate(sortable_candidates)
k >= 10*top && break
scores_vectors, _intersection = compare_token_sets(
qmset,
msets_by_rid[rid+1],
len_legalese,
rule.min_high_matched_length,
rule.min_matched_length;
minimum_containment=rule.minimum_containment,
high_resemblance_threshold)

if !isnothing(scores_vectors)
svr, svf = scores_vectors
if (!high_resemblance || (high_resemblance && svr.is_highly_resemblant && svf.is_highly_resemblant))
push!(sortable_candidates_new, (scores_vectors, rid, rule, high_set_intersection))
end
end
end

length(sortable_candidates_new) == 0 && return sortable_candidates_new

# rank candidates
return sort!(sortable_candidates_new; rev=true)[1:min(top, length(sortable_candidates_new))]
end
71 changes: 23 additions & 48 deletions src/packagedcode/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from collections import defaultdict

from commoncode import fileutils
from packageurl import PackageURL

from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
Expand Down Expand Up @@ -374,54 +375,28 @@ def parse(cls, location, package_only=True):
)
)

if (
'upstream_type'
and 'name'
and 'version'
and 'licenses'
and 'upstream_address'
in metadata_fields
):
# TODO: Create function that determines package type from download URL,
# then create a package of that package type from the metadata info
package_data = dict(
datasource_id=cls.datasource_id,
type=metadata_fields.get('upstream_type', cls.default_package_type),
name=metadata_fields.get('name'),
version=metadata_fields.get('version'),
extracted_license_statement=metadata_fields.get('licenses', []),
parties=parties,
homepage_url=metadata_fields.get('upstream_address', ''),
# TODO: Store 'upstream_hash` somewhere
)
yield models.PackageData.from_data(package_data, package_only=True)

if (
'package_type'
and 'name'
and 'version'
and 'license_expression'
and 'homepage_url'
and 'download_url'
and 'vcs_url'
and 'download_archive_sha1'
and 'vcs_commit_hash'
in metadata_fields
):
package_data = dict(
datasource_id=cls.datasource_id,
type=metadata_fields.get('package_type', cls.default_package_type),
name=metadata_fields.get('name'),
version=metadata_fields.get('version'),
extracted_license_statement=metadata_fields.get('license_expression', ''),
parties=parties,
homepage_url=metadata_fields.get('homepage_url', ''),
download_url=metadata_fields.get('download_url', ''),
vcs_url=metadata_fields.get('vcs_url', ''),
sha1=metadata_fields.get('download_archive_sha1', ''),
extra_data=dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', ''))
)
yield models.PackageData.from_data(package_data, package_only=True)
# TODO: Create function that determines package type from download URL,
# then create a package of that package type from the metadata info
package_data = dict(
datasource_id=cls.datasource_id,
type=metadata_fields.get('upstream_type', metadata_fields.get('package_type', cls.default_package_type)),
name=metadata_fields.get('name'),
version=metadata_fields.get('version'),
extracted_license_statement=metadata_fields.get('licenses', metadata_fields.get('license_expression')),
parties=parties,
homepage_url=metadata_fields.get('upstream_address', metadata_fields.get('homepage_url')),
download_url=metadata_fields.get('download_url'),
vcs_url=metadata_fields.get('vcs_url'),
sha1=metadata_fields.get('download_archive_sha1'),
# TODO: Store 'upstream_hash` somewhere
)
package_data['extra_data'] = {}
package_data['extra_data']['vcs_commit_hash'] = metadata_fields.get('vcs_commit_hash')
package_data['extra_data']['upstream_hash'] = metadata_fields.get('upstream_hash')
if 'package_url' in metadata_fields:
package_data.update(PackageURL.from_string(metadata_fields['package_url']).to_dict())
yield models.PackageData.from_data(package_data, package_only=True)


@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
METADATA = {
"licenses": [
"BSD-3-Clause",
],
"maintainers": [
"oss_foundation",
],
"name": "androidx.compose.animation:animation",
"upstream_address": "https://developer.android.com/jetpack/androidx/releases/compose-animation#0.0.1",
"version": "0.0.1",
"package_url" : "pkg:maven/androidx.compose.animation/[email protected]"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you maybe link to some examples of this type of manifests with package_url fields and maybe add one of those as tests, it's best to use real world examples probably. Or if you got this from some real example, you can also link that file here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, I cannot link you one of our internal files. However my example file is very close to an actual file (the version number is wrong but that's all).

}
22 changes: 22 additions & 0 deletions tests/packagedcode/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,28 @@ def test_MetadataBzl_parse(self):
)
expected_packages = [models.PackageData.from_data(package_data=package_data, package_only=True)]
compare_package_results(expected_packages, result_packages)

def test_MetadataBzl_parse_with_package_url(self):
test_file = self.get_test_loc('metadatabzl/with-package-url/METADATA.bzl')
result_packages = build.BuckMetadataBzlHandler.parse(test_file, package_only=True)
package_data = dict(
datasource_id=build.BuckMetadataBzlHandler.datasource_id,
name='animation',
namespace='androidx.compose.animation',
type='maven',
version='0.0.1',
extracted_license_statement=['BSD-3-Clause'],
parties=[
models.Party(
type=models.party_org,
name='oss_foundation',
role='maintainer'
)
],
homepage_url='https://developer.android.com/jetpack/androidx/releases/compose-animation#0.0.1',
)
expected_packages = [models.PackageData.from_data(package_data=package_data, package_only=True)]
compare_package_results(expected_packages, result_packages)

def test_MetadataBzl_recognize_new_format(self):
test_file = self.get_test_loc('metadatabzl/new-format/METADATA.bzl')
Expand Down
Loading