Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add transitive extraction for Maven pom.xml #399

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
188 changes: 188 additions & 0 deletions extractor/filesystem/language/java/pomxmlnet/extractor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// Package pomxmlnet extracts Maven's pom.xml format with transitive dependency resolution.
cuixq marked this conversation as resolved.
Show resolved Hide resolved
cuixq marked this conversation as resolved.
Show resolved Hide resolved
package pomxmlnet

import (
"context"
"fmt"
"path/filepath"

"golang.org/x/exp/maps"

"deps.dev/util/maven"
"deps.dev/util/resolve"
"deps.dev/util/resolve/dep"
mavenresolve "deps.dev/util/resolve/maven"
"github.com/google/osv-scalibr/extractor"
"github.com/google/osv-scalibr/extractor/filesystem"
"github.com/google/osv-scalibr/extractor/filesystem/osv"
"github.com/google/osv-scalibr/internal/datasource"
"github.com/google/osv-scalibr/internal/mavenutil"
"github.com/google/osv-scalibr/internal/resolution/client"
"github.com/google/osv-scalibr/plugin"
"github.com/google/osv-scalibr/purl"
)

// Extractor extracts Maven packages with transitive dependency resolution.
// TODO: Use the virtual filesystem rather than the real filesystem.
cuixq marked this conversation as resolved.
Show resolved Hide resolved
type Extractor struct {
client.DependencyClient
*datasource.MavenRegistryAPIClient
}

// Name of the extractor.
func (e Extractor) Name() string { return "java/pomxml" }
cuixq marked this conversation as resolved.
Show resolved Hide resolved

// Version of the extractor.
func (e Extractor) Version() int { return 0 }

// Requirements of the extractor.
func (e Extractor) Requirements() *plugin.Capabilities {
return &plugin.Capabilities{
Network: true,
cuixq marked this conversation as resolved.
Show resolved Hide resolved
}
}

// FileRequired never returns true, as this is for the osv-scanner json output.
func (e Extractor) FileRequired(fapi filesystem.FileAPI) bool {
return filepath.Base(fapi.Path()) == "pom.xml"
}

// Extract extracts packages from yarn.lock files passed through the scan input.
func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Inventory, error) {
var project maven.Project
if err := datasource.NewMavenDecoder(input.Reader).Decode(&project); err != nil {
return nil, fmt.Errorf("could not extract from %s: %w", input.Path, err)
}
// Empty JDK and ActivationOS indicates merging the default profiles.
if err := project.MergeProfiles("", maven.ActivationOS{}); err != nil {
return nil, fmt.Errorf("failed to merge profiles: %w", err)
}
for _, repo := range project.Repositories {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a lot of code in Extract, I wonder if we could move some of these into separate functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolving Maven pom.xml requires a lot of steps and some of these steps are already done in packages (e.g. deps.dev/util/maven).
Considering we are moving over the Maven guided remediation logic here, we may consider do some refactoring later to see if possible to reduce duplication.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough - in that case maybe we should add a TODO with a link to an issue about doing that refactoring later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, I missed this comment - will add a TODO

if err := e.MavenRegistryAPIClient.AddRegistry(datasource.MavenRegistry{
URL: string(repo.URL),
ID: string(repo.ID),
ReleasesEnabled: repo.Releases.Enabled.Boolean(),
SnapshotsEnabled: repo.Snapshots.Enabled.Boolean(),
}); err != nil {
return nil, fmt.Errorf("failed to add registry %s: %w", repo.URL, err)
}
}
// Merging parents data by parsing local parent pom.xml or fetching from upstream.
if err := mavenutil.MergeParents(ctx, input, e.MavenRegistryAPIClient, &project, project.Parent, 1, true); err != nil {
return nil, fmt.Errorf("failed to merge parents: %w", err)
}
// Process the dependencies:
// - dedupe dependencies and dependency management
// - import dependency management
// - fill in missing dependency version requirement
project.ProcessDependencies(func(groupID, artifactID, version maven.String) (maven.DependencyManagement, error) {
return mavenutil.GetDependencyManagement(ctx, e.MavenRegistryAPIClient, groupID, artifactID, version)
})

if registries := e.MavenRegistryAPIClient.GetRegistries(); len(registries) > 0 {
clientRegs := make([]client.Registry, len(registries))
for i, reg := range registries {
clientRegs[i] = reg
}
if err := e.DependencyClient.AddRegistries(clientRegs); err != nil {
return nil, err
}
}

overrideClient := client.NewOverrideClient(e.DependencyClient)
resolver := mavenresolve.NewResolver(overrideClient)

// Resolve the dependencies.
root := resolve.Version{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: project.ProjectKey.Name(),
},
VersionType: resolve.Concrete,
Version: string(project.Version),
}}
reqs := make([]resolve.RequirementVersion, len(project.Dependencies)+len(project.DependencyManagement.Dependencies))
for i, d := range project.Dependencies {
reqs[i] = resolve.RequirementVersion{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: d.Name(),
},
VersionType: resolve.Requirement,
Version: string(d.Version),
},
Type: resolve.MavenDepType(d, ""),
}
}
for i, d := range project.DependencyManagement.Dependencies {
reqs[len(project.Dependencies)+i] = resolve.RequirementVersion{
VersionKey: resolve.VersionKey{
PackageKey: resolve.PackageKey{
System: resolve.Maven,
Name: d.Name(),
},
VersionType: resolve.Requirement,
Version: string(d.Version),
},
Type: resolve.MavenDepType(d, mavenutil.OriginManagement),
}
}
overrideClient.AddVersion(root, reqs)

g, err := resolver.Resolve(ctx, root.VersionKey)
if err != nil {
return nil, fmt.Errorf("failed resolving %v: %w", root, err)
}
for i, e := range g.Edges {
e.Type = dep.Type{}
g.Edges[i] = e
}

details := map[string]*extractor.Inventory{}
for i := 1; i < len(g.Nodes); i++ {
// Ignore the first node which is the root.
node := g.Nodes[i]
depGroups := []string{}
inventory := extractor.Inventory{
Name: node.Version.Name,
Version: node.Version.Version,
// TODO(rexpan): Add merged paths in here as well
erikvarga marked this conversation as resolved.
Show resolved Hide resolved
Locations: []string{input.Path},
}
// We are only able to know dependency groups of direct dependencies but
// not transitive dependencies because the nodes in the resolve graph does
// not have the scope information.
for _, dep := range project.Dependencies {
if dep.Name() != inventory.Name {
continue
}
if dep.Scope != "" && dep.Scope != "compile" {
depGroups = append(depGroups, string(dep.Scope))
}
}
inventory.Metadata = osv.DepGroupMetadata{
DepGroupVals: depGroups,
}
details[inventory.Name] = &inventory
}

return maps.Values(details), nil
}

// ToPURL converts an inventory created by this extractor into a PURL.
func (e Extractor) ToPURL(i *extractor.Inventory) *purl.PackageURL {
return &purl.PackageURL{
Type: purl.TypeMaven,
Name: i.Name,
Version: i.Version,
}
}

// Ecosystem returns the OSV ecosystem ('npm') of the software extracted by this extractor.
func (e Extractor) Ecosystem(_ *extractor.Inventory) string {
return "Maven"
}

var _ filesystem.Extractor = Extractor{}
Loading
Loading