Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"),
flagSet.IntVarP(&options.MaxFailureCount, "max-failure-count", "mfc", 10, "maximum number of consecutive action failures before stopping"),
flagSet.BoolVarP(&options.EnableDiagnostics, "enable-diagnostics", "ed", false, "enable diagnostics"),
flagSet.StringVarP(&options.PageLoadStrategy, "page-load-strategy", "pls", "heuristic", "page load strategy (heuristic, load, domcontentloaded, networkidle, none)"),
flagSet.IntVarP(&options.DOMWaitTime, "dom-wait-time", "dwt", 5, "time in seconds to wait after page load when using domcontentloaded strategy"),
)

flagSet.CreateGroup("scope", "Scope",
Expand Down
28 changes: 28 additions & 0 deletions internal/runner/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,24 @@ func validateOptions(options *types.Options) error {
return errkit.New("no inputs specified for crawler")
}

// Validate page load strategy
if options.PageLoadStrategy != "" {
validStrategies := []string{"heuristic", "load", "domcontentloaded", "networkidle", "none"}
valid := false
for _, s := range validStrategies {
if options.PageLoadStrategy == s {
valid = true
break
}
}
if !valid {
return errkit.New("invalid page-load-strategy: must be one of (heuristic, load, domcontentloaded, networkidle, none)")
}
} else {
// Default to heuristic
options.PageLoadStrategy = "heuristic"
}

// Disabling automatic form fill (-aff) for headless navigation due to incorrect implementation.
// Form filling should be handled via headless actions within the page context
if options.HeadlessHybrid && options.AutomaticFormFill {
Expand All @@ -36,6 +54,16 @@ func validateOptions(options *types.Options) error {
if options.Headless && options.HeadlessHybrid {
return errkit.New("flags -hl (headless) and -hh (hybrid) are mutually exclusive")
}

// Warn if -headless is used with -cwu (Chrome WebSocket URL)
// The ChromeWSUrl takes precedence and pure headless engine will be used
if options.Headless && options.ChromeWSUrl != "" {
gologger.Warning().Msgf("Using -cwu with existing browser session. The -headless flag is redundant.")
gologger.Info().Msgf("Connecting to Chrome at: %s", options.ChromeWSUrl)
} else if options.ChromeWSUrl != "" {
gologger.Info().Msgf("Connecting to Chrome at: %s (using pure headless engine)", options.ChromeWSUrl)
}

if (options.HeadlessOptionalArguments != nil || options.HeadlessNoSandbox || options.SystemChromePath != "") &&
!options.Headless && !options.HeadlessHybrid {
return errkit.New("headless (-hl) or hybrid (-hh) mode is required if -ho, -nos or -scp are set")
Expand Down
4 changes: 4 additions & 0 deletions internal/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ func New(options *types.Options) (*Runner, error) {
var crawler engine.Engine

switch {
case options.ChromeWSUrl != "":
// When connecting to existing browser via WebSocket URL,
// use pure headless engine with advanced crawling features
crawler, err = headless.New(crawlerOptions)
case options.Headless:
crawler, err = headless.New(crawlerOptions)
case options.HeadlessHybrid:
Expand Down
Loading