Skip to content

Commit 143ffbf

Browse files
committed
Admin/XMover: Add problematic translog detection and cancellation
The new command can find and cancel replica shards with large uncommitted translog sizes. It provides options to set size thresholds and optionally execute cancel commands after user confirmation.
1 parent 256ff3d commit 143ffbf

File tree

4 files changed

+422
-1
lines changed

4 files changed

+422
-1
lines changed

cratedb_toolkit/admin/xmover/analysis/shard.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,106 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100.
838838
}
839839

840840

841+
class TranslogReporter:
842+
def __init__(self, client: CrateDBClient):
843+
self.client = client
844+
845+
def problematic_translogs(self, size_mb: int) -> List[str]:
846+
"""Find and optionally cancel shards with problematic translog sizes."""
847+
console.print(Panel.fit("[bold blue]Problematic Translog Analysis[/bold blue]"))
848+
console.print(f"[dim]Looking for replica shards with translog uncommitted size > {size_mb}MB[/dim]")
849+
console.print()
850+
851+
# Query to find problematic replica shards
852+
query = """
853+
SELECT sh.schema_name, \
854+
sh.table_name, \
855+
translate(p.values::text, ':{}', '=()') as partition_values, \
856+
sh.id AS shard_id, \
857+
node['name'] as node_name, \
858+
sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_mb
859+
FROM
860+
sys.shards AS sh
861+
LEFT JOIN information_schema.table_partitions p
862+
ON sh.table_name = p.table_name
863+
AND sh.schema_name = p.table_schema
864+
AND sh.partition_ident = p.partition_ident
865+
WHERE
866+
sh.state = 'STARTED'
867+
AND sh.translog_stats['uncommitted_size'] \
868+
> ? * 1024^2
869+
AND primary = FALSE
870+
ORDER BY
871+
6 DESC \
872+
"""
873+
874+
try:
875+
result = self.client.execute_query(query, [size_mb])
876+
rows = result.get("rows", [])
877+
878+
if not rows:
879+
console.print(f"[green]✓ No replica shards found with translog uncommitted size > {size_mb}MB[/green]")
880+
return []
881+
882+
console.print(f"Found {len(rows)} shards with problematic translogs:")
883+
console.print()
884+
885+
# Display query results table
886+
results_table = Table(title=f"Problematic Replica Shards (translog > {size_mb}MB)", box=box.ROUNDED)
887+
results_table.add_column("Schema", style="cyan")
888+
results_table.add_column("Table", style="blue")
889+
results_table.add_column("Partition", style="magenta")
890+
results_table.add_column("Shard ID", justify="right", style="yellow")
891+
results_table.add_column("Node", style="green")
892+
results_table.add_column("Translog MB", justify="right", style="red")
893+
894+
for row in rows:
895+
schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
896+
partition_display = (
897+
partition_values if partition_values and partition_values != "NULL" else "[dim]none[/dim]"
898+
)
899+
results_table.add_row(
900+
schema_name, table_name, partition_display, str(shard_id), node_name, f"{translog_mb:.1f}"
901+
)
902+
903+
console.print(results_table)
904+
console.print()
905+
console.print("[bold]Generated ALTER Commands:[/bold]")
906+
console.print()
907+
908+
# Generate ALTER commands
909+
alter_commands = []
910+
for row in rows:
911+
schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
912+
913+
# Build the ALTER command based on whether it's partitioned
914+
if partition_values and partition_values != "NULL":
915+
# partition_values already formatted like ("sync_day"=1757376000000) from the translate function
916+
alter_cmd = (
917+
f'ALTER TABLE "{schema_name}"."{table_name}" partition {partition_values} '
918+
f"REROUTE CANCEL SHARD {shard_id} on '{node_name}' WITH (allow_primary=False);"
919+
)
920+
else:
921+
alter_cmd = (
922+
f'ALTER TABLE "{schema_name}"."{table_name}" '
923+
f"REROUTE CANCEL SHARD {shard_id} on '{node_name}' WITH (allow_primary=False);"
924+
)
925+
926+
alter_commands.append(alter_cmd)
927+
console.print(alter_cmd)
928+
929+
console.print()
930+
console.print(f"[bold]Total: {len(alter_commands)} ALTER commands generated[/bold]")
931+
return alter_commands
932+
933+
except Exception as e:
934+
console.print(f"[red]Error analyzing problematic translogs: {e}[/red]")
935+
import traceback
936+
937+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
938+
return []
939+
940+
841941
class ShardReporter:
842942
def __init__(self, analyzer: ShardAnalyzer):
843943
self.analyzer = analyzer

cratedb_toolkit/admin/xmover/cli.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
from rich.console import Console
1313
from rich.panel import Panel
1414

15-
from cratedb_toolkit.admin.xmover.analysis.shard import ActiveShardMonitor, ShardAnalyzer, ShardReporter
15+
from cratedb_toolkit.admin.xmover.analysis.shard import (
16+
ActiveShardMonitor,
17+
ShardAnalyzer,
18+
ShardReporter,
19+
TranslogReporter,
20+
)
1621
from cratedb_toolkit.admin.xmover.analysis.table import DistributionAnalyzer
1722
from cratedb_toolkit.admin.xmover.analysis.zone import ZoneReport
1823
from cratedb_toolkit.admin.xmover.model import (
@@ -444,6 +449,69 @@ def run_single_analysis():
444449
console.print(f"[dim]{traceback.format_exc()}[/dim]")
445450

446451

452+
@main.command()
453+
@click.option("--size-mb", default=300, help="Minimum translog uncommitted size in MB (default: 300)")
454+
@click.option("--cancel", is_flag=True, help="Execute the cancel commands after confirmation")
455+
@click.pass_context
456+
def problematic_translogs(ctx, size_mb: int, cancel: bool):
457+
"""
458+
Find and optionally cancel shards with problematic translog sizes.
459+
460+
This command identifies replica shards with large uncommitted translog sizes
461+
that may indicate replication issues. By default, it shows the ALTER commands
462+
that would cancel these shards. With --cancel, it executes them after confirmation.
463+
"""
464+
client = ctx.obj["client"]
465+
report = TranslogReporter(client=client)
466+
alter_commands = report.problematic_translogs(size_mb=size_mb)
467+
468+
try:
469+
if cancel and alter_commands:
470+
console.print()
471+
console.print("[yellow]⚠️ WARNING: This will cancel the specified shards![/yellow]")
472+
console.print("[yellow]This may cause temporary data unavailability for these shards.[/yellow]")
473+
console.print()
474+
475+
if click.confirm("Are you sure you want to execute these ALTER commands?"):
476+
console.print()
477+
console.print("[bold blue]Executing ALTER commands...[/bold blue]")
478+
479+
executed = 0
480+
failed = 0
481+
482+
for i, cmd in enumerate(alter_commands, 1):
483+
if cmd.startswith("--"):
484+
console.print(f"[yellow]Skipping command {i} (parse error): {cmd}[/yellow]")
485+
continue
486+
487+
try:
488+
console.print(f"[dim]({i}/{len(alter_commands)}) Executing...[/dim]")
489+
client.execute_query(cmd)
490+
console.print(f"[green]✓ Command {i} executed successfully[/green]")
491+
executed += 1
492+
except Exception as e:
493+
console.print(f"[red]✗ Command {i} failed: {e}[/red]")
494+
failed += 1
495+
496+
# Small delay between commands to avoid overwhelming the cluster
497+
if i < len(alter_commands):
498+
time.sleep(1)
499+
500+
console.print()
501+
console.print("[bold]Execution Summary:[/bold]")
502+
console.print(f"[green]✓ Successful: {executed}[/green]")
503+
if failed > 0:
504+
console.print(f"[red]✗ Failed: {failed}[/red]")
505+
else:
506+
console.print("[yellow]Operation cancelled by user[/yellow]")
507+
508+
except Exception as e:
509+
console.print(f"[red]Error analyzing problematic translogs: {e}[/red]")
510+
import traceback
511+
512+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
513+
514+
447515
@main.command()
448516
@click.option("--table", "-t", help="Analyze zones for specific table only")
449517
@click.option("--show-shards/--no-show-shards", default=False, help="Show individual shard details (default: False)")

doc/admin/xmover/handbook.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,47 @@ xmover monitor-recovery --watch --include-transitioning
245245
- **DISK**: Rebuilding shard from local data (after restart/disk issues)
246246

247247

248+
### `problematic-translogs`
249+
Find and optionally cancel replica shards with problematic translog sizes.
250+
251+
**Options:**
252+
- `--size-mb INTEGER`: Minimum translog uncommitted size in MB (default: 300)
253+
- `--cancel`: Execute the cancel commands after confirmation
254+
255+
**Description:**
256+
This command identifies replica shards with large uncommitted translog sizes that may indicate replication issues. By default, it shows the ALTER commands that would cancel these shards. With `--cancel`, it executes them after confirmation.
257+
258+
**Examples:**
259+
```bash
260+
# Show problematic shards with translog > 300MB (default)
261+
xmover problematic-translogs
262+
263+
# Show shards with translog > 500MB
264+
xmover problematic-translogs --size-mb 500
265+
266+
# Execute cancel commands for shards > 1GB after confirmation
267+
xmover problematic-translogs --size-mb 1000 --cancel
268+
```
269+
270+
**Sample Output:**
271+
```
272+
Found 3 shards with problematic translogs:
273+
Problematic Replica Shards (translog > 300MB)
274+
╭────────┬───────────────────────────────┬────────────────────────────┬──────────┬────────────┬─────────────╮
275+
│ Schema │ Table │ Partition │ Shard ID │ Node │ Translog MB │
276+
├────────┼───────────────────────────────┼────────────────────────────┼──────────┼────────────┼─────────────┤
277+
│ TURVO │ shipmentFormFieldData │ none │ 14 │ data-hot-6 │ 7040.9 │
278+
│ TURVO │ shipmentFormFieldData_events │ ("sync_day"=1757376000000) │ 3 │ data-hot-2 │ 481.2 │
279+
│ TURVO │ orderFormFieldData │ none │ 5 │ data-hot-1 │ 469.5 │
280+
╰────────┴───────────────────────────────┴────────────────────────────┴──────────┴────────────┴─────────────╯
281+
Generated ALTER Commands:
282+
ALTER TABLE "TURVO"."shipmentFormFieldData" REROUTE CANCEL SHARD 14 on 'data-hot-6' WITH (allow_primary=False);
283+
ALTER TABLE "TURVO"."shipmentFormFieldData_events" partition ("sync_day"=1757376000000) REROUTE CANCEL SHARD 3 on 'data-hot-2' WITH (allow_primary=False);
284+
ALTER TABLE "TURVO"."orderFormFieldData" REROUTE CANCEL SHARD 5 on 'data-hot-1' WITH (allow_primary=False);
285+
Total: 3 ALTER commands generated
286+
```
287+
288+
248289
### `active-shards`
249290
Monitor the most active shards by tracking checkpoint progression over time.
250291
This command helps identify which shards are receiving the most write activity

0 commit comments

Comments
 (0)