Percona-Lab
diff --git a/‎.DS_Store‎
6 KB b/‎.DS_Store‎
6 KB
diff --git a/‎cmd/.DS_Store‎
0 Bytes b/‎cmd/.DS_Store‎
0 Bytes
diff --git a/‎cmd/docStreamer/main.go‎
Lines changed: 42 additions & 9 deletions b/‎cmd/docStreamer/main.go‎
Lines changed: 42 additions & 9 deletions
diff --git a/‎config.yaml‎
Lines changed: 42 additions & 1 deletion b/‎config.yaml‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎faq.md‎
Lines changed: 37 additions & 0 deletions b/‎faq.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎internal/.DS_Store‎
0 Bytes b/‎internal/.DS_Store‎
0 Bytes
diff --git a/‎internal/cdc/cdc.go‎
Lines changed: 9 additions & 2 deletions b/‎internal/cdc/cdc.go‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎internal/cloner/copy.go‎
Lines changed: 9 additions & 1 deletion b/‎internal/cloner/copy.go‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎internal/config/config.go‎
Lines changed: 24 additions & 7 deletions b/‎internal/config/config.go‎
Lines changed: 24 additions & 7 deletions
@@ -33,6 +33,7 @@ import (
 	"github.com/Percona-Lab/percona-docstreamer/internal/config"
 	"github.com/Percona-Lab/percona-docstreamer/internal/dbops"
 	"github.com/Percona-Lab/percona-docstreamer/internal/discover"
+	"github.com/Percona-Lab/percona-docstreamer/internal/flow"
 	"github.com/Percona-Lab/percona-docstreamer/internal/logging"
 	"github.com/Percona-Lab/percona-docstreamer/internal/pid"
 	"github.com/Percona-Lab/percona-docstreamer/internal/status"
@@ -221,12 +222,8 @@ func startAction(cmd *cobra.Command, args []string) {
 		os.Exit(1)
 	}
 
-	var hiddenargs = []string{"run",
-		"--docdb-user", docdbUser,
-		"--mongo-user", mongoUser,
-		"--docdb-pass", docdbPass,
-		"--mongo-pass", mongoPass,
-	}
+	var hiddenargs = []string{"run"}
+
 	if config.Cfg.Migration.Destroy {
 		hiddenargs = append(hiddenargs, "--destroy")
 	}
@@ -241,6 +238,15 @@ func startAction(cmd *cobra.Command, args []string) {
 	runCmd.SysProcAttr = &syscall.SysProcAttr{
 		Setsid: true, // Detach
 	}
+
+	// Inherit the current environment
+	runCmd.Env = os.Environ()
+	// Append credentials explicitly
+	runCmd.Env = append(runCmd.Env, fmt.Sprintf("DOCDB_USER=%s", docdbUser))
+	runCmd.Env = append(runCmd.Env, fmt.Sprintf("MONGO_USER=%s", mongoUser))
+	runCmd.Env = append(runCmd.Env, fmt.Sprintf("DOCDB_PASS=%s", docdbPass))
+	runCmd.Env = append(runCmd.Env, fmt.Sprintf("MONGO_PASS=%s", mongoPass))
+
 	if err := runCmd.Start(); err != nil {
 		logging.PrintError(fmt.Sprintf("Failed to launch background process: %v", err), 0)
 		os.Exit(1)
@@ -539,9 +545,25 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 	}()
 
 	docdbUser, _ := cmd.Flags().GetString("docdb-user")
+	if docdbUser == "" {
+		docdbUser = os.Getenv("DOCDB_USER")
+	}
+
 	mongoUser, _ := cmd.Flags().GetString("mongo-user")
+	if mongoUser == "" {
+		mongoUser = os.Getenv("MONGO_USER")
+	}
+
 	docdbPass, _ := cmd.Flags().GetString("docdb-pass")
+	if docdbPass == "" {
+		docdbPass = os.Getenv("DOCDB_PASS")
+	}
+
 	mongoPass, _ := cmd.Flags().GetString("mongo-pass")
+	if mongoPass == "" {
+		mongoPass = os.Getenv("MONGO_PASS")
+	}
+
 	destroy, _ := cmd.Flags().GetBool("destroy")
 
 	docdbURI := config.Cfg.BuildDocDBURI(docdbUser, docdbPass)
@@ -578,6 +600,7 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 		logging.PrintError(err.Error(), 0)
 		return
 	}
+
 	// --- Disconnect with timeout ---
 	defer func() {
 		dCtx, dCancel := context.WithTimeout(context.Background(), 5*time.Second)
@@ -647,8 +670,15 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 	valStore := validator.NewStore(targetClient)
 
 	apiServer = api.NewServer(config.Cfg.Migration.StatusHTTPPort)
+	// Initialize Status Manager BEFORE Flow Manager
 	statusManager = status.NewManager(targetClient, false)
 
+	// --- FLOW CONTROL ---
+	// Pass mongoUser and mongoPass so we can connect to shards if discovered
+	flowManager := flow.NewManager(targetClient, statusManager, mongoUser, mongoPass)
+	flowManager.Start()
+	defer flowManager.Stop()
+
 	validationManager := validator.NewManager(sourceClient, targetClient, tracker, valStore, statusManager)
 	defer validationManager.Close()
 
@@ -730,7 +760,7 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 				logging.PrintPhase("4", "FULL DATA LOAD")
 				statusManager.SetState("running", "Initial Sync (Full Load)")
 
-				_, err = launchFullLoadWorkers(ctx, sourceClient, targetClient, toRun, statusManager, checkpointManager)
+				_, err = launchFullLoadWorkers(ctx, sourceClient, targetClient, toRun, statusManager, checkpointManager, flowManager)
 				if err != nil {
 					if err != context.Canceled {
 						statusManager.SetError(err.Error())
@@ -785,6 +815,8 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 
 		startAt = resumeAt
 		statusManager.SetCloneCompleted()
+
+		statusManager.SetInitialSyncCompleted(0)
 	}
 
 	if ctx.Err() != nil {
@@ -805,6 +837,7 @@ func runMigrationProcess(cmd *cobra.Command, args []string) {
 		tracker,
 		valStore,
 		validationManager,
+		flowManager,
 	)
 
 	cdcManager.Start(ctx)
@@ -824,7 +857,7 @@ func extractDBNames(collections []discover.CollectionInfo) []string {
 	return dbNames
 }
 
-func launchFullLoadWorkers(ctx context.Context, source, target *mongo.Client, collections []discover.CollectionInfo, statusMgr *status.Manager, checkpointMgr *checkpoint.Manager) (bson.Timestamp, error) {
+func launchFullLoadWorkers(ctx context.Context, source, target *mongo.Client, collections []discover.CollectionInfo, statusMgr *status.Manager, checkpointMgr *checkpoint.Manager, flowMgr *flow.Manager) (bson.Timestamp, error) {
 	jobs := make(chan discover.CollectionInfo, len(collections))
 	for _, c := range collections {
 		jobs <- c
@@ -844,7 +877,7 @@ func launchFullLoadWorkers(ctx context.Context, source, target *mongo.Client, co
 				}
 				ns := collInfo.Namespace
 				logging.PrintStep(fmt.Sprintf("[Worker %d] Starting full load for %s", workerID, ns), 0)
-				copier := cloner.NewCopyManager(source, target, collInfo, statusMgr, checkpointMgr, config.Cfg.Migration.CheckpointDocID)
+				copier := cloner.NewCopyManager(source, target, collInfo, statusMgr, checkpointMgr, config.Cfg.Migration.CheckpointDocID, flowMgr)
 				docCount, _, err := copier.Do(ctx)
 				start := time.Now()
 				logging.LogFullLoadOp(start, ns, docCount, err)
 
@@ -143,11 +143,15 @@ cloner:
   # Increase writers to achieve 1:8 ratio with readers, this typically provides an effective performance
   # however it is important to keep in mind this is per collection, so tune this wisely (see max_concurrent_workers)
   # e.g. If num_read_workers = 4 then set num_insert_workers to 32 (e.g. 4 * 8 = 32)
+  # CRITICAL: Reduce insert workers to limit concurrent writes.
+  # High values here can OOM a shard by flooding it with connections/data.
+  # Recommendation: 4-8 per collection
   num_insert_workers: 8
 
   # read_batch_size: Number of documents per read batch
   read_batch_size: 1000
 
+  # Reduce batch size to lower memory pressure per request on the cluster.
   # insert_batch_size: Number of documents per insert batch
   insert_batch_size: 1000
 
@@ -205,4 +209,41 @@ cdc:
   # write_timeout_ms: The maximum time to wait for a BulkWrite operation to complete.
   # If the network hangs, this ensures the worker doesn't freeze forever.
   # Default: 30000 (30 seconds)
-  write_timeout_ms: 30000
+  write_timeout_ms: 30000
+
+# -----------------------------------------------
+# Adaptive Flow Control (Throttling)
+# -----------------------------------------------
+flow_control:
+  # Enabled: If true, docStreamer continuously monitors the target MongoDB's health.
+  # It polls `db.serverStatus()` on the target (and all shards if discovered) every second.
+  # If any node shows signs of overload (high queues or memory usage), 
+  # docStreamer will temporarily pause fetching new data from the source.
+  enabled: true
+
+  # Check Interval: How often (in milliseconds) to poll the target database for its status.
+  # Default: 1000 (1 second)
+  check_interval_ms: 1000
+
+  # Target Max Queued Ops: The safety limit for the Target's Global Lock Queue.
+  # Source Metric: db.serverStatus().globalLock.currentQueue.total
+  # Checks BOTH Mongos and all backend Shards.
+  # If ANY node has more than this many operations queued, docStreamer pauses.
+  # Default: 50
+  target_max_queued_ops: 50
+
+  # Target Max Resident MB: The safety limit for Target RAM usage.
+  # Source Metric: db.serverStatus().mem.resident
+  # If the target's Resident Memory exceeds this value (in Megabytes), docStreamer will pause. 
+  # This is critical for preventing OOM (Out of Memory) kills on the target host.
+  # SHARDED CLUSTERS NOTE: 
+  # If the migration user exists on the backend shards, this setting PROTECTS THE SHARDS directly.
+  # If the user only exists on Mongos, this setting only protects the Mongos router.
+  # Set to 0 to disable memory-based throttling.
+  # Default: 0 (Disabled)
+  target_max_resident_mb: 0
+
+  # Pause Duration: How long (in milliseconds) to sleep when an overload is detected.
+  # The application will re-check the status after this duration.
+  # Default: 500
+  pause_duration_ms: 500
@@ -212,6 +212,43 @@ Indexes created on the source **after** migration starts (during CDC) are **not*
 
 ---
 
+### Adaptive Flow Control
+
+**Q: Why do I need Flow Control? Can't I just increase the number of workers?**
+**A:** Increasing workers speeds up data *reading*, but it can easily overwhelm your target database. If you push data faster than MongoDB can write it to disk, you risk:
+1.  **Lock Contention:** Operations pile up, causing latency spikes for other applications using the cluster.
+2.  **Memory Saturation:** MongoDB may consume all available RAM, leading to OOM (Out of Memory) crashes by the Operating System.
+3.  **Replica Set Lag:** Secondary nodes may fall too far behind, risking data consistency or triggering elections.
+
+Flow Control acts as an intelligent "brake," ensuring migration speed never exceeds the target's physical capacity.
+
+**Q: What exactly happens when the destination is overloaded?**
+**A:** When `docStreamer` detects that the target is stressed (high queue depth or memory usage):
+1.  **Status Change:** The application enters a `PAUSED` state and logs a warning (e.g., `[WARN] THROTTLING PAUSED`).
+2.  **Source Throttle:** It stops fetching new documents from DocumentDB immediately.
+3.  **Connection Keep-Alive:** Existing connections remain open, but no new write operations are sent.
+4.  **Auto-Resume:** The background monitor continues checking every second. As soon as the target's metrics drop below your configured thresholds, the migration automatically resumes exactly where it left off.
+
+**Q: Does this work with Sharded Clusters?**
+**A:** **Yes.** This is a Cluster-Aware feature.
+* **Standard tools** often only monitor the `mongos` router, which usually reports "healthy" metrics (0 queues) even when backend shards are struggling.
+* **docStreamer** automatically discovers your cluster topology and opens direct monitoring connections to **every backend shard**.
+* **Protection:** If *any* single shard becomes overloaded, the entire migration pauses. This prevents a "hot shard" scenario where one specific shard causes a cluster-wide failure.
+
+**Q: I see "targetQueuedOps: 0" in the status output. Is Flow Control working?**
+**A:** **Yes, this is normal.** MongoDB is highly optimized. A value of `0` means your database is handling the current write load instantly without any backlog.
+* **Healthy:** `0 - 10`
+* **Warning:** `10 - 50` (Micro-bursts)
+* **Critical:** `> 50` (Sustained saturation)
+Flow Control only steps in when it sees the critical values you defined in `config.yaml`.
+
+**Q: Will Flow Control slow down my migration?**
+**A:** It might slightly extend the total duration, but it **prevents failure**.
+* *Without Flow Control:* You might migrate 20% faster, but risk crashing the production database or forcing a restart due to errors.
+* *With Flow Control:* You get the maximum *safe* speed your hardware can handle, with zero manual intervention required to prevent crashes.
+
+---
+
 ## Q: What happens if the source environment is a Sharded Amazon DocumentDB cluster? Will docStreamer work?
 **A:** docStreamer is designed to work with sharded source environments, as long as the connection details point to the cluster's router/endpoint (equivalent to a mongos instance in a MongoDB sharded cluster).
 
 
@@ -11,6 +11,7 @@ import (
 
 	"github.com/Percona-Lab/percona-docstreamer/internal/checkpoint"
 	"github.com/Percona-Lab/percona-docstreamer/internal/config"
+	"github.com/Percona-Lab/percona-docstreamer/internal/flow"
 	"github.com/Percona-Lab/percona-docstreamer/internal/logging"
 	"github.com/Percona-Lab/percona-docstreamer/internal/status"
 	"github.com/Percona-Lab/percona-docstreamer/internal/validator"
@@ -39,6 +40,7 @@ type CDCManager struct {
 	excludeDBs         map[string]bool
 	excludeColls       map[string]bool
 	fatalErrorChan     chan error
+	flowMgr            *flow.Manager
 }
 
 // shouldRetry checks if an error is a transient network or connection issue
@@ -57,7 +59,7 @@ func shouldRetry(err error) bool {
 		strings.Contains(msg, "server selection error")
 }
 
-func NewManager(source, target *mongo.Client, checkpointDocID string, startAt bson.Timestamp, checkpoint *checkpoint.Manager, statusMgr *status.Manager, tracker *validator.InFlightTracker, store *validator.Store, valMgr *validator.Manager) *CDCManager {
+func NewManager(source, target *mongo.Client, checkpointDocID string, startAt bson.Timestamp, checkpoint *checkpoint.Manager, statusMgr *status.Manager, tracker *validator.InFlightTracker, store *validator.Store, valMgr *validator.Manager, flowMgr *flow.Manager) *CDCManager {
 	resumeTS, found := checkpoint.GetResumeTimestamp(context.Background(), checkpointDocID)
 
 	if !found {
@@ -114,7 +116,8 @@ func NewManager(source, target *mongo.Client, checkpointDocID string, startAt bs
 		checkpointDocID:  checkpointDocID,
 		excludeDBs:       excludeDBs,
 		excludeColls:     excludeColls,
-		fatalErrorChan:   make(chan error, workerCount+1), // Buffer slightly to prevent blocking
+		fatalErrorChan:   make(chan error, workerCount+1),
+		flowMgr:          flowMgr,
 	}
 	mgr.totalEventsApplied.Store(initialEvents)
 	return mgr
@@ -368,6 +371,10 @@ func (m *CDCManager) processChanges(ctx context.Context) {
 	defer retryTicker.Stop()
 
 	for {
+		// CHECK THROTTLE
+		if m.flowMgr != nil {
+			m.flowMgr.Wait()
+		}
 		select {
 		case <-ctx.Done():
 			for i, writer := range m.bulkWriters {
 
@@ -12,6 +12,7 @@ import (
 	"github.com/Percona-Lab/percona-docstreamer/internal/checkpoint"
 	"github.com/Percona-Lab/percona-docstreamer/internal/config"
 	"github.com/Percona-Lab/percona-docstreamer/internal/discover"
+	"github.com/Percona-Lab/percona-docstreamer/internal/flow"
 	"github.com/Percona-Lab/percona-docstreamer/internal/indexer"
 	"github.com/Percona-Lab/percona-docstreamer/internal/logging"
 	"github.com/Percona-Lab/percona-docstreamer/internal/status"
@@ -196,16 +197,19 @@ type CopyManager struct {
 	checkpointMgr   *checkpoint.Manager
 	checkpointDocID string
 	initialMaxKey   bson.RawValue
+	flowMgr         *flow.Manager
 }
 
-func NewCopyManager(source, target *mongo.Client, collInfo discover.CollectionInfo, statusMgr *status.Manager, checkpointMgr *checkpoint.Manager, checkpointDocID string) *CopyManager {
+// Update: NewCopyManager now accepts flowMgr
+func NewCopyManager(source, target *mongo.Client, collInfo discover.CollectionInfo, statusMgr *status.Manager, checkpointMgr *checkpoint.Manager, checkpointDocID string, flowMgr *flow.Manager) *CopyManager {
 	return &CopyManager{
 		sourceClient:    source,
 		targetClient:    target,
 		collInfo:        collInfo,
 		statusMgr:       statusMgr,
 		checkpointMgr:   checkpointMgr,
 		checkpointDocID: checkpointDocID,
+		flowMgr:         flowMgr,
 	}
 }
 
@@ -486,6 +490,10 @@ func (cm *CopyManager) readWorker(
 	logging.PrintStep(fmt.Sprintf("[%s] Read Worker %d started", ns, workerID), 4)
 
 	for segment := range segmentQueue {
+		// CHECK THROTTLE BEFORE READING
+		if cm.flowMgr != nil {
+			cm.flowMgr.Wait()
+		}
 		var minVal, maxVal interface{}
 		if err := segment.Min.Unmarshal(&minVal); err != nil {
 			logging.PrintError(fmt.Sprintf("Min unmarshal failed: %v", err), 0)
 
@@ -94,15 +94,25 @@ type ValidationConfig struct {
 	QueueSize            int  `mapstructure:"queue_size"`
 }
 
+// FlowControlConfig holds settings for adaptive throttling
+type FlowControlConfig struct {
+	Enabled             bool `mapstructure:"enabled"`
+	CheckIntervalMS     int  `mapstructure:"check_interval_ms"`
+	TargetMaxQueuedOps  int  `mapstructure:"target_max_queued_ops"`
+	TargetMaxResidentMB int  `mapstructure:"target_max_resident_mb"`
+	PauseDurationMS     int  `mapstructure:"pause_duration_ms"`
+}
+
 // Config holds all configuration for the application
 type Config struct {
-	Logging    LoggingConfig    `mapstructure:"logging"`
-	DocDB      DocDBConfig      `mapstructure:"docdb"`
-	Mongo      MongoConfig      `mapstructure:"mongo"`
-	Migration  MigrationConfig  `mapstructure:"migration"`
-	Cloner     ClonerConfig     `mapstructure:"cloner"`
-	CDC        CDCConfig        `mapstructure:"cdc"`
-	Validation ValidationConfig `mapstructure:"validation"`
+	Logging     LoggingConfig     `mapstructure:"logging"`
+	DocDB       DocDBConfig       `mapstructure:"docdb"`
+	Mongo       MongoConfig       `mapstructure:"mongo"`
+	Migration   MigrationConfig   `mapstructure:"migration"`
+	Cloner      ClonerConfig      `mapstructure:"cloner"`
+	CDC         CDCConfig         `mapstructure:"cdc"`
+	Validation  ValidationConfig  `mapstructure:"validation"`
+	FlowControl FlowControlConfig `mapstructure:"flow_control"`
 }
 
 // Cfg is the global config object
@@ -175,6 +185,13 @@ func LoadConfig() {
 	viper.SetDefault("validation.max_retries", 3)
 	viper.SetDefault("validation.queue_size", 2000)
 
+	// Flow control Defaults
+	viper.SetDefault("flow_control.enabled", true)
+	viper.SetDefault("flow_control.check_interval_ms", 1000)
+	viper.SetDefault("flow_control.target_max_queued_ops", 50) // Conservative default
+	viper.SetDefault("flow_control.target_max_resident_mb", 0) // 0 = disabled
+	viper.SetDefault("flow_control.pause_duration_ms", 500)
+
 	// --- 2. Read config file ---
 	viper.SetConfigName("config")
 	viper.SetConfigType("yaml")