Clean up shuffle.py args and defaults - deliberate compatibility break

lightvector · lightvector · commit fbc6ee530f2d · 2026-06-09T21:01:54.000-04:00
diff --git a/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh b/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh
@@ -90,7 +90,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
             sleep 10
 
             echo "BEGINNING SHUFFLE------------------------------"
-            ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" "$BATCHSIZE" -summary-file "$basedir"/selfplay.summary.json "$@"
+            ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" -summary-file "$basedir"/selfplay.summary.json "$@"
             sleep "$SHUFFLEPERIOD"
         done
     fi
diff --git a/python/selfplay/shuffle.sh b/python/selfplay/shuffle.sh
@@ -4,14 +4,13 @@ set -o pipefail
 #Shuffles and copies selfplay training from selfplay/ to shuffleddata/current/
 #Should be run periodically.
 
-if [[ $# -lt 4 ]]
+if [[ $# -lt 3 ]]
 then
-    echo "Usage: $0 BASEDIR TMPDIR NTHREADS BATCHSIZE"
+    echo "Usage: $0 BASEDIR TMPDIR NTHREADS"
     echo "Currently expects to be run from within the 'python' directory of the KataGo repo, or otherwise in the same dir as shuffle.py."
     echo "BASEDIR containing selfplay data and models and related directories"
     echo "TMPDIR scratch space, ideally on fast local disk, unique to this loop"
     echo "NTHREADS number of parallel threads/processes to use in shuffle"
-    echo "BATCHSIZE number of samples to concat together per batch for training"
     exit 0
 fi
 BASEDIR="$1"
@@ -20,8 +19,6 @@ TMPDIR="$1"
 shift
 NTHREADS="$1"
 shift
-BATCHSIZE="$1"
-shift
 
 #------------------------------------------------------------------------------
 
@@ -50,10 +47,9 @@ then
            -out-tmp-dir "$TMPDIR"/train \
            -approx-rows-per-out-file 70000 \
            -num-processes "$NTHREADS" \
-           -batch-size "$BATCHSIZE" \
+           -keep-target-rows 20000000 \
            -only-include-md5-path-prop-lbound 0.00 \
            -only-include-md5-path-prop-ubound 1.00 \
-           -output-npz \
            "$@" \
            2>&1 | tee "$BASEDIR"/shuffleddata/"$OUTDIR".tmp/outtrain.txt &
 
@@ -70,10 +66,9 @@ else
            -out-tmp-dir "$TMPDIR"/val \
            -approx-rows-per-out-file 70000 \
            -num-processes "$NTHREADS" \
-           -batch-size "$BATCHSIZE" \
+           -keep-target-rows 20000000 \
            -only-include-md5-path-prop-lbound 0.95 \
            -only-include-md5-path-prop-ubound 1.00 \
-           -output-npz \
            "$@" \
            2>&1 | tee "$BASEDIR"/shuffleddata/"$OUTDIR".tmp/outval.txt &
 
@@ -88,10 +83,9 @@ else
            -out-tmp-dir "$TMPDIR"/train \
            -approx-rows-per-out-file 70000 \
            -num-processes "$NTHREADS" \
-           -batch-size "$BATCHSIZE" \
+           -keep-target-rows 20000000 \
            -only-include-md5-path-prop-lbound 0.00 \
            -only-include-md5-path-prop-ubound 0.95 \
-           -output-npz \
            "$@" \
            2>&1 | tee "$BASEDIR"/shuffleddata/"$OUTDIR".tmp/outtrain.txt &
 
diff --git a/python/selfplay/shuffle_and_export_loop.sh b/python/selfplay/shuffle_and_export_loop.sh
@@ -48,7 +48,7 @@ cp -r "$GITROOTDIR"/python/muon "$DATED_ARCHIVE"
     cd "$basedir"/scripts
     while true
     do
-        ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" "$BATCHSIZE" "$@"
+        ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" "$@"
         sleep 20
     done
 ) >> "$basedir"/logs/outshuffle.txt 2>&1 & disown
diff --git a/python/selfplay/shuffle_loop.sh b/python/selfplay/shuffle_loop.sh
@@ -60,7 +60,7 @@ cp -r "$GITROOTDIR"/python/muon "$DATED_ARCHIVE"
 
         for i in {1..10}
         do
-            ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" "$BATCHSIZE" -summary-file "$basedir"/selfplay.summary.json "$@"
+            ./shuffle.sh "$basedir" "$tmpdir" "$NTHREADS" -summary-file "$basedir"/selfplay.summary.json "$@"
             sleep 600
         done
     done
diff --git a/python/selfplay/synchronous_loop.sh b/python/selfplay/synchronous_loop.sh
@@ -102,7 +102,7 @@ do
     (
         # Skip validate since peeling off 5% of data is actually a bit too chunky and discrete when running at a small scale, and validation data
         # doesn't actually add much to debugging a fast-changing RL training.
-        time SKIP_VALIDATE=1 ./shuffle.sh "$BASEDIR" "$SCRATCHDIR" "$NUM_THREADS_FOR_SHUFFLING" "$BATCHSIZE" -min-rows "$SHUFFLE_MINROWS" -keep-target-rows "$SHUFFLE_KEEPROWS" -taper-window-scale "$TAPER_WINDOW_SCALE" | tee -a "$BASEDIR"/logs/outshuffle.txt
+        time SKIP_VALIDATE=1 ./shuffle.sh "$BASEDIR" "$SCRATCHDIR" "$NUM_THREADS_FOR_SHUFFLING" -min-rows "$SHUFFLE_MINROWS" -keep-target-rows "$SHUFFLE_KEEPROWS" -taper-window-scale "$TAPER_WINDOW_SCALE" | tee -a "$BASEDIR"/logs/outshuffle.txt
     )
 
     echo "Train"
diff --git a/python/shuffle.py b/python/shuffle.py
@@ -264,45 +264,33 @@ def write_one_output_file(
     arrs: Sequence[np.ndarray | None],
     out_file_start: int,
     out_file_stop: int,
-    batch_size: int,
-    ensure_batch_multiple: int,
     include_meta: bool,
     include_qvalues: bool,
 ):
     """Write rows [out_file_start, out_file_stop) of arrs to one output npz + json metadata
-
-    Truncates to a whole multiple of (batch_size * ensure_batch_multiple) rows, dropping
-    the leftover partial batch at the end. Returns the number of rows actually written.
+    Returns the number of rows written.
     """
     num_rows = out_file_stop - out_file_start
 
-    # Just truncate and lose the batch at the end, it's fine
-    num_batches = (num_rows // (batch_size * ensure_batch_multiple)) * ensure_batch_multiple
-
-    start = out_file_start
-    stop = out_file_start + num_batches*batch_size
-
     save_output_npz(
         filename=filename,
         arrs=arrs,
         include_meta=include_meta,
         include_qvalues=include_qvalues,
-        start=start,
-        stop=stop,
+        start=out_file_start,
+        stop=out_file_stop,
     )
 
     jsonfilename = os.path.splitext(filename)[0] + ".json"
     with open(jsonfilename,"w") as f:
-        json.dump({"num_rows":num_rows,"num_batches":num_batches},f)
+        json.dump({"num_rows":num_rows},f)
 
-    return num_batches * batch_size
+    return num_rows
 
 def merge_bucket(
     out_filenames: list[str],
     num_shards_to_merge: int,
     out_tmp_dir: str,
-    batch_size: int,
-    ensure_batch_multiple: int,
     include_meta: bool,
     include_qvalues: bool
 ):
@@ -365,8 +353,6 @@ def merge_bucket(
             arrs=concatenated_arrs,
             out_file_start=out_file_start,
             out_file_stop=out_file_stop,
-            batch_size=batch_size,
-            ensure_batch_multiple=ensure_batch_multiple,
             include_meta=include_meta,
             include_qvalues=include_qvalues,
         )
@@ -582,8 +568,6 @@ def run_two_phase_shuffle(
     bucket_tmp_dir,
     worker_group_size,
     keep_prob,
-    batch_size,
-    ensure_batch_multiple,
     include_meta,
     include_qvalues,
     fill_in_qvalues,
@@ -636,7 +620,7 @@ def run_two_phase_shuffle(
                     os.path.join(out_dir, "%s%d_%d.npz" % (out_file_prefix, b, j))
                     for j in range(num_out_files_per_bucket)
                 ],
-                num_shards_to_merge, bucket_tmp_dirs[b], batch_size, ensure_batch_multiple,
+                num_shards_to_merge, bucket_tmp_dirs[b],
                 include_meta, include_qvalues
             )
             for b in range(num_buckets)
@@ -737,16 +721,13 @@ def __exit__(self, exception_type, exception_val, trace):
     If you want to control the "scale" of the power law differently than the min rows, you can specify -taper-window-scale as well.
     There is also a bit of a hack to cap the number of random rows (rows generated by random play without a neural net), since random row generation at the start of a run can be very fast due to not hitting the GPU, and overpopulate the run.
 
-    Additionally, NOT all of the shuffled window is output, only a random shuffled 20M rows will be kept. Adjust this using -keep-target-rows. The intention is that this script will be repeatedly run as new data comes in, such that well before train.py would need more than 20M rows, the data would have been shuffled again and a new random 20M rows chosen.
+    Additionally, NOT all of the shuffled window need be output: -keep-target-rows controls how many rows are randomly sampled and kept (pass 'all' to keep the whole window). For ongoing self-play training the intention is that this script is rerun as new data comes in, such that well before train.py would need more than -keep-target-rows rows, the data would have been reshuffled and a fresh random sample chosen.
 
-    If you are NOT doing ongoing self-play training, but simply want to shuffle an entire dataset (not just a window of it) and want to output all of it (not just 20M of it) then you can use arguments like:
-      -taper-window-exponent 1.0 \\
-      -expand-window-per-row 1.0 \\
-      -keep-target-rows SOME_VERY_LARGE_NUMBER
+    If you are NOT doing ongoing self-play training, but simply want to shuffle an entire dataset (not just a window of it) and output all of it, the default window args already select the whole dataset, so you just need:
+      -keep-target-rows all
 
     If you ARE doing ongoing self-play training, but want a fixed window size, then you can use arguments like:
       -min-rows YOUR_DESIRED_SIZE \\
-      -taper-window-exponent 1.0 \\
       -expand-window-per-row 0.0
 
     ==================================================================
@@ -766,7 +747,7 @@ def __exit__(self, exception_type, exception_val, trace):
     --dry-run-print-resource-cost NUM_DATASET_ROWS
     which assumes the dataset has NUM_DATASET_ROWS total rows and prints estimates instead of shuffling.
     """)
-    parser.add_argument('dirs', metavar='DIR', nargs='+', help='Directories of training data files')
+    parser.add_argument('dirs', metavar='DIR', nargs='*', help='Directories of training data files (not required in --dry-run-print-resource-cost mode)')
 
     required_args = parser.add_argument_group('required arguments')
     optional_args = parser.add_argument_group('optional arguments')
@@ -777,24 +758,21 @@ def __exit__(self, exception_type, exception_val, trace):
         default=argparse.SUPPRESS,
         help='show this help message and exit'
     )
-    optional_args.add_argument('-min-rows', type=int, required=False, help='Minimum training rows to use, default 250k')
-    optional_args.add_argument('-max-rows', type=int, required=False, help='Maximum training rows to use, default unbounded')
-    optional_args.add_argument('-keep-target-rows', type=int, required=False, help='Target number of rows to actually keep in the final data set, default 20M')
-    required_args.add_argument('-expand-window-per-row', type=float, required=True, help='Beyond min rows, initially expand the window by this much every post-random data row')
-    required_args.add_argument('-taper-window-exponent', type=float, required=True, help='Make the window size asymtotically grow as this power of the data rows')
+    optional_args.add_argument('-min-rows', type=int, required=False, help='Minimum size of the desired training window, default 250k')
+    optional_args.add_argument('-max-rows', type=int, required=False, help='Maximum size of the desired training window, default unbounded')
+    required_args.add_argument('-keep-target-rows', required=True, help="Target number of rows to actually sample and keep in the final output shuffle, or 'all' to keep the whole window")
+    optional_args.add_argument('-expand-window-per-row', type=float, required=False, default=1.0, help='Beyond min rows, initially expand the window by this much every post-random data row (default 1.0)')
+    optional_args.add_argument('-taper-window-exponent', type=float, required=False, default=1.0, help='Make the window size asymtotically grow as this power of the data rows (default 1.0)')
     optional_args.add_argument('-taper-window-scale', type=float, required=False, help='The scale at which the power law applies, defaults to -min-rows')
     optional_args.add_argument('-add-to-data-rows', type=float, required=False, help='Compute the window size as if the number of data rows were this much larger/smaller')
-    optional_args.add_argument('-add-to-window-size', type=float, required=False, help='DEPRECATED due to being misnamed name, use -add-to-data-rows')
     optional_args.add_argument('-summary-file', required=False, help='Summary json file for directory contents')
-    required_args.add_argument('-out-dir', required=True, help='Dir to output training files')
-    required_args.add_argument('-out-tmp-dir', required=True, help='Dir to use as scratch space')
+    optional_args.add_argument('-out-dir', required=False, help='Dir to output training files (not required in --dry-run-print-resource-cost mode)')
+    optional_args.add_argument('-out-tmp-dir', required=False, help='Dir to use as scratch space (not required in --dry-run-print-resource-cost mode)')
     optional_args.add_argument('-approx-rows-per-out-file', type=int, required=False, default=70000, help='Number of rows per output file, default 70k')
     optional_args.add_argument('-approx-rows-per-bucket', type=int, required=False, help='Each merge worker takes one whole bucket in RAM and splits it equally into output files. Bigger buckets means shard files. Must be a multiple of -approx-rows-per-out-file. Default: equal to -approx-rows-per-out-file.')
     optional_args.add_argument('-num-waves', type=int, required=False, default=1, help='If > 1, shuffle in this many waves to bound peak intermediate shard count and temp disk usage for very large (whole-dataset) shuffles. Default 1 (no waves).')
     optional_args.add_argument('--dry-run-print-resource-cost', type=int, required=False, metavar='NUM_DATASET_ROWS', help='Do not actually shuffle (or even scan the dataset). Assume the dataset has this many total rows, run the window-size / keep / md5-filter math, and print rough estimates of output files, peak intermediate shard count, peak temp disk usage, and peak memory. Assumes 19x19 data and typical measured per-row sizes.')
     required_args.add_argument('-num-processes', type=int, required=True, help='Number of multiprocessing processes for shuffling in parallel')
-    required_args.add_argument('-batch-size', type=int, required=True, help='Batch size to write training examples in')
-    optional_args.add_argument('-ensure-batch-multiple', type=int, required=False, help='Ensure each file is a multiple of this many batches')
     optional_args.add_argument('-worker-group-size', type=int, required=False, default=80000, help='Internally, target having many rows per parallel sharding worker (doesnt affect merge)')
     optional_args.add_argument('-exclude', required=False, help='Text file with npzs to ignore, one per line')
     optional_args.add_argument('-exclude-prefix', required=False, help='Prefix to concat to lines in exclude to produce the full file path')
@@ -803,24 +781,23 @@ def __exit__(self, exception_type, exception_val, trace):
     optional_args.add_argument('-only-include-md5-path-prop-ubound', type=float, required=False, help='Just before sharding, include only filepaths hashing to float < this')
     optional_args.add_argument('-skip-mtime-range-start', type=float, required=False, help='')
     optional_args.add_argument('-skip-mtime-range-end', type=float, required=False, help='')
-    optional_args.add_argument('-output-npz', action="store_true", required=False, help='Output results as npz files')
     optional_args.add_argument('-include-meta', action="store_true", required=False, help='Include sgf metadata inputs')
     optional_args.add_argument('-exclude-qvalues', action="store_true", required=False, help='Exclude Q-value targets (for backwards compatibility with pre-v1.16)')
 
     args = parser.parse_args()
     dirs = args.dirs
     min_rows = args.min_rows
     max_rows = args.max_rows
-    keep_target_rows = args.keep_target_rows
+    # -keep-target-rows is required, and accepts 'all' to mean "keep the whole window"
+    # (represented internally as None, i.e. no cap).
+    if str(args.keep_target_rows).lower() == "all":
+        keep_target_rows = None
+    else:
+        keep_target_rows = int(args.keep_target_rows)
     expand_window_per_row = args.expand_window_per_row
     taper_window_exponent = args.taper_window_exponent
     taper_window_scale = args.taper_window_scale
     add_to_data_rows = args.add_to_data_rows
-    if args.add_to_data_rows is not None and args.add_to_window_size is not None:
-        print("Cannot specify both -add-to-data-rows and -add-to-window-size. Please use only -add-to-data-rows, -add-to-window-size is deprecated")
-    if args.add_to_data_rows is None and args.add_to_window_size is not None:
-        print("WARNING: -add-to-window-size is deprecated due to being misnamed, use -add-to-data-rows")
-        add_to_data_rows = args.add_to_window_size
 
     summary_file = args.summary_file
     out_dir = args.out_dir
@@ -843,10 +820,6 @@ def __exit__(self, exception_type, exception_val, trace):
     if num_waves < 1:
         raise ValueError("-num-waves must be >= 1")
     num_processes = args.num_processes
-    batch_size = args.batch_size
-    ensure_batch_multiple = 1
-    if args.ensure_batch_multiple is not None:
-        ensure_batch_multiple = args.ensure_batch_multiple
     worker_group_size = args.worker_group_size
     exclude = args.exclude
     exclude_prefix = args.exclude_prefix
@@ -857,21 +830,22 @@ def __exit__(self, exception_type, exception_val, trace):
     only_include_md5_path_prop_ubound = args.only_include_md5_path_prop_ubound
     skip_mtime_range_start = args.skip_mtime_range_start
     skip_mtime_range_end = args.skip_mtime_range_end
-    output_npz = args.output_npz
     include_meta = args.include_meta
     include_qvalues = not args.exclude_qvalues
     dry_run_print_resource_cost = args.dry_run_print_resource_cost
 
-    if not output_npz and dry_run_print_resource_cost is None:
-        raise AssertionError("No longer supports outputting tensorflow data")
+    # dirs / out-dir / out-tmp-dir are only needed for a real run, not for the dry run.
+    if dry_run_print_resource_cost is None:
+        if len(dirs) <= 0:
+            raise ValueError("At least one input directory is required (except in --dry-run-print-resource-cost mode)")
+        if out_dir is None:
+            raise ValueError("-out-dir is required (except in --dry-run-print-resource-cost mode)")
+        if out_tmp_dir is None:
+            raise ValueError("-out-tmp-dir is required (except in --dry-run-print-resource-cost mode)")
 
     if min_rows is None:
         print("NOTE: -min-rows was not specified, defaulting to requiring 250K rows before shuffling.")
         min_rows = 250000
-    if keep_target_rows is None:
-        print("NOTE: -keep-target-rows was not specified, defaulting to sampling a random 20M rows out of the computed window.")
-        print("If you intended to shuffle the whole dataset instead, pass in -keep-target-rows <very large number>")
-        keep_target_rows = 20000000
     if add_to_data_rows is None:
         add_to_data_rows = 0
 
@@ -1234,8 +1208,6 @@ def num_usable_rows():
                 bucket_tmp_dir=bucket_tmp_dir,
                 worker_group_size=worker_group_size,
                 keep_prob=keep_prob,
-                batch_size=batch_size,
-                ensure_batch_multiple=ensure_batch_multiple,
                 include_meta=include_meta,
                 include_qvalues=include_qvalues,
                 fill_in_qvalues=True,
@@ -1316,8 +1288,6 @@ def num_usable_rows():
                     bucket_tmp_dir=bucket_tmp_dir,
                     worker_group_size=worker_group_size,
                     keep_prob=1.0,  # keep_prob already applied in phase 1
-                    batch_size=batch_size,
-                    ensure_batch_multiple=ensure_batch_multiple,
                     include_meta=include_meta,
                     include_qvalues=include_qvalues,
                     fill_in_qvalues=False,  # wave shards already contain qValueTargetsNCMove

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ do`
`102`	`102`	`(`
`103`	`103`	`# Skip validate since peeling off 5% of data is actually a bit too chunky and discrete when running at a small scale, and validation data`
`104`	`104`	`# doesn't actually add much to debugging a fast-changing RL training.`
`105`		`- time SKIP_VALIDATE=1 ./shuffle.sh "$BASEDIR" "$SCRATCHDIR" "$NUM_THREADS_FOR_SHUFFLING" "$BATCHSIZE" -min-rows "$SHUFFLE_MINROWS" -keep-target-rows "$SHUFFLE_KEEPROWS" -taper-window-scale "$TAPER_WINDOW_SCALE" \| tee -a "$BASEDIR"/logs/outshuffle.txt`
	`105`	`+ time SKIP_VALIDATE=1 ./shuffle.sh "$BASEDIR" "$SCRATCHDIR" "$NUM_THREADS_FOR_SHUFFLING" -min-rows "$SHUFFLE_MINROWS" -keep-target-rows "$SHUFFLE_KEEPROWS" -taper-window-scale "$TAPER_WINDOW_SCALE" \| tee -a "$BASEDIR"/logs/outshuffle.txt`
`106`	`106`	`)`
`107`	`107`
`108`	`108`	`echo "Train"`