From 437017b80c0f16402590e5272e6d0749cd76f169 Mon Sep 17 00:00:00 2001 From: rajratamore-debug Date: Tue, 23 Dec 2025 11:52:57 +0300 Subject: [PATCH 1/9] windows compatible file --- generative_data_prep/__main__.py | 20 +++- generative_data_prep/data_prep/data_prep.py | 4 +- generative_data_prep/data_prep/pipeline.py | 104 +++++++++--------- .../utils/large_file_shuffle.py | 65 +++++++++-- generative_data_prep/utils/logger.py | 17 ++- generative_data_prep/utils/utils.py | 2 +- 6 files changed, 143 insertions(+), 69 deletions(-) diff --git a/generative_data_prep/__main__.py b/generative_data_prep/__main__.py index 85811f2e..d0645a1a 100644 --- a/generative_data_prep/__main__.py +++ b/generative_data_prep/__main__.py @@ -46,6 +46,24 @@ logger = logging.getLogger("generative_data_prep_logger") logging.config.fileConfig(get_config_file_path()) +# Fix Unicode encoding issues on Windows console +import sys + +# Configure stdout/stderr to handle Unicode encoding errors on Windows +if sys.platform == "win32": + # Try to reconfigure streams to use UTF-8 with error replacement + if hasattr(sys.stdout, 'reconfigure'): + try: + sys.stdout.reconfigure(encoding='utf-8', errors='replace') + except (AttributeError, ValueError): + pass + if hasattr(sys.stderr, 'reconfigure'): + try: + sys.stderr.reconfigure(encoding='utf-8', errors='replace') + except (AttributeError, ValueError): + pass + + def add_special_tokens_dict(tokenizer: PreTrainedTokenizerBase, special_tokens_dict: str): """Add the special tokens dictionary to tokenizer. @@ -302,4 +320,4 @@ def run_with_training_args( parser = get_arg_parser() data_prep_args = parser.parse_args() data_prep_args = check_deprecated_args(data_prep_args) - main(data_prep_args) + main(data_prep_args) diff --git a/generative_data_prep/data_prep/data_prep.py b/generative_data_prep/data_prep/data_prep.py index 7d970d6d..85c24a37 100644 --- a/generative_data_prep/data_prep/data_prep.py +++ b/generative_data_prep/data_prep/data_prep.py @@ -106,7 +106,7 @@ def data_prep_main( dump_categories = category_to_id is not None with Hdf5FileBuffer(output_file, max_seq_length, dump_categories) as hdf5_text_buffer: - with open(input_file, "r") as reader: + with open(input_file, "r", encoding="utf-8", errors="replace") as reader: for i, line in enumerate(reader): try: hdf5_text_buffer.write(article_tokenizer(line)) @@ -119,7 +119,7 @@ def data_prep_main( num_tokenized_articles.value += 100 except json.JSONDecodeError as exc: if ignore_input_format_error: - with open(error_log_path, "a") as f: + with open(error_log_path, "a", encoding="utf-8", errors="replace") as f: f.write(line) if num_tokenized_articles_lock is not None and num_skipped_articles is not None: with num_tokenized_articles_lock: diff --git a/generative_data_prep/data_prep/pipeline.py b/generative_data_prep/data_prep/pipeline.py index bd8d5739..976b2dee 100644 --- a/generative_data_prep/data_prep/pipeline.py +++ b/generative_data_prep/data_prep/pipeline.py @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + Data preparation pipeline for converting a jsonl file to tokenized hdf5 files consumable by SambaSuite. """ @@ -23,6 +23,7 @@ import os import random import shutil +import sys import time import uuid from pathlib import Path @@ -35,6 +36,13 @@ from alive_progress import alive_bar from transformers import PretrainedConfig, PreTrainedTokenizerBase +# Set multiprocessing start method for Windows compatibility +if sys.platform == "win32": + try: + multiprocessing.set_start_method("spawn", force=True) + except RuntimeError: + # Start method already set, ignore + pass from generative_data_prep.data_prep import data_prep_main from generative_data_prep.processors.metrics import Metrics from generative_data_prep.utils import ( @@ -91,11 +99,11 @@ def combine_input_dir_files(input_path: str) -> Tuple[str, List[Path]]: output_file = input_path_obj / f"combined_output_{uuid.uuid4().hex[:8]}{ext}" # Open the output file and concatenate all input files - with open(output_file, "w") as f_out: + with open(output_file, "w", encoding="utf-8", errors="replace") as f_out: for input_file in input_files: if "combined_output_" not in str(input_file): verify_input_file(str(input_file)) - with open(input_file, "r") as f_in: + with open(input_file, "r", encoding="utf-8", errors="replace") as f_in: if input_file.stat().st_size == 0: continue # Skip empty files @@ -104,17 +112,33 @@ def combine_input_dir_files(input_path: str) -> Tuple[str, List[Path]]: return str(output_file), input_files -def split_file_linux(num_splits: int, input_file_path: str, split_dir: str) -> None: - """Split the [input_file_path] into num_splits and places it in [split_dir]. +def split_file_round_robin(num_splits: int, input_file_path: str, split_dir: str) -> None: + """Split the [input_file_path] into num_splits and places it in [split_dir] using round-robin distribution. + + This is a cross-platform replacement for the Linux 'split -d -n r/' command. Args: num_splits (int): number of output file splits input_file_path (str): input jsonl file path split_dir (str): The directory to place all the outputted splits """ - split_command = f"split -d -n r/{num_splits} {input_file_path} {split_dir}/" - execute_and_return_stdout(split_command) - + # Create file handles for all split files + split_files = [] + num_digits = len(str(num_splits)) + for i in range(num_splits): + out_file_path = os.path.join(split_dir, str(i).zfill(max(2, num_digits))) + split_files.append(open(out_file_path, "w", encoding="utf-8", errors="replace")) + + try: + # Read input file and distribute lines in round-robin fashion + with open(input_file_path, "r", encoding="utf-8", errors="replace") as infile: + for line_num, line in enumerate(infile): + split_index = line_num % num_splits + split_files[split_index].write(line) + finally: + # Close all file handles + for f in split_files: + f.close() def check_RAM(input_file_size_in_bytes: int): """Check to make sure there is enough RAM on the system to fit [input_file_size_in_bytes]. @@ -200,7 +224,7 @@ def estimate_total_num_articles(files_to_tokenize, split_dir): Estimate of the total number of articles needed to tokenize """ lines_per_file = 0 - with open(os.path.join(split_dir, files_to_tokenize[0]), "r") as file: + with open(os.path.join(split_dir, files_to_tokenize[0]), "r", encoding="utf-8", errors="replace") as file: for _ in file: lines_per_file += 1 @@ -354,6 +378,10 @@ def multiprocess_data_prep( # noqa: C901 prev_num_tokenized_articles = 0 prev_num_skipped_articles = 0 # Submit multiprocessing workers + # On Windows, reduce workers to avoid pickling issues with large tokenizers + if sys.platform == "win32" and num_workers > 4: + LOGGER.warning(f"Reducing workers from {num_workers} to 4 on Windows to avoid multiprocessing issues.") + num_workers = 4 executor = concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) futures = [] for input_file_path, output_file_path in zip(sub_input_file_paths, sub_output_file_paths): @@ -580,7 +608,7 @@ def pipeline_main( # noqa: C901 ) num_splits_greater_lines = False - with open(input_file_path, "r") as input_file: + with open(input_file_path, "r", encoding="utf-8", errors="replace") as input_file: for i, line in enumerate(input_file): if i > num_splits: num_splits_greater_lines = True @@ -622,66 +650,32 @@ def pipeline_main( # noqa: C901 # ========================================================= # Case 1: large file shuffle specified. REQUIRES: linux OS if shuffle == "large_file": - err_msg = "You specified --shuffle=large_file, but this is only supported on linux operating systems, " - err_msg += f"your operating system is {platform}. Please change the flag to --shuffle=on_RAM or --shuffle=False" - if "linux" not in platform.lower(): - raise OSError(err_msg) split_dir = large_file_shuffle(input_file_path, output_dir, False, num_splits) - # Case 2: Shuffling on RAM with linux OS - elif shuffle == "on_RAM" and "linux" in platform.lower(): + # Case 2: Shuffling on RAM (cross-platform) + elif shuffle == "on_RAM": check_RAM(input_file_size_in_bytes) log_sep_str() LOGGER.info("Shuffling input file, please be patient.") - file_ext = os.path.splitext(input_file_path)[1] - shuffle_file_path = os.path.join(output_dir, f"tmp_shuf{file_ext}") - shuffle_command = f"shuf {input_file_path} > {shuffle_file_path}" - try: - out = execute_and_return_stdout(shuffle_command) - err_msg = f"Shuffle command killed, with print stdout:{out.stdout} stderr:{out.stderr}" - if "killed" in out.stdout or "killed" in out.stderr: - raise MemoryError(err_msg) - except Exception as e: - err_msg = f"Failed with exception {e}, shuffling on RAM is not possible," - err_msg += " try specifying argument --shuffle=large_file" - raise RuntimeError(err_msg) - split_file_linux(num_splits, shuffle_file_path, split_dir) - os.remove(shuffle_file_path) - - # Case 3: shuffle on RAM without linux OS - elif shuffle == "on_RAM" and "linux" not in platform.lower(): - check_RAM(input_file_size_in_bytes) - lines = open(input_file_path).readlines() + # Read all lines into memory + with open(input_file_path, "r", encoding="utf-8", errors="replace") as f: + lines = f.readlines() + # Shuffle the lines random.shuffle(lines) + # Split into chunks splits = np.array_split(lines, num_splits) num_digits = len(str(num_splits)) for i, split in enumerate(splits): out_file_path = os.path.join(split_dir, str(i).zfill(max(2, num_digits))) - with open(out_file_path, "w") as out_file: + with open(out_file_path, "w", encoding="utf-8", errors="replace") as out_file: out_file.writelines(split) - # Case 4: Do not shuffle, split file without linux OS - elif shuffle == "False" and "linux" not in platform.lower(): + # Case 3: Do not shuffle, split file (cross-platform) + elif shuffle == "False": log_sep_str() LOGGER.warning("WARNING: you did not specify the --shuffle flag, so no shuffling was done!") - out_files = [] - num_digits = len(str(num_splits)) - for i in range(num_splits): - out_file_path = os.path.join(split_dir, str(i).zfill(max(2, num_digits))) - out_files.append(out_file_path) - with open(out_file_path, "w") as _: - pass - - with open(input_file_path, "r") as input_file: - for i, line in enumerate(input_file): - with open(out_files[i % len(out_files)], "a") as out_f: - out_f.write(line) + split_file_round_robin(num_splits, input_file_path, split_dir) - # Case 5: Do not shuffle, split file with linux OS - elif shuffle == "False" and "linux" in platform.lower(): - log_sep_str() - LOGGER.warning("WARNING: you did not specify the --shuffle flag, so no shuffling was done!") - split_file_linux(num_splits, input_file_path, split_dir) # rename files to include the corresponding names of 'test', 'dev' and 'train' files_to_tokenize = rename_files( diff --git a/generative_data_prep/utils/large_file_shuffle.py b/generative_data_prep/utils/large_file_shuffle.py index ffb3a227..cd990d39 100644 --- a/generative_data_prep/utils/large_file_shuffle.py +++ b/generative_data_prep/utils/large_file_shuffle.py @@ -25,6 +25,54 @@ LOGGER = logging.getLogger("generative_data_prep_logger") +def _split_file_round_robin(input_file_path: str, split_dir: str, num_splits: int): + """Split a file into multiple files using round-robin distribution. + + This is a cross-platform replacement for the Linux 'split -d -n r/' command. + Each line from the input file is distributed to output files in round-robin fashion. + + Args: + input_file_path (str): Path to the input file to split + split_dir (str): Directory where split files will be created + num_splits (int): Number of split files to create + """ + # Create file handles for all split files + split_files = [] + for i in range(num_splits): + split_file_path = os.path.join(split_dir, f"x{i:02d}") + split_files.append(open(split_file_path, "w", encoding="utf-8", errors="replace")) + + try: + # Read input file and distribute lines in round-robin fashion + with open(input_file_path, "r", encoding="utf-8", errors="replace") as infile: + for line_num, line in enumerate(infile): + split_index = line_num % num_splits + split_files[split_index].write(line) + finally: + # Close all file handles + for f in split_files: + f.close() + + +def _shuffle_file(file_path: str): + """Shuffle the lines of a file in-place. + + This is a cross-platform replacement for the Linux 'shuf' command. + + Args: + file_path (str): Path to the file to shuffle + """ + # Read all lines + with open(file_path, "r", encoding="utf-8", errors="replace") as f: + lines = f.readlines() + + # Shuffle the lines + random.shuffle(lines) + + # Write back to file + with open(file_path, "w", encoding="utf-8", errors="replace") as f: + f.writelines(lines) + def large_file_shuffle( input_file_path: str, output_dir: str, @@ -86,8 +134,7 @@ def large_file_shuffle( prev_time = time.time() LOGGER.info("splitting file") - split_command = f"split -d -n r/{num_splits} {input_file_path} {split_dir}/" - os.system(split_command) # nosec + _split_file_round_robin(input_file_path, split_dir, num_splits) LOGGER.info(f"splitting took {time.time() - prev_time} seconds (used round robin splitting).") prev_time = time.time() @@ -95,19 +142,19 @@ def large_file_shuffle( file_list = list(os.listdir(split_dir)) for file in tqdm(file_list): curr_file_path = os.path.join(split_dir, file) - shuf_command = f"shuf {curr_file_path} --output={curr_file_path}" - os.system(shuf_command) # nosec + _shuffle_file(curr_file_path) if concat_splits: random_split_list = list(range(num_splits)) random.shuffle(random_split_list) prev_time = time.time() LOGGER.info("Concatenating shuffled splits.") - for rand_ind in tqdm(random_split_list): - curr_file_path = os.path.join(split_dir, file_list[rand_ind]) - concat_command = f"cat {curr_file_path} >> {output_path}" - os.system(concat_command) # nosec - os.remove(curr_file_path) + with open(output_path, "wb") as outfile: + for rand_ind in tqdm(random_split_list): + curr_file_path = os.path.join(split_dir, file_list[rand_ind]) + with open(curr_file_path, "rb") as infile: + shutil.copyfileobj(infile, outfile) + os.remove(curr_file_path) LOGGER.info(f"Finished concatenating files. Took {time.time() - prev_time} seconds.") shutil.rmtree(split_dir) diff --git a/generative_data_prep/utils/logger.py b/generative_data_prep/utils/logger.py index 716948c8..c7bdf61d 100644 --- a/generative_data_prep/utils/logger.py +++ b/generative_data_prep/utils/logger.py @@ -89,7 +89,22 @@ def log_input_args(args): def log_metrics(metrics): """Log the metrics table.""" if not metrics.is_empty: - LOGGER.info(f"{get_header('')}\n{metrics}\n{get_header('')}") + metrics_str = f"{get_header('')}\n{metrics}\n{get_header('')}" + # Replace Unicode box-drawing characters with ASCII equivalents for Windows compatibility + if sys.platform == "win32": + replacements = { + '╒': '+', '═': '=', '╤': '+', '╕': '+', + '├': '+', '─': '-', '┼': '+', '┤': '+', + '╘': '+', '╧': '+', '╛': '+', '│': '|' + } + for old, new in replacements.items(): + metrics_str = metrics_str.replace(old, new) + try: + LOGGER.info(metrics_str) + except UnicodeEncodeError: + # Fallback: encode as ASCII with replacement + safe_str = metrics_str.encode('ascii', errors='replace').decode('ascii') + LOGGER.info(safe_str) def get_header(header_name: str): diff --git a/generative_data_prep/utils/utils.py b/generative_data_prep/utils/utils.py index 7ce63290..58076b68 100644 --- a/generative_data_prep/utils/utils.py +++ b/generative_data_prep/utils/utils.py @@ -80,7 +80,7 @@ def validate_sha256(output_dir: str): """ files_to_hash = _get_walk_files_to_hash(output_dir, "sha256") sha_info_file = os.path.join(output_dir, "sha256", "files_metadata.json") - with open(sha_info_file, "r") as output_file: + with open(sha_info_file, "r", encoding="utf-8", errors="replace") as output_file: file_info_dict = json.load(output_file) for file, hash_file_name in files_to_hash: if "logs" not in hash_file_name: From 2ecad932d3734aa3acb012d3e7e382fd9e7979b8 Mon Sep 17 00:00:00 2001 From: Rajrantan More Date: Tue, 23 Dec 2025 17:29:05 +0530 Subject: [PATCH 2/9] updated the progress bar --- generative_data_prep/__main__.py | 2 +- generative_data_prep/data_prep/data_prep.py | 16 +- generative_data_prep/data_prep/pipeline.py | 182 ++++++++++++++++-- .../utils/add_metadata_to_dataset.py | 4 +- generative_data_prep/utils/utils.py | 2 +- 5 files changed, 181 insertions(+), 25 deletions(-) diff --git a/generative_data_prep/__main__.py b/generative_data_prep/__main__.py index d0645a1a..4157bc3b 100644 --- a/generative_data_prep/__main__.py +++ b/generative_data_prep/__main__.py @@ -147,7 +147,7 @@ def get_categories(categories_path: str): _, file_extension = os.path.splitext(categories_path) if file_extension != ".json": raise ValueError(f"Your --categories_path flag must point to a json file, you used {categories_path}") - with open(categories_path, "r") as categories_file: + with open(categories_path, "r", encoding="utf-8") as categories_file: categories_list = json.load(categories_file) if not isinstance(categories_list, list): err_msg = ( diff --git a/generative_data_prep/data_prep/data_prep.py b/generative_data_prep/data_prep/data_prep.py index 85c24a37..34a056d3 100644 --- a/generative_data_prep/data_prep/data_prep.py +++ b/generative_data_prep/data_prep/data_prep.py @@ -106,12 +106,16 @@ def data_prep_main( dump_categories = category_to_id is not None with Hdf5FileBuffer(output_file, max_seq_length, dump_categories) as hdf5_text_buffer: + total_processed = 0 with open(input_file, "r", encoding="utf-8", errors="replace") as reader: for i, line in enumerate(reader): try: hdf5_text_buffer.write(article_tokenizer(line)) + total_processed = i + 1 # Track total processed (i is 0-indexed) + # Update counter every 100 articles (including the first batch) + # When i+1 is a multiple of 100, we've processed exactly that many if ( - (i != 0 and i % 100 == 0) + total_processed % 100 == 0 and num_tokenized_articles_lock is not None and num_tokenized_articles is not None ): @@ -133,9 +137,15 @@ def data_prep_main( exc.doc, exc.pos, ) from exc - if num_tokenized_articles_lock is not None and num_tokenized_articles is not None: + # Add remaining articles that weren't counted in the batch updates + if num_tokenized_articles_lock is not None and num_tokenized_articles is not None and total_processed > 0: with num_tokenized_articles_lock: - num_tokenized_articles.value += i % 100 + # Calculate remaining articles: total processed minus what we already counted + # We count in batches of 100, so we need to add the remainder + already_counted = (total_processed // 100) * 100 # How many we've already counted + remaining = total_processed - already_counted + if remaining > 0: + num_tokenized_articles.value += remaining hdf5_text_buffer.write(article_tokenizer(None)) article_tokenizer.metrics.dataset_type = dataset_type return article_tokenizer.metrics diff --git a/generative_data_prep/data_prep/pipeline.py b/generative_data_prep/data_prep/pipeline.py index 976b2dee..4b2ff2b0 100644 --- a/generative_data_prep/data_prep/pipeline.py +++ b/generative_data_prep/data_prep/pipeline.py @@ -213,8 +213,40 @@ def rename_files( return files_to_tokenize +def count_exact_total_num_articles(files_to_tokenize, split_dir): + """Counts the exact total number of articles by counting all non-empty lines in all files. + + Args: + files_to_tokenize: List of files to tokenize. + split_dir: Directory where the split files are located. + + Returns: + Exact count of the total number of articles to tokenize + """ + if not files_to_tokenize: + return 0 + + total_lines = 0 + LOGGER.info(f"Counting articles in {len(files_to_tokenize)} files to get exact total...") + + for file_name in files_to_tokenize: + file_path = os.path.join(split_dir, file_name) + lines_in_file = 0 + with open(file_path, "r", encoding="utf-8", errors="replace") as file: + for line in file: + # Skip empty lines to match actual processing behavior + if line.strip(): + lines_in_file += 1 + total_lines += lines_in_file + + LOGGER.info(f"Exact total articles counted: {total_lines}") + return total_lines + + def estimate_total_num_articles(files_to_tokenize, split_dir): - """Estimates the total number of articles based on number of artiles in first split times number of splits. + """Estimates the total number of articles based on number of articles in sample files times number of splits. + + DEPRECATED: Use count_exact_total_num_articles for exact count instead. Args: files_to_tokenize: List of files to tokenize. @@ -223,12 +255,31 @@ def estimate_total_num_articles(files_to_tokenize, split_dir): Returns: Estimate of the total number of articles needed to tokenize """ - lines_per_file = 0 - with open(os.path.join(split_dir, files_to_tokenize[0]), "r", encoding="utf-8", errors="replace") as file: - for _ in file: - lines_per_file += 1 - - return lines_per_file * len(files_to_tokenize) + if not files_to_tokenize: + return 0 + + # Sample up to 5 files to get a better average estimate + sample_size = min(5, len(files_to_tokenize)) + total_lines = 0 + files_sampled = 0 + + for i in range(sample_size): + file_path = os.path.join(split_dir, files_to_tokenize[i]) + lines_in_file = 0 + with open(file_path, "r", encoding="utf-8", errors="replace") as file: + for line in file: + # Skip empty lines to match actual processing behavior + if line.strip(): + lines_in_file += 1 + total_lines += lines_in_file + files_sampled += 1 + + if files_sampled == 0: + return 0 + + # Calculate average lines per file and multiply by total files + avg_lines_per_file = total_lines / files_sampled + return int(avg_lines_per_file * len(files_to_tokenize)) def get_split_counts( @@ -369,7 +420,8 @@ def multiprocess_data_prep( # noqa: C901 ) train_hdf5_files = list(filter(lambda file_name: "train" in file_name, sub_output_file_paths)) dev_hdf5_files = list(filter(lambda file_name: "dev" in file_name, sub_output_file_paths)) - total_num_articles = estimate_total_num_articles(files_to_tokenize, split_dir) + # Count exact total to guarantee 100% accuracy + total_num_articles = count_exact_total_num_articles(files_to_tokenize, split_dir) # create manager for shared variables to keep track of tokenization progress manager = multiprocessing.Manager() num_tokenized_articles_lock = manager.Lock() @@ -377,6 +429,8 @@ def multiprocess_data_prep( # noqa: C901 num_skipped_articles = manager.Value(int, 0) prev_num_tokenized_articles = 0 prev_num_skipped_articles = 0 + # Track how much we've actually updated the progress bar to prevent exceeding total + bar_update_tracker = 0 # Submit multiprocessing workers # On Windows, reduce workers to avoid pickling issues with large tokenizers if sys.platform == "win32" and num_workers > 4: @@ -429,7 +483,8 @@ def multiprocess_data_prep( # noqa: C901 tokenization_start_time = time.time() finished_futures = set() # Loop while processes are running, update progress bar. - with alive_bar(total_num_articles) as bar: + # Use manual mode to have better control over the progress bar + with alive_bar(total_num_articles, manual=True, title="Tokenizing articles") as bar: while True: for i, future in enumerate(futures): if future.done() and future not in finished_futures: @@ -473,15 +528,50 @@ def multiprocess_data_prep( # noqa: C901 if all(future.done() for future in futures): if len(finished_futures) != len(futures): raise ValueError("All futures done, but finished futures set does not equal all futures list.") + # Final update to ensure progress bar reflects all processed articles + with num_tokenized_articles_lock: + num_new_tokenized_articles = num_tokenized_articles.value - prev_num_tokenized_articles + if num_new_tokenized_articles > 0: + # Use our tracker to ensure we never exceed total + remaining_until_total = max(0, total_num_articles - bar_update_tracker) + if remaining_until_total > 0: + # Cap update to not exceed total + max_update = min(num_new_tokenized_articles, remaining_until_total) + if max_update > 0: + bar_update_tracker += max_update + # Set bar to exact position (as fraction of total, capped at 1.0) + bar_position = min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 + bar(bar_position) + # Ensure progress bar reaches exactly 100% (1.0 in manual mode) + # Use tracker to set final position + bar_update_tracker = total_num_articles + bar(1.0) # Set to 100% completion break # Update the progress bar with how every many new articles were tokenized with num_tokenized_articles_lock: num_new_tokenized_articles = num_tokenized_articles.value - prev_num_tokenized_articles - bar(num_new_tokenized_articles) - perc_complete = round((bar.current / total_num_articles) * 100, 2) + if num_new_tokenized_articles > 0: + # Use our tracker to ensure we never exceed total + remaining_until_total = max(0, total_num_articles - bar_update_tracker) + # Only update if there's room and we have new articles + if remaining_until_total > 0: + # Cap update to not exceed total + max_update = min(num_new_tokenized_articles, remaining_until_total) + if max_update > 0: + bar_update_tracker += max_update + # Set bar to exact position (as fraction of total, capped at 1.0) + bar_position = min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 + bar(bar_position) + # Calculate percentage based on our tracker (more accurate than bar.current in manual mode) + if total_num_articles > 0: + # Use tracker to calculate accurate percentage + actual_current = min(bar_update_tracker, total_num_articles) + perc_complete = min(100.0, round((actual_current / total_num_articles) * 100, 2)) + else: + perc_complete = 0.0 elapsed_time_str = f"--- elapsed time: {time.time() - tokenization_start_time}" LOGGER.debug( - f"{total_num_articles}, {perc_complete}% complete => Time remaining: {bar.eta} {elapsed_time_str}" + f"Counter: {num_tokenized_articles.value}, Progress tracker: {bar_update_tracker}/{total_num_articles}, {perc_complete}% complete => Time remaining: {bar.eta} {elapsed_time_str}" ) prev_num_tokenized_articles = num_tokenized_articles.value @@ -492,9 +582,65 @@ def multiprocess_data_prep( # noqa: C901 prev_num_skipped_articles = num_skipped_articles.value time.sleep(5) + # Log final article count and validate 100% completion + log_sep_str() + total_actual_articles = train_metrics.articles + dev_metrics.articles + LOGGER.info(f"Total articles processed (from metrics): {total_actual_articles} (Train: {train_metrics.articles}, Dev: {dev_metrics.articles})") + LOGGER.info(f"Total articles counted in input files: {total_num_articles}") + if ignore_input_format_error: - LOGGER.info(f"Total processed lines: {num_tokenized_articles.value}") - LOGGER.info(f"Total skipped lines: {num_skipped_articles.value}") + LOGGER.info(f"Progress counter value: {num_tokenized_articles.value}") + LOGGER.info(f"Total skipped lines (format errors): {num_skipped_articles.value}") + + # Validate 100% completion + if total_num_articles > 0: + counter_articles = num_tokenized_articles.value + metrics_articles = total_actual_articles + skipped_articles = num_skipped_articles.value if ignore_input_format_error else 0 + + # Calculate expected articles (total - skipped due to format errors) + # Note: Articles dropped during processing (prompt-only, packing drops) are still counted in metrics.articles + # because metrics.articles is incremented before processing/dropping + expected_articles = total_num_articles - skipped_articles + + # Compare metrics with expected count + metrics_diff = abs(metrics_articles - expected_articles) + metrics_diff_percent = (metrics_diff / total_num_articles) * 100 if total_num_articles > 0 else 0 + + log_sep_str() + if metrics_diff == 0: + LOGGER.info(f"[SUCCESS] 100% DATA UTILIZATION: All {total_num_articles} articles from input files were processed!") + if skipped_articles > 0: + LOGGER.info(f" Note: {skipped_articles} articles were skipped due to JSON format errors (expected)") + LOGGER.info(f" All {metrics_articles} processed articles are included in the output dataset.") + elif metrics_diff_percent <= 0.1: # Less than 0.1% difference + LOGGER.warning( + f"Near-complete data utilization: {metrics_articles}/{expected_articles} articles processed " + f"({metrics_diff_percent:.3f}% difference). This is likely due to rounding or minor counting differences." + ) + LOGGER.info(f" {metrics_articles} articles are included in the output dataset.") + else: + LOGGER.error( + f"[WARNING] INCOMPLETE DATA UTILIZATION: Only {metrics_articles}/{expected_articles} articles processed " + f"({metrics_diff_percent:.2f}% difference, {expected_articles - metrics_articles} articles missing)." + ) + LOGGER.error( + f" This means {expected_articles - metrics_articles} articles from your input files were not processed. " + f"Please check for errors in processing or data format issues." + ) + + # Compare counter with metrics to identify counting issues + if abs(counter_articles - metrics_articles) > 10: + LOGGER.warning( + f"Counter discrepancy detected: Progress counter shows {counter_articles} articles, " + f"but metrics show {metrics_articles} articles were actually processed. " + f"Difference: {abs(metrics_articles - counter_articles)} articles. " + f"The metrics count ({metrics_articles}) is the accurate one." + ) + else: + LOGGER.info(f"[OK] Progress counter matches metrics: {counter_articles} articles counted, {metrics_articles} articles processed.") + + log_sep_str() if dataset_metadata_json is not None: dataset_metadata_json["max_batch_size_train"] = max_batch_size_train @@ -639,7 +785,7 @@ def pipeline_main( # noqa: C901 if category_to_id is not None: category_to_id_output_file_path = os.path.join(output_dir, "category_to_id.json") verify_output_file(category_to_id_output_file_path, overwrite_output_path) - with open(category_to_id_output_file_path, "w") as f: + with open(category_to_id_output_file_path, "w", encoding="utf-8") as f: json.dump(category_to_id, f) test_dir = os.path.join(output_dir, "test_files") @@ -736,9 +882,9 @@ def pipeline_main( # noqa: C901 for file_name in os.listdir(json_error_log_dir): file_names.append(os.path.join(json_error_log_dir, file_name)) if file_names: - with open(os.path.join(output_dir, "json_load_failed_lines.log"), "w") as outfile: + with open(os.path.join(output_dir, "json_load_failed_lines.log"), "w", encoding="utf-8") as outfile: for file_name in file_names: - with open(file_name) as reader: + with open(file_name, "r", encoding="utf-8") as reader: for line in reader: outfile.write(line) shutil.rmtree(json_error_log_dir) @@ -749,7 +895,7 @@ def pipeline_main( # noqa: C901 update_dataset_metadata(train_metrics, dataset_metadata_json) update_dataset_metadata(dev_metrics, dataset_metadata_json) metadata_file_path = os.path.join(output_dir, "metadata.yaml") - with open(metadata_file_path, "w") as file: + with open(metadata_file_path, "w", encoding="utf-8") as file: yaml.dump(dataset_metadata_json, file, default_flow_style=False) # Create sha256 of all the files within the directory diff --git a/generative_data_prep/utils/add_metadata_to_dataset.py b/generative_data_prep/utils/add_metadata_to_dataset.py index 4ecb445b..02f55717 100644 --- a/generative_data_prep/utils/add_metadata_to_dataset.py +++ b/generative_data_prep/utils/add_metadata_to_dataset.py @@ -59,7 +59,7 @@ def save_metadata(metadata_path, metadata): metadata_path (str): Path to the metadata YAML file. metadata (dict): Metadata dictionary to save. """ - with open(metadata_path, "w") as f: + with open(metadata_path, "w", encoding="utf-8") as f: yaml.safe_dump(metadata, f, default_flow_style=False) @@ -72,7 +72,7 @@ def add_seq_metadata_dataset(dataset_path): metadata_path = os.path.join(dataset_path, "metadata.yaml") metadata = {} if os.path.exists(metadata_path): - with open(metadata_path, "r") as f: + with open(metadata_path, "r", encoding="utf-8") as f: metadata = yaml.safe_load(f) or {} train_sequences = 0 diff --git a/generative_data_prep/utils/utils.py b/generative_data_prep/utils/utils.py index 58076b68..265cc572 100644 --- a/generative_data_prep/utils/utils.py +++ b/generative_data_prep/utils/utils.py @@ -128,7 +128,7 @@ def create_sha256(output_dir: str): "size": os.path.getsize(file), "modified_time": os.path.getmtime(file), } - with open(output_file_hash, "w") as output_file: + with open(output_file_hash, "w", encoding="utf-8") as output_file: json.dump(file_info_dict, output_file) From 5bc74d99d0bbc9ef7a1b7aa5f458a90c5e1ce64e Mon Sep 17 00:00:00 2001 From: Rajrantan More Date: Tue, 23 Dec 2025 18:21:33 +0530 Subject: [PATCH 3/9] fixed pre commit issues --- generative_data_prep/__main__.py | 4 +- generative_data_prep/data_prep/pipeline.py | 83 +++++++++++++++------- 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/generative_data_prep/__main__.py b/generative_data_prep/__main__.py index 4157bc3b..59a7d42a 100644 --- a/generative_data_prep/__main__.py +++ b/generative_data_prep/__main__.py @@ -17,7 +17,9 @@ """ import json import logging +import logging.config import os +import sys from typing import Optional from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase @@ -47,8 +49,6 @@ logging.config.fileConfig(get_config_file_path()) # Fix Unicode encoding issues on Windows console -import sys - # Configure stdout/stderr to handle Unicode encoding errors on Windows if sys.platform == "win32": # Try to reconfigure streams to use UTF-8 with error replacement diff --git a/generative_data_prep/data_prep/pipeline.py b/generative_data_prep/data_prep/pipeline.py index 4b2ff2b0..b9d25ee4 100644 --- a/generative_data_prep/data_prep/pipeline.py +++ b/generative_data_prep/data_prep/pipeline.py @@ -27,7 +27,6 @@ import time import uuid from pathlib import Path -from sys import platform from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -50,7 +49,6 @@ PackingConfig, balance_hdf5_files, create_sha256, - execute_and_return_stdout, get_num_training_splits, large_file_shuffle, log_sep_str, @@ -576,21 +574,30 @@ def multiprocess_data_prep( # noqa: C901 prev_num_tokenized_articles = num_tokenized_articles.value if ignore_input_format_error: - num_new_skipped_articles = num_skipped_articles.value - prev_num_skipped_articles + num_new_skipped_articles = ( + num_skipped_articles.value - prev_num_skipped_articles + ) if num_new_skipped_articles > 0: - LOGGER.info(f"{num_skipped_articles.value} misformatted lines are skipped") + LOGGER.info( + f"{num_skipped_articles.value} misformatted lines are skipped" + ) prev_num_skipped_articles = num_skipped_articles.value time.sleep(5) # Log final article count and validate 100% completion log_sep_str() total_actual_articles = train_metrics.articles + dev_metrics.articles - LOGGER.info(f"Total articles processed (from metrics): {total_actual_articles} (Train: {train_metrics.articles}, Dev: {dev_metrics.articles})") + LOGGER.info( + f"Total articles processed (from metrics): {total_actual_articles} " + f"(Train: {train_metrics.articles}, Dev: {dev_metrics.articles})" + ) LOGGER.info(f"Total articles counted in input files: {total_num_articles}") - + if ignore_input_format_error: LOGGER.info(f"Progress counter value: {num_tokenized_articles.value}") - LOGGER.info(f"Total skipped lines (format errors): {num_skipped_articles.value}") + LOGGER.info( + f"Total skipped lines (format errors): {num_skipped_articles.value}" + ) # Validate 100% completion if total_num_articles > 0: @@ -599,46 +606,72 @@ def multiprocess_data_prep( # noqa: C901 skipped_articles = num_skipped_articles.value if ignore_input_format_error else 0 # Calculate expected articles (total - skipped due to format errors) - # Note: Articles dropped during processing (prompt-only, packing drops) are still counted in metrics.articles - # because metrics.articles is incremented before processing/dropping + # Note: Articles dropped during processing (prompt-only, packing drops) + # are still counted in metrics.articles because metrics.articles is + # incremented before processing/dropping expected_articles = total_num_articles - skipped_articles - + # Compare metrics with expected count metrics_diff = abs(metrics_articles - expected_articles) - metrics_diff_percent = (metrics_diff / total_num_articles) * 100 if total_num_articles > 0 else 0 - + metrics_diff_percent = ( + (metrics_diff / total_num_articles) * 100 + if total_num_articles > 0 + else 0 + ) + log_sep_str() if metrics_diff == 0: - LOGGER.info(f"[SUCCESS] 100% DATA UTILIZATION: All {total_num_articles} articles from input files were processed!") + LOGGER.info( + f"[SUCCESS] 100% DATA UTILIZATION: All {total_num_articles} " + f"articles from input files were processed!" + ) if skipped_articles > 0: - LOGGER.info(f" Note: {skipped_articles} articles were skipped due to JSON format errors (expected)") - LOGGER.info(f" All {metrics_articles} processed articles are included in the output dataset.") + LOGGER.info( + f" Note: {skipped_articles} articles were skipped due to " + f"JSON format errors (expected)" + ) + LOGGER.info( + f" All {metrics_articles} processed articles are included in " + f"the output dataset." + ) elif metrics_diff_percent <= 0.1: # Less than 0.1% difference LOGGER.warning( - f"Near-complete data utilization: {metrics_articles}/{expected_articles} articles processed " - f"({metrics_diff_percent:.3f}% difference). This is likely due to rounding or minor counting differences." + f"Near-complete data utilization: {metrics_articles}/" + f"{expected_articles} articles processed " + f"({metrics_diff_percent:.3f}% difference). This is likely due " + f"to rounding or minor counting differences." + ) + LOGGER.info( + f" {metrics_articles} articles are included in the output " + f"dataset." ) - LOGGER.info(f" {metrics_articles} articles are included in the output dataset.") else: LOGGER.error( - f"[WARNING] INCOMPLETE DATA UTILIZATION: Only {metrics_articles}/{expected_articles} articles processed " - f"({metrics_diff_percent:.2f}% difference, {expected_articles - metrics_articles} articles missing)." + f"[WARNING] INCOMPLETE DATA UTILIZATION: Only " + f"{metrics_articles}/{expected_articles} articles processed " + f"({metrics_diff_percent:.2f}% difference, " + f"{expected_articles - metrics_articles} articles missing)." ) LOGGER.error( - f" This means {expected_articles - metrics_articles} articles from your input files were not processed. " + f" This means {expected_articles - metrics_articles} articles " + f"from your input files were not processed. " f"Please check for errors in processing or data format issues." ) - + # Compare counter with metrics to identify counting issues if abs(counter_articles - metrics_articles) > 10: LOGGER.warning( - f"Counter discrepancy detected: Progress counter shows {counter_articles} articles, " - f"but metrics show {metrics_articles} articles were actually processed. " + f"Counter discrepancy detected: Progress counter shows " + f"{counter_articles} articles, but metrics show " + f"{metrics_articles} articles were actually processed. " f"Difference: {abs(metrics_articles - counter_articles)} articles. " f"The metrics count ({metrics_articles}) is the accurate one." ) else: - LOGGER.info(f"[OK] Progress counter matches metrics: {counter_articles} articles counted, {metrics_articles} articles processed.") + LOGGER.info( + f"[OK] Progress counter matches metrics: {counter_articles} " + f"articles counted, {metrics_articles} articles processed." + ) log_sep_str() From 4a2ade37fcf37f2badfc5f87caf34b7a5c6c8f53 Mon Sep 17 00:00:00 2001 From: Rajrantan More Date: Tue, 23 Dec 2025 18:28:38 +0530 Subject: [PATCH 4/9] flake8 errors are fixed. --- generative_data_prep/__main__.py | 3 +- generative_data_prep/data_prep/pipeline.py | 51 +++++++++++++--------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/generative_data_prep/__main__.py b/generative_data_prep/__main__.py index 59a7d42a..035054cf 100644 --- a/generative_data_prep/__main__.py +++ b/generative_data_prep/__main__.py @@ -64,7 +64,6 @@ pass - def add_special_tokens_dict(tokenizer: PreTrainedTokenizerBase, special_tokens_dict: str): """Add the special tokens dictionary to tokenizer. @@ -320,4 +319,4 @@ def run_with_training_args( parser = get_arg_parser() data_prep_args = parser.parse_args() data_prep_args = check_deprecated_args(data_prep_args) - main(data_prep_args) + main(data_prep_args) diff --git a/generative_data_prep/data_prep/pipeline.py b/generative_data_prep/data_prep/pipeline.py index b9d25ee4..e5a55961 100644 --- a/generative_data_prep/data_prep/pipeline.py +++ b/generative_data_prep/data_prep/pipeline.py @@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - Data preparation pipeline for converting a jsonl file to tokenized hdf5 files consumable by SambaSuite. """ @@ -126,7 +125,7 @@ def split_file_round_robin(num_splits: int, input_file_path: str, split_dir: str for i in range(num_splits): out_file_path = os.path.join(split_dir, str(i).zfill(max(2, num_digits))) split_files.append(open(out_file_path, "w", encoding="utf-8", errors="replace")) - + try: # Read input file and distribute lines in round-robin fashion with open(input_file_path, "r", encoding="utf-8", errors="replace") as infile: @@ -138,6 +137,7 @@ def split_file_round_robin(num_splits: int, input_file_path: str, split_dir: str for f in split_files: f.close() + def check_RAM(input_file_size_in_bytes: int): """Check to make sure there is enough RAM on the system to fit [input_file_size_in_bytes]. @@ -183,11 +183,11 @@ def rename_files( num_digits = len(str(num_splits)) for i in range(num_splits): if i < train_count: - new_name = f"train_{i+1}_of_{train_count}{file_ext}" + new_name = f"train_{i + 1}_of_{train_count}{file_ext}" elif i < train_count + test_count: - new_name = f"test_{i-train_count+1}_of_{test_count}{file_ext}" + new_name = f"test_{i - train_count + 1}_of_{test_count}{file_ext}" else: - new_name = f"dev_{i-train_count-test_count+1}_of_{dev_count}{file_ext}" + new_name = f"dev_{i - train_count - test_count + 1}_of_{dev_count}{file_ext}" new_file_path = os.path.join(split_dir, new_name) @@ -223,10 +223,10 @@ def count_exact_total_num_articles(files_to_tokenize, split_dir): """ if not files_to_tokenize: return 0 - + total_lines = 0 LOGGER.info(f"Counting articles in {len(files_to_tokenize)} files to get exact total...") - + for file_name in files_to_tokenize: file_path = os.path.join(split_dir, file_name) lines_in_file = 0 @@ -236,14 +236,14 @@ def count_exact_total_num_articles(files_to_tokenize, split_dir): if line.strip(): lines_in_file += 1 total_lines += lines_in_file - + LOGGER.info(f"Exact total articles counted: {total_lines}") return total_lines def estimate_total_num_articles(files_to_tokenize, split_dir): """Estimates the total number of articles based on number of articles in sample files times number of splits. - + DEPRECATED: Use count_exact_total_num_articles for exact count instead. Args: @@ -255,12 +255,12 @@ def estimate_total_num_articles(files_to_tokenize, split_dir): """ if not files_to_tokenize: return 0 - + # Sample up to 5 files to get a better average estimate sample_size = min(5, len(files_to_tokenize)) total_lines = 0 files_sampled = 0 - + for i in range(sample_size): file_path = os.path.join(split_dir, files_to_tokenize[i]) lines_in_file = 0 @@ -271,10 +271,10 @@ def estimate_total_num_articles(files_to_tokenize, split_dir): lines_in_file += 1 total_lines += lines_in_file files_sampled += 1 - + if files_sampled == 0: return 0 - + # Calculate average lines per file and multiply by total files avg_lines_per_file = total_lines / files_sampled return int(avg_lines_per_file * len(files_to_tokenize)) @@ -538,7 +538,11 @@ def multiprocess_data_prep( # noqa: C901 if max_update > 0: bar_update_tracker += max_update # Set bar to exact position (as fraction of total, capped at 1.0) - bar_position = min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 + bar_position = ( + min(1.0, bar_update_tracker / total_num_articles) + if total_num_articles > 0 + else 0.0 + ) bar(bar_position) # Ensure progress bar reaches exactly 100% (1.0 in manual mode) # Use tracker to set final position @@ -558,7 +562,11 @@ def multiprocess_data_prep( # noqa: C901 if max_update > 0: bar_update_tracker += max_update # Set bar to exact position (as fraction of total, capped at 1.0) - bar_position = min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 + bar_position = ( + min(1.0, bar_update_tracker / total_num_articles) + if total_num_articles > 0 + else 0.0 + ) bar(bar_position) # Calculate percentage based on our tracker (more accurate than bar.current in manual mode) if total_num_articles > 0: @@ -569,7 +577,9 @@ def multiprocess_data_prep( # noqa: C901 perc_complete = 0.0 elapsed_time_str = f"--- elapsed time: {time.time() - tokenization_start_time}" LOGGER.debug( - f"Counter: {num_tokenized_articles.value}, Progress tracker: {bar_update_tracker}/{total_num_articles}, {perc_complete}% complete => Time remaining: {bar.eta} {elapsed_time_str}" + f"Counter: {num_tokenized_articles.value}, Progress tracker: " + f"{bar_update_tracker}/{total_num_articles}, {perc_complete}% complete => " + f"Time remaining: {bar.eta} {elapsed_time_str}" ) prev_num_tokenized_articles = num_tokenized_articles.value @@ -598,13 +608,13 @@ def multiprocess_data_prep( # noqa: C901 LOGGER.info( f"Total skipped lines (format errors): {num_skipped_articles.value}" ) - + # Validate 100% completion if total_num_articles > 0: counter_articles = num_tokenized_articles.value metrics_articles = total_actual_articles skipped_articles = num_skipped_articles.value if ignore_input_format_error else 0 - + # Calculate expected articles (total - skipped due to format errors) # Note: Articles dropped during processing (prompt-only, packing drops) # are still counted in metrics.articles because metrics.articles is @@ -672,7 +682,7 @@ def multiprocess_data_prep( # noqa: C901 f"[OK] Progress counter matches metrics: {counter_articles} " f"articles counted, {metrics_articles} articles processed." ) - + log_sep_str() if dataset_metadata_json is not None: @@ -849,13 +859,12 @@ def pipeline_main( # noqa: C901 with open(out_file_path, "w", encoding="utf-8", errors="replace") as out_file: out_file.writelines(split) - # Case 3: Do not shuffle, split file (cross-platform) + # Case 3: Do not shuffle, split file (cross-platform) elif shuffle == "False": log_sep_str() LOGGER.warning("WARNING: you did not specify the --shuffle flag, so no shuffling was done!") split_file_round_robin(num_splits, input_file_path, split_dir) - # rename files to include the corresponding names of 'test', 'dev' and 'train' files_to_tokenize = rename_files( input_file_path, From f97e77b99b1e8b16f1627a674b03908d519d9338 Mon Sep 17 00:00:00 2001 From: Rajrantan More Date: Tue, 23 Dec 2025 20:05:42 +0530 Subject: [PATCH 5/9] pre-commit issues are fixed --- generative_data_prep/__main__.py | 9 +++-- generative_data_prep/data_prep/pipeline.py | 39 +++++-------------- .../utils/large_file_shuffle.py | 16 ++++---- generative_data_prep/utils/logger.py | 18 +++++++-- 4 files changed, 37 insertions(+), 45 deletions(-) diff --git a/generative_data_prep/__main__.py b/generative_data_prep/__main__.py index 035054cf..fabe79e2 100644 --- a/generative_data_prep/__main__.py +++ b/generative_data_prep/__main__.py @@ -15,6 +15,7 @@ Entry point to the Text Processing Pipeline. """ + import json import logging import logging.config @@ -52,14 +53,14 @@ # Configure stdout/stderr to handle Unicode encoding errors on Windows if sys.platform == "win32": # Try to reconfigure streams to use UTF-8 with error replacement - if hasattr(sys.stdout, 'reconfigure'): + if hasattr(sys.stdout, "reconfigure"): try: - sys.stdout.reconfigure(encoding='utf-8', errors='replace') + sys.stdout.reconfigure(encoding="utf-8", errors="replace") except (AttributeError, ValueError): pass - if hasattr(sys.stderr, 'reconfigure'): + if hasattr(sys.stderr, "reconfigure"): try: - sys.stderr.reconfigure(encoding='utf-8', errors='replace') + sys.stderr.reconfigure(encoding="utf-8", errors="replace") except (AttributeError, ValueError): pass diff --git a/generative_data_prep/data_prep/pipeline.py b/generative_data_prep/data_prep/pipeline.py index e5a55961..de7d7427 100644 --- a/generative_data_prep/data_prep/pipeline.py +++ b/generative_data_prep/data_prep/pipeline.py @@ -539,9 +539,7 @@ def multiprocess_data_prep( # noqa: C901 bar_update_tracker += max_update # Set bar to exact position (as fraction of total, capped at 1.0) bar_position = ( - min(1.0, bar_update_tracker / total_num_articles) - if total_num_articles > 0 - else 0.0 + min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 ) bar(bar_position) # Ensure progress bar reaches exactly 100% (1.0 in manual mode) @@ -563,9 +561,7 @@ def multiprocess_data_prep( # noqa: C901 bar_update_tracker += max_update # Set bar to exact position (as fraction of total, capped at 1.0) bar_position = ( - min(1.0, bar_update_tracker / total_num_articles) - if total_num_articles > 0 - else 0.0 + min(1.0, bar_update_tracker / total_num_articles) if total_num_articles > 0 else 0.0 ) bar(bar_position) # Calculate percentage based on our tracker (more accurate than bar.current in manual mode) @@ -584,13 +580,9 @@ def multiprocess_data_prep( # noqa: C901 prev_num_tokenized_articles = num_tokenized_articles.value if ignore_input_format_error: - num_new_skipped_articles = ( - num_skipped_articles.value - prev_num_skipped_articles - ) + num_new_skipped_articles = num_skipped_articles.value - prev_num_skipped_articles if num_new_skipped_articles > 0: - LOGGER.info( - f"{num_skipped_articles.value} misformatted lines are skipped" - ) + LOGGER.info(f"{num_skipped_articles.value} misformatted lines are skipped") prev_num_skipped_articles = num_skipped_articles.value time.sleep(5) @@ -605,9 +597,7 @@ def multiprocess_data_prep( # noqa: C901 if ignore_input_format_error: LOGGER.info(f"Progress counter value: {num_tokenized_articles.value}") - LOGGER.info( - f"Total skipped lines (format errors): {num_skipped_articles.value}" - ) + LOGGER.info(f"Total skipped lines (format errors): {num_skipped_articles.value}") # Validate 100% completion if total_num_articles > 0: @@ -623,11 +613,7 @@ def multiprocess_data_prep( # noqa: C901 # Compare metrics with expected count metrics_diff = abs(metrics_articles - expected_articles) - metrics_diff_percent = ( - (metrics_diff / total_num_articles) * 100 - if total_num_articles > 0 - else 0 - ) + metrics_diff_percent = (metrics_diff / total_num_articles) * 100 if total_num_articles > 0 else 0 log_sep_str() if metrics_diff == 0: @@ -637,13 +623,9 @@ def multiprocess_data_prep( # noqa: C901 ) if skipped_articles > 0: LOGGER.info( - f" Note: {skipped_articles} articles were skipped due to " - f"JSON format errors (expected)" + f" Note: {skipped_articles} articles were skipped due to " f"JSON format errors (expected)" ) - LOGGER.info( - f" All {metrics_articles} processed articles are included in " - f"the output dataset." - ) + LOGGER.info(f" All {metrics_articles} processed articles are included in " f"the output dataset.") elif metrics_diff_percent <= 0.1: # Less than 0.1% difference LOGGER.warning( f"Near-complete data utilization: {metrics_articles}/" @@ -651,10 +633,7 @@ def multiprocess_data_prep( # noqa: C901 f"({metrics_diff_percent:.3f}% difference). This is likely due " f"to rounding or minor counting differences." ) - LOGGER.info( - f" {metrics_articles} articles are included in the output " - f"dataset." - ) + LOGGER.info(f" {metrics_articles} articles are included in the output " f"dataset.") else: LOGGER.error( f"[WARNING] INCOMPLETE DATA UTILIZATION: Only " diff --git a/generative_data_prep/utils/large_file_shuffle.py b/generative_data_prep/utils/large_file_shuffle.py index cd990d39..ca5f27a8 100644 --- a/generative_data_prep/utils/large_file_shuffle.py +++ b/generative_data_prep/utils/large_file_shuffle.py @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. """ + import logging import os import random @@ -27,10 +28,10 @@ def _split_file_round_robin(input_file_path: str, split_dir: str, num_splits: int): """Split a file into multiple files using round-robin distribution. - + This is a cross-platform replacement for the Linux 'split -d -n r/' command. Each line from the input file is distributed to output files in round-robin fashion. - + Args: input_file_path (str): Path to the input file to split split_dir (str): Directory where split files will be created @@ -41,7 +42,7 @@ def _split_file_round_robin(input_file_path: str, split_dir: str, num_splits: in for i in range(num_splits): split_file_path = os.path.join(split_dir, f"x{i:02d}") split_files.append(open(split_file_path, "w", encoding="utf-8", errors="replace")) - + try: # Read input file and distribute lines in round-robin fashion with open(input_file_path, "r", encoding="utf-8", errors="replace") as infile: @@ -56,23 +57,24 @@ def _split_file_round_robin(input_file_path: str, split_dir: str, num_splits: in def _shuffle_file(file_path: str): """Shuffle the lines of a file in-place. - + This is a cross-platform replacement for the Linux 'shuf' command. - + Args: file_path (str): Path to the file to shuffle """ # Read all lines with open(file_path, "r", encoding="utf-8", errors="replace") as f: lines = f.readlines() - + # Shuffle the lines random.shuffle(lines) - + # Write back to file with open(file_path, "w", encoding="utf-8", errors="replace") as f: f.writelines(lines) + def large_file_shuffle( input_file_path: str, output_dir: str, diff --git a/generative_data_prep/utils/logger.py b/generative_data_prep/utils/logger.py index c7bdf61d..65971cf3 100644 --- a/generative_data_prep/utils/logger.py +++ b/generative_data_prep/utils/logger.py @@ -15,6 +15,7 @@ This class creates a common logger. """ + import argparse import datetime import importlib.metadata @@ -93,9 +94,18 @@ def log_metrics(metrics): # Replace Unicode box-drawing characters with ASCII equivalents for Windows compatibility if sys.platform == "win32": replacements = { - '╒': '+', '═': '=', '╤': '+', '╕': '+', - '├': '+', '─': '-', '┼': '+', '┤': '+', - '╘': '+', '╧': '+', '╛': '+', '│': '|' + "╒": "+", + "═": "=", + "╤": "+", + "╕": "+", + "├": "+", + "─": "-", + "┼": "+", + "┤": "+", + "╘": "+", + "╧": "+", + "╛": "+", + "│": "|", } for old, new in replacements.items(): metrics_str = metrics_str.replace(old, new) @@ -103,7 +113,7 @@ def log_metrics(metrics): LOGGER.info(metrics_str) except UnicodeEncodeError: # Fallback: encode as ASCII with replacement - safe_str = metrics_str.encode('ascii', errors='replace').decode('ascii') + safe_str = metrics_str.encode("ascii", errors="replace").decode("ascii") LOGGER.info(safe_str) From acdf1c76e691f502a855784ff6a5d99ab681b78b Mon Sep 17 00:00:00 2001 From: rajratamore-debug Date: Mon, 29 Dec 2025 13:50:33 +0300 Subject: [PATCH 6/9] changed the versions of the jinja2>=3.1.6 regex>=2025.2.10 requests>=2.32.4 torch>=2.6.0 transformers>=4.53.0 urllib3>=2.5.0 for the pre commit condition --- pyproject.toml | 2 +- requirements/all-requirements.txt | 12 ++++++------ requirements/build-requirements.txt | 4 ++-- requirements/docs-requirements.txt | 6 +++--- requirements/requirements.txt | 12 ++++++------ requirements/tests-requirements.txt | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 28a084f9..1a9177c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "psutil", "torch>=2.3", "tqdm", - "transformers==4.43.1", + "transformers>=4.53.0", "tabulate", "gitpython", "types-tabulate", diff --git a/requirements/all-requirements.txt b/requirements/all-requirements.txt index 14892c6e..52498e79 100644 --- a/requirements/all-requirements.txt +++ b/requirements/all-requirements.txt @@ -13,7 +13,7 @@ grapheme==0.6.0 h5py==3.11.0; python_version >= '3.8' huggingface-hub==0.26.2; python_full_version >= '3.8.0' idna==3.10; python_version >= '3.6' -jinja2==3.1.4; python_version >= '3.7' +jinja2>=3.1.6; python_version >= '3.7' jsonlines==4.0.0; python_version >= '3.8' markupsafe==2.1.5; python_version >= '3.7' mpmath==1.3.0 @@ -36,8 +36,8 @@ psutil==6.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, pydantic==2.9.2; python_version >= '3.8' pydantic-core==2.23.4; python_version >= '3.8' pyyaml==6.0.2; python_version >= '3.8' -regex==2024.11.6; python_version >= '3.8' -requests==2.32.3; python_version >= '3.8' +regex>=2025.2.10; python_version >= '3.8' +requests>=2.32.4; python_version >= '3.8' safetensors==0.4.5; python_version >= '3.7' -e . sentencepiece==0.2.0 @@ -45,13 +45,13 @@ smmap==5.0.1; python_version >= '3.7' sympy==1.13.3; python_version >= '3.8' tabulate==0.9.0; python_version >= '3.7' tokenizers==0.19.1; python_version >= '3.7' -torch==2.4.1; python_full_version >= '3.8.0' +torch>=2.6.0; python_full_version >= '3.8.0' tqdm==4.67.0; python_version >= '3.7' -transformers==4.43.1; python_full_version >= '3.8.0' +transformers>=4.53.0; python_full_version >= '3.8.0' triton==3.0.0; python_version < '3.13' and platform_system == 'Linux' and platform_machine == 'x86_64' types-tabulate==0.9.0.20240106; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3==2.2.3; python_version >= '3.8' +urllib3>=2.5.0; python_version >= '3.8' anyio==4.5.2; python_version >= '3.8' backports.tarfile==1.2.0; python_version < '3.12' cffi==1.17.1; platform_python_implementation != 'PyPy' diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt index 4f1545b5..3e00db06 100644 --- a/requirements/build-requirements.txt +++ b/requirements/build-requirements.txt @@ -39,7 +39,7 @@ ptyprocess==0.7.0 pycparser==2.22; python_version >= '3.8' pygments==2.18.0; python_version >= '3.8' readme-renderer==43.0; python_version >= '3.8' -requests==2.32.3; python_version >= '3.8' +requests>=2.32.4; python_version >= '3.8' requests-toolbelt==1.0.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' rfc3986==2.0.0; python_version >= '3.7' rich==13.9.4; python_full_version >= '3.8.0' @@ -54,7 +54,7 @@ tomlkit==0.13.2; python_version >= '3.8' trove-classifiers==2024.10.21.16 twine==5.1.1; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3==2.2.3; python_version >= '3.8' +urllib3>=2.5.0; python_version >= '3.8' userpath==1.9.2; python_version >= '3.7' uv==0.5.1; python_version >= '3.8' virtualenv==20.27.1; python_version >= '3.8' diff --git a/requirements/docs-requirements.txt b/requirements/docs-requirements.txt index 02c16927..0a0e146b 100644 --- a/requirements/docs-requirements.txt +++ b/requirements/docs-requirements.txt @@ -20,7 +20,7 @@ importlib-metadata==8.5.0; python_version < '3.10' importlib-resources==6.4.5; python_version < '3.9' ipython==8.12.3; python_version >= '3.8' jedi==0.19.2; python_version >= '3.6' -jinja2==3.1.4; python_version >= '3.7' +jinja2>=3.1.6; python_version >= '3.7' jsonschema==4.23.0; python_version >= '3.8' jsonschema-specifications==2023.12.1; python_version >= '3.8' jupyter-client==8.6.3; python_version >= '3.8' @@ -57,7 +57,7 @@ pytz==2024.2; python_version < '3.9' pyyaml==6.0.2; python_version >= '3.8' pyzmq==26.2.0; python_version >= '3.7' referencing==0.35.1; python_version >= '3.8' -requests==2.32.3; python_version >= '3.8' +requests>=2.32.4; python_version >= '3.8' rpds-py==0.20.1; python_version >= '3.8' setuptools==75.3.0; python_version >= '3.8' setuptools-scm==8.1.0; python_version >= '3.8' @@ -78,7 +78,7 @@ tomli==2.1.0; python_version < '3.11' tornado==6.4.1; python_version >= '3.8' traitlets==5.14.3; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3==2.2.3; python_version >= '3.8' +urllib3>=2.5.0; python_version >= '3.8' wcwidth==0.2.13 webencodings==0.5.1 zipp==3.20.2; python_version >= '3.8' diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 3806dea9..1e84e911 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -13,7 +13,7 @@ grapheme==0.6.0 h5py==3.11.0; python_version >= '3.8' huggingface-hub==0.26.2; python_full_version >= '3.8.0' idna==3.10; python_version >= '3.6' -jinja2==3.1.4; python_version >= '3.7' +jinja2>=3.1.6; python_version >= '3.7' jsonlines==4.0.0; python_version >= '3.8' markupsafe==2.1.5; python_version >= '3.7' mpmath==1.3.0 @@ -36,8 +36,8 @@ psutil==6.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, pydantic==2.9.2; python_version >= '3.8' pydantic-core==2.23.4; python_version >= '3.8' pyyaml==6.0.2; python_version >= '3.8' -regex==2024.11.6; python_version >= '3.8' -requests==2.32.3; python_version >= '3.8' +regex>=2025.2.10; python_version >= '3.8' +requests>=2.32.4; python_version >= '3.8' safetensors==0.4.5; python_version >= '3.7' -e . sentencepiece==0.2.0 @@ -45,10 +45,10 @@ smmap==5.0.1; python_version >= '3.7' sympy==1.13.3; python_version >= '3.8' tabulate==0.9.0; python_version >= '3.7' tokenizers==0.19.1; python_version >= '3.7' -torch==2.4.1; python_full_version >= '3.8.0' +torch>=2.6.0; python_full_version >= '3.8.0' tqdm==4.67.0; python_version >= '3.7' -transformers==4.43.1; python_full_version >= '3.8.0' +transformers>=4.53.0; python_full_version >= '3.8.0' triton==3.0.0; python_version < '3.13' and platform_system == 'Linux' and platform_machine == 'x86_64' types-tabulate==0.9.0.20240106; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3==2.2.3; python_version >= '3.8' +urllib3>=2.5.0; python_version >= '3.8' diff --git a/requirements/tests-requirements.txt b/requirements/tests-requirements.txt index 84592793..13ca9ec0 100644 --- a/requirements/tests-requirements.txt +++ b/requirements/tests-requirements.txt @@ -6,7 +6,7 @@ exceptiongroup==1.2.2; python_version < '3.11' filelock==3.16.1; python_version >= '3.8' identify==2.6.1; python_version >= '3.8' iniconfig==2.0.0; python_version >= '3.7' -jinja2==3.1.4; python_version >= '3.7' +jinja2>=3.1.6; python_version >= '3.7' markupsafe==2.1.5; python_version >= '3.7' nodeenv==1.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6' packaging==24.2; python_version >= '3.8' From c049314bccbeb06ab2079e11ed19b8b872c35069 Mon Sep 17 00:00:00 2001 From: rajratamore-debug Date: Mon, 29 Dec 2025 14:20:23 +0300 Subject: [PATCH 7/9] precommit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1a9177c6..fe4d2efa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ "jsonlines", "numpy", "psutil", - "torch>=2.3", + "torch>=2.8.0", "tqdm", "transformers>=4.53.0", "tabulate", From 09971b21349c94cd052408fd2d03eb375ac4a8f7 Mon Sep 17 00:00:00 2001 From: rajratamore-debug Date: Mon, 29 Dec 2025 17:37:56 +0300 Subject: [PATCH 8/9] removed changes made for pre commit --- pyproject.toml | 4 ++-- requirements/all-requirements.txt | 12 ++++++------ requirements/build-requirements.txt | 4 ++-- requirements/docs-requirements.txt | 6 +++--- requirements/requirements.txt | 12 ++++++------ requirements/tests-requirements.txt | 2 +- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fe4d2efa..28a084f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,9 +16,9 @@ dependencies = [ "jsonlines", "numpy", "psutil", - "torch>=2.8.0", + "torch>=2.3", "tqdm", - "transformers>=4.53.0", + "transformers==4.43.1", "tabulate", "gitpython", "types-tabulate", diff --git a/requirements/all-requirements.txt b/requirements/all-requirements.txt index 52498e79..14892c6e 100644 --- a/requirements/all-requirements.txt +++ b/requirements/all-requirements.txt @@ -13,7 +13,7 @@ grapheme==0.6.0 h5py==3.11.0; python_version >= '3.8' huggingface-hub==0.26.2; python_full_version >= '3.8.0' idna==3.10; python_version >= '3.6' -jinja2>=3.1.6; python_version >= '3.7' +jinja2==3.1.4; python_version >= '3.7' jsonlines==4.0.0; python_version >= '3.8' markupsafe==2.1.5; python_version >= '3.7' mpmath==1.3.0 @@ -36,8 +36,8 @@ psutil==6.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, pydantic==2.9.2; python_version >= '3.8' pydantic-core==2.23.4; python_version >= '3.8' pyyaml==6.0.2; python_version >= '3.8' -regex>=2025.2.10; python_version >= '3.8' -requests>=2.32.4; python_version >= '3.8' +regex==2024.11.6; python_version >= '3.8' +requests==2.32.3; python_version >= '3.8' safetensors==0.4.5; python_version >= '3.7' -e . sentencepiece==0.2.0 @@ -45,13 +45,13 @@ smmap==5.0.1; python_version >= '3.7' sympy==1.13.3; python_version >= '3.8' tabulate==0.9.0; python_version >= '3.7' tokenizers==0.19.1; python_version >= '3.7' -torch>=2.6.0; python_full_version >= '3.8.0' +torch==2.4.1; python_full_version >= '3.8.0' tqdm==4.67.0; python_version >= '3.7' -transformers>=4.53.0; python_full_version >= '3.8.0' +transformers==4.43.1; python_full_version >= '3.8.0' triton==3.0.0; python_version < '3.13' and platform_system == 'Linux' and platform_machine == 'x86_64' types-tabulate==0.9.0.20240106; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3>=2.5.0; python_version >= '3.8' +urllib3==2.2.3; python_version >= '3.8' anyio==4.5.2; python_version >= '3.8' backports.tarfile==1.2.0; python_version < '3.12' cffi==1.17.1; platform_python_implementation != 'PyPy' diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt index 3e00db06..4f1545b5 100644 --- a/requirements/build-requirements.txt +++ b/requirements/build-requirements.txt @@ -39,7 +39,7 @@ ptyprocess==0.7.0 pycparser==2.22; python_version >= '3.8' pygments==2.18.0; python_version >= '3.8' readme-renderer==43.0; python_version >= '3.8' -requests>=2.32.4; python_version >= '3.8' +requests==2.32.3; python_version >= '3.8' requests-toolbelt==1.0.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' rfc3986==2.0.0; python_version >= '3.7' rich==13.9.4; python_full_version >= '3.8.0' @@ -54,7 +54,7 @@ tomlkit==0.13.2; python_version >= '3.8' trove-classifiers==2024.10.21.16 twine==5.1.1; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3>=2.5.0; python_version >= '3.8' +urllib3==2.2.3; python_version >= '3.8' userpath==1.9.2; python_version >= '3.7' uv==0.5.1; python_version >= '3.8' virtualenv==20.27.1; python_version >= '3.8' diff --git a/requirements/docs-requirements.txt b/requirements/docs-requirements.txt index 0a0e146b..02c16927 100644 --- a/requirements/docs-requirements.txt +++ b/requirements/docs-requirements.txt @@ -20,7 +20,7 @@ importlib-metadata==8.5.0; python_version < '3.10' importlib-resources==6.4.5; python_version < '3.9' ipython==8.12.3; python_version >= '3.8' jedi==0.19.2; python_version >= '3.6' -jinja2>=3.1.6; python_version >= '3.7' +jinja2==3.1.4; python_version >= '3.7' jsonschema==4.23.0; python_version >= '3.8' jsonschema-specifications==2023.12.1; python_version >= '3.8' jupyter-client==8.6.3; python_version >= '3.8' @@ -57,7 +57,7 @@ pytz==2024.2; python_version < '3.9' pyyaml==6.0.2; python_version >= '3.8' pyzmq==26.2.0; python_version >= '3.7' referencing==0.35.1; python_version >= '3.8' -requests>=2.32.4; python_version >= '3.8' +requests==2.32.3; python_version >= '3.8' rpds-py==0.20.1; python_version >= '3.8' setuptools==75.3.0; python_version >= '3.8' setuptools-scm==8.1.0; python_version >= '3.8' @@ -78,7 +78,7 @@ tomli==2.1.0; python_version < '3.11' tornado==6.4.1; python_version >= '3.8' traitlets==5.14.3; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3>=2.5.0; python_version >= '3.8' +urllib3==2.2.3; python_version >= '3.8' wcwidth==0.2.13 webencodings==0.5.1 zipp==3.20.2; python_version >= '3.8' diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 1e84e911..3806dea9 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -13,7 +13,7 @@ grapheme==0.6.0 h5py==3.11.0; python_version >= '3.8' huggingface-hub==0.26.2; python_full_version >= '3.8.0' idna==3.10; python_version >= '3.6' -jinja2>=3.1.6; python_version >= '3.7' +jinja2==3.1.4; python_version >= '3.7' jsonlines==4.0.0; python_version >= '3.8' markupsafe==2.1.5; python_version >= '3.7' mpmath==1.3.0 @@ -36,8 +36,8 @@ psutil==6.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, pydantic==2.9.2; python_version >= '3.8' pydantic-core==2.23.4; python_version >= '3.8' pyyaml==6.0.2; python_version >= '3.8' -regex>=2025.2.10; python_version >= '3.8' -requests>=2.32.4; python_version >= '3.8' +regex==2024.11.6; python_version >= '3.8' +requests==2.32.3; python_version >= '3.8' safetensors==0.4.5; python_version >= '3.7' -e . sentencepiece==0.2.0 @@ -45,10 +45,10 @@ smmap==5.0.1; python_version >= '3.7' sympy==1.13.3; python_version >= '3.8' tabulate==0.9.0; python_version >= '3.7' tokenizers==0.19.1; python_version >= '3.7' -torch>=2.6.0; python_full_version >= '3.8.0' +torch==2.4.1; python_full_version >= '3.8.0' tqdm==4.67.0; python_version >= '3.7' -transformers>=4.53.0; python_full_version >= '3.8.0' +transformers==4.43.1; python_full_version >= '3.8.0' triton==3.0.0; python_version < '3.13' and platform_system == 'Linux' and platform_machine == 'x86_64' types-tabulate==0.9.0.20240106; python_version >= '3.8' typing-extensions==4.12.2; python_version >= '3.8' -urllib3>=2.5.0; python_version >= '3.8' +urllib3==2.2.3; python_version >= '3.8' diff --git a/requirements/tests-requirements.txt b/requirements/tests-requirements.txt index 13ca9ec0..84592793 100644 --- a/requirements/tests-requirements.txt +++ b/requirements/tests-requirements.txt @@ -6,7 +6,7 @@ exceptiongroup==1.2.2; python_version < '3.11' filelock==3.16.1; python_version >= '3.8' identify==2.6.1; python_version >= '3.8' iniconfig==2.0.0; python_version >= '3.7' -jinja2>=3.1.6; python_version >= '3.7' +jinja2==3.1.4; python_version >= '3.7' markupsafe==2.1.5; python_version >= '3.7' nodeenv==1.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6' packaging==24.2; python_version >= '3.8' From 1068b7a419f3c7b1e50e261c957d4bdb5629b7ee Mon Sep 17 00:00:00 2001 From: rajratanmore-debug Date: Thu, 15 Jan 2026 09:38:22 +0300 Subject: [PATCH 9/9] Edited README.md file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 13c2a608..b705cbc6 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ If you are an advanced user looking to process data with pre-defined splits, int ## Requirements - Python version 3.8.10+ -- Support for Linux and Mac OS. Not tested on Windows +- Support for Linux, Mac OS and Windows.