diff --git a/.github/workflows/static-check.yml b/.github/workflows/static-check.yml
index e1530b5..5bf6c12 100644
--- a/.github/workflows/static-check.yml
+++ b/.github/workflows/static-check.yml
@@ -15,42 +15,10 @@ jobs:
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Check requirements.txt exists
-        id: check_req
-        run: |
-          if [ -f requirements.txt ]; then
-            echo "requirements_exists=true" >> $GITHUB_OUTPUT
-          else
-            echo "requirements_exists=false" >> $GITHUB_OUTPUT
-          fi
-          if [ -f pyproject.toml ]; then
-            echo "pyproject_exists=true" >> $GITHUB_OUTPUT
-          else
-            echo "pyproject_exists=false" >> $GITHUB_OUTPUT
-          fi
-      - name: Install dependencies by requirements.txt
+      - name: Install pylint
         id: install_deps_req
-        if: ${{ steps.check_req.outputs.requirements_exists == 'true' }}
         run: |
           python -m pip install --upgrade pylint
-          python -m pip install --upgrade isort
-          python -m pip install -r requirements.txt
-          echo "dependencies_installed=true" >> $GITHUB_OUTPUT
-      - name: Analysing the code with pylint
-        if: ${{ steps.check_req.outputs.requirements_exists == 'true' }}
-        run: |
-          isort $(git ls-files '*.py') --check-only --diff
-          pylint $(git ls-files '*.py')
-      - name: Install dependencies by uv
-        id: install_deps_uv
-        if: ${{ steps.check_req.outputs.pyproject_exists == 'true' }}
-        run: |
-          python -m pip install uv
-          uv sync
-          uv pip install pylint
-          uv pip install isort
       - name: Analysing the code with pylint
-        if: ${{ steps.check_req.outputs.pyproject_exists == 'true' }}
         run: |
-          uv run isort $(git ls-files '*.py') --check-only --diff
-          uv run pylint $(git ls-files '*.py')
+          grep -r -l "# LINT_ME" **/*.py|xargs pylint
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
index 7e930ac..01d1801 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -31,7 +31,7 @@ extension-pkg-allow-list=
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code. (This is an alternative name to extension-pkg-allow-list
 # for backward compatibility.)
-extension-pkg-whitelist=cv2
+extension-pkg-whitelist=
 
 # Return non-zero exit code if any of these messages/categories are detected,
 # even if score is above --fail-under value. Syntax same as enable. Messages
@@ -59,15 +59,16 @@ ignore-paths=
 # Emacs file locks
 ignore-patterns=^\.#
 
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis). It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=cv2
+# List of module names for which member attributes should not be checked and
+# will not be imported (useful for modules/projects where namespaces are
+# manipulated during runtime and thus existing member attributes cannot be
+# deduced by static analysis). It supports qualified module names, as well as
+# Unix pattern matching.
+ignored-modules=
 
 # Python code to execute, usually for sys.path manipulation such as
 # pygtk.require().
-init-hook='import sys; sys.path.append(".")'
+#init-hook=
 
 # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 # number of processors available to use, and will cap the count on Windows to
@@ -86,9 +87,13 @@ load-plugins=
 # Pickle collected data for later comparisons.
 persistent=yes
 
+# Resolve imports to .pyi stubs if available. May reduce no-member messages and
+# increase not-an-iterable messages.
+prefer-stubs=no
+
 # Minimum Python version to use for version dependent checks. Will default to
 # the version used to run pylint.
-py-version=3.10
+py-version=3.12
 
 # Discover python modules and packages in the file system subtree.
 recursive=no
@@ -99,10 +104,6 @@ recursive=no
 # source root.
 source-roots=
 
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
 unsafe-load-any-extension=no
@@ -229,6 +230,11 @@ name-group=
 # not require a docstring.
 no-docstring-rgx=^_
 
+# Regular expression matching correct parameter specification variable names.
+# If left empty, parameter specification variable names will be checked with
+# the set naming style.
+#paramspec-rgx=
+
 # List of decorators that produce properties, such as abc.abstractproperty. Add
 # to this list to register other decorators that produce valid properties.
 # These decorators are taken in consideration only for invalid-name.
@@ -242,13 +248,17 @@ property-classes=abc.abstractproperty
 # variable names will be checked with the set naming style.
 #typevar-rgx=
 
+# Regular expression matching correct type variable tuple names. If left empty,
+# type variable tuple names will be checked with the set naming style.
+#typevartuple-rgx=
+
 # Naming style matching correct variable names.
 variable-naming-style=snake_case
 
 # Regular expression matching correct variable names. Overrides variable-
 # naming-style. If left empty, variable names will be checked with the set
 # naming style.
-variable-rgx=(_?[a-z][A-Za-z0-9]{0,30})|([A-Z0-9]{1,30})
+#variable-rgx=
 
 
 [CLASSES]
@@ -285,10 +295,10 @@ exclude-too-few-public-methods=
 ignored-parents=
 
 # Maximum number of arguments for function / method.
-max-args=7
+max-args=5
 
 # Maximum number of attributes for a class (see R0902).
-max-attributes=20
+max-attributes=7
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr=5
@@ -302,6 +312,9 @@ max-locals=15
 # Maximum number of parents for a class (see R0901).
 max-parents=7
 
+# Maximum number of positional arguments for function / method.
+max-positional-arguments=5
+
 # Maximum number of public methods for a class (see R0904).
 max-public-methods=20
 
@@ -309,10 +322,10 @@ max-public-methods=20
 max-returns=6
 
 # Maximum number of statements in function / method body.
-max-statements=300
+max-statements=50
 
 # Minimum number of public methods for a class (see R0903).
-min-public-methods=1
+min-public-methods=2
 
 
 [EXCEPTIONS]
@@ -336,11 +349,13 @@ indent-after-paren=4
 # tab).
 indent-string='    '
 
-# Maximum number of characters on a single line.
-max-line-length=150
+# Maximum number of characters on a single line. Pylint's default of 100 is
+# based on PEP 8's guidance that teams may choose line lengths up to 99
+# characters.
+max-line-length=100
 
 # Maximum number of lines in a module.
-max-module-lines=2000
+max-module-lines=1000
 
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.
@@ -421,11 +436,16 @@ confidence=HIGH,
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use "--disable=all --enable=classes
 # --disable=W".
-disable=too-many-arguments,
-      too-many-locals,
-      too-many-branches,
-      protected-access
-
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        use-implicit-booleaness-not-comparison-to-string,
+        use-implicit-booleaness-not-comparison-to-zero,
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
@@ -433,6 +453,13 @@ disable=too-many-arguments,
 # it should appear only once). See also the "--disable" option for examples.
 enable=
 
+[MASTER]
+# A comma-separated list of package or module names 
+# from where C extensions may#be loaded. Extensions 
+# are loading into the active Python interpreter and 
+# may run arbitrary code .
+extension-pkg-allow-list=torch,transformers
+
 
 [METHOD_ARGS]
 
@@ -443,9 +470,13 @@ timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.
 
 [MISCELLANEOUS]
 
+# Whether or not to search for fixme's in docstrings.
+check-fixme-in-docstring=no
+
 # List of note tags to take in consideration, separated by a comma.
 notes=FIXME,
-      XXX
+      XXX,
+      TODO
 
 # Regular expression of note tags to take in consideration.
 notes-rgx=
@@ -465,7 +496,7 @@ never-returning-functions=sys.exit,argparse.parse_error
 # Let 'consider-using-join' be raised when the separator to join on would be
 # non-empty (resulting in expected fixes of the type: ``"- " + " -
 # ".join(items)``)
-# suggest-join-with-non-empty-separator=yes
+suggest-join-with-non-empty-separator=yes
 
 
 [REPORTS]
@@ -481,10 +512,10 @@ evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor
 # used to format the message information. See doc for all details.
 msg-template=
 
-# Set the output format. Available formats are: text, parseable, colorized,
-# json2 (improved json format), json (old json format) and msvs (visual
-# studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
+# Set the output format. Available formats are: 'text', 'parseable',
+# 'colorized', 'json2' (improved json format), 'json' (old json format), msvs
+# (visual studio) and 'github' (GitHub actions). You can also give a reporter
+# class, e.g. mypackage.mymodule.MyReporterClass.
 #output-format=
 
 # Tells whether to display a full report or only the messages.
@@ -582,11 +613,14 @@ ignored-checks-for-mixins=no-member,
 # qualified names.
 ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
 
+
+ignored-modules=torch,torch.distributed,transformers,transformers.hf_argparser
+
 # Show a hint with possible names when a member name was not found. The aspect
 # of finding the hint is based on edit distance.
 missing-member-hint=yes
 
-# The minimum edit distance a name should have in order to be considered a
+# The maximum edit distance a name should have in order to be considered a
 # similar match for a missing member name.
 missing-member-hint-distance=1
 
@@ -630,4 +664,4 @@ init-import=no
 
 # List of qualified module names which can have objects that can redefine
 # builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
\ No newline at end of file
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..8cc1b46
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10.15
diff --git a/README.md b/README.md
index 0fcb5dd..4926ca2 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,127 @@
-# Coming Soon...
+<h1 align='center'>WAM-Diff: A Masked Diffusion VLA Framework with MoE and Online Reinforcement Learning for Autonomous Driving</h1>
+<div align='center'>
+    <a href='https://github.com/xumingw' target='_blank'>Mingwang Xu</a><sup>1*</sup>&emsp;
+    <a href='https://cuijh26.github.io/' target='_blank'>Jiahao Cui</a><sup>1*</sup>&emsp;
+    <a href='https://github.com/fudan-generative-vision/WAM-Diff' target='_blank'>Feipeng Cai</a><sup>2*</sup>&emsp;
+    <a href='https://github.com/NinoNeumann' target='_blank'>Hanlin Shang</a><sup>1*</sup>&emsp;
+    <a href='https://github.com/SSSSSSuger' target='_blank'>Zhihao Zhu</a><sup>1</sup>&emsp;
+    <a href='https://github.com/isan089' target='_blank'>Shan Luan</a><sup>1</sup>&emsp;
+</div>
+<div align='center'>
+    <a href='https://github.com/YoucanBaby' target='_blank'>Yifang Xu</a><sup>1</sup>&emsp;
+    <a href='https://github.com/fudan-generative-vision/WAM-Diff' target='_blank'>Neng Zhang</a><sup>2</sup>&emsp;
+    <a href='https://github.com/fudan-generative-vision/WAM-Diff' target='_blank'>Yaoyi Li</a><sup>2</sup>&emsp;
+    <a href='https://github.com/fudan-generative-vision/WAM-Diff' target='_blank'>Jia Cai</a><sup>2</sup>&emsp;
+    <a href='https://sites.google.com/site/zhusiyucs/home' target='_blank'>Siyu Zhu</a><sup>1</sup>&emsp;
+</div>
+
+<div align='center'>
+    <sup>1</sup>Fudan University&emsp; <sup>2</sup>Yinwang Intelligent Technology Co., Ltd&emsp;
+</div>
+
+
+## 📅️ Roadmap
+
+| Status | Milestone                                                                                             |    ETA     |
+| :----: | :----------------------------------------------------------------------------------------------------: | :--------: |
+|   🚀   | **[Releasing the inference source code](https://github.com/fudan-generative-vision/WAM-Diff)** | 2025.12.21      |
+|   🚀   | **[Releasing the training scripts](https://github.com/fudan-generative-vision/WAM-Diff)**                                                       | 2025.12.21      |
+|   🚀   | **[Pretrained models on Huggingface](https://huggingface.co/fudan-generative-ai/WAM-Diff)**              | TBD        |
+
+
+## 🔧️ Framework
+![framework](assets/main_arch.png)
+
+## 🏆 Qualitative Results on NAVSIM
+### NAVSIM-v1 benchmark results
+<div style="text-align: center;">
+  <img src="assets/navsim-v1.png" alt="navsim-v1" width="70%" />
+</div>
+
+### NAVSIM-v2 benchmark results
+<div style="text-align: center;">
+<img src="assets/navsim-v2.png" alt="navsim-v2" width="90%" />
+</div>
+
+
+
+## Quick Inference Demo
+The WAM-Diff will be available on Hugging Face Hub soon. To quickly test the model, follow these simple steps:
+
+1. **Clone the repository**  
+   ```bash
+   git clone https://github.com/fudan-generative-vision/WAM-Diff
+   cd WAM-Diff
+   ```
+2. **Initialize the environment**  
+   If you prefer conda, run the environment setup script to install necessary dependencies:
+   ```bash
+   bash init_env.sh
+   ```
+   Or you can use uv to create the environment:
+   ```bash
+   uv venv && uv sync
+   ```
+3. **Prepare the Model**
+    Download the pretrained WAM-Diff model from Hugging Face (pending release) to the `./model/WAM-Diff` directory:
+    ```
+    https://huggingface.co/fudan-generative-ai/WAM-Diff
+    ```
+    Download the pretrained Siglip2 model from Hugging Face to the `./model/siglip2-so400m-patch14-384` directory:
+   ```
+   https://huggingface.co/google/siglip2-so400m-patch14-384
+   ```
+
+
+3. **Run the demo script**  
+   Execute the demo script to test WAM-Diff on an example image:
+   ```bash
+   bash inf.sh
+   ```
+
+## Training
+To fine-tune WAM-Diff, please follow these steps:
+1. **Set Up the Environment**  
+   Follow the same environment setup steps as in the Quick Inference Demo section.
+2. **Prepare the Data**  
+Prepare your training dataset in JSON format like
+    ```json
+    [
+        {
+        "image": ["path/to/image1.png"],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "Here is front views of a driving vehicle:\n<image>\nThe navigation information is: straight\nThe current position is (0.00,0.00)\nCurrent velocity is: (13.48,-0.29)  and current accelerate is: (0.19,0.05)\nPredict the optimal driving action for the next 4 seconds with 8 new waypoints."
+            },
+            {
+                "from": "gpt",
+                "value": "6.60,-0.01,13.12,-0.03,19.58,-0.04,25.95,-0.03,32.27,-0.03,38.56,-0.05,44.88,-0.06,51.16,-0.09"
+            }
+            ]
+        },
+        ...
+    ]
+    ```
+3. **Run the Training Script**  
+   Execute the training script with the following command:
+   ```bash
+   cd train
+   bash ./scripts/llada_v_finetune.sh
+   ```
+
+## 📝 Citation
+
+If you find our work useful for your research, please consider citing the paper:
+
+```
+@article{xu2025wam,
+  title={WAM-Diff: A Masked Diffusion VLA Framework with MoE and Online Reinforcement Learning for Autonomous Driving},
+  author={Xu, Mingwang and Cui, Jiahao and Cai, Feipeng and Shang, Hanlin and Zhu, Zhihao and Luan, Shan and Xu, Yifang and Zhang, Neng and Li, Yaoyi and Cai, Jia and others},
+  journal={arXiv preprint arXiv:2512.11872},
+  year={2025}
+}
+```
+
+## 🤗 Acknowledgements
+We gratefully acknowledge the contributors to the [LLaDA-V](https://github.com/ML-GSAI/LLaDA-V), repositories, whose commitment to open source has provided us with their excellent codebases and pretrained models.
\ No newline at end of file
diff --git a/assets/image.png b/assets/image.png
new file mode 100644
index 0000000..3d1b68c
Binary files /dev/null and b/assets/image.png differ
diff --git a/assets/main_arch.png b/assets/main_arch.png
new file mode 100644
index 0000000..2b3f8e4
Binary files /dev/null and b/assets/main_arch.png differ
diff --git a/assets/navsim-v1.png b/assets/navsim-v1.png
new file mode 100644
index 0000000..8ca33c4
Binary files /dev/null and b/assets/navsim-v1.png differ
diff --git a/assets/navsim-v2.png b/assets/navsim-v2.png
new file mode 100644
index 0000000..26f5b62
Binary files /dev/null and b/assets/navsim-v2.png differ
diff --git a/envs.yml b/envs.yml
new file mode 100644
index 0000000..d3215ed
--- /dev/null
+++ b/envs.yml
@@ -0,0 +1,7 @@
+name: WAM-Diff
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.10
diff --git a/inf.sh b/inf.sh
new file mode 100644
index 0000000..2afc538
--- /dev/null
+++ b/inf.sh
@@ -0,0 +1 @@
+torchrun --master_addr=127.0.0.1 --master_port=12346 --nproc-per-node=1 ./train/generate_demo_navsim_moe.py --pretrained_path path_to_WAM-Diff_ckpt
\ No newline at end of file
diff --git a/init_env.sh b/init_env.sh
new file mode 100644
index 0000000..2bd26e4
--- /dev/null
+++ b/init_env.sh
@@ -0,0 +1,48 @@
+conda env create -f envs.yml
+conda activate WAM-Diff
+
+#!/bin/bash
+
+# log file for failed installations
+FAIL_LOG="failed_packages.log"
+
+# rm existing log file
+> $FAIL_LOG
+
+while IFS= read -r line; do
+    # skip empty lines and comments
+    if [[ -z "$line" || "$line" =~ ^# ]]; then
+        continue
+    fi
+
+    echo "====================================="
+    echo "Installing: $line"
+    echo "====================================="
+
+    # Execute installation command
+    pip install $line $PIP_MIRROR
+
+    # Check if installation was successful
+    if [ $? -ne 0 ]; then
+        echo "Installation failed: $line" >> $FAIL_LOG
+        echo "====================================="
+        echo "⚠️  $line installation failed, logged to $FAIL_LOG"
+        echo "====================================="
+    else
+        echo "====================================="
+        echo "✅ $line installed successfully"
+        echo "====================================="
+    fi
+    sleep 1
+
+done < "requirements.txt"
+
+echo "====================================="
+echo "Installation complete!"
+if [ -s $FAIL_LOG ]; then
+    echo "❌ The following packages failed to install:"
+    cat $FAIL_LOG
+else
+    echo "✅ All packages installed successfully!"
+fi
+echo "====================================="
\ No newline at end of file
diff --git a/model/.gitkeep b/model/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..8715476
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,65 @@
+[project]
+name = "wam-diff"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = "==3.10.15"
+dependencies = [
+    "accelerate==0.29.1",
+    "aioboto3>=15.5.0",
+    "aiofiles>=25.1.0",
+    "bokeh==2.4.3",
+    "casadi>=3.7.2",
+    "control==0.9.1",
+    "datasets==2.16.1",
+    "deepspeed==0.14.4",
+    "einops>=0.8.1",
+    "fiona>=1.10.1",
+    "geopandas>=0.12.1",
+    "guppy3==3.1.2",
+    "hydra-core==1.2.0",
+    "imageio>=2.37.2",
+    "joblib>=1.5.2",
+    "matplotlib>=3.10.8",
+    "nest-asyncio>=1.6.0",
+    "notebook>=7.5.0",
+    "numpy==1.23.4",
+    "nuplan-devkit",
+    "opencv-python==4.9.0.80",
+    "pandas>=2.3.3",
+    "pillow>=12.0.0",
+    "positional-encodings==6.0.1",
+    "protobuf==3.20.0",
+    "psutil>=7.1.3",
+    "pyarrow>=22.0.0",
+    "pyinstrument>=5.1.1",
+    "pylint>=4.0.4",
+    "pynvml==12.0.0",
+    "pyogrio>=0.12.1",
+    "pyquaternion>=0.9.5",
+    "pytest>=9.0.2",
+    "pytorch-lightning==2.2.1",
+    "rasterio>=1.3.11",
+    "retry>=0.9.2",
+    "rtree>=1.4.1",
+    "scikit-learn==1.2.2",
+    "scipy>=1.13.1",
+    "selenium>=4.39.0",
+    "setuptools==65.5.1",
+    "shapely>=2.0.0",
+    "sqlalchemy==1.4.27",
+    "sympy>=1.14.0",
+    "tensorboard==2.16.2",
+    "timm>=1.0.22",
+    "torch==2.8.0",
+    "torchvision==0.23.0",
+    "tornado>=6.5.3",
+    "tqdm>=4.67.1",
+    "transformers==4.43.0",
+    "tyro==0.9.28",
+    "ujson>=5.11.0",
+]
+[[tool.uv.index]]
+url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0404460
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,203 @@
+accelerate==0.29.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+anls==0.0.2
+annotated-types==0.7.0
+anyio==4.10.0
+async-timeout==5.0.1
+attrs==25.3.0
+av==15.0.0
+bitsandbytes==0.45.3
+black==24.1.0
+blis==1.3.0
+braceexpand==0.1.7
+capture-metric==0.1.13
+catalogue==2.0.10
+certifi==2025.8.3
+cfgv==3.4.0
+chardet==5.2.0
+charset-normalizer==3.4.3
+click==8.2.2
+cloudpathlib==0.21.1
+colorama==0.4.6
+confection==0.1.5
+cymem==2.0.11
+dataproperty==1.1.0
+datasets==2.16.1
+debugpy==1.8.17
+decord==0.6.0
+deepspeed==0.14.4
+dill==0.3.7
+discosg==0.0.5
+distlib==0.4.0
+distro==1.9.0
+docstring-parser==0.17.0
+einops==0.6.1
+einops-exts==0.0.4
+et-xmlfile==2.0.0
+evaluate==0.4.5
+exceptiongroup==1.3.0
+factualscenegraph==0.7.3
+fastapi==0.116.1
+filelock==3.19.1
+frozenlist==1.7.0
+fsspec==2023.10.0
+ftfy==6.3.1
+gitdb==4.0.12
+gitpython==3.1.45
+gradio-client==0.2.9
+h11==0.14.0
+hf-transfer==0.1.9
+hf-xet==1.1.7
+hjson==3.1.0
+httpcore==0.16.3
+httpx==0.24.0
+huggingface-hub==0.34.4
+identify==2.6.13
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+isort==5.13.2
+jinja2==3.1.5
+jiter==0.10.0
+joblib==1.5.1
+jsonlines==4.0.0
+langcodes==3.5.0
+language-data==1.3.0
+lazy-import==0.2.2
+levenshtein==0.27.1
+loguru==0.7.3
+lxml==6.0.0
+marisa-trie==1.3.0
+markdown-it-py==4.0.0
+markupsafe==3.0.2
+mbstrdecoder==1.1.4
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.6.4
+multiprocess==0.70.15
+murmurhash==1.0.13
+mypy-extensions==1.1.0
+networkx==3.4.2
+ninja==1.13.0
+nltk==3.9.1
+nodeenv==1.9.1
+numexpr==2.11.0
+nvidia-cublas-cu12
+nvidia-cuda-cupti-cu12
+nvidia-cuda-nvrtc-cu12
+nvidia-cuda-runtime-cu12
+nvidia-cudnn-cu12
+nvidia-cufft-cu12
+nvidia-cufile-cu12
+nvidia-curand-cu12
+nvidia-cusolver-cu12
+nvidia-cusparse-cu12
+nvidia-cusparselt-cu12
+nvidia-ml-py==12.575.51
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+open-clip-torch==3.1.0
+openai==1.100.0
+openpyxl==3.1.5
+packaging==25.0
+pandas==2.3.1
+pathlib2==2.3.7.post1
+pathspec==0.12.1
+pathvalidate==3.3.1
+peft==0.9.0
+pillow==11.3.0
+platformdirs==4.3.8
+portalocker==3.2.0
+pre-commit==4.3.0
+preshed==3.0.10
+propcache==0.3.2
+protobuf==3.20.0
+psutil==7.0.0
+py-cpuinfo==9.0.0
+pyarrow==21.0.0
+pyarrow-hotfix==0.7
+pybind11==3.0.0
+pycocoevalcap==1.2
+pycocotools==2.0.10
+pycryptodome==3.23.0
+pydantic==2.9.2
+pydantic-core==2.33.2
+pygments==2.19.2
+pynvml==12.0.0
+pytablewriter==1.2.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+pywsd==1.2.5
+pyyaml==6.0.2
+rapidfuzz==3.13.0
+regex==2025.7.34
+requests==2.32.4
+rfc3986==1.5.0
+rich==14.1.0
+rouge==1.0.1
+sacrebleu==2.5.1
+safetensors==0.6.2
+scikit-learn==1.7.1
+scipy==1.15.3
+sentence-transformers==5.1.0
+sentencepiece==0.1.99
+sentry-sdk==2.35.0
+shellingham==1.5.4
+shortuuid==1.0.13
+shtab==1.7.2
+six==1.17.0
+smart-open==7.3.0.post1
+smmap==5.0.2
+sniffio==1.3.1
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+sqlitedict==2.1.0
+srsly==2.5.1
+starlette==0.47.2
+sympy==1.13.1
+tabledata==1.3.4
+tabulate==0.9.0
+tcolorpy==0.1.7
+tenacity==8.3.0
+tensorboardx==2.6.4
+thinc==8.3.6
+threadpoolctl==3.6.0
+tiktoken==0.11.0
+timm==1.0.19
+tokenizers==0.19.1
+tomli==2.2.1
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.43.0
+transformers-stream-generator==0.0.5
+triton==3.2.0
+typeguard==4.4.4
+typepy==1.3.4
+typer==0.16.1
+typing-extensions==4.14.1
+typing-inspection==0.4.1
+tyro==0.9.28
+tzdata==2025.2
+urllib3==2.0.0
+uvicorn==0.35.0
+virtualenv==20.34.0
+wandb==0.21.1
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+webdataset==1.0.2
+websockets==15.0.1
+wn==0.0.23
+wrapt==1.17.3
+xxhash==3.5.0
+yarl==1.20.1
+yt-dlp==2025.8.11
+zss==1.2.0
+zstandard==0.24.0
+opencv-python==4.12.0.88
+opencv-python-headless==4.12.0.88
\ No newline at end of file
diff --git a/train/LICENSE b/train/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/train/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/train/generate_demo_navsim_moe.py b/train/generate_demo_navsim_moe.py
new file mode 100644
index 0000000..8f75dbf
--- /dev/null
+++ b/train/generate_demo_navsim_moe.py
@@ -0,0 +1,352 @@
+# LINT_ME
+"""Generate demo navigation predictions using NavSim-MOE model in MoE setting."""
+
+from __future__ import annotations
+
+import copy
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import List, Tuple
+
+# Remove these if not needed in your environment.
+import llava.s3_ops as st
+import torch
+import torch.distributed as dist
+
+# llava / model imports
+from llava.cache import dLLMCache, dLLMCacheConfig
+from llava.conversation import conv_templates
+from llava.hooks import register_cache_LLaDA_V
+from llava.hooks.fast_dllm_hook import register_fast_dllm_hook
+from llava.mm_utils import IMAGE_TOKEN_INDEX, process_images, tokenizer_image_token
+from llava.model.builder import load_pretrained_model
+from transformers.hf_argparser import HfArgumentParser
+
+
+# =========================
+# Dataclass Args (HF-style)
+# =========================
+@dataclass
+class DataArgs:
+    """
+    Data-related arguments.
+    """
+    navsim_root: str = field(default="None")
+    steps_fut: int = 8
+    steps_hist: int = 4
+    camera_order: List[str] = field(
+        default_factory=lambda: [
+            "CAM_FRONT",
+            "CAM_FRONT_LEFT",
+            "CAM_FRONT_RIGHT",
+            "CAM_BACK",
+            "CAM_BACK_LEFT",
+            "CAM_BACK_RIGHT",
+        ]
+    )
+
+
+@dataclass
+class ModelArgs:  # pylint: disable=R0902
+    """Model Configuration Arguments."""
+    pretrained_path: str
+    conv_template: str = "llava_llada"
+    use_fast_dllm: bool = True
+    use_dllm_cache: bool = False
+    prompt_interval_steps: int = 25
+    gen_interval_steps: int = 7
+    transfer_ratio: float = 0.25
+    token_gen_steps: int = 32
+    token_gen_length: int = 32
+    token_block_length: int = 32
+
+
+@dataclass
+class RuntimeArgs:
+    """Runtime / Distributed Training Arguments."""
+    save_jsons_folder: str = "output/jsons"
+    device_type: str = "cuda"  # cuda|npu|cpu
+    backend: str = "nccl"  # nccl|gloo
+    merge_after: bool = False
+    debug_enable: bool = False
+
+
+# =========================
+# DDP helpers
+# =========================
+def setup_distributed(device_type: str = "npu", backend: str = "hccl"):
+    """
+    Initialize torch.distributed, pin local rank to the right device,
+    and return (rank, world_size, local_rank, device_str).
+    """
+    # Figure out ranks
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    rank = int(os.environ.get("RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+
+    # Pin device
+    device_type = device_type.lower()
+    if device_type == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA requested but not available.")
+        torch.cuda.set_device(local_rank)
+        device_str = f"cuda:{local_rank}"
+    elif device_type == "npu":
+        torch.npu.set_device(local_rank)
+        device_str = f"npu:{local_rank}"
+    else:
+        device_str = "cpu"
+
+    # Init process group
+    if dist.is_available() and not dist.is_initialized():
+        dist.init_process_group(backend=backend)
+
+    return rank, world_size, local_rank, device_str
+
+
+def get_rank_world() -> Tuple[int, int]:
+    """Get the current process rank and world size for distributed training."""
+    if dist.is_available() and dist.is_initialized():
+        return dist.get_rank(), dist.get_world_size()
+    return 0, 1
+
+
+def barrier():
+    """DDP barrier."""
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+
+
+def is_rank0() -> bool:
+    """Check if current process is rank 0."""
+    r, _ = get_rank_world()
+    return r == 0
+
+
+def get_local_rank() -> int:
+    """Get local rank from env vars."""
+    if "LOCAL_RANK" in os.environ:
+        return int(os.environ["LOCAL_RANK"])
+    if "RANK" in os.environ:
+        num = torch.cuda.device_count() if torch.cuda.is_available() else 1
+        return int(os.environ["RANK"]) % max(1, num)
+    return 0
+
+
+def pick_device(device_type: str, local_rank: int) -> str:
+    """Pick device string based on type and local rank."""
+    device_type = device_type.lower()
+    if device_type == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA requested but not available.")
+        return f"cuda:{local_rank}"
+    if device_type == "npu":
+        # Adjust if using a different NPU stack
+        return f"npu:{local_rank}"
+    if device_type == "cpu":
+        return "cpu"
+    raise ValueError(
+        f"Unsupported --device_type '{device_type}'. Use one of: cuda|npu|cpu."
+    )
+
+
+# =========================
+# Arg parsing (HF)
+# =========================
+def parse_args() -> tuple[DataArgs, ModelArgs, RuntimeArgs]:
+    """
+    Supports:
+      - CLI flags
+      - Config file (JSON/TOML/YAML): eval_nuscenes_ddp.py cfg.json
+      - Mixed (file + overrides)
+    """
+    parser = HfArgumentParser((DataArgs, ModelArgs, RuntimeArgs))
+    data_args, model_args, runtime_args = parser.parse_args_into_dataclasses(
+        return_remaining_strings=False
+    )
+    runtime_args.device_type = runtime_args.device_type.lower()
+    runtime_args.backend = runtime_args.backend.lower()
+    return data_args, model_args, runtime_args
+
+
+def _int_stem(p: Path) -> int:
+    """Get integer from path stem."""
+    try:
+        return int(p.stem)
+    except ValueError:  # narrow exception (fixes W0718)
+        return 1_000_000_000
+
+
+def merge_rank_jsons(save_jsons_folder: str, world_size: int) -> None:
+    """Merge per-rank JSON files into a single JSONL file."""
+    save_dir = Path(save_jsons_folder)
+    merged_path = save_dir / "merged.jsonl"
+
+    numbered: list[tuple[int, Path]] = []
+    for r in range(world_size):
+        rank_dir = save_dir / f"rank_{r}"
+        if not rank_dir.exists():
+            continue
+
+        for p in rank_dir.glob("*.json"):
+            idx = _int_stem(p)
+            if idx is None:
+                continue
+            numbered.append((idx, p))
+
+    numbered.sort(key=lambda t: t[0])
+
+    count = 0
+    with st.open_file(str(merged_path), "w") as out_f:
+        for _, p in numbered:
+            with st.open_file(str(p), "r") as in_f:
+                out_f.write(in_f.read().rstrip("\n") + "\n")
+            count += 1
+
+    print(f"[Rank 0] merged {count} records -> {merged_path}")
+
+# =========================
+# Main
+# =========================
+def main(): # pylint: disable=R0912, R0914, R0915
+    """Main function."""
+    data_args, model_args, runtime_args = parse_args()
+    # DDP init
+    rank, world_size, _, device_str = setup_distributed(
+        device_type=runtime_args.device_type,
+        backend=runtime_args.backend,
+    )
+
+    # Device & IO
+    device_map = device_str
+
+    st.makedirs(runtime_args.save_jsons_folder)
+    rank_out_dir = Path(runtime_args.save_jsons_folder) / f"rank_{rank}"
+    st.makedirs(str(rank_out_dir))
+
+    if is_rank0():
+        print(
+            f"[DDP] world_size={world_size} | saving to: {runtime_args.save_jsons_folder}"
+        )
+        # Save resolved config
+        with st.open_file(
+            str(Path(runtime_args.save_jsons_folder) / "resolved_args.json"), "w"
+        ) as f:
+            json.dump(
+                {
+                    "DataArgs": vars(data_args),
+                    "ModelArgs": vars(model_args),
+                    "RuntimeArgs": vars(runtime_args),
+                },
+                f,
+                indent=2,
+            )
+
+    # Load model (per rank)
+    model_name = "llava_llada"
+    tokenizer, model, image_processor, _ = load_pretrained_model(
+        model_args.pretrained_path,
+        None,
+        model_name,
+        attn_implementation="sdpa",
+        device_map=device_map,
+    )
+    model = model.to(device_str)
+    model.config.image_aspect_ratio = "anyres_max_2"
+    model.eval()
+
+    # Optional speed-ups
+    if model_args.use_fast_dllm:
+        register_fast_dllm_hook(model)
+        if is_rank0():
+            print("[Fast-dLLM] enabled")
+    elif model_args.use_dllm_cache:
+        dLLMCache.new_instance(
+            **asdict(
+                dLLMCacheConfig(
+                    prompt_interval_steps=model_args.prompt_interval_steps,
+                    gen_interval_steps=model_args.gen_interval_steps,
+                    transfer_ratio=model_args.transfer_ratio,
+                )
+            )
+        )
+        register_cache_LLaDA_V(model, "model.layers")
+        if is_rank0():
+            print("[dLLM-Cache] enabled")
+    else:
+        if is_rank0():
+            print("[Cache] disabled")
+
+    torch.set_grad_enabled(False)
+
+    surrounding_views = ["./assets/image.png"]
+
+    images = [st.image_open(p) for p in surrounding_views]
+
+    if len(images) > 1:
+        model.config.image_aspect_ratio = "pad"
+
+    image_tensor = process_images(images, image_processor, model.config)
+    image_tensor = [
+        _img.to(dtype=torch.float16, device=device_str) for _img in image_tensor
+    ]
+    image_sizes = [img.size for img in images]
+
+    # Prompt
+    conv = copy.deepcopy(conv_templates[model_args.conv_template])
+    question = """Here is front views of a driving vehicle:
+                <image>\nThe navigation information is: straight
+                The current position is (0.00,0.00)
+                Current velocity is: (8.92,-0.15) and current acceleration is: (-1.25,-0.01)
+                Predict the optimal driving action for the next 4 seconds with 8 new waypoints.
+                """
+    conv.append_message(conv.roles[0], question)
+    conv.append_message(conv.roles[1], None)
+    prompt_question = conv.get_prompt()
+
+    input_ids = (
+        tokenizer_image_token(
+            prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+        )
+        .unsqueeze(0)
+        .to(device_str)
+    )
+
+    # Generate
+    cont, track_x = model.generate(
+        input_ids,
+        images=image_tensor,
+        image_sizes=image_sizes,
+        steps=model_args.token_gen_steps,
+        gen_length=model_args.token_gen_length,
+        block_length=model_args.token_block_length,
+        tokenizer=tokenizer,
+        stopping_criteria=["<|eot_id|>"],
+        prefix_refresh_interval=32,
+        threshold=1,
+    )
+    text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=False)
+    print(text_outputs)
+
+    with open("output.txt", "w", encoding="utf-8") as f:
+        for x_step in track_x:
+            text_outputs = tokenizer.batch_decode(x_step, skip_special_tokens=False)
+
+            # Write each decoded string on its own line
+            for text in text_outputs:
+                f.write(text + "\n")
+
+            # Add separator after each step
+            f.write("--------\n")
+
+    barrier()
+
+    # Merge (rank 0)
+    if is_rank0() and runtime_args.merge_after:
+        merge_rank_jsons(runtime_args.save_jsons_folder, world_size)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train/init_env.sh b/train/init_env.sh
new file mode 100644
index 0000000..0df7957
--- /dev/null
+++ b/train/init_env.sh
@@ -0,0 +1,2 @@
+pip install -e ".[train]"
+pip install webdataset
diff --git a/train/llava/__init__.py b/train/llava/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/train/llava/cache/Cache.py b/train/llava/cache/Cache.py
new file mode 100644
index 0000000..282bd10
--- /dev/null
+++ b/train/llava/cache/Cache.py
@@ -0,0 +1,87 @@
+# pylint: disable=C0114,C0115,C0116,C0103
+
+from collections import defaultdict
+
+import torch
+
+
+class Singleton(type):
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+class dLLMCache(metaclass=Singleton):
+    gen_interval_steps: int
+    prompt_interval_steps: int
+    cfg_interval_steps: int
+    prompt_length: int
+    transfer_ratio: float
+    __cache: defaultdict
+    __step_counter: defaultdict
+    cache_type: str
+
+    @classmethod
+    def new_instance(
+        cls,
+        prompt_interval_steps: int = 1,
+        gen_interval_steps: int = 1,
+        cfg_interval_steps: int = 1,
+        transfer_ratio: float = 0.0,
+    ) -> "dLLMCache":
+        ins = cls()
+        setattr(ins, "prompt_interval_steps", prompt_interval_steps)
+        setattr(ins, "gen_interval_steps", gen_interval_steps)
+        setattr(ins, "cfg_interval_steps", cfg_interval_steps)
+        setattr(ins, "transfer_ratio", transfer_ratio)
+        ins.init()
+        return ins
+
+    def init(self) -> None:
+        self.__cache = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+        )
+        self.__step_counter = defaultdict(lambda: defaultdict(lambda: 0))
+
+    def reset_cache(self, prompt_length: int = 0) -> None:
+        self.init()
+        torch.cuda.empty_cache()
+        self.prompt_length = prompt_length
+        self.cache_type = "no_cfg"
+
+    def set_cache(
+        self, layer_id: int, feature_name: str, features: torch.Tensor, cache_type: str
+    ) -> None:
+        self.__cache[self.cache_type][cache_type][layer_id][feature_name] = {
+            0: features
+        }
+
+    def get_cache(
+        self, layer_id: int, feature_name: str, cache_type: str
+    ) -> torch.Tensor:
+        output = self.__cache[self.cache_type][cache_type][layer_id][feature_name][0]
+        return output
+
+    def update_step(self, layer_id: int) -> None:
+        self.__step_counter[self.cache_type][layer_id] += 1
+
+    def refresh_gen(self) -> bool:
+        return (self.current_step - 1) % self.gen_interval_steps == 0
+
+    def refresh_prompt(self) -> bool:
+        return (self.current_step - 1) % self.prompt_interval_steps == 0
+
+    def refresh_cfg(self) -> bool:
+        return (
+            self.current_step - 1
+        ) % self.cfg_interval_steps == 0 or self.current_step <= 5
+
+    @property
+    def current_step(self) -> int:
+        return max(list(self.__step_counter[self.cache_type].values()), default=1)
+
+    def __repr__(self):
+        return "USE dLLMCache"
diff --git a/train/llava/cache/Config.py b/train/llava/cache/Config.py
new file mode 100644
index 0000000..d48e835
--- /dev/null
+++ b/train/llava/cache/Config.py
@@ -0,0 +1,11 @@
+# pylint: disable=C0114,C0115,C0116,C0103
+
+from dataclasses import dataclass
+
+
+@dataclass
+class dLLMCacheConfig:
+    prompt_interval_steps: int = 1
+    gen_interval_steps: int = 1
+    transfer_ratio: float = 0.0
+    cfg_interval_steps: int = 1
diff --git a/train/llava/cache/__init__.py b/train/llava/cache/__init__.py
new file mode 100644
index 0000000..a8ef4bf
--- /dev/null
+++ b/train/llava/cache/__init__.py
@@ -0,0 +1,6 @@
+# pylint: disable=C0114,C0115,C0116
+
+from .Cache import dLLMCache
+from .Config import dLLMCacheConfig
+
+__all__ = ["dLLMCache", "dLLMCacheConfig"]
diff --git a/train/llava/constants.py b/train/llava/constants.py
new file mode 100644
index 0000000..be8cf02
--- /dev/null
+++ b/train/llava/constants.py
@@ -0,0 +1,12 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
diff --git a/train/llava/conversation.py b/train/llava/conversation.py
new file mode 100644
index 0000000..365bb84
--- /dev/null
+++ b/train/llava/conversation.py
@@ -0,0 +1,396 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Union
+import re
+import base64
+from io import BytesIO
+from PIL import Image
+from transformers import AutoTokenizer
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    QWEN = auto()
+    GEMMA = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    tokenizer_id: str = ""
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0]
+            if "mmtag" in self.version:
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            elif not init_msg.startswith("<image>"):
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, "<image>\n" + init_msg)
+            else:
+                messages[0] = (init_role, init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            if self.tokenizer is None:
+                if self.version == "llama_v3":
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        "meta-llama/Meta-Llama-3-8B-Instruct", trust_remote_code=True
+                    )
+                elif self.version == "llava_llada":
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        "model/LLaDA-V", trust_remote_code=True
+                    )
+                else:
+                    raise ValueError(
+                        "The tokenizer is not available. Make sure you have the necessary permissions."
+                    )
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+
+            # print(chat_template_messages)
+            return self.tokenizer.apply_chat_template(
+                chat_template_messages, tokenize=False, add_generation_prompt=True
+            )
+
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                assert role == self.roles[i % 2], (
+                    "Conversation should alternate user/assistant/user/assistant/..."
+                )
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = (
+                lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            )
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(
+        self, image, image_process_mode, return_pil=False, image_format="PNG"
+    ):
+        if image_process_mode == "Pad":
+
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+
+        if type(image) is not Image.Image:
+            image = Image.open(image).convert("RGB")
+
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 672, 448
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False, return_path=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    for img in image:
+                        if not return_path and self.is_image_file(img):
+                            img = self.process_image(
+                                img, image_process_mode, return_pil=return_pil
+                            )
+                        else:
+                            images.append(img)
+        return images
+
+    def is_image_file(self, filename):
+        image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"]
+        return any(filename.lower().endswith(ext) for ext in image_extensions)
+
+    def is_video_file(self, filename):
+        video_extensions = [
+            ".mp4",
+            ".mov",
+            ".avi",
+            ".mkv",
+            ".wmv",
+            ".flv",
+            ".mpeg",
+            ".mpg",
+        ]
+        return any(filename.lower().endswith(ext) for ext in video_extensions)
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    if len(image) == 1:
+                        msg = "<image>\n" + msg.replace("<image>", "").strip()
+                    else:
+                        msg = re.sub(r"(<image>)\n(?=<image>)", r"\1 ", msg)
+
+                    img_str_list = []
+                    for img in image:
+                        if self.is_image_file(img):
+                            img_b64_str = self.process_image(
+                                img, "Default", return_pil=False, image_format="JPEG"
+                            )
+                            img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" style="max-width: 256px; max-height: 256px; width: auto; height: auto; object-fit: contain;"/>'
+                            img_str_list.append(img_str)
+                        elif self.is_video_file(img):
+                            ret.append(((img,), None))
+
+                    msg = msg.strip()
+                    img_place_holder = ""
+                    for img_str in img_str_list:
+                        img_place_holder += f"{img_str}\n\n"
+
+                    if len(img_str_list) > 0:
+                        msg = f"{img_place_holder}\n\n{msg}"
+
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+def safe_load_tokenizer(tokenizer_id):
+    try:
+        return AutoTokenizer.from_pretrained(tokenizer_id)
+    except Exception:
+        return None
+
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llada_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=[],
+    version="llada_plain",
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="\n",
+)
+
+conv_llava_llada = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llava_llada",
+    messages=[],
+    offset=0,
+    sep="<|eot_id|>",
+    sep_style=SeparatorStyle.LLAMA_3,
+)
+
+
+default_conversation = conv_llava_llada
+conv_templates = {
+    "default": conv_llava_llada,
+    "plain": conv_llava_plain,
+    "llada_plain": conv_llada_plain,
+    "llava_llada": conv_llava_llada,
+    "v0_plain": conv_llava_plain,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/train/llava/hooks/__init__.py b/train/llava/hooks/__init__.py
new file mode 100644
index 0000000..01540b5
--- /dev/null
+++ b/train/llava/hooks/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=C0114,C0115,C0116,C0103
+
+from .cache_hook_LLaDA_V import register_cache_LLaDA_V
+
+__all__ = ["register_cache_LLaDA_V"]
diff --git a/train/llava/hooks/cache_hook_LLaDA_V.py b/train/llava/hooks/cache_hook_LLaDA_V.py
new file mode 100644
index 0000000..d993338
--- /dev/null
+++ b/train/llava/hooks/cache_hook_LLaDA_V.py
@@ -0,0 +1,867 @@
+# pylint: disable=C0114,C0115,C0116,C0103,R0913,R0917,R0914,R0912,R0915
+
+import math
+import types
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from llava.cache import dLLMCache
+
+
+# Helper functions from the new LLADa model (need to be accessible)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def register_cache_LLaDA_V(model: nn.Module, tf_block_module_key_name: str) -> None:
+    """
+    Registers cache hooks for a LLaDA-like model.
+    tf_block_module_key_name is typically 'model.layers' for LLaMA-style models.
+    """
+    target_module_path = tf_block_module_key_name.split(".")
+    current_module = model
+    for part in target_module_path:
+        current_module = getattr(current_module, part)
+
+    target_module: Optional[nn.ModuleList] = current_module
+    if target_module is None or not isinstance(target_module, nn.ModuleList):
+        raise ValueError(f"Could not find nn.ModuleList at {tf_block_module_key_name}")
+
+    for layer_index, tf_block in enumerate(target_module):
+        setattr(tf_block, "layer_idx", layer_index)
+
+        setattr(tf_block, "_old_forward", tf_block.forward)
+        tf_block.forward = types.MethodType(llada_cache_hook_feature, tf_block)
+
+        setattr(tf_block.self_attn, "_old_forward_main", tf_block.self_attn.forward)
+        tf_block.self_attn.attention_forward_for_cache = types.MethodType(
+            llada_attention_hook_for_cache, tf_block.self_attn
+        )
+
+        setattr(
+            tf_block.self_attn.rotary_emb,
+            "_old_forward",
+            tf_block.self_attn.rotary_emb.forward,
+        )
+        tf_block.self_attn.rotary_emb.forward = types.MethodType(
+            llada_RoPe_forward_hook, tf_block.self_attn.rotary_emb
+        )
+
+
+def llada_attention_hook_for_cache(
+    self,  # self is LLaDAAttention instance
+    q_in_proj: torch.Tensor,  # Renamed from q to clarify it's post-projection from cache_hook
+    k_in_proj: torch.Tensor,  # Renamed from k
+    v_in_proj: torch.Tensor,  # Renamed from v
+    attention_bias: Optional[torch.Tensor] = None,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    use_cache: bool = False,
+    q_index: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+    _ = layer_past
+    _ = use_cache
+    # q_in_proj, k_in_proj, v_in_proj are (Batch, SeqLen_specific_to_them, HiddenDim_model)
+    B, q_len_current_q = (
+        q_in_proj.shape[0],
+        q_in_proj.shape[1],
+    )  # q_len_current_q is the length of the Q for this specific call
+
+    q_num_heads = self.num_heads
+    k_num_heads = self.num_key_value_heads
+    v_num_heads = self.num_key_value_heads
+    head_dim = self.head_dim
+
+    # Reshape q, k, v to (Batch, NumHeads, SeqLen, HeadDim)
+    q = q_in_proj.view(B, q_len_current_q, q_num_heads, head_dim).transpose(1, 2)
+
+    k_seq_len_current_k = k_in_proj.shape[
+        1
+    ]  # k_seq_len_current_k is the length of K for this call (e.g., full context)
+    v_seq_len_current_v = v_in_proj.shape[1]
+
+    k = k_in_proj.view(B, k_seq_len_current_k, k_num_heads, head_dim).transpose(1, 2)
+    v = v_in_proj.view(B, v_seq_len_current_v, v_num_heads, head_dim).transpose(1, 2)
+
+    if hasattr(self, "rotary_emb"):
+        # q_index passed here should be for q_in_proj
+        q, k = self.rotary_emb(q, k, q_index=q_index)
+
+    present = None
+
+    k_repeated = repeat_kv(k, self.num_key_value_groups)
+    v_repeated = repeat_kv(v, self.num_key_value_groups)
+
+    # q is (B, q_num_heads, q_len_current_q, head_dim)
+    # k_repeated is (B, q_num_heads, k_seq_len_current_k, head_dim)
+
+    attn_weights = torch.matmul(q, k_repeated.transpose(2, 3)) / math.sqrt(
+        self.head_dim
+    )
+
+    if attention_bias is not None:
+        bias_q_dim = attention_bias.shape[-2]
+        bias_k_dim = attention_bias.shape[-1]
+
+        sliced_attention_bias = attention_bias
+        if (
+            q_len_current_q < bias_q_dim
+        ):  # Q is a segment, assume it's the latest tokens
+            # This assumes the q segment is the last part of
+            # the full query sequence represented by the bias
+            sliced_attention_bias = attention_bias[:, :, -q_len_current_q:, :]
+
+        if (
+            k_seq_len_current_k < bias_k_dim
+        ):  # K is a segment (less likely for full KV cache but possible)
+            # This assumes K is the latest part of the full key sequence in the bias
+            sliced_attention_bias = sliced_attention_bias[
+                :, :, :, -k_seq_len_current_k:
+            ]
+
+        # Final check on dimensions before adding
+        if (
+            attn_weights.shape[-2] == sliced_attention_bias.shape[-2]
+            and attn_weights.shape[-1] == sliced_attention_bias.shape[-1]
+        ):
+            attn_weights = attn_weights + sliced_attention_bias
+        else:
+            if (
+                sliced_attention_bias.shape[1] == 1
+                and attn_weights.shape[1] == self.num_heads
+            ):  # Mask has 1 head dim
+                attn_weights = (
+                    attn_weights + sliced_attention_bias
+                )  # Broadcast over heads
+            elif (
+                sliced_attention_bias.shape[1] == self.num_heads
+            ):  # Mask has same num heads
+                attn_weights = attn_weights + sliced_attention_bias
+            else:
+                raise RuntimeError(
+                    f"Attention bias shape {sliced_attention_bias.shape} incompatible with "
+                    f"attn_weights shape {attn_weights.shape} after slicing."
+                )
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+        q.dtype
+    )
+    attn_weights = nn.functional.dropout(
+        attn_weights, p=self.attention_dropout, training=self.training
+    )
+    att_output_heads = torch.matmul(attn_weights, v_repeated)
+
+    # Reshape to (B, q_len_current_q, ModelHiddenDim)
+    att_output_heads = (
+        att_output_heads.transpose(1, 2)
+        .contiguous()
+        .view(B, q_len_current_q, q_num_heads * head_dim)
+    )
+
+    output = self.o_proj(att_output_heads)
+
+    return output, present
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def llada_RoPe_forward_hook(
+    self_rope,
+    q_in: torch.Tensor,
+    k_in: torch.Tensor,
+    q_index: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    q_, k_ = q_in.float(), k_in.float()
+
+    _, _, query_len_current, _ = q_.shape
+    _, _, key_len_current, _ = k_.shape
+
+    max_pos_needed = key_len_current
+    if q_index is not None:
+        max_pos_needed = max(
+            max_pos_needed, int(q_index.max().item()) + 1 if q_index.numel() > 0 else 0
+        )
+
+    max_pos_needed = max(max_pos_needed, query_len_current)
+
+    if max_pos_needed == 0:
+        return q_in, k_in
+
+    inv_freq_to_use = self_rope.inv_freq.to(q_.device)
+    t = torch.arange(max_pos_needed, device=q_.device, dtype=torch.float32)
+    if hasattr(self_rope, "scaling_factor"):
+        t = t / self_rope.scaling_factor
+
+    freqs = torch.outer(t, inv_freq_to_use.float())
+    emb = torch.cat((freqs, freqs), dim=-1)
+
+    pos_cos_table = emb.cos()
+    pos_sin_table = emb.sin()
+
+    if q_index is not None:
+        actual_q_indices = q_index[:, :query_len_current]
+        cos_q = pos_cos_table[actual_q_indices]
+        sin_q = pos_sin_table[actual_q_indices]
+        q_rotated = (q_ * cos_q.unsqueeze(1)) + (rotate_half(q_) * sin_q.unsqueeze(1))
+    else:
+        q_indices = torch.arange(query_len_current, device=q_.device)
+        cos_q = pos_cos_table[q_indices].unsqueeze(0)
+        sin_q = pos_sin_table[q_indices].unsqueeze(0)
+        q_rotated = (q_ * cos_q.unsqueeze(1)) + (rotate_half(q_) * sin_q.unsqueeze(1))
+
+    k_indices = torch.arange(key_len_current, device=q_.device)
+    cos_k = pos_cos_table[k_indices].unsqueeze(0)
+    sin_k = pos_sin_table[k_indices].unsqueeze(0)
+    k_rotated = (k_ * cos_k.unsqueeze(1)) + (rotate_half(k_) * sin_k.unsqueeze(1))
+
+    return q_rotated.type_as(q_in), k_rotated.type_as(k_in)
+
+
+def refresh_index(
+    new_features: torch.Tensor,
+    cached_features: torch.Tensor = None,
+    transfer_ratio: float = 0.5,
+    layer_id: int = 0,
+) -> torch.Tensor:
+    _ = layer_id
+    batch_size, gen_len, _ = new_features.shape
+    num_replace = int(gen_len * transfer_ratio)
+    if num_replace == 0 or gen_len == 0:
+        return torch.empty(
+            (batch_size, 0), dtype=torch.long, device=new_features.device
+        )
+    if cached_features is None or cached_features.shape[1] == 0:
+        return torch.empty(
+            (batch_size, 0), dtype=torch.long, device=new_features.device
+        )
+    # pylint: disable=E1102
+    cos_sim = torch.nn.functional.cosine_similarity(
+        new_features, cached_features, dim=-1
+    )
+    k_actual = min(num_replace, cos_sim.shape[1])
+    if k_actual == 0:
+        return torch.empty(
+            (batch_size, 0), dtype=torch.long, device=new_features.device
+        )
+
+    transfer_index = torch.topk(cos_sim, largest=False, k=k_actual).indices
+    return transfer_index
+
+
+def llada_cache_hook_feature(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[
+        torch.Tensor
+    ] = None,  # This is the original mask for the full layer input
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Tuple[
+    torch.Tensor,
+    Optional[Tuple[torch.Tensor, torch.Tensor]],
+    Optional[Tuple[torch.Tensor, ...]],
+]:
+    current_layer_idx = self.layer_idx
+    feature_cache = dLLMCache()
+    feature_cache.update_step(current_layer_idx)
+    _ = past_key_value
+    _ = cache_position
+
+    prompt_length = feature_cache.prompt_length
+    # x_prompt and x_gen are sub-segments of hidden_states
+    x_prompt = hidden_states[:, :prompt_length, :]
+    x_gen = hidden_states[:, prompt_length:, :]
+
+    refresh_gen = feature_cache.refresh_gen()
+    refresh_prompt = feature_cache.refresh_prompt()
+    transfer_ratio = feature_cache.transfer_ratio
+
+    bs, _, dim = (
+        hidden_states.shape
+    )  # seq_len is the length of hidden_states input to this layer
+
+    transfer = 0 < transfer_ratio <= 1
+
+    index_from_attn_transfer = None
+    index_expanded_from_attn_transfer = None
+
+    def project(
+        x_input: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x_normed = self.input_layernorm(x_input)
+        q = self.self_attn.q_proj(x_normed)
+        k = self.self_attn.k_proj(x_normed)
+        v = self.self_attn.v_proj(x_normed)
+        return q, k, v
+
+    # This function needs to be smarter about the attention_bias it passes.
+    # q_tensor_origin_slice: A tuple (start_idx, length) indicating where q_tensor comes from
+    # relative to the original hidden_states/attention_mask.
+    def call_attention_on_qkv(
+        q_tensor,
+        k_tensor,
+        v_tensor,
+        original_full_attention_bias,
+        q_tensor_start_idx: int,  # Start index of q_tensor within the original sequence
+        q_index: Optional[torch.Tensor] = None,
+    ):
+        q_len_for_this_call = q_tensor.shape[1]
+        k_len_for_this_call = k_tensor.shape[
+            1
+        ]  # This K is already the full K from dLLM cache
+
+        # Slice the original_full_attention_bias
+        # original_full_attention_bias is likely
+        # (B, 1 or H, S_full_layer_input, K_full_from_dllm_or_max)
+        # We need (B, 1 or H, q_len_for_this_call, k_len_for_this_call)
+        sliced_bias = original_full_attention_bias
+        if original_full_attention_bias is not None:
+            # Slice query dimension based on q_tensor_start_idx and its length
+            # Slice key dimension to match k_tensor's length (which is k_full from dLLM)
+            sliced_bias = original_full_attention_bias[
+                :,
+                :,
+                q_tensor_start_idx : q_tensor_start_idx + q_len_for_this_call,
+                :k_len_for_this_call,
+            ]
+            # Ensure num_heads dim is compatible
+            # (1 for broadcast, or matches self.self_attn.num_heads)
+            if (
+                sliced_bias.shape[1] != 1
+                and sliced_bias.shape[1] != self.self_attn.num_heads
+            ):
+                # This might indicate an issue with mask preparation upstream
+                # if it's not 1 or num_heads
+                # For now, assume it's (B,1,q,k) and will broadcast
+                # if num_heads > 1 in attention_hook
+                pass
+
+        att_output, _ = self.self_attn.attention_forward_for_cache(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            attention_bias=sliced_bias,
+            layer_past=None,
+            use_cache=False,
+            q_index=q_index,
+        )
+        return att_output
+
+    def compute_mlp(input_to_mlp: torch.Tensor) -> torch.Tensor:
+        if input_to_mlp.shape[1] == 0:
+            return torch.empty_like(input_to_mlp)
+        x_norm = self.post_attention_layernorm(input_to_mlp)
+        gate_proj_out = self.mlp.gate_proj(x_norm)
+        up_proj_out = self.mlp.up_proj(x_norm)
+        act_out = self.mlp.act_fn(gate_proj_out)
+        x = act_out * up_proj_out
+        return self.mlp.down_proj(x)
+
+    residual_pre_attn = hidden_states
+
+    if refresh_gen and refresh_prompt:
+        q_full, k_full, v_full = project(hidden_states)
+        feature_cache.set_cache(
+            layer_id=current_layer_idx,
+            feature_name="kv_cache",
+            features={
+                "k": k_full[:, :prompt_length, :],
+                "v": v_full[:, :prompt_length, :],
+            },
+            cache_type="prompt",
+        )
+        if hidden_states.shape[1] > prompt_length:
+            feature_cache.set_cache(
+                layer_id=current_layer_idx,
+                feature_name="kv_cache",
+                features={
+                    "k": k_full[:, prompt_length:, :],
+                    "v": v_full[:, prompt_length:, :],
+                },
+                cache_type="gen",
+            )
+
+        # Q is all of hidden_states, K,V also from all of hidden_states
+        # q_start_idx is 0 because q_full corresponds to the start of hidden_states
+        att = call_attention_on_qkv(
+            q_full,
+            k_full,
+            v_full,
+            attention_mask,
+            q_tensor_start_idx=0,
+            q_index=position_ids,
+        )
+        feature_cache.set_cache(
+            layer_id=current_layer_idx,
+            feature_name="attn",
+            features=att[:, :prompt_length, :],
+            cache_type="prompt",
+        )
+        if hidden_states.shape[1] > prompt_length:
+            feature_cache.set_cache(
+                layer_id=current_layer_idx,
+                feature_name="attn",
+                features=att[:, prompt_length:, :],
+                cache_type="gen",
+            )
+
+    elif refresh_gen and not refresh_prompt:
+        att_gen_part = torch.empty((bs, 0, dim), device=hidden_states.device)
+        if x_gen.shape[1] > 0:
+            q_gen, k_gen, v_gen = project(x_gen)
+            feature_cache.set_cache(
+                layer_id=current_layer_idx,
+                feature_name="kv_cache",
+                features={"k": k_gen, "v": v_gen},
+                cache_type="gen",
+            )
+            kv_cache_prompt = feature_cache.get_cache(
+                layer_id=current_layer_idx, feature_name="kv_cache", cache_type="prompt"
+            )
+            k_prompt_val = kv_cache_prompt.get(
+                "k", torch.empty(bs, 0, dim, device=hidden_states.device)
+            )
+            v_prompt_val = kv_cache_prompt.get(
+                "v", torch.empty(bs, 0, dim, device=hidden_states.device)
+            )
+
+            k_full_ctx = torch.cat([k_prompt_val, k_gen], dim=1)
+            v_full_ctx = torch.cat([v_prompt_val, v_gen], dim=1)
+
+            q_gen_pos_ids = (
+                position_ids[:, prompt_length:]
+                if position_ids is not None and position_ids.shape[1] > prompt_length
+                else None
+            )
+
+            # q_gen starts at prompt_length in the original hidden_states sequence
+            att_gen_part = call_attention_on_qkv(
+                q_gen,
+                k_full_ctx,
+                v_full_ctx,
+                attention_mask,
+                q_tensor_start_idx=prompt_length,
+                q_index=q_gen_pos_ids,
+            )
+
+            feature_cache.set_cache(
+                layer_id=current_layer_idx,
+                feature_name="attn",
+                features=att_gen_part,
+                cache_type="gen",
+            )
+
+        att_prompt_cache = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="attn", cache_type="prompt"
+        )
+        att = torch.cat([att_prompt_cache, att_gen_part], dim=1)
+
+    elif not refresh_gen and refresh_prompt:
+        q_prompt, k_prompt, v_prompt = project(x_prompt)
+        feature_cache.set_cache(
+            layer_id=current_layer_idx,
+            feature_name="kv_cache",
+            features={"k": k_prompt, "v": v_prompt},
+            cache_type="prompt",
+        )
+        kv_cache_gen = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="kv_cache", cache_type="gen"
+        )
+        att_gen_cache = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="attn", cache_type="gen"
+        )
+
+        k_gen_current = kv_cache_gen.get(
+            "k", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+        v_gen_current = kv_cache_gen.get(
+            "v", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+
+        q_for_attn_segments = [q_prompt]
+        q_idx_for_attn_segments = (
+            [position_ids[:, :prompt_length]] if position_ids is not None else [None]
+        )
+
+        if transfer and x_gen.shape[1] > 0 and k_gen_current.shape[1] > 0:
+            _, _, v_gen_for_transfer = project(x_gen)
+            index_from_attn_transfer = refresh_index(
+                v_gen_for_transfer, v_gen_current, transfer_ratio, current_layer_idx
+            )
+
+            if index_from_attn_transfer.numel() > 0:
+                index_expanded_from_attn_transfer = index_from_attn_transfer.unsqueeze(
+                    -1
+                ).expand(-1, -1, dim)
+
+                x_gen_normed_selected = torch.gather(
+                    self.input_layernorm(x_gen),
+                    dim=1,
+                    index=index_expanded_from_attn_transfer,
+                )
+                q_gen_index = self.self_attn.q_proj(x_gen_normed_selected)
+                k_gen_index = self.self_attn.k_proj(x_gen_normed_selected)
+                v_gen_index_part = self.self_attn.v_proj(x_gen_normed_selected)
+
+                k_gen_current = k_gen_current.scatter(
+                    dim=1, index=index_expanded_from_attn_transfer, src=k_gen_index
+                )
+                v_gen_current = v_gen_current.scatter(
+                    dim=1, index=index_expanded_from_attn_transfer, src=v_gen_index_part
+                )
+
+                feature_cache.set_cache(
+                    layer_id=current_layer_idx,
+                    feature_name="kv_cache",
+                    features={"k": k_gen_current, "v": v_gen_current},
+                    cache_type="gen",
+                )
+
+                q_for_attn_segments.append(q_gen_index)
+                if position_ids is not None and position_ids.shape[1] > prompt_length:
+                    gen_abs_positions_all = position_ids[:, prompt_length:]
+                    gen_abs_positions_selected = torch.gather(
+                        gen_abs_positions_all, 1, index_from_attn_transfer
+                    )
+                    q_idx_for_attn_segments.append(gen_abs_positions_selected)
+                else:
+                    q_idx_for_attn_segments.append(None)
+
+        q_combined_for_attn = torch.cat(q_for_attn_segments, dim=1)
+        q_idx_combined_for_rope = (
+            torch.cat(q_idx_for_attn_segments, dim=1)
+            if all(s is not None for s in q_idx_for_attn_segments)
+            else None
+        )
+
+        k_full_ctx = torch.cat([k_prompt, k_gen_current], dim=1)
+        v_full_ctx = torch.cat([v_prompt, v_gen_current], dim=1)
+
+        # q_combined_for_attn effectively starts at index 0 of a conceptual sequence.
+        # Its RoPE is handled by q_idx_combined_for_rope.
+        att_for_q_combined = call_attention_on_qkv(
+            q_combined_for_attn,
+            k_full_ctx,
+            v_full_ctx,
+            attention_mask,
+            q_tensor_start_idx=0,  # Since Q is combined and starts from effective 0
+            q_index=q_idx_combined_for_rope,
+        )
+
+        att_prompt_new = att_for_q_combined[:, : q_prompt.shape[1], :]
+        if (
+            transfer
+            and index_from_attn_transfer is not None
+            and index_from_attn_transfer.numel() > 0
+        ):
+            att_gen_index_new = att_for_q_combined[
+                :, q_prompt.shape[1] :, :
+            ]  # Segment for transferred Qs
+            if att_gen_cache.shape[1] > 0:
+                att_gen_cache = att_gen_cache.scatter(
+                    dim=1,
+                    index=index_expanded_from_attn_transfer,
+                    src=att_gen_index_new,
+                )
+                feature_cache.set_cache(
+                    layer_id=current_layer_idx,
+                    feature_name="attn",
+                    features=att_gen_cache,
+                    cache_type="gen",
+                )
+
+        feature_cache.set_cache(
+            layer_id=current_layer_idx,
+            feature_name="attn",
+            features=att_prompt_new,
+            cache_type="prompt",
+        )
+        att = torch.cat([att_prompt_new, att_gen_cache], dim=1)
+
+    else:  # Not refresh gen, not refresh prompt
+        att_prompt_cache = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="attn", cache_type="prompt"
+        )
+        att_gen_cache = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="attn", cache_type="gen"
+        )
+        kv_cache_gen = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="kv_cache", cache_type="gen"
+        )
+        kv_cache_prompt = feature_cache.get_cache(
+            layer_id=current_layer_idx, feature_name="kv_cache", cache_type="prompt"
+        )
+
+        k_gen_current = kv_cache_gen.get(
+            "k", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+        v_gen_current = kv_cache_gen.get(
+            "v", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+        k_prompt_val = kv_cache_prompt.get(
+            "k", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+        v_prompt_val = kv_cache_prompt.get(
+            "v", torch.empty(bs, 0, dim, device=hidden_states.device)
+        )
+
+        if transfer and x_gen.shape[1] > 0 and k_gen_current.shape[1] > 0:
+            x_gen_normed = self.input_layernorm(x_gen)
+            v_gen_for_transfer = self.self_attn.v_proj(x_gen_normed)
+            index_from_attn_transfer = refresh_index(
+                v_gen_for_transfer, v_gen_current, transfer_ratio, current_layer_idx
+            )
+
+            if index_from_attn_transfer.numel() > 0:
+                index_expanded_from_attn_transfer = index_from_attn_transfer.unsqueeze(
+                    -1
+                ).expand(-1, -1, dim)
+
+                x_gen_normed_selected = torch.gather(
+                    x_gen_normed, dim=1, index=index_expanded_from_attn_transfer
+                )
+                q_gen_index_only = self.self_attn.q_proj(
+                    x_gen_normed_selected
+                )  # Q only for transferred items
+                k_gen_index = self.self_attn.k_proj(x_gen_normed_selected)
+                v_gen_index_part = self.self_attn.v_proj(x_gen_normed_selected)
+
+                k_gen_current = k_gen_current.scatter(
+                    dim=1, index=index_expanded_from_attn_transfer, src=k_gen_index
+                )
+                v_gen_current = v_gen_current.scatter(
+                    dim=1, index=index_expanded_from_attn_transfer, src=v_gen_index_part
+                )
+
+                feature_cache.set_cache(
+                    layer_id=current_layer_idx,
+                    feature_name="kv_cache",
+                    features={"k": k_gen_current, "v": v_gen_current},
+                    cache_type="gen",
+                )
+
+                q_idx_for_transferred_rope = None
+                if position_ids is not None and position_ids.shape[1] > prompt_length:
+                    gen_abs_positions_all = position_ids[:, prompt_length:]
+                    q_idx_for_transferred_rope = torch.gather(
+                        gen_abs_positions_all, 1, index_from_attn_transfer
+                    )
+
+                k_full_ctx = torch.cat([k_prompt_val, k_gen_current], dim=1)
+                v_full_ctx = torch.cat([v_prompt_val, v_gen_current], dim=1)
+
+                att_gen_index_new = call_attention_on_qkv(
+                    q_gen_index_only,
+                    k_full_ctx,
+                    v_full_ctx,
+                    attention_mask,
+                    q_tensor_start_idx=prompt_length,  # Approximate for mask slicing
+                    q_index=q_idx_for_transferred_rope,
+                )
+
+                if att_gen_cache.shape[1] > 0:  # Make sure att_gen_cache has a gen part
+                    att_gen_cache = att_gen_cache.scatter(
+                        dim=1,
+                        index=index_expanded_from_attn_transfer,
+                        src=att_gen_index_new,
+                    )
+                elif (
+                    x_gen.shape[1] > 0
+                ):  # If original att_gen_cache was for an empty gen part, but x_gen is not empty
+                    # Initialize att_gen_cache to be of x_gen's length before scattering
+                    att_gen_cache = torch.zeros(
+                        (bs, x_gen.shape[1], dim),
+                        device=hidden_states.device,
+                        dtype=att_gen_index_new.dtype,
+                    )
+                    att_gen_cache = att_gen_cache.scatter(
+                        dim=1,
+                        index=index_expanded_from_attn_transfer,
+                        src=att_gen_index_new,
+                    )
+                # Else: if x_gen.shape[1] is 0, att_gen_cache remains empty, scatter won't happen.
+
+                feature_cache.set_cache(
+                    layer_id=current_layer_idx,
+                    feature_name="attn",
+                    features=att_gen_cache,
+                    cache_type="gen",
+                )
+
+        att = torch.cat([att_prompt_cache, att_gen_cache], dim=1)
+
+    # ... rest of the llada_cache_hook_feature (MLP part) remains the same ...
+    hidden_states_after_attn = residual_pre_attn + att
+    residual_pre_mlp = hidden_states_after_attn
+
+    x_prompt_mlp = hidden_states_after_attn[:, :prompt_length, :]
+    x_gen_mlp = hidden_states_after_attn[:, prompt_length:, :]
+
+    mlp_out_prompt_part = torch.empty(
+        (bs, prompt_length, dim),
+        device=hidden_states.device,
+        dtype=hidden_states_after_attn.dtype,
+    )
+    mlp_out_gen_part = torch.empty(
+        (bs, x_gen_mlp.shape[1], dim),
+        device=hidden_states.device,
+        dtype=hidden_states_after_attn.dtype,
+    )
+
+    if refresh_gen and refresh_prompt:
+        mlp_out_full = compute_mlp(hidden_states_after_attn)
+        mlp_out_prompt_part = mlp_out_full[:, :prompt_length, :]
+        if x_gen_mlp.shape[1] > 0:
+            mlp_out_gen_part = mlp_out_full[:, prompt_length:, :]
+
+        feature_cache.set_cache(
+            current_layer_idx, "mlp", mlp_out_prompt_part, cache_type="prompt"
+        )
+        if x_gen_mlp.shape[1] > 0:
+            feature_cache.set_cache(
+                current_layer_idx, "mlp", mlp_out_gen_part, cache_type="gen"
+            )
+        if mlp_out_gen_part.shape[1] > 0:
+            mlp_out = torch.cat([mlp_out_prompt_part, mlp_out_gen_part], dim=1)
+        else:
+            mlp_out = mlp_out_prompt_part
+
+    elif refresh_gen and not refresh_prompt:
+        mlp_out_prompt_part = feature_cache.get_cache(
+            current_layer_idx, "mlp", cache_type="prompt"
+        )
+        if x_gen_mlp.shape[1] > 0:
+            mlp_out_gen_part = compute_mlp(x_gen_mlp)
+            feature_cache.set_cache(
+                current_layer_idx, "mlp", mlp_out_gen_part, cache_type="gen"
+            )
+
+        if mlp_out_gen_part.shape[1] > 0:
+            mlp_out = torch.cat([mlp_out_prompt_part, mlp_out_gen_part], dim=1)
+        else:
+            mlp_out = mlp_out_prompt_part
+
+    elif refresh_prompt and not refresh_gen:
+        mlp_gen_cache_data = feature_cache.get_cache(
+            current_layer_idx, "mlp", cache_type="gen"
+        )
+        if x_gen_mlp.shape[1] > 0:
+            mlp_out_gen_part = mlp_gen_cache_data
+
+        mlp_input_for_prompt_path = x_prompt_mlp
+        # Use index_expanded_from_attn_transfer which was set in the attention block
+        if (
+            transfer
+            and index_expanded_from_attn_transfer is not None
+            and index_expanded_from_attn_transfer.numel() > 0
+            and x_gen_mlp.shape[1] > 0
+        ):
+            x_gen_mlp_selected = torch.gather(
+                x_gen_mlp, dim=1, index=index_expanded_from_attn_transfer
+            )
+            mlp_input_for_prompt_path = torch.cat(
+                [x_prompt_mlp, x_gen_mlp_selected], dim=1
+            )
+
+        mlp_out_prompt_path_processed = compute_mlp(mlp_input_for_prompt_path)
+        mlp_out_prompt_part = mlp_out_prompt_path_processed[
+            :, : x_prompt_mlp.shape[1], :
+        ]
+
+        if (
+            transfer
+            and index_expanded_from_attn_transfer is not None
+            and index_expanded_from_attn_transfer.numel() > 0
+            and x_gen_mlp.shape[1] > 0
+        ):
+            mlp_gen_index_new = mlp_out_prompt_path_processed[
+                :, x_prompt_mlp.shape[1] :, :
+            ]
+            if mlp_out_gen_part.shape[1] > 0:  # If gen part exists
+                mlp_out_gen_part = mlp_out_gen_part.scatter(
+                    dim=1,
+                    index=index_expanded_from_attn_transfer,
+                    src=mlp_gen_index_new,
+                )
+
+            feature_cache.set_cache(
+                current_layer_idx, "mlp", mlp_out_gen_part, cache_type="gen"
+            )
+
+        feature_cache.set_cache(
+            current_layer_idx, "mlp", mlp_out_prompt_part, cache_type="prompt"
+        )
+        if mlp_out_gen_part.shape[1] > 0:
+            mlp_out = torch.cat([mlp_out_prompt_part, mlp_out_gen_part], dim=1)
+        else:
+            mlp_out = mlp_out_prompt_part
+
+    else:
+        mlp_out_prompt_part = feature_cache.get_cache(
+            current_layer_idx, "mlp", cache_type="prompt"
+        )
+        mlp_gen_cache_data = feature_cache.get_cache(
+            current_layer_idx, "mlp", cache_type="gen"
+        )
+        if x_gen_mlp.shape[1] > 0:
+            mlp_out_gen_part = mlp_gen_cache_data
+
+        # Use index_expanded_from_attn_transfer
+        if (
+            transfer
+            and index_expanded_from_attn_transfer is not None
+            and index_expanded_from_attn_transfer.numel() > 0
+            and x_gen_mlp.shape[1] > 0
+        ):
+            x_gen_mlp_selected = torch.gather(
+                x_gen_mlp, dim=1, index=index_expanded_from_attn_transfer
+            )
+            mlp_out_gen_index = compute_mlp(x_gen_mlp_selected)
+            if mlp_out_gen_part.shape[1] > 0:
+                mlp_out_gen_part = mlp_out_gen_part.scatter(
+                    dim=1,
+                    index=index_expanded_from_attn_transfer,
+                    src=mlp_out_gen_index,
+                )
+
+            feature_cache.set_cache(
+                current_layer_idx, "mlp", mlp_out_gen_part, cache_type="gen"
+            )
+
+        if mlp_out_gen_part.shape[1] > 0:
+            mlp_out = torch.cat([mlp_out_prompt_part, mlp_out_gen_part], dim=1)
+        else:
+            mlp_out = mlp_out_prompt_part
+
+    final_hidden_states = residual_pre_mlp + mlp_out
+
+    returned_outputs = (final_hidden_states,)
+    if output_attentions:
+        returned_outputs += (None,)
+    if use_cache:
+        returned_outputs += (None,)
+
+    return returned_outputs
diff --git a/train/llava/hooks/fast_dllm_hook.py b/train/llava/hooks/fast_dllm_hook.py
new file mode 100644
index 0000000..c4d9dbe
--- /dev/null
+++ b/train/llava/hooks/fast_dllm_hook.py
@@ -0,0 +1,702 @@
+# pylint: disable=C0114,C0115,C0116,C0103,R0913,R0917,R0914,R0912,R0915,E1102
+"""
+Fast dLLM Cache Hook for LLaDA Model
+
+This module contains hooks and utilities for implementing fast distributed LLM caching
+in the LLaDA model architecture.
+"""
+
+from typing import List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .cache_hook_LLaDA_V import repeat_kv
+
+
+class FastDLLMGenerationHook:
+    """
+    Hook class for implementing fast dLLM caching functionality in LLaDA model.
+    This class handles both attention-level caching and generation-level optimizations.
+    """
+
+    def __init__(self, model):
+        self.model = model
+        self.original_methods = {}
+        self.is_registered = False
+
+    def register_hooks(self):
+        """Register fast dLLM hooks to the model."""
+        if self.is_registered:
+            return
+
+        # Store original methods
+        # 新增：保存原始的generate方法
+        self.original_methods["generate"] = self.model.generate
+
+        # Store original attention forwards
+        for layer_idx, layer in enumerate(self.model.model.layers):
+            self.original_methods[f"attention_{layer_idx}"] = layer.self_attn.forward
+            # Replace attention forward with fast cache version
+            layer.self_attn.forward = self._create_fast_attention_forward(
+                layer.self_attn, layer_idx
+            )
+
+        self.model.generate = self._fast_generate  # 新增：替换generate方法
+
+        self.is_registered = True
+
+    def unregister_hooks(self):
+        """Unregister fast dLLM hooks from the model."""
+        if not self.is_registered:
+            return
+
+        # Restore original methods
+        # 新增：恢复原始的generate方法
+        self.model.generate = self.original_methods["generate"]
+
+        # Restore original attention forwards
+        for layer_idx, layer in enumerate(self.model.model.layers):
+            layer.self_attn.forward = self.original_methods[f"attention_{layer_idx}"]
+
+        self.is_registered = False
+
+    def _create_fast_attention_forward(self, attention_layer, layer_idx):
+        """Create fast attention forward method with caching support."""
+
+        def fast_attention_forward(
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
+            fast_dllm_cache: Optional[
+                Sequence[Tuple[torch.Tensor, torch.Tensor]]
+            ] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+            # If not using fast cache or output_attentions is needed, use original method
+            if output_attentions:
+                return self.original_methods[f"attention_{layer_idx}"](
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    **kwargs,
+                )
+
+            bsz, q_len, _ = hidden_states.size()
+
+            query_states = attention_layer.q_proj(hidden_states)
+            key_states = attention_layer.k_proj(hidden_states)
+            value_states = attention_layer.v_proj(hidden_states)
+
+            query_states = query_states.view(
+                bsz, q_len, attention_layer.num_heads, attention_layer.head_dim
+            ).transpose(1, 2)
+            key_states = key_states.view(
+                bsz,
+                q_len,
+                attention_layer.num_key_value_heads,
+                attention_layer.head_dim,
+            ).transpose(1, 2)
+            value_states = value_states.view(
+                bsz,
+                q_len,
+                attention_layer.num_key_value_heads,
+                attention_layer.head_dim,
+            ).transpose(1, 2)
+
+            # Apply rotary position embedding with fast cache consideration
+            cache_offset = 0
+            if fast_dllm_cache and len(fast_dllm_cache) > layer_idx:
+                cache_offset = fast_dllm_cache[layer_idx][0].shape[-2]
+
+            cos, sin = attention_layer.rotary_emb(
+                value_states,
+                position_ids + cache_offset if cache_offset > 0 else position_ids,
+            )
+            query_states, key_states = self._apply_rotary_pos_emb(
+                query_states, key_states, cos, sin
+            )
+
+            # Handle past key values
+            past_key_value = getattr(attention_layer, "past_key_value", past_key_value)
+            if past_key_value is not None:
+                cache_kwargs = {
+                    "sin": sin,
+                    "cos": cos,
+                    "cache_position": cache_position,
+                }
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, layer_idx, cache_kwargs
+                )
+
+            # Fast dLLM cache logic
+            if fast_dllm_cache is not None:
+                if len(fast_dllm_cache) <= layer_idx:
+                    fast_dllm_cache.append((key_states, value_states))
+                else:
+                    past_key, past_value = fast_dllm_cache[layer_idx]
+                    key_states = torch.cat([past_key, key_states], dim=-2)
+                    value_states = torch.cat([past_value, value_states], dim=-2)
+
+            # Repeat key-value pairs for multi-head attention
+            key_states = repeat_kv(key_states, attention_layer.num_key_value_groups)
+            value_states = repeat_kv(value_states, attention_layer.num_key_value_groups)
+
+            # Apply causal mask
+            causal_mask = attention_mask
+            if attention_mask is not None:
+                causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+            # Ensure contiguous tensors for CUDA
+            if query_states.device.type == "cuda" and causal_mask is not None:
+                query_states = query_states.contiguous()
+                key_states = key_states.contiguous()
+                value_states = value_states.contiguous()
+
+            # Scaled dot-product attention
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=None,
+                is_causal=False,
+                dropout_p=attention_layer.attention_dropout
+                if attention_layer.training
+                else 0.0,
+            )
+
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, attention_layer.hidden_size)
+            attn_output = attention_layer.o_proj(attn_output)
+
+            return attn_output, None, past_key_value
+
+        return fast_attention_forward
+
+    @torch.no_grad()
+    def _fast_generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ):
+        modalities = (
+            kwargs.pop("modalities", None)
+            if "modalities" in kwargs and modalities is None
+            else modalities
+        )
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = (
+                self.model.prepare_inputs_labels_for_multimodal(
+                    inputs,
+                    position_ids,
+                    attention_mask,
+                    None,
+                    None,
+                    images,
+                    modalities,
+                    image_sizes=image_sizes,
+                )
+            )
+        else:
+            inputs_embeds = self.model.get_model().embed_tokens(inputs)
+        output = self._fast_generate_with_embeds(inputs_embeds=inputs_embeds, **kwargs)
+        return output
+
+    def reverse_causal_x0_p_comma(self, x0, logits, block_start, block_end):
+        batch_size, seq_len = x0.shape
+        p = F.softmax(logits.to(torch.float64), dim=-1)
+        x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
+        if block_start > block_end:
+            block_start = 0
+            assert seq_len == block_end
+        # 计算 block 内的长度
+        block_len = block_end - block_start
+        assert block_len == 24, "block len is not 24! is that true?"
+        even_indices = torch.tensor(
+            [block_start + i for i in range(0, block_len, 2)], device=x0.device
+        ).repeat(batch_size, 1)
+        num_even = even_indices.shape[1]
+        decreasing_values = torch.linspace(
+            0.1, 0.99, steps=num_even, device=x0.device, dtype=x0_p.dtype
+        ).repeat(batch_size, 1)
+        x0_p = x0_p.scatter_(1, even_indices, decreasing_values)
+        return x0_p
+
+    @torch.no_grad()
+    def _fast_generate_with_embeds(
+        self,
+        inputs_embeds,
+        steps=128,
+        gen_length=128,
+        block_length=128,
+        temperature=0.0,
+        cfg_scale=0.0,
+        remasking="low_confidence",
+        mask_id=126336,
+        tokenizer=None,
+        stopping_criteria=None,
+        generation_suffix=None,
+        threshold=None,
+        prefix_refresh_interval=32,
+        **kwargs,
+    ):
+        """
+        Fast generation with embeddings using dLLM cache optimization.
+        This method incorporates all fast dLLM related optimizations.
+        """
+        _ = kwargs  # Unused
+        # Use mixed precision for faster computation
+        with torch.autocast(enabled=True, device_type="cuda", dtype=torch.bfloat16):
+            # Handle generation suffix
+            suffix_embeds = None
+            suffix_token_ids = None
+            suffix_len = 0
+            if (
+                generation_suffix is not None
+                and tokenizer is not None
+                and len(generation_suffix) > 0
+            ):
+                suffix_token_ids = tokenizer.encode(
+                    generation_suffix, add_special_tokens=False
+                )
+                suffix_token_ids = torch.tensor(
+                    suffix_token_ids, dtype=torch.long, device=inputs_embeds.device
+                ).unsqueeze(0)
+                suffix_embeds = self.model.model.embed_tokens(suffix_token_ids)
+                suffix_len = suffix_embeds.shape[1]
+
+            # Create input in embedding space
+            total_length = inputs_embeds.shape[1] + gen_length + suffix_len
+            masked_embed = self.model.model.embed_tokens(
+                torch.tensor([mask_id]).to(inputs_embeds.device)
+            )
+            x_embeds = masked_embed.repeat(1, total_length, 1).to(inputs_embeds.device)
+            x_embeds[:, : inputs_embeds.shape[1]] = inputs_embeds.clone()
+            if suffix_embeds is not None:
+                x_embeds[:, -suffix_len:] = suffix_embeds
+
+            # Create tracking tensor for token IDs
+            x = torch.full(
+                (1, total_length),
+                mask_id,
+                dtype=torch.long,
+                device=inputs_embeds.device,
+            )
+            track_x = []  # record all step of x
+            if suffix_token_ids is not None:
+                x[:, -suffix_len:] = suffix_token_ids
+
+            # Prompt index tracking
+            prompt_index = torch.zeros(
+                (1, total_length), dtype=torch.bool, device=inputs_embeds.device
+            )
+            prompt_index[:, : inputs_embeds.shape[1]] = 1
+
+            assert gen_length % block_length == 0
+            num_blocks = gen_length // block_length
+            assert steps % num_blocks == 0
+            steps = steps // num_blocks
+
+            # Initialize stop tracking
+            stop_position = inputs_embeds.shape[1] + gen_length
+            found_stop_seq = False
+            stop_tokens = []
+
+            if stopping_criteria is not None:
+                assert tokenizer is not None, (
+                    "tokenizer is required when stopping_criteria is not None"
+                )
+                for stop_str in stopping_criteria:
+                    tokens = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_tokens.append(tokens)
+
+            # Process each block
+            for num_block in range(num_blocks):
+                block_start = inputs_embeds.shape[1] + num_block * block_length
+                block_end = inputs_embeds.shape[1] + (num_block + 1) * block_length
+
+                # Skip if stop found before current block
+                if found_stop_seq and stop_position <= block_start:
+                    break
+
+                block_embeds = x_embeds[:, block_start:block_end]
+                block_mask_index = torch.all(
+                    torch.abs(block_embeds - masked_embed) < 1e-5, dim=2
+                )
+                num_transfer_tokens = self._get_num_transfer_tokens(
+                    block_mask_index, steps
+                )
+
+                i = 0
+
+                while True:
+                    if threshold is None and i >= steps:
+                        break
+
+                    # Check mask state
+                    mask_index = torch.all(
+                        torch.abs(x_embeds - masked_embed) < 1e-5, dim=2
+                    )
+
+                    if found_stop_seq:
+                        pre_stop_masks = mask_index[
+                            0, inputs_embeds.shape[1] : stop_position
+                        ]
+                        if not pre_stop_masks.any():
+                            break
+
+                    current_block_masks = mask_index[0, block_start:block_end]
+                    if not current_block_masks.any():
+                        break
+
+                    # Handle CFG
+                    if cfg_scale > 0.0:
+                        un_embeds = x_embeds.clone()
+                        un_mask = prompt_index.unsqueeze(-1).expand_as(x_embeds)
+                        un_embeds[un_mask] = masked_embed.repeat(
+                            x_embeds.shape[0], x_embeds.shape[1], 1
+                        )[un_mask]
+                        combined_embeds = torch.cat([x_embeds, un_embeds], dim=0)
+
+                        if i % prefix_refresh_interval == 0:
+                            fast_dllm_cache = []
+                            outputs = self.model.model(
+                                inputs_embeds=combined_embeds,
+                                fast_dllm_cache=fast_dllm_cache,
+                            )
+                            fast_dllm_cache = self._create_cache_slice(
+                                fast_dllm_cache, block_start
+                            )
+                        else:
+                            outputs = self.model.model(
+                                inputs_embeds=combined_embeds[:, block_start:],
+                                fast_dllm_cache=fast_dllm_cache,
+                            )
+
+                        logits = self.model.lm_head(outputs[0]).float()
+                        logits, un_logits = torch.chunk(logits, 2, dim=0)
+                        logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                    else:
+                        if i % prefix_refresh_interval == 0:
+                            fast_dllm_cache = []
+                            outputs = self.model.model(
+                                inputs_embeds=x_embeds, fast_dllm_cache=fast_dllm_cache
+                            )
+                            # Slice cache to block start
+                            fast_dllm_cache = self._create_cache_slice(
+                                fast_dllm_cache, block_start
+                            )
+                        else:
+                            # Incremental forward pass
+                            outputs = self.model.model(
+                                inputs_embeds=x_embeds[:, block_start:],
+                                fast_dllm_cache=fast_dllm_cache,
+                            )
+                        logits = self.model.lm_head(outputs[0]).float()
+
+                    # Filter forbidden tokens
+                    forbidden_tokens = [126081, 126080, 126346, 126347]
+                    if i % prefix_refresh_interval == 0:
+                        for token_id in forbidden_tokens:
+                            logits[:, :, token_id] = torch.where(
+                                mask_index, -float("inf"), logits[:, :, token_id]
+                            )
+                    else:
+                        for token_id in forbidden_tokens:
+                            logits[:, :, token_id] = torch.where(
+                                mask_index[:, block_start:],
+                                -float("inf"),
+                                logits[:, :, token_id],
+                            )
+
+                    # Get transfer indices and update
+                    if i % prefix_refresh_interval == 0:
+                        x0, transfer_index = self._get_transfer_index(
+                            logits,
+                            temperature,
+                            remasking,
+                            mask_index,
+                            x,
+                            num_transfer_tokens[:, i] if threshold is None else None,
+                            found_stop_seq,
+                            stop_position,
+                            block_end,
+                            suffix_len,
+                            block_start,
+                            threshold,
+                        )
+                        x0_embeds = self.model.model.embed_tokens(x0)
+                        x0_embeds = torch.where(
+                            mask_index.unsqueeze(-1).expand_as(x_embeds),
+                            x0_embeds,
+                            x_embeds,
+                        )
+                        x_embeds[transfer_index] = x0_embeds[transfer_index]
+                        x[transfer_index] = x0[transfer_index]
+                    else:
+                        x0, transfer_index = self._get_transfer_index(
+                            logits,
+                            temperature,
+                            remasking,
+                            mask_index[:, block_start:],
+                            x[:, block_start:],
+                            num_transfer_tokens[:, i] if threshold is None else None,
+                            found_stop_seq,
+                            stop_position - block_start,
+                            block_end - block_start,
+                            suffix_len,
+                            block_start,
+                            threshold,
+                        )
+                        x0_embeds = self.model.model.embed_tokens(x0)
+                        x0_embeds = torch.where(
+                            mask_index[:, block_start:]
+                            .unsqueeze(-1)
+                            .expand_as(x_embeds[:, block_start:]),
+                            x0_embeds,
+                            x_embeds[:, block_start:],
+                        )
+                        x_embeds[:, block_start:][transfer_index] = x0_embeds[
+                            transfer_index
+                        ]
+                        x[:, block_start:][transfer_index] = x0[transfer_index]
+
+                    track_x.append(
+                        x.clone()[
+                            :,
+                            inputs_embeds.shape[1] : inputs_embeds.shape[1]
+                            + gen_length,
+                        ]
+                    )
+
+                    # Check for stop words
+                    if stopping_criteria is not None:
+                        generated_part = x[
+                            0,
+                            inputs_embeds.shape[1] : inputs_embeds.shape[1]
+                            + gen_length,
+                        ]
+                        for stop_seq in stop_tokens:
+                            if not isinstance(stop_seq, list):
+                                stop_seq = [stop_seq]
+                            for start_idx in range(
+                                generated_part.size(0) - len(stop_seq) + 1
+                            ):
+                                if torch.all(
+                                    generated_part[
+                                        start_idx : start_idx + len(stop_seq)
+                                    ]
+                                    == torch.tensor(stop_seq, device=x.device)
+                                ):
+                                    current_position = (
+                                        inputs_embeds.shape[1] + start_idx
+                                    )
+                                    if (
+                                        not found_stop_seq
+                                        or current_position < stop_position
+                                    ):
+                                        stop_position = current_position
+                                        found_stop_seq = True
+                                    break
+                            if found_stop_seq:
+                                break
+                    i += 1
+
+                if threshold is not None:
+                    print(f"Number of steps: {i}")
+
+            # Return results
+            if found_stop_seq:
+                if suffix_len > 0:
+                    return torch.cat(
+                        [
+                            x[:, inputs_embeds.shape[1] : stop_position],
+                            x[:, -suffix_len:],
+                        ],
+                        dim=1,
+                    )
+                return x[:, inputs_embeds.shape[1] : stop_position], track_x
+            if suffix_len > 0:
+                return torch.cat(
+                    [
+                        x[
+                            :,
+                            inputs_embeds.shape[1] : inputs_embeds.shape[1]
+                            + gen_length,
+                        ],
+                        x[:, -suffix_len:],
+                    ],
+                    dim=1,
+                )
+            return x[
+                    :, inputs_embeds.shape[1] : inputs_embeds.shape[1] + gen_length
+                ], track_x
+
+    @staticmethod
+    def _apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Apply rotary position embedding to query and key tensors."""
+        _ = position_ids # Unused in this implementation
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (FastDLLMGenerationHook._rotate_half(q) * sin)
+        k_embed = (k * cos) + (FastDLLMGenerationHook._rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    @staticmethod
+    def _rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def _get_transfer_index(
+        self,
+        logits,
+        temperature,
+        remasking,
+        mask_index,
+        x,
+        num_transfer_tokens,
+        found_stop_seq,
+        stop_position,
+        block_end,
+        suffix_len,
+        block_start,
+        threshold=None,
+    ):
+        """Get transfer indices for token updates during generation."""
+        logits_with_noise = self._add_gumbel_noise(logits, temperature=temperature)
+        x0 = torch.argmax(logits_with_noise, dim=-1)
+
+        if remasking == "low_confidence":
+            p = F.softmax(logits.to(torch.float64), dim=-1)
+            x0_p = torch.squeeze(
+                torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1
+            )
+        elif remasking == "reverse_causal_comma":
+            x0_p = self.reverse_causal_x0_p_comma(x0, logits, block_start, block_end)
+        else:
+            raise NotImplementedError(remasking)
+
+        # Handle stop sequences and block boundaries
+        if found_stop_seq:
+            x0_p[:, stop_position:] = -np.inf
+        else:
+            x0_p[:, block_end:] = -np.inf
+
+        # Prevent overwriting suffix
+        if suffix_len > 0:
+            x0_p[:, -suffix_len:] = -np.inf
+
+        x0 = torch.where(mask_index, x0, x)
+        confidence = torch.where(mask_index, x0_p, -np.inf)
+
+        transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+
+        if threshold is not None:
+            num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
+
+        for j in range(confidence.shape[0]):
+            if threshold is None:
+                top_i = num_transfer_tokens[j]
+            else:
+                ns = list(range(1, num_transfer_tokens[j] + 1))
+                es = [threshold / (n + 1) for n in ns]
+                threshs = [1 - e for e in es]
+                threshs[0] = -1  # at least one token is transferred
+
+                sorted_confidence = torch.sort(
+                    confidence[j][mask_index[j]], dim=-1, descending=True
+                )[0]
+                assert len(sorted_confidence) == len(threshs)
+
+                top_i = 0
+                for top_i, _ in enumerate(threshs):
+                    if sorted_confidence[top_i] < threshs[top_i]:
+                        break
+
+                if top_i in (0, len(threshs) - 1):
+                    top_i += 1
+
+            _, select_index = torch.topk(confidence[j], k=top_i)
+            transfer_index[j, select_index] = True
+
+        return x0, transfer_index
+
+    @staticmethod
+    def _add_gumbel_noise(logits, temperature):
+        """Add Gumbel noise for categorical sampling."""
+        if temperature == 0:
+            return logits
+
+        logits = logits.to(torch.float64)
+        noise = torch.rand_like(logits, dtype=torch.float64)
+        gumbel_noise = (-torch.log(noise)) ** temperature
+        return logits.exp() / gumbel_noise
+
+    @staticmethod
+    def _get_num_transfer_tokens(mask_index, steps):
+        """Precompute the number of tokens to transition at each step."""
+        mask_num = mask_index.sum(dim=1, keepdim=True)
+        base = mask_num // steps
+        remainder = mask_num % steps
+
+        num_transfer_tokens = base.expand(-1, steps).clone()
+
+        if remainder.sum() > 0:
+            indices = torch.arange(steps, device=mask_index.device)
+            mask = indices.unsqueeze(0) < remainder
+            num_transfer_tokens[mask] += 1
+
+        return num_transfer_tokens.to(torch.int64)
+
+    def _create_cache_slice(self, fast_dllm_cache, block_start):
+        """Create a sliced version of fast_dllm_cache for block processing."""
+        new_past_key_values = []
+        for i, cache_i in enumerate(fast_dllm_cache):
+            new_past_key_values.append([])
+            for _, cache_j in enumerate(cache_i):
+                new_past_key_values[i].append(cache_j[:, :, :block_start])
+        return new_past_key_values
+
+
+def register_fast_dllm_hook(model):
+    """
+    Register fast dLLM cache hooks to the model.
+
+    Args:
+        model: The LLaDA model to register hooks to
+
+    Returns:
+        FastDLLMGenerationHook: The hook instance for management
+    """
+    hook = FastDLLMGenerationHook(model)
+    hook.register_hooks()
+    return hook
+
+
+def unregister_fast_dllm_hook(hook):
+    """
+    Unregister fast dLLM cache hooks from the model.
+
+    Args:
+        hook: The FastDLLMGenerationHook instance to unregister
+    """
+    hook.unregister_hooks()
diff --git a/train/llava/mm_utils.py b/train/llava/mm_utils.py
new file mode 100644
index 0000000..52b4503
--- /dev/null
+++ b/train/llava/mm_utils.py
@@ -0,0 +1,396 @@
+from PIL import Image
+from io import BytesIO
+import base64
+import math
+import ast
+import re
+import torch
+from transformers import StoppingCriteria
+from llava.constants import IMAGE_TOKEN_INDEX
+from llava.utils import rank0_print, process_video_with_pyav, process_video_with_decord
+
+
+def resize_and_center_crop(image, shortest_edge_length):
+    # Calculate new dimensions and resize
+    aspect_ratio = float(image.width) / float(image.height)
+    if aspect_ratio > 1:
+        new_width = int(shortest_edge_length * aspect_ratio)
+        new_height = shortest_edge_length
+    else:
+        new_width = shortest_edge_length
+        new_height = int(shortest_edge_length / aspect_ratio)
+    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+
+    # Calculate the position and perform the center crop
+    left = (new_width - shortest_edge_length) / 2
+    top = (new_height - shortest_edge_length) / 2
+    right = (new_width + shortest_edge_length) / 2
+    bottom = (new_height + shortest_edge_length) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+
+    return cropped_image
+
+
+def auto_pad_images(image, grid_params):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert len(grid_params) > 0, "Grid parameters should not be empty"
+
+    # Step 1: Calculate and find the closest aspect ratio
+    input_width, input_height = image.size
+    input_aspect_ratio = input_width / input_height
+    candidate_resolutions = [(w / h, w, h) for w in grid_params for h in grid_params]
+    closest_aspect_ratio = min(candidate_resolutions, key=lambda x: abs(input_aspect_ratio - x[0]))
+
+    candidate_resolutions = [(x[1], x[2]) for x in candidate_resolutions if abs(x[0] - closest_aspect_ratio[0]) < 1e-3]
+
+    target_resolution = min(candidate_resolutions, key=lambda res: abs(max(input_width, input_height) / max(res) - 1))
+
+    resize_width, resize_height = target_resolution
+    if input_width > input_height:
+        resize_height = int(resize_width / input_aspect_ratio)
+    else:
+        resize_width = int(resize_height * input_aspect_ratio)
+    resized_image = image.resize((resize_width, resize_height), Image.ANTIALIAS)
+
+    # Step 5: Pad the resized image if necessary to match the target resolution
+    pad_width = target_resolution[0] - resize_width
+    pad_height = target_resolution[1] - resize_height
+    padded_image = Image.new("RGB", target_resolution, color=(0, 0, 0))
+    padded_image.paste(resized_image, (pad_width // 2, pad_height // 2))
+
+    return padded_image
+
+
+def extract_patches(image, patch_size, overlap_ratio):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert patch_size > 0, "Patch size should be greater than 0"
+    assert 0 <= overlap_ratio < 1, "Overlap ratio should be between 0 and 1"
+
+    W, H = image.size
+    patches = []
+
+    stride = int(patch_size * (1 - overlap_ratio))
+
+    num_patches_y = (H - patch_size) // stride + 1
+    num_patches_x = (W - patch_size) // stride + 1
+
+    y_start = (H - (num_patches_y - 1) * stride - patch_size) // 2
+    x_start = (W - (num_patches_x - 1) * stride - patch_size) // 2
+
+    for y in range(y_start, y_start + num_patches_y * stride, stride):
+        for x in range(x_start, x_start + num_patches_x * stride, stride):
+            patch = image.crop((x, y, x + patch_size, y + patch_size))
+            patches.append(patch)
+
+    return patches
+
+
+def process_highres_image_crop_split(image, data_args, processor=None):
+    crop_resolution = data_args.image_crop_resolution
+    split_resolution = data_args.image_split_resolution
+    if processor is None:
+        processor = data_args.image_processor
+    image_crop = resize_and_center_crop(image, crop_resolution)
+    image_patches = extract_patches(image_crop, patch_size=split_resolution, overlap_ratio=0)
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+
+
+def process_highres_image(image, processor, grid_pinpoints):
+    grid_params = [int(x) for x in grid_pinpoints.split(",")]
+    width_height = max(image.size)
+    fit_grid_params = [x for x in grid_params if x >= width_height]
+    if len(fit_grid_params) == 0:
+        select_size = max(grid_params)
+    else:
+        select_size = min(fit_grid_params)
+    # FIXME: always select the 448
+    select_size = max(grid_params)
+    image_padded = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+
+    # FIXME: this seems to be a bug that it always resizes instead of padding
+    image_original_resize = image.resize((processor.size["shortest_edge"], processor.size["shortest_edge"]))
+    image_padded = image_padded.resize((select_size, select_size))
+    image_patches = extract_patches(image_padded, patch_size=processor.size["shortest_edge"], overlap_ratio=0)
+    image_patches = [image_original_resize] + image_patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "highres":
+        for image in images:
+            image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "crop_split":
+        for image in images:
+            image = process_highres_image_crop_split(image, model_cfg, image_processor)
+            new_images.append(image)
+    elif image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            new_images.append(image)
+    else:
+        return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
diff --git a/train/llava/model/__init__.py b/train/llava/model/__init__.py
new file mode 100644
index 0000000..0a9ae48
--- /dev/null
+++ b/train/llava/model/__init__.py
@@ -0,0 +1,8 @@
+import os
+
+AVAILABLE_MODELS = {
+    "llava_llada": "LlavaLLaDAModelLM, LlavaLLaDAConfig",
+}
+
+for model_name, model_classes in AVAILABLE_MODELS.items():
+    exec(f"from .language_model.{model_name} import {model_classes}")
\ No newline at end of file
diff --git a/train/llava/model/apply_delta.py b/train/llava/model/apply_delta.py
new file mode 100644
index 0000000..c183ba1
--- /dev/null
+++ b/train/llava/model/apply_delta.py
@@ -0,0 +1,47 @@
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava import LlavaLlamaForCausalLM
+
+
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
+
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/train/llava/model/builder.py b/train/llava/model/builder.py
new file mode 100644
index 0000000..264534c
--- /dev/null
+++ b/train/llava/model/builder.py
@@ -0,0 +1,145 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.utils import rank0_print
+from trl.import_utils import is_npu_available
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", torch_dtype="float16",attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
+    kwargs["device_map"] = device_map
+
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
+    elif torch_dtype == "float16":
+        kwargs["torch_dtype"] = torch.float16
+    elif torch_dtype == "bfloat16":
+        kwargs["torch_dtype"] = torch.bfloat16
+    else:
+        import pdb;pdb.set_trace()
+
+    if customized_config is not None:
+        kwargs["config"] = customized_config
+
+    if "multimodal" in kwargs:
+        if kwargs["multimodal"] is True:
+            is_multimodal = True
+            kwargs.pop("multimodal")
+    else:
+        is_multimodal = False
+
+    if "llava" in model_name.lower() or is_multimodal:
+        # Load LLaVA model
+        rank0_print(f"Loaded LLaVA model: {model_path}")
+        if "llada" in model_name.lower() or "exp" in model_name.lower():
+            from llava.model.language_model.llava_llada import LlavaLLaDAConfig
+
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            if customized_config is None:
+                llada_cfg = LlavaLLaDAConfig.from_pretrained(model_path)
+            else:
+                llada_cfg = customized_config
+
+            if overwrite_config is not None:
+                rank0_print(f"Overwriting config with {overwrite_config}")
+                for k, v in overwrite_config.items():
+                    setattr(llada_cfg, k, v)
+
+            model = LlavaLLaDAModelLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llada_cfg, **kwargs)
+
+        else:
+            try:
+                from llava.model.language_model.llava_llama import LlavaConfig
+
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                if customized_config is None:
+                    llava_cfg = LlavaConfig.from_pretrained(model_path)
+                    if "v1.5" in model_path.lower():
+                        llava_cfg.delay_load = True  # a workaround for correctly loading v1.5 models
+                else:
+                    llava_cfg = customized_config
+
+                if overwrite_config is not None:
+                    rank0_print(f"Overwriting config with {overwrite_config}")
+                    for k, v in overwrite_config.items():
+                        setattr(llava_cfg, k, v)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llava_cfg, **kwargs)
+            except:
+                raise ValueError(f"Model {model_name} not supported")
+
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if "mpt" in model_name.lower().replace("prompt", ""):
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    rank0_print(f"Model Class: {model.__class__.__name__}")
+    image_processor = None
+
+    if "llava" in model_name.lower() or is_multimodal:
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device_map)
+        if device_map != "auto":
+            if is_npu_available():
+                vision_tower.to(device="npu", dtype=torch.float16)
+            else:
+                vision_tower.to(device="cuda", dtype=torch.bfloat16)
+        image_processor = vision_tower.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    elif hasattr(model.config, "max_position_embeddings"):
+        context_len = model.config.max_position_embeddings
+    elif hasattr(model.config, "tokenizer_model_max_length"):
+        context_len = model.config.tokenizer_model_max_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
diff --git a/train/llava/model/consolidate.py b/train/llava/model/consolidate.py
new file mode 100644
index 0000000..f02e575
--- /dev/null
+++ b/train/llava/model/consolidate.py
@@ -0,0 +1,30 @@
+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+
+import argparse
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+from llava.model.utils import auto_upgrade
+
+
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+
+    args = parser.parse_args()
+
+    consolidate_ckpt(args.src, args.dst)
diff --git a/train/llava/model/language_model/configuration_llada.py b/train/llava/model/language_model/configuration_llada.py
new file mode 100644
index 0000000..7fc5604
--- /dev/null
+++ b/train/llava/model/language_model/configuration_llada.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaDA model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLaDA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LLaDAConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LLaDAModel`]. It is used to instantiate an LLaDA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaDA-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaDA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LLaDAModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. 
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. 
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+
+    model_type = "llada"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_experts_per_tok=-1,
+        num_experts=-1,
+        moe_choice='expert',
+        capacity_factor=0.1,
+        moe_router_enable_expert_bias=None,
+        moe_router_score_function=None,
+        moe_lora_rank=32,
+        moe_lora_alpha=32,
+        moe_lora_in_features=4096,
+        moe_lora_out_features=4096,
+        moe_lora_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.moe_choice = moe_choice
+        self.capacity_factor = capacity_factor
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.moe_router_score_function = moe_router_score_function
+        self.moe_lora_rank=moe_lora_rank
+        self.moe_lora_alpha=moe_lora_alpha
+        self.moe_lora_in_features=moe_lora_in_features
+        self.moe_lora_out_features=moe_lora_out_features
+        self.moe_lora_dropout=moe_lora_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/train/llava/model/language_model/llava_llada.py b/train/llava/model/language_model/llava_llada.py
new file mode 100644
index 0000000..22b1e10
--- /dev/null
+++ b/train/llava/model/language_model/llava_llada.py
@@ -0,0 +1,211 @@
+#    Copyright 2023 Zebin You
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from llava.model.language_model.configuration_llada import LLaDAConfig
+from llava.model.language_model.modeling_llada import LLaDAModel, LLaDAModelLM
+from llava.model.llava_arch import LlavaMetaForCausalLM, LlavaMetaModel
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+)
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+
+class LlavaLLaDAConfig(LLaDAConfig):
+    model_type = "llava_llada"
+    temperature: float = 0.0  # reset to 0.0, previously 0.9 for Vicuna
+    max_new_tokens: int = 1024
+    do_sample: bool = False
+    top_p: Optional[float] = None
+    # rope_scaling: Optional[dict] = {}
+
+
+class LlavaLLaDAModel(LlavaMetaModel, LLaDAModel):
+    config_class = LlavaLLaDAConfig
+
+    def __init__(self, config: LLaDAConfig):
+        super(LlavaLLaDAModel, self).__init__(config)
+
+
+class LlavaLLaDAModelLM(LLaDAModelLM, LlavaMetaForCausalLM):
+    config_class = LlavaLLaDAConfig
+
+    def __init__(self, config):
+        LLaDAModelLM.__init__(self, config)
+
+        # configure default generation settings
+        config.model_type = "llava_llada"
+        # config.rope_scaling = None
+
+        self.model = LlavaLLaDAModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = None,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None and attention_mask is not None:
+            # donate multi-dialogue
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                conversation_ids,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                modalities,
+                image_sizes,
+                is_llada=True,
+            )
+        elif inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                modalities,
+                image_sizes,
+            )
+            conversation_ids = None
+        if dpo_forward:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            return logits, labels
+
+        else:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                conversation_ids=conversation_ids,
+            )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        modalities = (
+            kwargs.pop("modalities", None)
+            if "modalities" in kwargs and modalities is None
+            else modalities
+        )
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = (
+                self.prepare_inputs_labels_for_multimodal(
+                    inputs,
+                    position_ids,
+                    attention_mask,
+                    None,
+                    None,
+                    images,
+                    modalities,
+                    image_sizes=image_sizes,
+                )
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate_with_embeds(inputs_embeds=inputs_embeds, **kwargs)
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+
+
+AutoConfig.register("llava_llada", LlavaLLaDAConfig)
+AutoModelForCausalLM.register(LlavaLLaDAConfig, LlavaLLaDAModelLM)
diff --git a/train/llava/model/language_model/modeling_llada.py b/train/llava/model/language_model/modeling_llada.py
new file mode 100644
index 0000000..997bad5
--- /dev/null
+++ b/train/llava/model/language_model/modeling_llada.py
@@ -0,0 +1,2330 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LLaDA model."""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from llava.cache import dLLMCache
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+
+from .configuration_llada import LLaDAConfig
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LLaDAConfig"
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+class LLaDARMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LLaDARMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(LLaDARMSNorm)
+
+
+class LLaDARotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (
+                torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device)
+                / self.dim
+            )
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=torch.int64
+        ).type_as(self.inv_freq)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False
+        )
+        self.register_buffer(
+            "_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False
+        )
+
+    @property
+    def sin_cached(self):
+        logger.warning_once(
+            "The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
+            "the forward method of RoPE from now on instead. It is not used in the `LLaDAAttention` class"
+        )
+        return self._sin_cached
+
+    @property
+    def cos_cached(self):
+        logger.warning_once(
+            "The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
+            "the forward method of RoPE from now on instead. It is not used in the `LLaDAAttention` class"
+        )
+        return self._cos_cached
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class LLaDALinearScalingRotaryEmbedding(LLaDARotaryEmbedding):
+    """LLaDARotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: a scaling factor is aplied to the position ids
+        position_ids = position_ids.float() / self.scaling_factor
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+class LLaDADynamicNTKScalingRotaryEmbedding(LLaDARotaryEmbedding):
+    """LLaDARotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings)
+                - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base
+                ** (
+                    torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device)
+                    / self.dim
+                )
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: this may break with compilation
+
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LLaDAMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [
+                    F.linear(x, gate_proj_slices[i])
+                    for i in range(self.config.pretraining_tp)
+                ],
+                dim=-1,
+            )
+            up_proj = torch.cat(
+                [
+                    F.linear(x, up_proj_slices[i])
+                    for i in range(self.config.pretraining_tp)
+                ],
+                dim=-1,
+            )
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class LLaDAAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LLaDAConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        # self.is_causal = True
+        # Modify: MDM set causal to False.
+        self.is_causal = False
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            self.hidden_size, self.hidden_size, bias=config.attention_bias
+        )
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LLaDARotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LLaDALinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LLaDADynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (
+                self.num_key_value_heads * self.head_dim
+            ) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [
+                F.linear(hidden_states, query_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [
+                F.linear(hidden_states, key_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [
+                F.linear(hidden_states, value_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(
+                self.hidden_size // self.config.pretraining_tp, dim=2
+            )
+            o_proj_slices = self.o_proj.weight.split(
+                self.hidden_size // self.config.pretraining_tp, dim=1
+            )
+            attn_output = sum(
+                [
+                    F.linear(attn_output[i], o_proj_slices[i])
+                    for i in range(self.config.pretraining_tp)
+                ]
+            )
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LLaDAFlashAttention2(LLaDAAttention):
+    """
+    LLaDA flash attention module. This module inherits from `LLaDAAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LLaDARMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LLaDAFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        assert causal is False  # Modify: MDM
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length
+            )
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+        return attn_output
+
+    def _upad_input(
+        self, query_layer, key_layer, value_layer, attention_mask, query_length
+    ):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class LLaDASdpaAttention(LLaDAAttention):
+    """
+    LLaDA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LLaDAAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from LLaDAAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LLaDAModel is using LLaDASdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+
+        # In case static cache is used, it is an instance attribute.
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        # if attention_mask is not None and cache_position is not None:
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            is_causal=False,  # Modify: MDM
+            dropout_p=self.attention_dropout if self.training else 0.0,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+LLaDA_ATTENTION_CLASSES = {
+    "eager": LLaDAAttention,
+    "flash_attention_2": LLaDAFlashAttention2,
+    "sdpa": LLaDASdpaAttention,
+}
+
+
+class LoRALayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.r = config.moe_lora_rank
+        self.alpha = config.moe_lora_alpha
+        self.scaling = self.alpha / self.r
+
+        in_features = config.moe_lora_in_features
+        out_features = config.moe_lora_out_features
+
+        if self.r > 0:
+            self.lora_A = nn.Parameter(torch.randn(self.r, in_features))
+            self.lora_B = nn.Parameter(torch.zeros(out_features, self.r))
+            self.dropout = nn.Dropout(config.moe_lora_dropout)
+        else:
+            self.lora_A = None
+            self.lora_B = None
+            self.dropout = nn.Identity()
+
+    def forward(self, x):
+        return self.dropout(x) @ self.lora_A.T @ self.lora_B.T * self.scaling
+
+
+class LLaDAMoESparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.capacity_factor = config.capacity_factor
+        self.norm_topk_prob = config.moe_choice == "token"
+
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [LoRALayer(config) for _ in range(self.num_experts)]
+        )
+        # self.score_func = config.moe_router_score_function
+        if (
+            hasattr(config, "moe_router_enable_expert_bias")
+            and config.moe_router_enable_expert_bias
+        ):
+            self.register_buffer("expert_bias", torch.zeros(self.num_experts))
+        else:
+            self.expert_bias = None
+        if config.moe_choice == "expert":
+            self.forward = self.forward_expert_choice
+        else:
+            self.forward = self.forward_token_choice
+
+    # expert choice
+    def forward_expert_choice(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        num_tokens = batch_size * sequence_length
+        hidden_states = hidden_states.view(-1, hidden_dim)  # flatten operation
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = F.softmax(
+            self.gate(hidden_states), dim=1
+        )  # importance of each token and experts
+
+        router_logits_T = router_logits.t()  # expert_num, sequence_length
+
+        capacity = max(
+            1, int(round(self.capacity_factor * num_tokens / self.num_experts))
+        )  # find out the capacity works?
+        capacity = min(capacity, num_tokens)  # clamp to valid range
+        # import pdb; pdb.set_trace()
+        expert_weights, expert_indices = torch.topk(
+            router_logits_T,
+            k=capacity,
+            dim=1,
+            sorted=False,  # no need to sort for efficiency
+        )
+
+        # Initialize output and token processing counter
+        output = torch.zeros_like(hidden_states)
+        # import pdb; pdb.set_trace()
+
+        # Process each expert in parallel (if possible) or sequentially
+        for expert_idx in range(self.num_experts):
+            # Get tokens selected by this expert
+            token_indices = expert_indices[expert_idx]  # [capacity]
+            weights = expert_weights[expert_idx]  # [capacity]
+
+            # Get hidden states for selected tokens
+            selected_hidden_states = hidden_states[
+                token_indices
+            ]  # [capacity, hidden_size]
+
+            # Expert forward pass
+            expert_output = self.experts[expert_idx](
+                selected_hidden_states
+            )  # [capacity, hidden_size] ->
+
+            # Weighted accumulation to output
+            # Each token may be processed by multiple experts, so we accumulate
+            weighted_output = (
+                weights.unsqueeze(-1) * expert_output
+            )  # [1, capacity] * [capacity, hidden_size]
+
+            # Scatter-add to output tensor
+            output.index_add_(0, token_indices, weighted_output)
+            # break
+        # Reshape back to original shape
+        output = output.view(batch_size, sequence_length, hidden_dim)
+
+        # print(f"in mlp output_router_logits: {output_router_logits}")
+        # import pdb; pdb.set_trace()
+        # print("router logits: ", router_logits)
+
+        return output
+
+    # token choice
+    def forward_token_choice(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with improved routing and load balancing.
+
+        Args:
+            hidden_states: [batch_size, sequence_length, hidden_dim]
+
+        Returns:
+            output: [batch_size, sequence_length, hidden_dim]
+        """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        num_tokens = batch_size * sequence_length
+
+        # Reshape to [num_tokens, hidden_dim]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # Compute routing weights
+        router_logits_raw, routing_probs = self._compute_routing_weights(hidden_states)
+
+        # Select top-k experts per token
+        # topk_weights: [num_tokens, top_k]
+        # selected_experts: [num_tokens, top_k]
+        topk_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
+
+        # Normalize top-k probabilities (Fix 1)
+        if self.norm_topk_prob:
+            topk_weights = topk_weights / topk_weights.sum(
+                dim=-1, keepdim=True
+            ).clamp_min(1e-9)
+
+        # Cast to input dtype
+        topk_weights = topk_weights.to(hidden_states.dtype)
+
+        # Compute auxiliary loss (Fix 3)
+        if self.training:
+            self.aux_loss = self._compute_aux_loss(
+                routing_probs, selected_experts, num_tokens
+            )
+        else:
+            self.aux_loss = torch.tensor(
+                0.0, device=hidden_states.device, dtype=hidden_states.dtype
+            )
+
+        # Initialize output
+        final_hidden_states = torch.zeros(
+            (num_tokens, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        # Efficient per-expert dispatch (Fix 2 - avoid one_hot)
+        # Process each expert
+        for expert_idx in range(self.num_experts):
+            # Find which tokens route to this expert
+            # expert_mask: [num_tokens, top_k] - boolean mask
+            expert_mask = selected_experts == expert_idx
+
+            # Get token indices that route to this expert (at any of their top-k positions)
+            # token_indices: [num_routed_tokens]
+            token_indices = expert_mask.any(dim=-1).nonzero(as_tuple=True)[0]
+
+            if token_indices.numel() == 0:
+                # No tokens routed to this expert, skip
+                continue
+
+            # For each routed token, find which k position corresponds to this expert
+            # k_positions: [num_routed_tokens]
+            k_positions = expert_mask[token_indices].float().argmax(dim=-1)
+
+            # Get the routing weights for this expert
+            # weights: [num_routed_tokens, 1]
+            weights = topk_weights[token_indices, k_positions].unsqueeze(-1)
+
+            # Get the hidden states for routed tokens
+            # current_states: [num_routed_tokens, hidden_dim]
+            current_states = hidden_states.index_select(0, token_indices)
+
+            # Compute expert output
+            # expert_output: [num_routed_tokens, hidden_dim]
+            expert_output = self.experts[expert_idx](current_states)
+
+            # Weight the expert output
+            weighted_output = expert_output * weights
+
+            # Accumulate to final output
+            final_hidden_states.index_add_(
+                0, token_indices, weighted_output.to(hidden_states.dtype)
+            )
+
+        # Reshape back to [batch_size, sequence_length, hidden_dim]
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+
+        return final_hidden_states
+
+
+class LLaDADecoderLayer(nn.Module):
+    def __init__(self, config: LLaDAConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.mlp_type = "moe"
+
+        self.self_attn = LLaDA_ATTENTION_CLASSES[config._attn_implementation](
+            config=config, layer_idx=layer_idx
+        )
+
+        self.mlp = LLaDAMLP(config)
+        self.moe = LLaDAMoESparseMoeBlock(config) if config.num_experts > 0 else None
+        self.input_layernorm = LLaDARMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LLaDARMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states_mlp = self.mlp(hidden_states)
+        if self.moe:
+            hidden_states_moe = self.moe(hidden_states)
+            hidden_states = residual + hidden_states_mlp + hidden_states_moe
+        else:
+            hidden_states = residual + hidden_states_mlp
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLaDA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LLaDAConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaDA Model outputting raw hidden-states without any specific head on top.",
+    LLaDA_START_DOCSTRING,
+)
+class LLaDAPreTrainedModel(PreTrainedModel):
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LLaDADecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _setup_cache(
+        self, cache_cls, max_batch_size, max_cache_len: Optional[int] = None
+    ):
+        if (
+            self.config._attn_implementation == "flash_attention_2"
+            and cache_cls == StaticCache
+        ):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        for layer in self.model.layers:
+            device = layer.input_layernorm.weight.device
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                dtype = self.config._pre_quantization_dtype
+            else:
+                dtype = layer.self_attn.o_proj.weight.dtype
+            layer.self_attn.past_key_value = cache_cls(
+                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype
+            )
+
+    def _reset_cache(self):
+        for layer in self.model.layers:
+            layer.self_attn.past_key_value = None
+
+
+LLaDA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaDA Model outputting raw hidden-states without any specific head on top.",
+    LLaDA_START_DOCSTRING,
+)
+class LLaDAModel(LLaDAPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LLaDADecoderLayer`]
+
+    Args:
+        config: LLaDAConfig
+    """
+
+    def __init__(self, config: LLaDAConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                LLaDADecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LLaDARMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLaDA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # Add Basic MDM Model config check
+        assert past_key_values is None and not use_cache, (
+            "The kvcache is not suppotred for MDM."
+        )
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        past_seen_tokens = 0
+        if use_cache:  # kept for BC (cache positions)
+            if not isinstance(past_key_values, StaticCache):
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_seen_tokens = past_key_values.get_seq_length()
+
+        if cache_position is None:
+            if isinstance(past_key_values, StaticCache):
+                raise ValueError(
+                    "cache_position is a required argument when using StaticCache."
+                )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, is_causal=False
+        )  # Modify: MDM
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    **kwargs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    **kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if isinstance(next_decoder_cache, Cache)
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+    def _update_causal_mask(
+        self, attention_mask, input_tensor, cache_position, is_causal=True
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
+            target_length = self.config.max_position_embeddings
+        else:  # dynamic cache
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[-1] + 1
+            )
+
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=dtype,
+            device=device,
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+
+        if is_causal == False:
+            causal_mask = torch.zeros(
+                (sequence_length, target_length), dtype=dtype, device=device
+            )
+
+        causal_mask *= torch.arange(
+            target_length, device=device
+        ) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(
+            input_tensor.shape[0], 1, -1, -1
+        )
+        if attention_mask is not None:
+            causal_mask = (
+                causal_mask.clone()
+            )  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                # The position with 1 in attention_mask represents the place to be attended to, so here we need to mask the place where attention_mask is 0
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[
+                    :, None, None, :
+                ].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[
+                    ..., :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+            elif attention_mask.dim() == 4:
+                # The position with 1 in attention_mask represents the place to be attended to, so here we need to mask the place where attention_mask is 0
+                # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+                # cache. In that case, the 4D attention mask attends to the newest tokens only.
+                if attention_mask.shape[-2] < cache_position[0] + sequence_length:
+                    offset = cache_position[0]
+                else:
+                    offset = 0
+                mask_shape = attention_mask.shape
+                mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+                causal_mask[
+                    : mask_shape[0],
+                    : mask_shape[1],
+                    offset : mask_shape[2] + offset,
+                    : mask_shape[3],
+                ] = mask_slice
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+        ):
+            # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+            is_tracing = (
+                torch.jit.is_tracing()
+                or isinstance(input_tensor, torch.fx.Proxy)
+                or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+            )
+            if not is_tracing and torch.any(attention_mask != 1):
+                # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+                # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+                # Details: https://github.com/pytorch/pytorch/issues/110213
+                causal_mask = AttentionMaskConverter._unmask_unattended(
+                    causal_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class LLaDAModelLM(LLaDAPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LLaDAModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def _build_conversation_mask_optimized(self, conversation_ids):
+        # Reshape conversation_ids for broadcasting
+        ids_i = conversation_ids.unsqueeze(-1)  # [batch_size, seq_len, 1]
+        ids_j = conversation_ids.unsqueeze(-2)  # [batch_size, 1, seq_len]
+
+        # Use broadcasting to compare all pairs of conversation IDs
+        conv_mask = ids_j <= ids_i  # [batch_size, seq_len, seq_len]
+
+        # Add the attention head dimension
+        return conv_mask.unsqueeze(1)  # [batch_size, 1, seq_len, seq_len]
+
+    @staticmethod
+    def add_gumbel_noise(logits, temperature):
+        """
+        The Gumbel max is a method for sampling categorical distributions.
+        According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+        Thus, we use float64.
+        """
+        if temperature == 0:
+            # When temperature=0, we can directly return the original logits.
+            # without any noise or transformation
+            return logits
+
+        # use float64 for more stable computation
+        logits = logits.to(torch.float64)
+        noise = torch.rand_like(logits, dtype=torch.float64)
+        gumbel_noise = (-torch.log(noise)) ** temperature
+        return logits.exp() / gumbel_noise
+
+    @staticmethod
+    def get_num_transfer_tokens(mask_index, steps):
+        """
+        Precompute the number of tokens to transition at each step.
+        Optimized to be more efficient.
+        """
+        mask_num = mask_index.sum(dim=1, keepdim=True)
+        base = mask_num // steps
+        remainder = mask_num % steps
+
+        # Create tensor once and modify in-place (via clone)
+        num_transfer_tokens = base.expand(-1, steps).clone()
+
+        # Handle remainder more efficiently
+        if remainder.sum() > 0:  # Optimization: only proceed if there are remainders
+            indices = torch.arange(steps, device=mask_index.device)
+            # Create mask using broadcasting
+            # indices shape: [steps] -> [1, steps]
+            # remainder shape: [batch_size, 1]
+            # mask shape: [batch_size, steps]
+            mask = indices.unsqueeze(0) < remainder
+            num_transfer_tokens[mask] += 1
+
+        return num_transfer_tokens.to(torch.int64)
+
+    @staticmethod
+    def get_masked_indices_from_embeds(noisy_embeds, masked_embed):
+        # Get shape information
+        b, l, d = noisy_embeds.shape
+        # Expand masked_embed to the same shape as noisy_embeds [b, l, d]
+        masked_embed_expanded = masked_embed.expand(b, l, d)
+        # Calculate absolute difference
+        abs_diff = torch.abs(noisy_embeds - masked_embed_expanded)
+        # Calculate tolerance boundary (atol + rtol * abs(masked_embed))
+        tolerance = 1e-5 + 1e-5 * torch.abs(masked_embed_expanded)
+        # Check if all dimensions at each position are within tolerance
+        # all(dim=-1) ensures all dimensions of each embedding meet the condition
+        masked_indices = (abs_diff <= tolerance).all(dim=-1)
+
+        return masked_indices
+
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt,
+        steps=128,
+        gen_length=128,
+        block_length=128,
+        temperature=0.0,
+        cfg_scale=0.0,
+        remasking="low_confidence",
+        mask_id=126336,
+    ):
+        """
+        Args:
+            prompt: A tensor of shape (1, l).
+            steps: Sampling steps, less than or equal to gen_length.
+            gen_length: Generated answer length.
+            block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
+            temperature: Categorical distribution sampling temperature.
+            cfg_scale: Unsupervised classifier-free guidance scale.
+            remasking: Remasking strategy. 'low_confidence' or 'random'.
+            mask_id: The toke id of [MASK] is 126336.
+        """
+        x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(
+            prompt.device
+        )
+        x[:, : prompt.shape[1]] = prompt.clone()
+
+        prompt_index = x != mask_id
+
+        assert gen_length % block_length == 0
+        num_blocks = gen_length // block_length
+
+        assert steps % num_blocks == 0
+        steps = steps // num_blocks
+
+        for num_block in range(num_blocks):
+            block_mask_index = (
+                x[
+                    :,
+                    prompt.shape[1] + num_block * block_length : prompt.shape[1]
+                    + (num_block + 1) * block_length :,
+                ]
+                == mask_id
+            )
+            num_transfer_tokens = self.get_num_transfer_tokens(block_mask_index, steps)
+            for i in range(steps):
+                mask_index = x == mask_id
+                if cfg_scale > 0.0:
+                    un_x = x.clone()
+                    un_x[prompt_index] = mask_id
+                    x_ = torch.cat([x, un_x], dim=0)
+                    logits = self.model(x_).logits
+                    logits, un_logits = torch.chunk(logits, 2, dim=0)
+                    logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                else:
+                    logits = self.model(x).logits
+
+                logits_with_noise = self.add_gumbel_noise(
+                    logits, temperature=temperature
+                )
+                x0 = torch.argmax(logits_with_noise, dim=-1)  # b, l
+
+                if remasking == "low_confidence":
+                    p = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_p = torch.squeeze(
+                        torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1
+                    )  # b, l
+                elif remasking == "random":
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+                else:
+                    raise NotImplementedError(remasking)
+
+                x0_p[:, prompt.shape[1] + (num_block + 1) * block_length :] = -np.inf
+
+                x0 = torch.where(mask_index, x0, x)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+
+                transfer_index = torch.zeros_like(
+                    x0, dtype=torch.bool, device=x0.device
+                )
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(
+                        confidence[j], k=num_transfer_tokens[j, i]
+                    )
+                    transfer_index[j, select_index] = True
+                x[transfer_index] = x0[transfer_index]
+
+        return x
+
+    @torch.no_grad()
+    def generate_with_embeds(
+        self,
+        inputs_embeds,
+        steps=128,
+        gen_length=128,
+        block_length=128,
+        temperature=0.0,
+        cfg_scale=0.0,
+        remasking="low_confidence",
+        mask_id=126336,
+        tokenizer=None,
+        stopping_criteria=None,
+        generation_suffix=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            inputs_embeds: A tensor of shape (1, l, d).
+            steps: Sampling steps, less than or equal to gen_length.
+            gen_length: Generated answer length.
+            block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
+            temperature: Categorical distribution sampling temperature.
+            cfg_scale: Unsupervised classifier-free guidance scale.
+            remasking: Remasking strategy. 'low_confidence' or 'random'.
+            mask_id: The toke id of [MASK] is 126336.
+            generation_suffix: (str or None) Generation suffix, such as "The answer is xxx", will be appended to the end
+        """
+        # Use mixed precision for faster computation
+        with torch.cuda.amp.autocast(enabled=True):
+            # Handle generation suffix
+            suffix_embeds = None
+            suffix_token_ids = None
+            suffix_len = 0
+            if (
+                generation_suffix is not None
+                and tokenizer is not None
+                and len(generation_suffix) > 0
+            ):
+                # Encode as token id
+                suffix_token_ids = tokenizer.encode(
+                    generation_suffix, add_special_tokens=False
+                )
+                suffix_token_ids = torch.tensor(
+                    suffix_token_ids, dtype=torch.long, device=inputs_embeds.device
+                ).unsqueeze(0)  # (1, s)
+                # Convert to embedding
+                suffix_embeds = self.model.embed_tokens(suffix_token_ids)  # (1, s, d)
+                suffix_len = suffix_embeds.shape[1]
+            else:
+                suffix_len = 0
+
+            # Create input in embedding space
+            total_length = inputs_embeds.shape[1] + gen_length + suffix_len
+            masked_embed = self.model.embed_tokens(
+                torch.tensor([mask_id]).to(inputs_embeds.device)
+            )  # shape (1, d)
+            x_embeds = masked_embed.repeat(1, total_length, 1).to(
+                inputs_embeds.device
+            )  # shape (1, l + gen_length + suffix_len, d)
+            x_embeds[:, : inputs_embeds.shape[1]] = inputs_embeds.clone()
+            if suffix_embeds is not None:
+                x_embeds[:, -suffix_len:] = suffix_embeds
+
+            # Create a tracking tensor for token IDs for final output
+            x = torch.full(
+                (1, total_length),
+                mask_id,
+                dtype=torch.long,
+                device=inputs_embeds.device,
+            )
+            if suffix_token_ids is not None:
+                x[:, -suffix_len:] = suffix_token_ids
+
+            # prompt_index: A tensor of shape (1, l + gen_length + suffix_len) where the first l elements are 1 (representing the prompt)
+            # and the remaining gen_length+suffix_len elements are 0 (representing the generated part)
+            prompt_index = torch.zeros(
+                (1, total_length), dtype=torch.bool, device=inputs_embeds.device
+            )
+            prompt_index[:, : inputs_embeds.shape[1]] = (
+                1  # shape (1, l + gen_length + suffix_len)
+            )
+
+            assert gen_length % block_length == 0
+            num_blocks = gen_length // block_length
+
+            assert steps % num_blocks == 0
+            steps = steps // num_blocks
+
+            # New: Initialize stop position variable (default to maximum length)
+            stop_position = inputs_embeds.shape[1] + gen_length
+            found_stop_seq = False
+
+            stop_tokens = []
+            if stopping_criteria is not None:
+                assert tokenizer is not None, (
+                    "tokenizer is required when stopping_criteria is not None"
+                )
+                for stop_str in stopping_criteria:
+                    # Use tokenizer to convert stop words to token IDs
+                    tokens = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_tokens.append(tokens)
+
+            feature_cache = dLLMCache()
+            feature_cache.reset_cache(inputs_embeds.shape[1])
+            for num_block in range(num_blocks):
+                # Create mask index for the current block
+                block_start = inputs_embeds.shape[1] + num_block * block_length
+                block_end = inputs_embeds.shape[1] + (num_block + 1) * block_length
+
+                # If a stop word is found and the stop word position is before the current block, do not process the current block
+                if found_stop_seq and stop_position <= block_start:
+                    break
+
+                block_embeds = x_embeds[:, block_start:block_end]
+                block_mask_index = torch.all(
+                    torch.abs(block_embeds - masked_embed) < 1e-5, dim=2
+                )
+
+                num_transfer_tokens = self.get_num_transfer_tokens(
+                    block_mask_index, steps
+                )
+
+                for i in range(steps):
+                    # Determine which positions are mask embeddings
+                    mask_index = torch.all(
+                        torch.abs(x_embeds - masked_embed) < 1e-5, dim=2
+                    )
+
+                    # If a stop word has been found, check if the masks before the stop word are all filled
+                    if found_stop_seq:
+                        # Get the mask state before the stop word
+                        pre_stop_masks = mask_index[
+                            0, inputs_embeds.shape[1] : stop_position
+                        ]
+                        # If the masks before the stop word are all filled, exit generation
+                        if not pre_stop_masks.any():
+                            break
+
+                    # Check if there are any masks left to fill in the current block
+                    current_block_masks = mask_index[0, block_start:block_end]
+                    if not current_block_masks.any():
+                        break
+
+                    # Handle CFG
+                    if cfg_scale > 0.0:
+                        un_embeds = (
+                            x_embeds.clone()
+                        )  # shape (1, l + gen_length + suffix_len, d)
+                        un_mask = prompt_index.unsqueeze(-1).expand_as(
+                            x_embeds
+                        )  # shape (1, l + gen_length + suffix_len, d)
+                        un_embeds[un_mask] = masked_embed.repeat(
+                            x_embeds.shape[0], x_embeds.shape[1], 1
+                        )[un_mask]  # Use repeat to avoid the complexity of expand_as
+                        combined_embeds = torch.cat([x_embeds, un_embeds], dim=0)
+
+                        # Forward pass
+                        outputs = self.model(inputs_embeds=combined_embeds)
+                        logits = self.lm_head(outputs[0]).float()
+
+                        # Split and apply CFG
+                        logits, un_logits = torch.chunk(logits, 2, dim=0)
+                        logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                    else:
+                        # Forward pass
+                        outputs = self.model(inputs_embeds=x_embeds)
+                        logits = self.lm_head(outputs[0]).float()
+
+                    for token_id in [126081, 126080, 126346, 126347]:
+                        logits[:, :, token_id] = torch.where(
+                            mask_index, -float("inf"), logits[:, :, token_id]
+                        )
+
+                    # Add noise and get the most likely token
+                    logits_with_noise = self.add_gumbel_noise(
+                        logits, temperature=temperature
+                    )  # shape (1, l + gen_length + suffix_len, vocab_size)
+                    x0 = torch.argmax(
+                        logits_with_noise, dim=-1
+                    )  # 1, l + gen_length + suffix_len
+
+                    # Get confidence scores
+                    if remasking == "low_confidence":
+                        p = F.softmax(
+                            logits.to(torch.float64), dim=-1
+                        )  # shape (1, l + gen_length + suffix_len, vocab_size)
+                        x0_p = torch.squeeze(
+                            torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1
+                        )  # 1, l + gen_length + suffix_len represents the confidence of each x0
+                    elif remasking == "random":
+                        x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+                    else:
+                        raise NotImplementedError(remasking)
+
+                    # If a stop word is found, only process positions before the stop word
+                    if found_stop_seq:
+                        x0_p[:, stop_position:] = -np.inf
+                    else:
+                        # Prevent processing future blocks
+                        x0_p[:, block_end:] = -np.inf
+
+                    # Do not allow the generated suffix part to be overwritten
+                    if suffix_len > 0:
+                        x0_p[:, -suffix_len:] = -np.inf
+
+                    # Update predictions only at mask positions
+                    x0_embeds = self.model.embed_tokens(
+                        x0
+                    )  # shape (1, l + gen_length + suffix_len, d)
+                    x0_embeds = torch.where(
+                        mask_index.unsqueeze(-1).expand_as(x_embeds),
+                        x0_embeds,
+                        x_embeds,
+                    )
+                    x0 = torch.where(
+                        mask_index, x0, x
+                    )  # shape (1, l + gen_length + suffix_len)
+
+                    # Calculate confidence and determine transfer index
+                    confidence = torch.where(mask_index, x0_p, -np.inf)
+
+                    transfer_index = torch.zeros_like(
+                        x0, dtype=torch.bool, device=x0.device
+                    )
+                    for j in range(confidence.shape[0]):
+                        _, select_index = torch.topk(
+                            confidence[j], k=num_transfer_tokens[j, i]
+                        )
+                        transfer_index[j, select_index] = True
+
+                    # Update embeddings and token IDs
+                    x_embeds[transfer_index] = x0_embeds[transfer_index]
+                    x[transfer_index] = x0[transfer_index]
+
+                    # New: Check for stop words after each update
+                    if stopping_criteria is not None:
+                        # Only check the generated part (excluding the suffix)
+                        generated_part = x[
+                            0,
+                            inputs_embeds.shape[1] : inputs_embeds.shape[1]
+                            + gen_length,
+                        ]
+                        current_stop_position = None
+
+                        for stop_seq in stop_tokens:
+                            if not isinstance(stop_seq, list):
+                                stop_seq = [stop_seq]
+                            # Check if the generated sequence contains stop words
+                            for start_idx in range(
+                                generated_part.size(0) - len(stop_seq) + 1
+                            ):
+                                if torch.all(
+                                    generated_part[
+                                        start_idx : start_idx + len(stop_seq)
+                                    ]
+                                    == torch.tensor(stop_seq, device=x.device)
+                                ):
+                                    # Calculate the position of the currently found stop word
+                                    current_position = (
+                                        inputs_embeds.shape[1] + start_idx
+                                    )
+                                    # If it is the first time a stop word is found, or this stop word is earlier than the previously found one
+                                    if (
+                                        not found_stop_seq
+                                        or current_position < stop_position
+                                    ):
+                                        stop_position = current_position
+                                        found_stop_seq = True
+                                    break
+                            if found_stop_seq and current_stop_position is None:
+                                break
+
+            # Return the generated result, up to stop_position, and append the suffix
+            if found_stop_seq:
+                if suffix_len > 0:
+                    return torch.cat(
+                        [
+                            x[:, inputs_embeds.shape[1] : stop_position],
+                            x[:, -suffix_len:],
+                        ],
+                        dim=1,
+                    )
+                else:
+                    return x[:, inputs_embeds.shape[1] : stop_position]
+            else:
+                if suffix_len > 0:
+                    return torch.cat(
+                        [
+                            x[
+                                :,
+                                inputs_embeds.shape[1] : inputs_embeds.shape[1]
+                                + gen_length,
+                            ],
+                            x[:, -suffix_len:],
+                        ],
+                        dim=1,
+                    )
+                else:
+                    return x[
+                        :, inputs_embeds.shape[1] : inputs_embeds.shape[1] + gen_length
+                    ]
+
+    @add_start_docstrings_to_model_forward(LLaDA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        conversation_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        def forward_process_embeds(input_embeds, labels, eps=1e-3):
+            b, l, d = input_embeds.shape
+            t = torch.rand(b, device=input_embeds.device)
+            p_mask = (1 - eps) * t + eps
+            p_mask = p_mask[:, None].repeat(1, l)
+
+            masked_indices = torch.rand((b, l), device=input_embeds.device) < p_mask
+            # Add label condition filtering
+            valid_mask = labels != -100  # Create valid encoding
+            masked_indices = (
+                masked_indices & valid_mask
+            )  # Combine random encoding and valid encoding
+            # Magic number 126336 stands for the tokenizer special token,
+            # Magic embeddings, which is used for [MASK] token here,
+            masked_embed = self.model.embed_tokens(
+                torch.tensor([126336]).to(input_embeds.device)
+            )
+            noisy_embeds = torch.where(
+                masked_indices.unsqueeze(-1), masked_embed, input_embeds
+            )
+
+            return noisy_embeds, p_mask, masked_embed
+
+        noisy_embeds, p_mask, masked_embed = forward_process_embeds(
+            inputs_embeds, labels
+        )
+
+        masked_indices = self.get_masked_indices_from_embeds(
+            noisy_embeds, masked_embed
+        )  # shape (b, l)
+        prompt_index = (labels == -100).to(torch.int64)  # shape (b, l)
+
+        noisy_data_length = torch.sum(
+            (1 - prompt_index), dim=-1, keepdim=True
+        )  # shape (b, 1)
+        noisy_data_length = noisy_data_length.repeat(
+            1, noisy_embeds.shape[1]
+        )  # shape (b, l)
+
+        if conversation_ids is not None:
+            conversation_mask = self._build_conversation_mask_optimized(
+                conversation_ids
+            )
+            if attention_mask is not None:
+                # 1. Dimension expansion
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(
+                    2
+                )  # (batch, length) -> (batch, 1, 1, length)
+                attention_mask = attention_mask.expand_as(
+                    conversation_mask
+                )  # (batch, 1, 1, length) -> (batch, 1, length, length)
+                # 2. Mask combination (element-wise multiplication)
+                combined_mask = conversation_mask * attention_mask
+            else:
+                # If attention_mask is None, directly use conversation_mask
+                combined_mask = conversation_mask
+            attention_mask = combined_mask
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=noisy_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(
+                self.vocab_size // self.config.pretraining_tp, dim=0
+            )
+            logits = [
+                F.linear(hidden_states, lm_head_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Change for MDM
+            token_loss = (
+                F.cross_entropy(
+                    logits[masked_indices],
+                    labels[masked_indices],
+                    ignore_index=-100,
+                    reduction="none",
+                )
+                / p_mask[masked_indices]
+            )
+            loss = (
+                torch.sum(token_loss / noisy_data_length[masked_indices])
+                / labels.shape[0]
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # With static cache, the `past_key_values` is None
+        # TODO joao: standardize interface for the different Cache classes and remove of this if
+        has_static_cache = False
+        if past_key_values is None:
+            past_key_values = getattr(
+                self.model.layers[0].self_attn, "past_key_value", None
+            )
+            has_static_cache = past_key_values is not None
+
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                past_length = (
+                    cache_position[0]
+                    if cache_position is not None
+                    else past_key_values.get_seq_length()
+                )
+                max_cache_length = (
+                    torch.tensor(
+                        past_key_values.get_max_length(), device=input_ids.device
+                    )
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = (
+                    past_length
+                    if max_cache_length is None
+                    else torch.min(max_cache_length, past_length)
+                )
+            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {"input_ids": input_ids.contiguous()}
+
+        input_length = (
+            position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        )
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_length, past_length + input_length, device=input_ids.device
+            )
+        else:
+            cache_position = cache_position[-input_length:]
+
+        if has_static_cache:
+            past_key_values = None
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
diff --git a/train/llava/model/llava_arch.py b/train/llava/model/llava_arch.py
new file mode 100644
index 0000000..2467aa7
--- /dev/null
+++ b/train/llava/model/llava_arch.py
@@ -0,0 +1,701 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from abc import ABC, abstractmethod
+
+import math
+import re
+import time
+import torch
+import torch.nn as nn
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_resampler.builder import build_vision_resampler
+from .multimodal_projector.builder import build_vision_projector
+
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+from llava.mm_utils import get_anyres_image_grid_shape
+from llava.utils import rank0_print, rank_print
+import random
+
+
+class LlavaMetaModel:
+
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            delay_load = getattr(config, "delay_load", False)
+            self.vision_tower = build_vision_tower(config, delay_load=delay_load)
+            self.vision_resampler = build_vision_resampler(config, vision_tower=self.vision_tower)
+            self.mm_projector = build_vision_projector(config, vision_cfg=self.vision_tower.config)
+
+            if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+                self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+
+        self.config.mm_vision_tower = vision_tower
+        self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
+
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            vision_resampler = build_vision_resampler(model_args, vision_tower=vision_tower)
+            for k, v in vision_resampler.config.items():
+                setattr(self.config, k, v)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+                self.vision_resampler = [vision_resampler]
+            else:
+                self.vision_tower = vision_tower
+                self.vision_resampler = vision_resampler
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_resampler = self.vision_resampler[0]
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_resampler = self.vision_resampler
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+            # In case it is frozen by LoRA
+            for p in self.vision_resampler.parameters():
+                p.requires_grad = True
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, "mm_projector_type", "linear")
+        self.config.mm_hidden_size = getattr(vision_resampler, "hidden_size", vision_tower.hidden_size)
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+
+        
+        if not hasattr(self.config, 'add_faster_video'):
+            if model_args.add_faster_video:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.faster_token = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+
+        if getattr(self, "mm_projector", None) is None:
+            self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
+
+            if "unpad" in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location="cpu")
+
+            def get_w(weights, keyword):
+                return {k.split(keyword + ".")[1]: v for k, v in weights.items() if keyword in k}
+
+            incompatible_keys = self.mm_projector.load_state_dict(get_w(mm_projector_weights, "mm_projector"))
+            rank0_print(f"Loaded mm projector weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
+            incompatible_keys = self.vision_resampler.load_state_dict(get_w(mm_projector_weights, "vision_resampler"), strict=False)
+            rank0_print(f"Loaded vision resampler weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    # Compute aspect ratios
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    # Determine padding size and direction
+    if original_aspect_ratio > current_aspect_ratio:
+        # Padding was added to the height
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        # Padding was added to the width
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+
+    def get_2dPool(self, image_feature, stride=2):
+        height = width = self.get_vision_tower().num_patches_per_side
+        num_frames, num_tokens, num_dim = image_feature.shape
+        image_feature = image_feature.view(num_frames, height, width, -1)
+        image_feature = image_feature.permute(0, 3, 1, 2).contiguous()
+        # image_feature = nn.functional.max_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+        if self.config.mm_spatial_pool_mode == "average":
+            image_feature = nn.functional.avg_pool2d(image_feature, stride)
+        elif self.config.mm_spatial_pool_mode == "max":
+            image_feature = nn.functional.max_pool2d(image_feature, stride)
+        elif self.config.mm_spatial_pool_mode == "bilinear":
+            height, width = image_feature.shape[2:]
+            scaled_shape = [math.ceil(height / stride), math.ceil(width / stride)]
+            image_feature = nn.functional.interpolate(image_feature, size=scaled_shape, mode='bilinear')
+
+        else:
+            raise ValueError(f"Unexpected mm_spatial_pool_mode: {self.config.mm_spatial_pool_mode}")
+        image_feature = image_feature.permute(0, 2, 3, 1)
+        image_feature = image_feature.view(num_frames, -1, num_dim)
+        return image_feature
+
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        # image_features = self.get_model().vision_resampler(image_features, images=images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    
+    def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
+        videos_or_images_features = self.get_model().get_vision_tower()(videos_or_images)
+        per_videos_or_images_features = torch.split(videos_or_images_features, split_sizes, dim=0)  # tuple, (dim_1, 576, 4096)
+        all_videos_or_images_features = []
+        all_faster_video_features = []
+        cur_mm_spatial_pool_stride = self.config.mm_spatial_pool_stride
+
+        for idx, feat in enumerate(per_videos_or_images_features):
+            
+            feat = self.get_model().mm_projector(feat)
+            faster_video_feature = 0
+            slower_img_feat = 0
+            if idx in video_idx_in_batch and cur_mm_spatial_pool_stride > 1:
+                slower_img_feat = self.get_2dPool(feat,cur_mm_spatial_pool_stride)
+                if self.config.add_faster_video:
+                    cur_mm_spatial_pool_stride = cur_mm_spatial_pool_stride * 2
+                    faster_video_feature = self.get_2dPool(feat,cur_mm_spatial_pool_stride)
+            if slower_img_feat != 0:
+                all_videos_or_images_features.append(slower_img_feat)
+            else:
+                all_videos_or_images_features.append(feat)
+            all_faster_video_features.append(faster_video_feature)
+        return all_videos_or_images_features,all_faster_video_features
+
+    def add_token_per_grid(self, image_feature):
+        resize_h = int(math.sqrt(image_feature.shape[1]))
+        num_frames = image_feature.shape[0]
+        feature_dim = image_feature.shape[-1]
+
+        image_feature = image_feature.view(num_frames, 1, resize_h, resize_h, -1)
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+        if getattr(self.config, "add_faster_video", False):
+            # import pdb; pdb.set_trace()
+            # (3584, 832, 14) -> (3584, 64, 13, 14)
+            image_feature = image_feature.view(feature_dim, num_frames,resize_h, -1)
+            #  (3584, 64, 13, 14) -> (64, 13, 14, 3584)
+            image_feature = image_feature.permute(1, 2, 3, 0).contiguous()
+            # (64, 13, 14, 3584) -> (64, 13*14, 3584)
+            image_feature = image_feature.flatten(1, 2)
+            # import pdb; pdb.set_trace()
+            return image_feature
+        # import pdb; pdb.set_trace()
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        return image_feature
+
+    def add_token_per_frame(self, image_feature):
+        image_feature = image_feature.permute(2, 0, 1).contiguous()
+        image_feature =  torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+        image_feature = image_feature.permute(1, 2, 0).contiguous()
+        return image_feature
+
+    def generate_conversation_ids(self, labels):
+        """
+        Args:
+            labels: Label tensor, can be one-dimensional or two-dimensional
+        Returns:
+            Conversation ID tensor with the same shape as labels
+        """
+        # Process input dimensions
+        original_shape = labels.shape
+        if labels.ndim == 1:
+            labels = labels.unsqueeze(0)
+
+        batch_size, seq_len = labels.shape
+        device = labels.device
+        conversation_ids = torch.zeros_like(labels)
+        
+        # Special token IDs
+        start_header_id = 126346
+        eot_id = 126348
+        assistant_role_id1 = 598
+        
+        # Process all sequences in batch
+        for b in range(batch_size):
+            # Pre-search all special token positions to reduce repeated searches
+            start_positions = (labels[b] == start_header_id).nonzero(as_tuple=True)[0]
+            end_positions = (labels[b] == eot_id).nonzero(as_tuple=True)[0]
+            
+            # If no boundaries are found, continue to the next sequence
+            if len(start_positions) == 0 or len(end_positions) == 0:
+                continue
+            
+            # Pair all message start and end positions
+            message_boundaries = []
+            for start_pos in start_positions:
+                # Find the nearest end position
+                end_indices = (end_positions >= start_pos).nonzero(as_tuple=True)[0]
+                if len(end_indices) == 0:
+                    continue
+                
+                end_pos = end_positions[end_indices[0]]
+                
+                # Quickly check if it's an assistant message
+                start_idx = start_pos.item()
+                is_assistant = (start_idx + 1 < seq_len and 
+                            labels[b, start_idx + 1] == assistant_role_id1)
+                
+                message_boundaries.append((start_idx, end_pos.item(), is_assistant))
+            
+            # Sort by start position
+            message_boundaries.sort(key=lambda x: x[0])
+            
+            # Determine if there is a system message
+            has_system = len(message_boundaries) > 0 and not message_boundaries[0][2]
+            
+            # Assign conversation turn IDs
+            current_turn = 0
+            prev_was_assistant = False
+            
+            # Efficiently handle BOS token (usually at the start of the sequence)
+            if labels[b, 0] == 126080:  # BOS ID
+                conversation_ids[b, 0] = 0
+            
+            # Assign IDs to all messages at once
+            for i, (start_pos, end_pos, is_assistant) in enumerate(message_boundaries):
+                # The first non-assistant message is a system message
+                is_system = i == 0 and not is_assistant and has_system
+                
+                if is_system:
+                    # System message belongs to the first conversation turn
+                    conversation_ids[b, start_pos:end_pos+1] = 0
+                else:
+                    # If it's a user message and the previous one was an assistant message, increase the turn
+                    if not is_assistant and prev_was_assistant:
+                        current_turn += 1
+                    
+                    # Assign ID to the entire message block at once
+                    conversation_ids[b, start_pos:end_pos+1] = current_turn
+                
+                prev_was_assistant = is_assistant
+            
+            # Fill gaps between messages - use cumulative max method
+            # This is much faster than looping element by element
+            for i in range(1, seq_len):
+                if conversation_ids[b, i] == 0 and conversation_ids[b, i-1] > 0:
+                    conversation_ids[b, i] = conversation_ids[b, i-1]
+            
+            # New: Handle end padding
+            non_zero_mask = (conversation_ids[b] != 0)
+            if non_zero_mask.any():
+                last_non_zero_idx = torch.nonzero(non_zero_mask, as_tuple=True)[0][-1]
+                last_turn = conversation_ids[b, last_non_zero_idx]
+                conversation_ids[b, last_non_zero_idx+1:] = last_turn
+        
+        # Return a tensor with the same dimensions as the input
+        if len(original_shape) == 1:
+            return conversation_ids.squeeze(0)
+        
+        return conversation_ids
+
+    def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None, is_llada=False):
+        vision_tower = self.get_vision_tower()
+        # rank_print(modalities)
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        if isinstance(modalities, str):
+            modalities = [modalities]
+
+        # import pdb; pdb.set_trace()
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+
+            video_idx_in_batch = []
+            for _ in range(len(modalities)):
+                if modalities[_] == "video":
+                    video_idx_in_batch.append(_)
+
+            images_list = []
+            for image in images:
+                if image.ndim == 4:
+                    images_list.append(image)
+                else:
+                    images_list.append(image.unsqueeze(0))
+
+            concat_images = torch.cat([image for image in images_list], dim=0)
+            split_sizes = [image.shape[0] for image in images_list]
+            encoded_image_features = self.encode_images(concat_images)
+            # image_features,all_faster_video_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)
+
+            # This is a list, each element is [num_images, patch * patch, dim]
+            # rank_print(f"Concat images : {concat_images.shape}")
+            encoded_image_features = torch.split(encoded_image_features, split_sizes)
+            image_features = []
+            for idx, image_feat in enumerate(encoded_image_features):
+                if idx in video_idx_in_batch:
+                    image_features.append(self.get_2dPool(image_feat))
+                else:
+                    image_features.append(image_feat)
+            # image_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)
+            # rank_print(f"Encoded image feats : {[x.shape for x in image_features]}")
+            # image_features = torch.split(image_features, split_sizes, dim=0)
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+            mm_newline_position = getattr(self.config, "mm_newline_position", "one_token")
+
+            if mm_patch_merge_type == "flat":
+                image_features = [x.flatten(0, 1) for x in image_features]
+
+            elif mm_patch_merge_type.startswith("spatial"):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    # FIXME: now assume the image is square, and split to 2x2 patches
+                    # num_patches = h * w, where h = w = sqrt(num_patches)
+                    # currently image_feature is a tensor of shape (4, num_patches, hidden_size)
+                    # we want to first unflatten it to (2, 2, h, w, hidden_size)
+                    # rank0_print("At least we are reaching here")
+                    # import pdb; pdb.set_trace()
+                    if image_idx in video_idx_in_batch:  # video operations
+                        # rank0_print("Video")
+                        if mm_newline_position == "grid":
+                            # Grid-wise
+                            image_feature = self.add_token_per_grid(image_feature)
+                            if getattr(self.config, "add_faster_video", False):
+                                faster_video_feature = self.add_token_per_grid(all_faster_video_features[image_idx])
+                                # Add a token for each frame
+                                concat_slow_fater_token = []
+                                # import pdb; pdb.set_trace()
+                                for _ in range(image_feature.shape[0]):
+                                    if _ % self.config.faster_token_stride == 0:
+                                        concat_slow_fater_token.append(torch.cat((image_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                                    else:
+                                        concat_slow_fater_token.append(torch.cat((faster_video_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                                # import pdb; pdb.set_trace()
+                                image_feature = torch.cat(concat_slow_fater_token)
+
+                                # print("!!!!!!!!!!!!")
+                        
+                            new_image_features.append(image_feature)
+                        elif mm_newline_position == "frame":
+                            # Frame-wise
+                            image_feature = self.add_token_per_frame(image_feature)
+
+                            new_image_features.append(image_feature.flatten(0, 1))
+                            
+                        elif mm_newline_position == "one_token":
+                            # one-token
+                            image_feature = image_feature.flatten(0, 1)
+                            if 'unpad' in mm_patch_merge_type:
+                                image_feature = torch.cat((
+                                    image_feature,
+                                    self.model.image_newline[None].to(image_feature.device)
+                                ), dim=0)
+                            new_image_features.append(image_feature)      
+                        elif mm_newline_position == "no_token":
+                            new_image_features.append(image_feature.flatten(0, 1))
+                        else:
+                            raise ValueError(f"Unexpected mm_newline_position: {mm_newline_position}")
+                    elif image_feature.shape[0] > 1:  # multi patches and multi images operations
+                        # rank0_print("Single-images")
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+
+                        if "anyres_max" in image_aspect_ratio:
+                            matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
+                            if matched_anyres_max_num_patches:
+                                max_num_patches = int(matched_anyres_max_num_patches.group(1))
+
+                        if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+                            if hasattr(self.get_vision_tower(), "image_size"):
+                                vision_tower_image_size = self.get_vision_tower().image_size
+                            else:
+                                raise ValueError("vision_tower_image_size is not found in the vision tower.")
+                            try:
+                                num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size)
+                            except Exception as e:
+                                rank0_print(f"Error: {e}")
+                                num_patch_width, num_patch_height = 2, 2
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            image_feature = image_feature.view(2, 2, height, width, -1)
+
+                        if "maxpool2x2" in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = nn.functional.max_pool2d(image_feature, 2)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                            unit = image_feature.shape[2]
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            c, h, w = image_feature.shape
+                            times = math.sqrt(h * w / (max_num_patches * unit**2))
+                            if times > 1.1:
+                                image_feature = image_feature[None]
+                                image_feature = nn.functional.interpolate(image_feature, [int(h // times), int(w // times)], mode="bilinear")[0]
+                            image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        elif "unpad" in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        if "nobase" in mm_patch_merge_type:
+                            pass
+                        else:
+                            image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                        new_image_features.append(image_feature)
+                    else:  # single image operations
+                        image_feature = image_feature[0]
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
+
+                        new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            image_features = self.encode_images(images)
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+            raise NotImplementedError
+        # rank_print(f"Total images : {len(image_features)}")
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        # rank_print("Inserting Images embedding")
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            # rank0_print(num_images)
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    try:
+                        cur_image_features = image_features[cur_image_idx]
+                    except IndexError:
+                        cur_image_features = image_features[cur_image_idx - 1]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+            # import pdb; pdb.set_trace()
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        # rank_print("Finishing Inserting")
+
+        new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        # TODO: Hard code for control loss spike
+        # if tokenizer_model_max_length is not None:
+        #     new_input_embeds = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        #     new_labels = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        # rank0_print("Prepare pos id")
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        # rank0_print("tokenizer padding")
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+        if getattr(self.config, "use_pos_skipping", False) and self.training:
+            position_ids = torch.arange(new_input_embeds.size(1), device=new_input_embeds.device).unsqueeze(0).to(new_input_embeds.device)
+            split_position = random.randint(0, new_input_embeds.size(1))
+            left_add = random.randint(0, self.config.pos_skipping_range)
+            right_add = random.randint(left_add, self.config.pos_skipping_range)
+            position_ids[:, :split_position] += left_add
+            position_ids[:, split_position:] += right_add
+        
+        # add conversation_ids 
+        if is_llada and attention_mask is not None: 
+            conversation_ids = self.generate_conversation_ids(new_labels)
+            return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels, conversation_ids
+        # import pdb; pdb.set_trace()
+        # rank0_print("Finish preparing")
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location="cpu")
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
diff --git a/train/llava/model/make_delta.py b/train/llava/model/make_delta.py
new file mode 100644
index 0000000..7b3fbab
--- /dev/null
+++ b/train/llava/model/make_delta.py
@@ -0,0 +1,52 @@
+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model.utils import auto_upgrade
+
+
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
+
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
diff --git a/train/llava/model/multimodal_encoder/builder.py b/train/llava/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000..d1cab60
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/builder.py
@@ -0,0 +1,36 @@
+import os
+from .clip_encoder import CLIPVisionTower
+from .imagebind import ImageBindWrapper
+from .open_clip_encoder import OpenCLIPVisionTower
+from .hf_vision import HFVisionTower
+from .siglip_encoder import SigLipVisionTower
+from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
+
+# from .eva_clip.eva_clip_encoder import EvaClipVisionTower
+# from .dev_eva_clip.eva_vit import EvaViTWrapper
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    use_s2 = getattr(vision_tower_cfg, "s2", False)
+    if vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: # is_absolute_path_exists or 
+        if use_s2:
+            return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
+        else:
+            return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "siglip" in vision_tower:
+        vision_tower = "../model/siglip2-so400m-patch14-384"
+        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    elif vision_tower.startswith("hf:"):
+        return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif vision_tower in ["imagebind_huge"]:
+        return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif vision_tower.startswith("open_clip_hub"):
+        return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower():
+    #     return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]:
+    #     return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
+
+    raise ValueError(f"Unknown vision tower: {vision_tower}")
diff --git a/train/llava/model/multimodal_encoder/clip_encoder.py b/train/llava/model/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000..212b262
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,173 @@
+import torch
+import torch.nn as nn
+from llava.utils import rank0_print
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+try:
+    from s2wrapper import forward as multiscale_forward
+except:
+    pass
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+
+        if not delay_load:
+            rank0_print(f"Loading vision tower: {vision_tower}")
+            self.load_model()
+        elif getattr(args, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+        elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        select_feature_type = self.select_feature
+
+        if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
+            select_every_k_layer = len(image_forward_outs.hidden_states) // 4
+            image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
+            select_feature_type = select_feature_type.replace("slicefour_", "")
+        elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]:
+            select_layers = [-2, -5, -8, -11, 6]
+            image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1)
+            select_feature_type = select_feature_type.replace("slice_m25811_f6_", "")
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        if select_feature_type == "patch":
+            image_features = image_features[:, 1:]
+        elif select_feature_type == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {select_feature_type}")
+        return image_features
+
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        _hidden_size = self.config.hidden_size
+        if "slicefour" in self.select_feature:
+            _hidden_size *= 4
+        if "slice_m25811_f6" in self.select_feature:
+            _hidden_size *= 5
+        return _hidden_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def num_patches(self):
+        _num_patches = (self.config.image_size // self.config.patch_size) ** 2
+        if "cls_patch" in self.select_feature:
+            _num_patches += 1
+        return _num_patches
+
+    @property
+    def image_size(self):
+        return self.config.image_size
+
+
+class CLIPVisionTowerS2(CLIPVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+
+        self.s2_scales = getattr(args, "s2_scales", "336,672,1008")
+        self.s2_scales = list(map(int, self.s2_scales.split(",")))
+        self.s2_scales.sort()
+        self.s2_split_size = self.s2_scales[0]
+        self.s2_image_size = self.s2_scales[-1]
+
+        super().__init__(vision_tower, args, delay_load)
+
+        # change resize/crop size in preprocessing to the largest image size in s2_scale
+        if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False):
+            self.image_processor.size["shortest_edge"] = self.s2_image_size
+            self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+
+        self.image_processor.size["shortest_edge"] = self.s2_image_size
+        self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
+
+        self.is_loaded = True
+
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
+                image_features.append(image_feature)
+        else:
+            image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
+
+        return image_features
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.s2_scales)
diff --git a/train/llava/model/multimodal_encoder/hf_vision.py b/train/llava/model/multimodal_encoder/hf_vision.py
new file mode 100644
index 0000000..a413208
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/hf_vision.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+
+from transformers import AutoModel, AutoImageProcessor, AutoConfig, CLIPImageProcessor
+from llava.utils import rank0_print
+
+
+class HFVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower.replace("hf:", "", 1)
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        try:
+            self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name)
+        except Exception as e:
+            if "448" in self.vision_tower_name:
+                image_size = 448
+                # use image processor with conig
+                self.image_processor = CLIPImageProcessor(size={"shortest_edge": image_size}, do_center_crop=True, crop_size=image_size)
+            else:
+                self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        rank0_print(f"Loaded image processor: {self.image_processor}")
+        self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")
+        self.device = self.vision_tower.device
+        self.dtype = self.vision_tower.dtype
+        self.config = self.vision_tower.config
+
+        if hasattr(self.vision_tower, "vision_model"):
+            self.vision_tower = self.vision_tower.vision_model
+        self.vision_tower.requires_grad_(False)
+        # self.vision_tower.eval()
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        select_feature_type = self.select_feature
+
+        if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
+            select_every_k_layer = len(image_forward_outs.hidden_states) // 4
+            image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
+            select_feature_type = select_feature_type.replace("slicefour_", "")
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        if select_feature_type == "patch":
+            image_features = image_features[:, 1:]
+        elif select_feature_type == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {select_feature_type}")
+        return image_features
+
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    # @property
+    # def dtype(self):
+    #     return self.vision_tower.dtype
+
+    # @property
+    # def device(self):
+    #     return self.vision_tower.device
+
+    @property
+    def hidden_size(self):
+        try:
+            _hidden_size = self.config.hidden_size
+        except:
+            _hidden_size = self.config.vision_config.hidden_size
+        if "slicefour" in self.select_feature:
+            _hidden_size *= 4
+        return _hidden_size
+
+    @property
+    def num_patches(self):
+        _num_patches = (self.config.image_size // self.config.patch_size) ** 2
+        if "cls_patch" in self.select_feature:
+            _num_patches += 1
+        return _num_patches
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def image_size(self):
+        return self.config.image_size
diff --git a/train/llava/model/multimodal_encoder/imagebind.py b/train/llava/model/multimodal_encoder/imagebind.py
new file mode 100644
index 0000000..8bbe71c
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/imagebind.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPImageProcessor
+
+try:
+    from imagebind.models import imagebind_model
+    from imagebind.models.imagebind_model import ModalityType
+    from imagebind.data import load_and_transform_audio_data
+except ImportError:
+    pass
+
+
+class ImageBindWrapper(nn.Module):
+    def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = select_layer
+        self.select_feature = select_feature
+
+        if not delay_load:
+            self.load_model()
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.vision_tower = imagebind_model.imagebind_huge(pretrained=True)
+        for p in self.vision_tower.parameters():
+            p.requires_grad = False
+        self.vision_tower.eval()
+        self.is_loaded = True
+
+    def train(self, mode=True):
+        self.training = mode
+
+        if self.is_loaded:
+            self.vision_tower.eval()
+
+    @torch.no_grad()
+    def forward(self, x):
+        if type(x) == dict:
+            if x["audios"] is not None:
+                inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()}
+                embeddings = self.vision_tower(inputs)
+                audio_embedding = embeddings[ModalityType.AUDIO]
+                return audio_embedding.unsqueeze(1)
+        else:
+            inputs = {ModalityType.VISION: x.to(dtype=self.dtype)}
+            embeddings = self.vision_tower(inputs)
+            vision_embedding = embeddings[ModalityType.VISION]
+            if vision_embedding.ndim == 2:
+                return vision_embedding.unsqueeze(1)
+            if vision_embedding.shape[1] == 257:
+                return vision_embedding[:, 1:]
+            raise ValueError(f"Unexpected shape: {vision_embedding.shape}")
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, 1024, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.modality_preprocessors.vision.cls_token.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.modality_preprocessors.vision.cls_token.device
+
+    @property
+    def hidden_size(self):
+        return 1024
diff --git a/train/llava/model/multimodal_encoder/open_clip_encoder.py b/train/llava/model/multimodal_encoder/open_clip_encoder.py
new file mode 100644
index 0000000..17a3277
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/open_clip_encoder.py
@@ -0,0 +1,163 @@
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor
+from llava.utils import rank0_print
+
+try:
+    import open_clip
+    import torchvision
+    from open_clip.transformer import _expand_token
+except ImportError:
+    print("OpenCLIP not installed")
+    open_clip = None
+
+HIDDEN_SIZE_DICT = {
+    "ViT-H-14-378-quickgelu": 1280,
+}
+
+
+class OpenCLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+        self.model_name = vision_tower.replace("open_clip_hub:", "")
+        self.pretrained = args.vision_tower_pretrained
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+
+        if not delay_load:
+            rank0_print(f"Loading vision tower: {vision_tower}")
+            self.load_model()
+        elif getattr(args, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+        elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
+            self.load_model()
+
+    def load_model(self, device_map="auto"):
+        rank0_print(f"Loading OpenCLIP model: {self.model_name}")
+        rank0_print(f"Pretrained: {self.pretrained}")
+        vision_tower, _, image_processor = open_clip.create_model_and_transforms(model_name=self.model_name, pretrained=self.pretrained, precision="fp32", device="cuda")
+
+        resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0]
+        normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0]
+        self.resize_transform_size = resize_transform.size  # 224 or 384
+        self.patch_size = vision_tower.visual.conv1.kernel_size[0]  # 14 or 16
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-large-patch14",
+            crop_size=resize_transform.size,
+            size={"shortest_edge": resize_transform.size},
+            image_mean=list(normalize_transform.mean),
+            image_std=list(normalize_transform.std),
+        )
+        rank0_print(f"Loaded image processor: {self.image_processor}")
+        self.vision_tower = vision_tower.visual
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        elif self.select_feature == "conv_flatten":
+            image_features = image_features.flatten(2).transpose(1, 2)
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+
+    def forward_visual(self, x, output_hidden_states=False):
+        if hasattr(self.vision_tower, "trunk") and hasattr(self.vision_tower.trunk, "_intermediate_layers"):
+            return self.vision_tower.trunk._intermediate_layers(x, abs(self.select_layer))
+        else:
+
+            def forward_openclip(self, x: torch.Tensor):
+                features = []
+                x = self.conv1(x)  # shape = [*, width, grid, grid]
+                x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+                x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+                # class embeddings and positional embeddings
+                x = torch.cat(
+                    [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x],
+                    dim=1,
+                )
+                # shape = [*, grid ** 2 + 1, width]
+                x = x + self.positional_embedding.to(x.dtype)
+
+                x = self.patch_dropout(x)
+                x = self.ln_pre(x)
+
+                x = x.permute(1, 0, 2)  # NLD -> LND
+                for r in self.transformer.resblocks:
+                    x = r(x, attn_mask=None)
+                    features.append(x)
+                return features
+
+            return forward_openclip(self.vision_tower, x)
+
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.forward_visual(image.to(self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.forward_visual(images.to(self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        if hasattr(self.vision_tower, "conv1"):
+            return self.vision_tower.conv1.weight.dtype
+        if hasattr(self.vision_tower, "trunk"):
+            return self.vision_tower.trunk.patch_embed.proj.weight.dtype
+        raise NotImplementedError
+
+    @property
+    def device(self):
+        if hasattr(self.vision_tower, "conv1"):
+            return self.vision_tower.conv1.weight.device
+        if hasattr(self.vision_tower, "trunk"):
+            return self.vision_tower.trunk.patch_embed.proj.weight.device
+        raise NotImplementedError
+
+    @property
+    def config(self):
+        return None
+
+    @property
+    def hidden_size(self):
+        if self.model_name in HIDDEN_SIZE_DICT:
+            return HIDDEN_SIZE_DICT[self.model_name]
+        else:
+            raise NotImplementedError
+
+    @property
+    def num_patches(self):
+        image_size = self.resize_transform_size if isinstance(self.resize_transform_size, int) else self.resize_transform_size[0]
+        _num_patches = (image_size // self.patch_size) ** 2
+        if "cls_patch" in self.select_feature:
+            _num_patches += 1
+        return _num_patches
+
+    @property
+    def image_size(self):
+        return self.resize_transform_size
+
+    @property
+    def num_patches_per_side(self):
+        return self.resize_transform_size // self.patch_size
diff --git a/train/llava/model/multimodal_encoder/siglip_encoder.py b/train/llava/model/multimodal_encoder/siglip_encoder.py
new file mode 100644
index 0000000..f1e101a
--- /dev/null
+++ b/train/llava/model/multimodal_encoder/siglip_encoder.py
@@ -0,0 +1,620 @@
+"""
+# Adapted from https://huggingface.co/MILVLG/imp-v1-3b/blob/main/vision_encoder.py
+"""
+
+from typing import Optional, Tuple, Union, Dict
+from dataclasses import dataclass
+from functools import partial, reduce
+from PIL import Image
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import os
+from transformers.image_processing_utils import BatchFeature, get_size_dict
+from transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    PILImageResampling,
+    to_numpy_array,
+)
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers import PretrainedConfig
+from transformers.utils import ModelOutput
+from llava.utils import rank0_print
+
+
+class SigLipImageProcessor:
+    def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
+        crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.data_format = data_format
+        self.crop_size = crop_size
+
+    def preprocess(self, images, return_tensors):
+        if isinstance(images, Image.Image):
+            images = [images]
+        else:
+            # to adapt video data
+            images = [to_numpy_array(image) for image in images]
+            assert isinstance(images, list)
+
+        transforms = [
+            convert_to_rgb,
+            to_numpy_array,
+            partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
+            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
+            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
+            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
+        ]
+
+        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
+        data = {"pixel_values": images}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+class SigLipVisionConfig(PretrainedConfig):
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        image_mean=(0.5, 0.5, 0.5),
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=384,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.image_mean = image_mean
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SigLipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            print(f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors.")
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->SigLip
+class SigLipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class SigLipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class SigLipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->SigLip
+class SigLipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->SigLip
+class SigLipEncoderLayer(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SigLipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SigLipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SigLipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SigLipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        pass
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
+class SigLipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SigLipEncoderLayer`].
+
+    Args:
+        config: SigLipVisionConfig
+    """
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions)
+
+
+class SigLipVisionTransformer(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SigLipVisionEmbeddings(config)
+        self.encoder = SigLipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = SigLipMultiheadAttentionPoolingHead(config)
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SigLipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SigLipMLP(config)
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class SigLipVisionModel(SigLipPreTrainedModel):
+    config_class = SigLipVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["SigLipEncoderLayer"]
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SigLipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SigLipVisionModel
+
+        >>> model = SigLipVisionModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SigLipVisionTower(nn.Module):
+    def __init__(self, vision_tower, vision_tower_cfg, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.config = SigLipVisionConfig()
+
+        self.vision_tower_name = vision_tower
+
+        self.image_processor = SigLipImageProcessor()
+
+        if not delay_load:
+            rank0_print(f"Loading vision tower: {vision_tower}")
+            self.load_model()
+        elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+        elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
+            self.load_model()
+        else:
+            self.cfg_only = self.config
+
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
+            return
+
+        self.vision_tower = SigLipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+
+        del self.vision_tower.vision_model.encoder.layers[-1:]
+        self.vision_tower.vision_model.head = nn.Identity()
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = image_forward_out.hidden_states[-1].to(image.dtype)
+                assert image_features.shape[-2] == 729
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = image_forward_outs.hidden_states[-1].to(images.dtype)
+            assert image_features.shape[-2] == 729
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        for p in self.vision_tower.parameters():
+            return p.dtype
+
+    @property
+    def device(self):
+        for p in self.vision_tower.parameters():
+            return p.device
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+        # return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]
+
+    @property
+    def image_size(self):
+        return self.config.image_size
diff --git a/train/llava/model/multimodal_projector/builder.py b/train/llava/model/multimodal_projector/builder.py
new file mode 100644
index 0000000..3122a0c
--- /dev/null
+++ b/train/llava/model/multimodal_projector/builder.py
@@ -0,0 +1,65 @@
+import torch
+import torch.nn as nn
+import re
+
+from .pooler_projector import PoolerProjector
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
+
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    if projector_type == "pooler":
+        return PoolerProjector(config, kwargs["vision_cfg"])
+
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    mlp_gelu_resnet_match = re.match(r"^mlp(\d+)x_res(\d+)x_gelu$", projector_type)
+    if mlp_gelu_resnet_match:
+        mlp_depth = int(mlp_gelu_resnet_match.group(1))
+        res_depth = int(mlp_gelu_resnet_match.group(2))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        for _ in range(res_depth):
+            modules.append(SimpleResBlock(config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == "identity":
+        return IdentityMap()
+
+    raise ValueError(f"Unknown projector type: {projector_type}")
diff --git a/train/llava/model/multimodal_projector/pooler_projector.py b/train/llava/model/multimodal_projector/pooler_projector.py
new file mode 100644
index 0000000..ce5a2e0
--- /dev/null
+++ b/train/llava/model/multimodal_projector/pooler_projector.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+
+import math
+
+from transformers.models.clip.modeling_clip import CLIPVisionModel
+
+
+class PoolerProjector(nn.Module):
+    def __init__(self, config, vision_cfg):
+        super().__init__()
+        self._config = config
+        self.hw = vision_cfg.image_size // vision_cfg.patch_size
+
+        self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
+
+        self.proj = nn.Sequential(
+            nn.GELU(),
+            nn.Linear(config.hidden_size, config.hidden_size),
+        )
+
+    def forward(self, x, *args, **kwargs):
+        height = width = self.hw
+        assert height * width == x.shape[1]
+        x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
+        x = self.conv_pool(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": "pooler"}
diff --git a/train/llava/model/multimodal_resampler/builder.py b/train/llava/model/multimodal_resampler/builder.py
new file mode 100644
index 0000000..7a4b207
--- /dev/null
+++ b/train/llava/model/multimodal_resampler/builder.py
@@ -0,0 +1,34 @@
+import torch
+
+from .masked_drop import MaskedDrop
+from .spatial_pool import SpatialPool
+from .perceiver import PerceiverResampler
+from .qformer import Qformer
+
+
+class IdentityMap(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_resampler_type": None}
+
+
+def build_vision_resampler(model_args, delay_load=False, **kwargs):
+    resampler_type = getattr(model_args, "mm_resampler_type", None)
+    if resampler_type == "masked_drop":
+        return MaskedDrop(model_args)
+    elif resampler_type == "spatial_pool":
+        return SpatialPool(model_args, **kwargs)
+    elif resampler_type == "perceiver":
+        return PerceiverResampler(model_args, **kwargs)
+    elif resampler_type == "qformer":
+        return Qformer(model_args, **kwargs)
+    elif resampler_type is None:
+        return IdentityMap()
+
+    raise ValueError(f"Unknown resampler type: {resampler_type}")
diff --git a/train/llava/model/multimodal_resampler/masked_drop.py b/train/llava/model/multimodal_resampler/masked_drop.py
new file mode 100644
index 0000000..03f0bf0
--- /dev/null
+++ b/train/llava/model/multimodal_resampler/masked_drop.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+
+import random
+
+
+class MaskedDrop(nn.Module):
+    def __init__(self, model_args):
+        super().__init__()
+
+        self.mode = model_args.mm_mask_drop_mode
+        self.skip_percentage = model_args.mm_mask_drop_skip_percentage
+        self.ratio = model_args.mm_mask_drop_ratio
+        self.ratio_upper = model_args.mm_mask_drop_ratio_upper
+        self.ratio_lower = model_args.mm_mask_drop_ratio_lower
+
+    def forward(self, image_features, *args, **kwargs):
+
+        if not self.training:
+            return image_features
+
+        if self.skip_percentage > random.random():
+            return image_features
+
+        masked_features = []
+
+        for image_feature in image_features:
+            num_tokens = image_feature.shape[0]
+            if self.mode == "fixed":
+                num_keep = int(num_tokens * self.ratio)
+                masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0])
+            elif self.mode == "range":
+                num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper))
+                masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0])
+            elif self.mode == "cls_only":
+                masked_features.append(image_feature[0:1])
+            else:
+                raise ValueError(f"Unexpected masked drop mode: {self.mode}")
+
+        if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]):
+            masked_features = torch.stack(masked_features, dim=0)
+
+        return masked_features
+
+    @property
+    def config(self):
+        return {
+            "mm_resampler_type": "masked_drop",
+            "mm_mask_drop_mode": self.mode,
+            "mm_mask_drop_skip_percentage": self.skip_percentage,
+            "mm_mask_drop_ratio": self.ratio,
+            "mm_mask_drop_ratio_upper": self.ratio_upper,
+            "mm_mask_drop_ratio_lower": self.ratio_lower,
+        }
+
+    def random_masking(self, x, len_keep):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return x_masked, mask, ids_restore
diff --git a/train/llava/model/multimodal_resampler/perceiver.py b/train/llava/model/multimodal_resampler/perceiver.py
new file mode 100644
index 0000000..d6b17a5
--- /dev/null
+++ b/train/llava/model/multimodal_resampler/perceiver.py
@@ -0,0 +1,155 @@
+"""
+Taken from https://github.com/lucidrains/flamingo-pytorch
+"""
+
+import torch
+from einops import rearrange, repeat
+
+try:
+    from einops_exts import rearrange_many
+except:
+    pass
+
+from torch import einsum, nn
+
+
+def exists(val):
+    return val is not None
+
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+
+        h = self.heads
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+
+
+class PerceiverResamplerModule(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
+        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult) if ff_mult > 0 else nn.Identity(),
+                    ]
+                )
+            )
+
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+
+
+class PerceiverResampler(nn.Module):
+    def __init__(self, model_args, vision_tower):
+        super().__init__()
+
+        self.depth = model_args.mm_perceiver_depth
+        self.num_latents = model_args.mm_perceiver_latents
+        self.ff_mult = model_args.mm_perceiver_ff_mult
+        self.pretrained = model_args.mm_perceiver_pretrained
+
+        self.perceiver = PerceiverResamplerModule(dim=vision_tower.hidden_size, depth=self.depth, num_latents=self.num_latents, ff_mult=self.ff_mult)
+
+        if self.pretrained is not None:
+            self.load_state_dict(torch.load(self.pretrained))
+
+    def forward(self, image_features, *args, **kwargs):
+        return self.perceiver(image_features[:, None, None]).squeeze(1)
+
+    @property
+    def config(self):
+        return {
+            "mm_resampler_type": "perceiver",
+            "mm_perceiver_depth": self.depth,
+            "mm_perceiver_latents": self.num_latents,
+            "mm_perceiver_ff_mult": self.ff_mult,
+            "mm_perceiver_pretrained": self.pretrained,
+        }
diff --git a/train/llava/model/multimodal_resampler/qformer.py b/train/llava/model/multimodal_resampler/qformer.py
new file mode 100644
index 0000000..b86754c
--- /dev/null
+++ b/train/llava/model/multimodal_resampler/qformer.py
@@ -0,0 +1,1160 @@
+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+logger = logging.get_logger(__name__)
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention and layer_num % self.config.cross_attention_freq == 0:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config, i) for i in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is None:
+            assert query_embeds is not None, "You have to specify query_embeds when input_ids is None"
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Qformer(nn.Module):
+    def __init__(self, model_args, vision_tower):
+        super().__init__()
+
+        self.depth = model_args.mm_qformer_depth
+        self.num_latents = model_args.mm_qformer_latents
+        self.pretrained = model_args.mm_qformer_pretrained
+
+        self.Qformer, self.query_tokens, self.ln_vision = self.build_Qformer(vision_tower.hidden_size, self.depth, self.num_latents)
+
+        if self.pretrained is not None:
+            pretrained_dict = torch.load(self.pretrained, map_location="cpu")["model"]
+            pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith("t5_proj")}
+            self.load_state_dict(pretrained_dict)
+
+    def build_Qformer(self, vision_width, cross_attention_freq, num_query_token):
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        Qformer = BertLMHeadModel(config=encoder_config)
+        query_tokens = nn.Parameter(torch.zeros(1, num_query_token, encoder_config.hidden_size))
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        Qformer.cls = None
+        Qformer.bert.embeddings.word_embeddings = None
+        Qformer.bert.embeddings.position_embeddings = None
+        for layer in Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        return Qformer, query_tokens, nn.LayerNorm(vision_width)
+
+    def forward(self, image_features, *args, **kwargs):
+        x = self.ln_vision(image_features)
+        image_atts = torch.ones(x.size()[:-1], dtype=torch.long).to(x.device)
+
+        query_tokens = self.query_tokens.expand(x.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=x,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+
+        return query_output.last_hidden_state
+
+    @property
+    def hidden_size(self):
+        return 768
+
+    @property
+    def config(self):
+        return {
+            "mm_resampler_type": "qformer",
+            "mm_qformer_depth": self.depth,
+            "mm_qformer_latents": self.num_latents,
+            "mm_qformer_pretrained": self.pretrained,
+        }
diff --git a/train/llava/model/multimodal_resampler/spatial_pool.py b/train/llava/model/multimodal_resampler/spatial_pool.py
new file mode 100644
index 0000000..4bdbe3a
--- /dev/null
+++ b/train/llava/model/multimodal_resampler/spatial_pool.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+import math
+
+
+class SpatialPool(nn.Module):
+    def __init__(self, model_args, vision_tower):
+        super().__init__()
+
+        self.mode = model_args.mm_spatial_pool_mode
+        self.stride = model_args.mm_spatial_pool_stride
+        self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
+
+        if self.mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
+        elif self.mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
+        elif self.mode == "conv":
+            self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
+        else:
+            raise ValueError(f"Unknown pooling mode: {self.pool}.")
+
+    def forward(self, image_features, images, *args, **kwargs):
+        ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
+        ori_H = int(ori_W * images.shape[2] // images.shape[3])
+
+        B, _, F = image_features.shape
+
+        image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
+        image_features_spatial_pool = self.pool(image_features_spatial)
+
+        return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
+
+    @property
+    def config(self):
+        return {
+            "mm_resampler_type": "spatial_pool",
+            "mm_spatial_pool_stride": self.stride,
+            "mm_spatial_pool_mode": self.mode,
+            "mm_spatial_pool_out_channels": self.out_channels,
+        }
+
+    @property
+    def hidden_size(self):
+        return self.out_channels
diff --git a/train/llava/model/simple_heading_mlp.py b/train/llava/model/simple_heading_mlp.py
new file mode 100644
index 0000000..d977db6
--- /dev/null
+++ b/train/llava/model/simple_heading_mlp.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+
+class TrajectoryHeadingSimpleMLP(nn.Module):
+    def __init__(self, hidden_layer_dim=512, num_poses=8):
+        super().__init__()
+        self.num_poses = num_poses
+        self.mlp = nn.Sequential(
+            nn.Linear(8 * 2, hidden_layer_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_layer_dim, hidden_layer_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_layer_dim, hidden_layer_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_layer_dim, num_poses * 3),
+        )
+
+    def forward(self, traj_xy):
+        x = traj_xy.reshape(traj_xy.shape[0], -1)
+        out = self.mlp(x)
+        out = out.view(-1, self.num_poses, 3)
+        heading_pred = torch.tanh(out[:, :, 2:]) * 3.14159
+        return heading_pred
+
+
+def load_heading_model(
+    ckpt_path: str, device: str = "cpu", hidden_layer_dim=512, num_poses=8
+) -> nn.Module:
+    model = TrajectoryHeadingSimpleMLP(
+        hidden_layer_dim=hidden_layer_dim, num_poses=num_poses
+    )
+    state_dict = torch.load(ckpt_path, map_location=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return model
diff --git a/train/llava/model/utils.py b/train/llava/model/utils.py
new file mode 100644
index 0000000..10652a5
--- /dev/null
+++ b/train/llava/model/utils.py
@@ -0,0 +1,20 @@
+from transformers import AutoConfig
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
diff --git a/train/llava/s3_ops.py b/train/llava/s3_ops.py
new file mode 100644
index 0000000..8569aed
--- /dev/null
+++ b/train/llava/s3_ops.py
@@ -0,0 +1,217 @@
+# LINT_ME
+"""Filesystem operations supporting local and remote (obs://, s3://, gs://) paths."""
+from __future__ import annotations
+
+import io
+import json
+import posixpath
+from pathlib import Path
+from typing import IO, List, Union
+from urllib.parse import urlsplit, urlunsplit
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+from PIL import Image
+
+try:
+    import moxing as mox
+    _HAS_MOX = True
+except ImportError:
+    mox = None  # type: ignore
+    _HAS_MOX = False
+
+PathLike = Union[str, Path]
+
+# ---------- helpers ----------
+
+
+def _is_remote(p: PathLike) -> bool:
+    """
+    Check if path is a remote storage URL (obs://, s3://, gs://).
+    """
+    s = str(p)
+    if s.startswith(("obs://", "s3://", "gs://")):
+        return True
+    return False
+
+
+def join(base: PathLike, *parts: PathLike) -> str:
+    """
+    Join path components for local or remote paths.
+    """
+    b = str(base)
+    if _is_remote(b):
+        scheme, netloc, path, query, frag = urlsplit(b)
+        segs = [path] + [str(p) for p in parts]
+        new_path = posixpath.join(*[s.lstrip("/") for s in segs if s])
+        if not new_path.startswith("/"):
+            new_path = "/" + new_path
+        return urlunsplit((scheme, netloc, new_path, query, frag))
+    return str(Path(b).joinpath(*map(str, parts)))
+
+def exists(p: PathLike) -> bool:
+    """
+    Check if path exists (local or remote).
+    """
+    if _is_remote(p):
+        if not _HAS_MOX:
+            raise RuntimeError("moxing not installed.")
+        return bool(mox.file.exists(str(p)))
+    return Path(p).exists()
+
+
+def isdir(p: PathLike) -> bool:
+    """
+    Check if path is a directory (local or remote).
+    """
+    if _is_remote(p):
+        if not _HAS_MOX:
+            raise RuntimeError("moxing not installed.")
+        return bool(mox.file.is_directory(str(p)))
+    return Path(p).is_dir()
+
+
+def listdir(p: PathLike) -> List[str]:
+    """
+    List directory contents for local or obs:// paths.
+    Remote returns full paths, local returns names (like os.listdir).
+    """
+    if _is_remote(p):
+        if not _HAS_MOX:
+            raise RuntimeError("moxing not installed.")
+        return mox.file.list_directory(str(p))
+    return [x.name for x in Path(p).iterdir()]
+
+
+def basename(p: PathLike) -> str:
+    """
+    Return the final component of a path, works for local and obs:// URLs.
+    """
+    s = str(p)
+    if _is_remote(s):
+        # Strip trailing slash if present, then take the last segment
+        return s.rstrip("/").rsplit("/",maxsplit=1)[-1]
+    return Path(s).name
+
+
+def split(p: PathLike) -> tuple[str, str]:
+    """
+    Split path into (head, tail).
+    For remote (obs://...), head keeps scheme+netloc+parent, tail is the last component.
+    For local, same as os.path.split.
+    """
+    s = str(p)
+    if _is_remote(s):
+        scheme, netloc, path, query, frag = urlsplit(s)
+        parts = path.rstrip("/").rsplit("/",maxsplit=1)
+        tail = parts[-1] if parts else ""
+        head_path = "/".join(parts[:-1])
+        if head_path and not head_path.startswith("/"):
+            head_path = "/" + head_path
+        head = urlunsplit((scheme, netloc, head_path, query, frag))
+        return head, tail
+    return str(Path(s).parent), Path(s).name
+
+
+def splitext(p: PathLike) -> tuple[str, str]:
+    """
+    Split path into (root, ext).
+    Works like os.path.splitext, for both local and remote.
+    """
+    s = str(p)
+    if _is_remote(s):
+        base = basename(s)  # last part only
+        root, ext = posixpath.splitext(base)
+        # Reconstruct full root path (without extension)
+        head, _ = split(s)
+        return join(head, root), ext
+    root, ext = Path(s).with_suffix("").as_posix(), Path(s).suffix
+    return root, ext
+
+
+def open_file(p: PathLike, mode: str = "r", **kwargs) -> IO:
+    """
+    Open a file for local or remote (obs://, s3://, gs://) paths.
+    """
+    if _is_remote(p):
+        if not _HAS_MOX:
+            raise RuntimeError("moxing not installed.")
+        return mox.file.File(str(p), mode, **kwargs)  # type: ignore
+    return open(p, mode, **kwargs)  # pylint: disable=W1514
+
+
+def json_load(p: PathLike, **json_kwargs):
+    """
+    Load JSON or JSONL files automatically.
+    Returns:
+        - list[dict]: for JSONL or list-based JSON
+        - dict: for JSON object
+    """
+    encoding = json_kwargs.pop("encoding", "utf-8")
+
+    with open_file(p, "r", encoding=encoding) as f:
+        f.seek(0)
+
+        # JSONL: detect line-delimited JSON
+        if p.endswith(".jsonl"):
+            f.seek(0)
+            return [json.loads(line) for line in f if line.strip()]
+        f.seek(0)
+        return json.load(f, **json_kwargs)
+
+
+def image_open(p: PathLike):
+    """
+    Open an image from local or remote path.
+    """
+
+    if str(p).endswith(".parquet"):
+        with open_file(p, "rb") as fp:
+            file_data = fp.read()
+        table = pq.read_table(pa.py_buffer(file_data)).to_pandas()
+        images = {}
+        for cam_id in table.columns:
+            data = table[cam_id].iloc[0]  # 取第一行的 bytes 数据
+            try:
+                img = Image.open(io.BytesIO(data))
+                img.load()  # 强制加载，释放文件句柄
+                images[cam_id] = img
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to decode image from camera '{cam_id}' in {p}: {e}"
+                ) from e
+        return images["cam_1"]
+
+    f = open_file(p, "rb")
+    try:
+        img = Image.open(f)
+        # Force load into memory so we can close the file immediately
+        img.load()
+        f.close()
+        return img
+    except Exception:
+        f.close()
+        raise
+
+
+def makedirs(p: PathLike, exist_ok: bool = True) -> None:
+    """
+    Create directories (local or remote).
+    - Local: same as Path(...).mkdir(parents=True, exist_ok=...)
+    - Remote: uses mox.file.make_dirs
+    """
+    s = str(p)
+    if _is_remote(s):
+        if not _HAS_MOX:
+            raise RuntimeError("moxing not installed.")
+        # moxing's make_dirs creates all necessary parent dirs automatically
+        if exist_ok:
+            # No harm if it already exists
+            if not mox.file.exists(s):
+                mox.file.make_dirs(s)
+        else:
+            if mox.file.exists(s):
+                raise FileExistsError(f"Remote path already exists: {s}")
+            mox.file.make_dirs(s)
+    else:
+        Path(s).mkdir(parents=True, exist_ok=exist_ok)
diff --git a/train/llava/train/llama_flash_attn_monkey_patch.py b/train/llava/train/llama_flash_attn_monkey_patch.py
new file mode 100644
index 0000000..c88fe34
--- /dev/null
+++ b/train/llava/train/llama_flash_attn_monkey_patch.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple
+import warnings
+
+import torch
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.")
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)  # shape: (b, num_heads, s, head_dim)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        # reuse k, v
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2)
+    qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
+        cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device)
+        max_s = q_len
+        output = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
+        output = output.view(bsz, q_len, -1)
+    else:
+        qkv = qkv.reshape(bsz, q_len, -1)
+        qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+
+    return self.o_proj(output), None, past_key_value
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn("Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593")
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
diff --git a/train/llava/train/llava_trainer.py b/train/llava/train/llava_trainer.py
new file mode 100644
index 0000000..e989297
--- /dev/null
+++ b/train/llava/train/llava_trainer.py
@@ -0,0 +1,591 @@
+import os
+import torch
+import torch.nn as nn
+import datetime
+import torch.distributed as dist
+
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs, GradientAccumulationPlugin
+from torch.utils.data import Dataset, Sampler, DataLoader
+
+from trl.trainer import DPOTrainer
+from trl.trainer.utils import DPODataCollatorWithPadding
+
+from transformers import Trainer
+from transformers.trainer import is_sagemaker_mp_enabled, get_parameter_names, has_length, ALL_LAYERNORM_LAYERS, GradientAccumulationPlugin, logger, is_accelerate_available, is_datasets_available
+from transformers.trainer_utils import seed_worker
+from transformers.trainer_pt_utils import get_length_grouped_indices as get_length_grouped_indices_hf
+from transformers.trainer_pt_utils import AcceleratorConfig
+from typing import List, Optional
+from datetime import timedelta
+
+if is_accelerate_available():
+    from accelerate import Accelerator, skip_first_batches, InitProcessGroupKwargs
+
+if is_datasets_available():
+    import datasets
+
+from llava.utils import rank0_print
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, "no ignore status")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_variable_length_grouped_indices(lengths, batch_size, world_size, megabatch_mult=8, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    sorted_indices = sorted(range(len(lengths)), key=lambda i: lengths[i], reverse=True)
+    megabatch_size = world_size * batch_size * megabatch_mult
+    megabatches = [sorted_indices[i : i + megabatch_size] for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: indices[i], reverse=True) for megabatch in megabatches]
+    shuffled_indices = [i for megabatch in megabatches for i in megabatch]
+    world_batch_size = world_size * batch_size
+    batches = [shuffled_indices[i : i + world_batch_size] for i in range(0, len(lengths), world_batch_size)]
+    batch_indices = torch.randperm(len(batches), generator=generator)
+    batches = [batches[i] for i in batch_indices]
+
+    return [i for batch in batches for i in batch]
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    """
+    Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
+    lengths. To do this, the indices are:
+
+    - randomly permuted
+    - grouped in mega-batches of size `mega_batch_mult * batch_size`
+    - reorder by length in each mega-batch
+
+    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
+    maximum length placed first, so that an OOM happens sooner rather than later.
+    """
+
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    """
+    Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
+    lengths. To do this, the indices are:
+
+    - randomly permuted
+    - grouped in mega-batches of size `mega_batch_mult * batch_size`
+    - reorder by length in each mega-batch
+
+    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
+    maximum length placed first, so that an OOM happens sooner rather than later.
+    """
+
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+def get_length_grouped_indices_auto_single(lengths, batch_size, world_size, generator=None):
+    indices = get_length_grouped_indices_hf(lengths, batch_size * world_size, generator=generator)
+
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size] for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    batch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in batch_indices]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+def get_modality_length_grouped_indices_auto(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices_auto_single(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices_auto_single(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices_auto_single(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    # FIXME: Hard code to avoid last batch mixed with different modalities
+    # if len(additional_batch) > 0:
+    #     megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        variable_length: bool = False,
+        group_by_modality: bool = False,
+        group_by_modality_auto: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.variable_length = variable_length
+        self.group_by_modality = group_by_modality
+        self.group_by_modality_auto = group_by_modality_auto
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.variable_length:
+            assert not self.group_by_modality, "Variable length grouping is not supported with modality grouping."
+            indices = get_variable_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            if self.group_by_modality:
+                indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+            elif self.group_by_modality_auto:
+                indices = get_modality_length_grouped_indices_auto(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+            else:
+                indices = get_length_grouped_indices_auto_single(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+
+
+class LLaVATrainer(Trainer):
+    def __init__(self, obs_upload_path=None, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.obs_upload_path = obs_upload_path
+
+    def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        rank0_print("Setting NCCL timeout to INF to avoid running errors.")
+
+        # create accelerator object
+        self.accelerator = Accelerator(
+            split_batches=self.args.split_batches, deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_plugin=gradient_accumulation_plugin, kwargs_handlers=[accelerator_kwargs]
+        )
+        # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
+        self.gather_function = self.accelerator.gather_for_metrics
+
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+
+        # post accelerator creation setup
+        if self.is_fsdp_enabled:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", fsdp_plugin.limit_all_gathers)
+            if is_accelerate_available("0.23.0"):
+                fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get("activation_checkpointing", fsdp_plugin.activation_checkpointing)
+                if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
+                    raise ValueError("The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg " "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic " "when using FSDP.")
+
+        if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
+            self.propagate_args_to_deepspeed()
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_length:
+            lengths = self.train_dataset.lengths
+            return LengthGroupedSampler(
+                # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
+                self.args.train_batch_size,
+                # world_size=self.args.world_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,  # TODO: seems that this may work?
+                lengths=lengths,
+            )
+        elif self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
+                self.args.train_batch_size,
+                # world_size=self.args.world_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,  # TODO: seems that this may work?
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        elif self.args.group_by_modality_length_auto:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
+                self.args.train_batch_size,
+                # world_size=self.args.world_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,  # TODO: seems that this may work?
+                lengths=lengths,
+                group_by_modality_auto=True,
+            )
+        elif self.args.group_by_varlen:
+            lengths = self.train_dataset.lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                # self.args.train_batch_size, # TODO: seems that we should have gradient_accumulation_steps
+                # world_size=self.args.world_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,  # TODO: seems that this may work?
+                lengths=lengths,
+                variable_length=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
+        training if necessary) otherwise.
+
+        Subclass and override this method if you want to inject some custom behavior.
+        """
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = self.args.dataloader_num_workers * 2 if self.args.dataloader_num_workers != 0 else None
+
+        if hasattr(self.args, "use_webdataset") and self.args.use_webdataset:
+            dataloader = DataLoader(train_dataset, **dataloader_params)
+        else: 
+            dataloader = self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+        return dataloader
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            lr_mapper = {}
+            if self.args.mm_projector_lr is not None:
+                lr_mapper["mm_projector"] = self.args.mm_projector_lr
+            if self.args.mm_vision_tower_lr is not None:
+                lr_mapper["vision_tower"] = self.args.mm_vision_tower_lr
+            if len(lr_mapper) > 0:
+                special_lr_parameters = [name for name, _ in opt_model.named_parameters() if any(module_keyword in name for module_keyword in lr_mapper)]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
+                        "weight_decay": 0.0,
+                    },
+                ]
+                for module_keyword, lr in lr_mapper.items():
+                    module_parameters = [name for name, _ in opt_model.named_parameters() if module_keyword in name]
+                    optimizer_grouped_parameters.extend(
+                        [
+                            {
+                                "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in module_parameters and p.requires_grad)],
+                                "weight_decay": self.args.weight_decay,
+                                "lr": lr,
+                            },
+                            {
+                                "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in module_parameters and p.requires_grad)],
+                                "weight_decay": 0.0,
+                                "lr": lr,
+                            },
+                        ]
+                    )
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False) or (
+            hasattr(self.args, "mm_tunable_parts") and (len(self.args.mm_tunable_parts.split(",")) == 1 and ("mm_mlp_adapter" in self.args.mm_tunable_parts or "mm_vision_resampler" in self.args.mm_tunable_parts))
+        ):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ["mm_projector", "vision_resampler"]
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(["embed_tokens", "embed_in"])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f"mm_projector.bin"))
+            
+        else:
+            #super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            if self.hp_search_backend is None and trial is None:
+                self.store_flos()
+
+            
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            self.save_model(output_dir, _internal_call=True)
+
+            if not self.args.save_only_model:
+                # Save optimizer and scheduler
+                self._save_optimizer_and_scheduler(output_dir)
+                # Save RNG state
+                self._save_rng_state(output_dir)
+
+            # Determine the new best metric / best model checkpoint
+            if metrics is not None and self.args.metric_for_best_model is not None:
+                import numpy as np
+                metric_to_check = self.args.metric_for_best_model
+                if not metric_to_check.startswith("eval_"):
+                    metric_to_check = f"eval_{metric_to_check}"
+                metric_value = metrics[metric_to_check]
+
+                operator = np.greater if self.args.greater_is_better else np.less
+                if (
+                    self.state.best_metric is None
+                    or self.state.best_model_checkpoint is None
+                    or operator(metric_value, self.state.best_metric)
+                ):
+                    self.state.best_metric = metric_value
+                    self.state.best_model_checkpoint = output_dir
+
+            # Save the Trainer state
+            if self.args.should_save:
+                TRAINER_STATE_NAME = "trainer_state.json"
+                self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+
+            if self.args.push_to_hub:
+                self._push_from_checkpoint(output_dir)
+
+            if self.obs_upload_path:
+                if dist.get_rank()==0:
+                    log_dir = os.path.join(os.path.dirname(output_dir),"runs")
+                    mox.file.set_auth(ak="",sk="")
+                    mox.file.copy_parallel(log_dir, self.obs_upload_path+log_dir)
+                    mox.file.copy_parallel(output_dir, self.obs_upload_path+output_dir)
+
+            # Maybe delete some older checkpoints.
+            if self.args.should_save:
+                # Solely rely on numerical checkpoint id for rotation.
+                # mtime is not reliable especially on some fuse fs in cloud environments.
+                self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
+
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)
+
+
+class LLaVADPOTrainer(DPOTrainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
+                self.args.train_batch_size,
+                world_size=self.args.world_size,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False) or (
+            hasattr(self.args, "mm_tunable_parts") and (len(self.args.mm_tunable_parts.split(",")) == 1 and ("mm_mlp_adapter" in self.args.mm_tunable_parts or "mm_vision_resampler" in self.args.mm_tunable_parts))
+        ):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ["mm_projector", "vision_resampler"]
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(["embed_tokens", "embed_in"])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f"mm_projector.bin"))
+        else:
+            # super(LLaVADPOTrainer, self)._save_checkpoint(model, trial, metrics)
+            # print(type(model))
+            # from transformers.modeling_utils import unwrap_model
+            # print(type(unwrap_model(model)))
+            # print(unwrap_model(model).config)
+            if self.args.lora_enable:
+                from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+                run_dir = self._get_output_dir(trial=trial)
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+                from transformers.modeling_utils import unwrap_model
+
+                unwrapped_model = unwrap_model(model)
+                self.save_my_lora_ckpt(output_dir, self.args, unwrapped_model)
+            else:
+                super(LLaVADPOTrainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False):
+            pass
+        else:
+            super(LLaVADPOTrainer, self)._save(output_dir, state_dict)
diff --git a/train/llava/train/llava_trainer_eval.py b/train/llava/train/llava_trainer_eval.py
new file mode 100644
index 0000000..e822258
--- /dev/null
+++ b/train/llava/train/llava_trainer_eval.py
@@ -0,0 +1,76 @@
+import json
+import subprocess
+
+from llava.train.llava_trainer import LLaVATrainer
+
+
+class LLaVAEvalTrainer(LLaVATrainer):
+    def evaluate(self, evaluate_args):
+        cmd = f"accelerate launch --num_processes {evaluate_args.eval_num_processes} -m lmms_eval \
+                --model {evaluate_args.model} \
+                --model_args {evaluate_args.model_args} \
+                --tasks {evaluate_args.task_names} \
+                --batch_size {evaluate_args.batch_size} \
+                --log_samples_suffix {evaluate_args.log_samples_suffix} \
+                --output_path {evaluate_args.output_path}"
+        if evaluate_args.limit:
+            cmd += f" --limit {evaluate_args.limit}"
+        if evaluate_args.num_fewshot:
+            cmd += f" --num_fewshot {evaluate_args.num_fewshot}"
+        if evaluate_args.gen_kwargs != "":
+            cmd += f" --gen_kwargs {evaluate_args.gen_kwargs}"
+        if evaluate_args.log_samples:
+            cmd += f" --log_samples"
+        else:
+            assert False, "Please log samples so that the result can be parsed"
+        results = subprocess.run([cmd], shell=True, capture_output=True, text=True)
+        try:
+            result_file_index_start = results.stdout.index("Saved samples to ")
+            result_file_index_end = results.stdout.index(f".json")
+            result_file_index_start += len("Saved samples to ")
+            file = results.stdout[result_file_index_start:result_file_index_end]
+        except:
+            result_file_index_start = results.stderr.index("Saved samples to ")
+            result_file_index_end = results.stderr.index(f".json")
+            result_file_index_start += len("Saved samples to ")
+            file = results.stderr[result_file_index_start:result_file_index_end]
+        file = file.split("/")[:-1]
+        file = "/".join(file) + "/results.json"
+        with open(file, "r") as f:
+            lmms_eval_results = json.load(f)
+        result_dict = {}
+        tasks_list = evaluate_args.task_names.split(",")
+        for task in tasks_list:
+            task_results = lmms_eval_results["results"][task]
+            for k, v in task_results.items():
+                if k != "alias" and "stderr" not in k:
+                    metric = k.split(",")[0]
+                    result_dict[f"{task}_{metric}"] = v
+        return result_dict
+
+    """def evaluate(self, evaluate_args):
+        initialize_tasks()
+        tasks_list = evaluate_args.task_names.split(",")
+        result_dict = {}
+        results = evaluator.simple_evaluate(
+            model=evaluate_args.model,
+            model_args=evaluate_args.model_args,
+            tasks=tasks_list,
+            num_fewshot=evaluate_args.num_fewshot,
+            batch_size=evaluate_args.batch_size,
+            device=evaluate_args.device,
+            limit=evaluate_args.limit,
+            check_integrity=evaluate_args.check_integrity,
+            show_task_to_terminal=evaluate_args.show_task_to_terminal,
+            log_samples=evaluate_args.log_samples,
+            gen_kwargs=evaluate_args.gen_kwargs,
+            cli_args=evaluate_args,
+        )
+        for task in tasks_list:
+            task_results = results["results"][task]
+            for k,v in task_results.items():
+                if k != "alias" and "stderr" not in k:
+                    metric = k.split(",")[0]
+                    result_dict[f"{task}_{metric}"] = v
+            
+        return result_dict"""
diff --git a/train/llava/train/train.py b/train/llava/train/train.py
new file mode 100644
index 0000000..bda1869
--- /dev/null
+++ b/train/llava/train/train.py
@@ -0,0 +1,1773 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import ast
+import copy
+import logging
+import os
+import pathlib
+import random
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+import tokenizers
+import torch
+import torch.distributed as dist
+import transformers
+from llava import conversation as conversation_lib
+from llava.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from llava.mm_utils import (
+    process_anyres_image,
+    process_highres_image,
+    process_highres_image_crop_split,
+    tokenizer_image_token,
+)
+from llava.model import *
+from llava.train.llava_trainer import LLaVATrainer
+from llava.utils import (
+    rank0_print,
+    read_frames_gif,
+    read_frames_pyav,
+)
+from packaging import version
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset
+from transformers import AutoConfig
+
+import train.llava.s3_ops as st
+
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+local_rank = None
+
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse(
+    "0.14"
+)
+
+import warnings
+
+warnings.filterwarnings("ignore")
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    model_class_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Used to init model class, format is XXXXForCausalLM. e.g. currently XXXX is chosen from LlavaLlama, LlavaMixtral, LlavaMistral, Llama"
+        },
+    )
+
+    mm_tunable_parts: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'Could be "mm_mlp_adapter", "mm_vision_resampler", "mm_vision_tower,mm_mlp_adapter,mm_language_model", "mm_vision_tower,mm_mlp_adapter,mm_language_model", "mm_mlp_adapter,mm_language_model"'
+        },
+    )
+    # deciding which part of the multimodal model to tune, will overwrite other previous settings
+
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    tune_mm_vision_resampler: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    vision_tower_pretrained: Optional[str] = field(
+        default=None
+    )  # default to the last layer
+
+    num_token: Optional[bool] = field(default=False)
+
+    unfreeze_mm_vision_tower: bool = field(default=False)
+    unfreeze_language_model: bool = field(default=False)
+    mm_vision_select_layer: Optional[int] = field(
+        default=-1
+    )  # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default="linear")
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_patch_merge_type: Optional[str] = field(default="flat")
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    mm_resampler_type: Optional[str] = field(default=None)
+    mm_mask_drop_mode: str = field(default="fixed")
+    mm_mask_drop_skip_percentage: float = field(default=0.0)
+    mm_mask_drop_ratio: float = field(default=0.25)
+    mm_mask_drop_ratio_upper: Optional[float] = field(default=None)
+    mm_mask_drop_ratio_lower: Optional[float] = field(default=None)
+    mm_spatial_pool_stride: Optional[int] = field(default=None)
+    mm_spatial_pool_mode: str = field(default="bilinear")
+    mm_spatial_pool_out_channels: Optional[int] = field(default=None)
+    mm_perceiver_depth: Optional[int] = field(default=3)
+    mm_perceiver_latents: Optional[int] = field(default=32)
+    mm_perceiver_ff_mult: Optional[float] = field(default=4)
+    mm_perceiver_pretrained: Optional[str] = field(default=None)
+    mm_qformer_depth: Optional[int] = field(default=3)
+    mm_qformer_latents: Optional[int] = field(default=32)
+    mm_qformer_pretrained: Optional[str] = field(default=None)
+
+    rope_scaling_factor: Optional[float] = field(default=None)
+    rope_scaling_type: Optional[str] = field(default=None)
+
+    s2: Optional[bool] = field(default=False)
+    s2_scales: Optional[str] = field(default="336,672,1008")
+
+    use_pos_skipping: Optional[bool] = field(default=False)
+    pos_skipping_range: Optional[int] = field(default=4096)
+
+    mm_newline_position: Optional[str] = field(default="grid")
+    delay_load: Optional[bool] = field(default=True)
+    add_faster_video: Optional[bool] = field(default=False)
+    faster_token_stride: Optional[int] = field(default=10)
+
+    num_experts_per_tok: Optional[int] = field(default=-1)
+    num_experts: Optional[int] = field(default=-1)
+    moe_choice: Optional[str] = field(default="expert")
+    moe_router_score_function: Optional[str] = field(default=None)
+    moe_lora_rank: Optional[int] = field(default=32)
+    moe_lora_alpha: Optional[int] = field(default=32)
+    moe_lora_in_features: Optional[int] = field(default=4096)
+    moe_lora_out_features: Optional[int] = field(default=4096)
+    moe_lora_dropout: Optional[float] = field(default=0.0)
+    capacity_factor: Optional[float] = field(default=0.1)
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None,
+        metadata={
+            "help": "Path to the training data, in llava's instruction.json format. Supporting multiple json files via /path/to/{a,b,c}.json"
+        },
+    )
+
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    early_mix_text: bool = False
+    use_webdataset: bool = False
+    total_samples: Optional[int] = field(default=None)
+
+    image_folder: Optional[str] = field(default=None)
+    image_folder_2: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the second image folder, used as a fallback."},
+    )
+    image_aspect_ratio: str = "square"
+    image_grid_pinpoints: Optional[str] = field(default=None)
+    image_crop_resolution: Optional[int] = field(default=None)
+    image_split_resolution: Optional[int] = field(default=None)
+
+    video_folder: Optional[str] = field(default=None)
+    video_fps: Optional[int] = field(default=1)
+    frames_upbound: Optional[int] = field(default=0)
+    add_time_instruction: Optional[bool] = field(default=False)
+    force_sample: Optional[bool] = field(default=False)
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    freeze_mm_vision_resampler: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=4096,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={
+            "help": "Compress the quantization statistics through double quantization."
+        },
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={
+            "help": "Quantization data type to use. Should be one of `fp4` or `nf4`."
+        },
+    )
+    bits: int = field(default=16, metadata={"help": "How many bits to use."})
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    mm_vision_tower_lr: Optional[float] = None
+    group_by_varlen: bool = field(default=False)
+    group_by_modality_length: bool = field(default=False)
+    group_by_modality_length_auto: bool = field(default=False)
+    auto_find_batch_size: bool = field(default=False)
+    gradient_checkpointing: bool = field(default=True)
+    verbose_logging: bool = field(default=False)
+    attn_implementation: str = field(
+        default="flash_attention_2",
+        metadata={"help": "Use transformers attention implementation."},
+    )
+    use_conversation_mask: bool = field(default=True)
+    split_batches: bool = field(default=False)
+    torch_empty_cache_steps: int = field(default=1)
+    fp16: bool = field(default=False)
+    debug_mode: bool = field(default=False)
+    obs_upload_path: str = field(
+        default="", metadata={"help": "Path to the obs upload path."}
+    )
+    output_dir: str = field(
+        default="exp", metadata={"help": "Path to the output directory."}
+    )
+    save_only_model: bool = True
+
+
+# @dataclass
+# class EvaluationArguments:
+#     eval_num_processes: int = field(default=1)
+#     task_names: str = field(default=None)
+#     model: str = field(default="llava")
+#     model_args: Optional[str] = field(default=None)
+#     num_fewshot: Optional[int] = field(default=None)
+#     batch_size: int = field(default=1)
+#     device: Optional[str] = field(default=None)
+#     limit: Optional[int] = field(default=None)
+#     check_integrity: Optional[bool] = field(default=False)
+#     show_task_to_terminal: Optional[bool] = field(default=False)
+#     log_samples: Optional[bool] = field(default=True)
+#     gen_kwargs: Optional[str] = field(default="")
+#     log_samples_suffix: Optional[str] = field(default="")
+#     output_path: Optional[str] = field(default="./logs/")
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(
+                    f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}"
+                )
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {
+        k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()
+    }
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {
+        k: t
+        for k, t in named_params
+        if any(key_match in k for key_match in keys_to_match)
+    }
+    to_return = {
+        k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()
+    }
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ["mm_projector", "vision_tower", "vision_resampler"]
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if "lm_head" in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove("lm_head")
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    if (
+        hasattr(trainer.args, "tune_mm_mlp_adapter")
+        and trainer.args.tune_mm_mlp_adapter
+    ):
+        check_only_save_mm_adapter_tunnable = True
+    # only has mm_mlp_adapter and mm_vision_resampler in the tuneable parts
+    elif hasattr(trainer.args, "mm_tunable_parts") and (
+        len(trainer.args.mm_tunable_parts.split(",")) == 1
+        and (
+            "mm_mlp_adapter" in trainer.args.mm_tunable_parts
+            or "mm_vision_resampler" in trainer.args.mm_tunable_parts
+        )
+    ):
+        check_only_save_mm_adapter_tunnable = True
+    else:
+        check_only_save_mm_adapter_tunnable = False
+
+    trainer.accelerator.wait_for_everyone()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif hasattr(torch, "npu") and torch.npu.is_available():
+        torch.npu.synchronize()
+    # torch.cuda.synchronize()
+
+    rank0_print(f"Only save projectors: {check_only_save_mm_adapter_tunnable}")
+    if check_only_save_mm_adapter_tunnable:
+        # Only save Adapter
+        keys_to_match = ["mm_projector", "vision_resampler"]
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(["embed_tokens", "embed_in"])
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(
+            trainer.model.named_parameters(), keys_to_match
+        )
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split("/")[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith("checkpoint-"):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(
+                    weight_to_save,
+                    os.path.join(mm_projector_folder, f"{current_folder}.bin"),
+                )
+            else:
+                torch.save(
+                    weight_to_save, os.path.join(output_dir, f"mm_projector.bin")
+                )
+        return
+
+    if trainer.deepspeed:
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def _tokenize_fn(
+    strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer
+) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx + 2 : cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+
+
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = "unknown"
+        sentence["value"] = (
+            BEGIN_SIGNAL + from_str + ": " + sentence["value"] + END_SIGNAL
+        )
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+
+
+def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+            # TODO maybe this should be changed for interleaved data?
+            # if DEFAULT_IMAGE_TOKEN in sentence["value"] and not sentence["value"].startswith(DEFAULT_IMAGE_TOKEN):
+            # only check for num_im=1
+            num_im = len(re.findall(DEFAULT_IMAGE_TOKEN, sentence["value"]))
+            if (
+                num_im == 1
+                and DEFAULT_IMAGE_TOKEN in sentence["value"]
+                and not sentence["value"].startswith(DEFAULT_IMAGE_TOKEN)
+            ):
+                sentence["value"] = (
+                    sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip()
+                )
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                sentence["value"] = sentence["value"].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN,
+                        "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>",
+                    )
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = (
+                    DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_IMAGE_TOKEN, replace_token
+            )
+
+            # For videoInstruct-100k noisy_data. TODO: Ask Yuanhan to clean the data instead of leaving the noise code here.
+            sentence["value"] = sentence["value"].replace(
+                "QA_GT_caption_based_noisy", ""
+            )
+
+    return sources
+
+
+def preprocess_llada(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    max_len=2048,
+    system_message: str = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+) -> Dict:
+    # roles = {"human": "<|start_header_id|>user<|end_header_id|>", "gpt": "<|start_header_id|>assistant<|end_header_id|>"}
+    roles = {"human": "user", "gpt": "assistant", "system": "system"}
+
+    # Add image tokens to tokenizer as a special tokens
+    # Use a deepcopy of tokenizer so that we don't modify on the tokenizer
+    # tokenizer = copy.deepcopy(tokenizer)
+    # # When there is actually an image, we add the image tokens as a special token
+    # if has_image:
+    #     tokenizer.add_tokens(["<image>"], special_tokens=True)
+    image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    bos_token_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
+    start_header_id = tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+    end_header_id = tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
+
+    unmask_tokens = [
+        "<|startoftext|>",
+        "<|start_header_id|>",
+        "<|end_header_id|>",
+        "<|eot_id|>",
+        "\n\n",
+    ]
+    unmask_tokens_idx = [tokenizer.convert_tokens_to_ids(tok) for tok in unmask_tokens]
+    # Reset LLaDA chat templates so that it won't include assistant message every time we apply
+    chat_template = "{% for message in messages %}{{'<|startoftext|>' + '<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n\n' + message['content'] + '<|eot_id|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+
+    # After update, calling tokenizer of llama3 will
+    # auto add bos id for the tokens. ヽ(｀⌒´)ﾉ
+    def safe_tokenizer_llama3(text):
+        input_ids = tokenizer(text).input_ids
+        if input_ids[0] == bos_token_id:
+            input_ids = input_ids[1:]
+        return input_ids
+
+    nl_tokens = tokenizer.convert_tokens_to_ids("\n\n")
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] == roles["system"]:
+            try:
+                system_message = source[0]["content"]
+            except:
+                system_message = source[0]["value"]
+            source = source[1:]
+
+        input_id, target = [], []
+
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+        )
+        target += [IGNORE_INDEX] * len(input_id)
+
+        for conv in source:
+            # Make sure llava data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+
+            role = roles.get(role, role)
+
+            if "video" in content:
+                content = content.replace("<video>", "<image>")
+
+            conv = [{"role": role, "content": content}]
+            # First is bos token we don't need here
+            encode_id = tokenizer.apply_chat_template(conv)[1:]
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"]
+            + source[1]["value"]
+            + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.version == "llava_llada":
+        return preprocess_llada(sources, tokenizer, has_image=has_image)
+
+    else:
+        raise TypeError(
+            f"Unknown conversation version: {conversation_lib.default_conversation.version}"
+        )
+
+
+def join_root_to_media_paths(data, root_path):
+    keys = ("image", "video")
+    for item in data:
+        for key in keys:
+            if key in item:
+                value = item[key]
+                if isinstance(value, str):
+                    # Single path
+                    if value.startswith("./"):
+                        value = value[2:]
+                    item[key] = st.join(root_path, value)
+                elif isinstance(value, list):
+                    # List of paths
+                    item[key] = [
+                        st.join(root_path, v[2:])
+                        if v.startswith("./")
+                        else st.join(root_path, v)
+                        for v in value
+                    ]
+    return data
+
+
+def load_jsons_from_dir(dir_path):
+    all_data = []
+    for single_file in st.listdir(dir_path):
+        if single_file.endswith(("json", "jsonl")):
+            all_data.extend(st.json_load(st.join(dir_path, single_file)))
+    return all_data
+
+
+def load_all_datasets(config_path):
+    if st.isdir(config_path):
+        return load_jsons_from_dir(config_path)
+
+    config = st.json_load(config_path)
+    if isinstance(config, List):
+        return config
+
+    all_data = []
+
+    for name, info in config.items():
+        data_path = info["annotation"]
+        repeat_time = info.get("repeat_time", 1)
+        root_path = info.get("root", None)
+
+        rank0_print(f"Loading {name} from {data_path}")
+        if not st.exists(data_path):
+            rank0_print(f"⚠️  Skipping {name}, path not found: {data_path}")
+            continue
+
+        if st.isdir(data_path):
+            cur_data = load_jsons_from_dir(data_path)
+            print(f"Loaded {len(cur_data)} samples from {name}")
+        else:
+            cur_data = st.json_load(data_path)
+            print(f"Loaded {len(cur_data)} samples from {name}")
+        if root_path:
+            rank0_print(f"Append root to image or video path")
+            join_root_to_media_paths(cur_data, root_path)
+
+        # Repeat or sample according to repeat_time
+        if repeat_time > 1:
+            # repeat N times
+            repeated_data = cur_data * int(repeat_time)
+            # if fractional part remains, sample proportionally
+            fractional = repeat_time - int(repeat_time)
+            if fractional > 0:
+                extra = random.sample(cur_data, int(len(cur_data) * fractional))
+                repeated_data.extend(extra)
+            cur_data = repeated_data
+
+        elif 0 < repeat_time < 1:
+            # sample a fraction of data
+            sample_size = int(len(cur_data) * repeat_time)
+            cur_data = random.sample(cur_data, sample_size)
+
+        elif repeat_time == 0:
+            rank0_print(f"Skipping {name} because repeat_time=0")
+            continue
+
+        all_data.extend(cur_data)
+        rank0_print(f"→ Added {len(cur_data)} samples (after repeat/sample)")
+
+    rank0_print(f"\n✅ Total combined samples: {len(all_data)}")
+    return all_data
+
+
+def replace_last_images(sample: dict, old_tag="<image>", new_tag="image"):
+    imgs = sample.get(new_tag, [])
+    if not isinstance(imgs, list):
+        imgs = [imgs] if imgs else []
+    remaining = len(imgs)
+
+    for msg in sample.get("conversations", []):
+        s = msg.get("value", "")
+        k = s.count(old_tag)
+        if k == 0:
+            continue
+
+        if remaining <= 0:
+            # All further tags are excess: replace all
+            msg["value"] = s.replace(old_tag, new_tag)
+            rank0_print("warning mismatch image num with <image> tag")
+            continue
+
+        if k <= remaining:
+            # Still within quota: keep all, just reduce remaining
+            remaining -= k
+            continue
+
+        rank0_print("warning mismatch image num with <image> tag")
+        # This message crosses the quota: keep the first `remaining`, replace the rest
+        parts = s.split(
+            old_tag, remaining
+        )  # split only at the first `remaining` occurrences
+        head = (
+            old_tag.join(
+                parts[:remaining]
+            )  # reconstruct the kept part (with separators)
+            + (old_tag if remaining > 0 else "")
+        )  # include the `remaining`-th tag itself
+        tail = parts[remaining].replace(old_tag, new_tag)
+        msg["value"] = head + tail
+        remaining = 0
+
+
+class LazySupervisedDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizer = None,
+        data_args: DataArguments = None,
+        data_list=List,
+    ):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.tokenizer_with_image = copy.deepcopy(tokenizer)
+        # When there is actually an image, we add the image tokens as a special token
+        self.tokenizer_with_image.add_tokens(["<image>"], special_tokens=True)
+        self.list_data_dict = data_list
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        self.min_num_frame = 4
+        self.max_num_frame = 8
+
+    @classmethod
+    def load_data_from_json(self, data_args, data_path=None):
+        list_data_dict = []
+        if data_path:
+            if "," in data_path:
+                # process multi json files
+                data_args.dataset_paths = data_path.split(",")
+            else:
+                data_args.dataset_paths = [data_path]
+            rank0_print(f"Loading {data_args.dataset_paths}")
+            for data_path in data_args.dataset_paths:
+                cur_data_dict = load_all_datasets(data_path)
+                rank0_print(f"Loaded {len(cur_data_dict)} samples from {data_path}")
+                list_data_dict.extend(cur_data_dict)
+
+        else:
+            raise ValueError("No data path provided")
+
+        rank0_print(
+            f"Loaded {len(list_data_dict)} samples from {data_args.dataset_paths}"
+        )
+        rank0_print("Formatting inputs...Skip in lazy mode")
+
+        return list_data_dict
+
+    @classmethod
+    def sanity_ckecking(self, list_data_dict: dict):
+        rank0_print("sanity checking")
+        from tqdm import tqdm
+
+        for sample in tqdm(list_data_dict):
+            # Concatenate all message contents
+            if "image" in sample:
+                replace_last_images(sample, "<image>", "image")
+                image_dict = sample["image"]
+                if not isinstance(image_dict, List):
+                    image_dict = [image_dict]
+                all_text = " ".join(
+                    msg.get("value", "") for msg in sample["conversations"]
+                )
+                num_tags = all_text.count("<image>")
+                assert len(image_dict) == num_tags, (
+                    f"{all_text},{image_dict},{num_tags}"
+                )
+            elif "video" in sample:
+                replace_last_images(sample, "<video>", "video")
+                image_dict = sample["video"]
+                if not isinstance(image_dict, List):
+                    image_dict = [image_dict]
+                all_text = " ".join(
+                    msg.get("value", "") for msg in sample["conversations"]
+                )
+                num_tags = all_text.count("<video>")
+                assert len(image_dict) == num_tags, (
+                    f"{all_text},{image_dict},{num_tags}"
+                )
+            else:
+                replace_last_images(sample, "<image>", "image")
+                replace_last_images(sample, "<video>", "video")
+                all_text = " ".join(
+                    msg.get("value", "") for msg in sample["conversations"]
+                )
+                assert "<image>" not in all_text and "<video>" not in all_text, (
+                    f"{all_text}"
+                )
+        return list_data_dict
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(
+                sum(len(conv["value"].split()) for conv in sample["conversations"])
+                + img_tokens
+            )
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            image_tokens = 0
+            if "image" in sample:
+                if isinstance(sample["image"], str):
+                    image_tokens = 2185
+                elif isinstance(sample["image"], List):
+                    if len(sample["image"]) == 1:
+                        image_tokens = 2185
+                    else:
+                        image_tokens = 730 * 6
+            cur_len = sum(
+                len(conv["value"].split()) for conv in sample["conversations"]
+            )
+            assert cur_len > 0, f"Conversation length is 0 for {sample}"
+            if "image" in sample or "video" in sample or self.data_args.early_mix_text:
+                length_list.append(cur_len + image_tokens)
+            else:
+                length_list.append(-cur_len)
+        return length_list
+
+    def process_image(self, image_file, overwrite_image_aspect_ratio=None):
+        image_folder = getattr(self.data_args, "image_folder", None)
+        image_folder_2 = getattr(self.data_args, "image_folder_2", None)
+        processor = self.data_args.image_processor
+
+        image_path = None
+        if image_folder:
+            primary_path = st.join(image_folder, image_file)
+            if os.path.exists(primary_path):
+                image_path = primary_path
+            elif image_folder_2:
+                secondary_path = os.path.join(image_folder_2, image_file)
+                if os.path.exists(secondary_path):
+                    image_path = secondary_path
+            else:
+                image_path = primary_path
+
+        elif image_folder_2:  # Only secondary folder is provided
+            image_path = os.path.join(image_folder_2, image_file)
+
+        else:
+            image_path = image_file
+
+        try:
+            # image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+            image = st.image_open(image_path).convert("RGB")
+        except Exception as exn:
+            print(
+                f"Failed to open image {image_path} (derived from {image_file}). Exception:",
+                exn,
+            )
+            raise exn
+
+        image_size = image.size
+        image_aspect_ratio = self.data_args.image_aspect_ratio
+        if overwrite_image_aspect_ratio is not None:
+            image_aspect_ratio = overwrite_image_aspect_ratio
+        if image_aspect_ratio == "highres":
+            image = process_highres_image(
+                image,
+                self.data_args.image_processor,
+                self.data_args.image_grid_pinpoints,
+            )
+        elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+            image = process_anyres_image(
+                image,
+                self.data_args.image_processor,
+                self.data_args.image_grid_pinpoints,
+            )
+        elif image_aspect_ratio == "crop_split":
+            image = process_highres_image_crop_split(image, self.data_args)
+        elif image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(
+                image, tuple(int(x * 255) for x in processor.image_mean)
+            )
+            image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        else:
+            image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        return image, image_size, "image"
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # TODO: define number of retries somewhere else
+        num_base_retries = 3
+        num_final_retries = 300
+
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                print(self.list_data_dict[i])
+                time.sleep(1)
+
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(
+                    f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:",
+                    e,
+                )
+                pass
+
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+
+        if "image" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            if type(image_file) is list:
+                image = [self.process_image(f) for f in image_file]
+                # Handling multi images
+                # overwrite to process with simple pad
+                if len(image_file) > 1:
+                    image = [self.process_image(f, "pad") for f in image_file]
+                    image = [[im[0], im[1], "image"] for im in image]
+            else:
+                image = [self.process_image(image_file)]
+
+            all_text = " ".join(
+                msg.get("value", "") for msg in sources[0]["conversations"]
+            )
+            num_tags = all_text.count("<image>")
+            assert len(image) == num_tags, f"{all_text},{image}"
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]), self.data_args
+            )
+
+        elif "video" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+
+            if not st.exists(video_file):
+                print("File {} not exist!".format(video_file))
+
+            try:
+                if video_file.endswith("gif"):
+                    video = read_frames_gif(
+                        video_file,
+                        num_frames=self.max_num_frame,
+                        min_num_frames=self.min_num_frame,
+                    )
+                else:
+                    video = read_frames_pyav(
+                        video_file,
+                        num_frames=self.max_num_frame,
+                        min_num_frames=self.min_num_frame,
+                        clip=self.list_data_dict[i].get("clip", None),
+                    )
+
+                processor = self.data_args.image_processor
+                image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
+
+                image = [(image, video[0].size, "video")]
+                sources = preprocess_multimodal(
+                    copy.deepcopy([e["conversations"] for e in sources]), self.data_args
+                )
+                # print(sources)
+            except Exception as e:
+                print(f"Error: {e}")
+                print(f"Failed to read video file: {video_file}")
+                return self._get_item(i + 1)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        has_image = ("image" in self.list_data_dict[i]) or (
+            "video" in self.list_data_dict[i]
+        )
+        data_dict = preprocess(sources, self.tokenizer_with_image, has_image=has_image)
+
+        if "prompt" in data_dict:
+            prompt = data_dict["prompt"]
+        else:
+            prompt = None
+
+        if isinstance(i, int):
+            data_dict = dict(
+                input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0]
+            )
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = [
+                (
+                    torch.zeros(1, 3, crop_size["height"], crop_size["width"]),
+                    (crop_size["width"], crop_size["height"]),
+                    "text",
+                ),
+            ]
+        # prompt exist in the data
+        if prompt is not None:
+            data_dict["prompt"] = prompt
+
+        data_dict["id"] = self.list_data_dict[i].get("id", i)
+        if conversation_lib.default_conversation.version == "llada_plain":
+            data_dict["is_plain"] = True
+            data_dict["is_llada"] = True
+        elif conversation_lib.default_conversation.version == "llava_llada":
+            data_dict["is_plain"] = False
+            data_dict["is_llada"] = True
+
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+    training_args: Optional[TrainingArguments] = (
+        None  # Add training_args as a class attribute
+    )
+
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        # input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=batch_first, padding_value=padding_value
+        )
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        # input_ids, labels, ids = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels", "id"))
+        input_ids = [
+            _input_ids[: self.tokenizer.model_max_length] for _input_ids in input_ids
+        ]
+        labels = [_labels[: self.tokenizer.model_max_length] for _labels in labels]
+        if self.tokenizer.pad_token_id is None:
+            # self.tokenizer.pad_token_id = self.tokenizer.eos_token_id  # FIXME: this could only be triggered for llama3 model.
+            self.tokenizer.pad_token_id = (
+                0  # This gets the best result. Don't know why.
+            )
+
+        if "is_llada" in instances[0] and instances[0]["is_llada"]:
+            # Pad the sequence with the pad token id
+            input_ids = self.pad_sequence(
+                input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+            )
+            labels = self.pad_sequence(
+                labels, batch_first=True, padding_value=self.tokenizer.pad_token_id
+            )
+
+            batch = dict(
+                input_ids=input_ids,
+                labels=labels.long() if labels.dtype == torch.int32 else labels,
+            )
+
+            # Only add attention_mask for multi-round dialogs with conversation_mask enabled
+            if (
+                "is_plain" in instances[0]
+                and not instances[0]["is_plain"]
+                and self.training_args.use_conversation_mask
+            ):
+                # This is for multi-round dialogs
+                assert len(input_ids) == 1 and len(labels) == 1, (
+                    "Now, the batch size must be 1 for multi-round dialogs in LLaDA"
+                )
+                batch["attention_mask"] = torch.ones_like(
+                    input_ids, device=input_ids.device
+                )
+        else:
+            input_ids = self.pad_sequence(
+                input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+            )
+            labels = self.pad_sequence(
+                labels, batch_first=True, padding_value=IGNORE_INDEX
+            )
+            batch = dict(
+                input_ids=input_ids,
+                labels=labels.long() if labels.dtype == torch.int32 else labels,
+                attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+            )
+            # batch = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id), ids=ids)
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            batch["image_sizes"] = [im[1] for im_list in images for im in im_list]
+            batch["modalities"] = [im[2] for im_list in images for im in im_list]
+            images = [im[0] for im_list in images for im in im_list]
+
+            # if all(x is not None and x.shape == images[0].shape for x in images):
+            # Image: (N, P, C, H, W)
+            # Video: (N, F, C, H, W)
+            #     batch["images"] = torch.stack(images)
+            # else:
+            batch["images"] = images
+
+        if "prompt" in instances[0]:
+            batch["prompts"] = [instance["prompt"] for instance in instances]
+
+        return batch
+
+
+def ddp_broadcast_object(obj, src: int = 0):
+    if not dist.is_available() or not dist.is_initialized():
+        return obj
+    rank = dist.get_rank()
+    obj_list = [obj if rank == src else None]
+    dist.broadcast_object_list(obj_list, src=src)
+    return obj_list[0]
+
+
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer, data_args, training_args
+) -> Dict:
+    """Create dataset and collator for supervised fine-tuning."""
+    # Check if using WebDataset format
+    rank0_print("Loading data...")
+
+    if dist.get_rank() == 0:
+        data_list = LazySupervisedDataset.load_data_from_json(
+            data_args=data_args, data_path=data_args.data_path
+        )
+        data_list = LazySupervisedDataset.sanity_ckecking(data_list)
+    else:
+        data_list = None
+
+    dist.barrier()
+
+    rank0_print("broadcasting data_list...")
+    data_list = ddp_broadcast_object(data_list, src=0)
+    dist.barrier()
+    print(data_list[0])
+
+    train_dataset = LazySupervisedDataset(
+        tokenizer=tokenizer, data_args=data_args, data_list=data_list
+    )
+
+    data_collator = DataCollatorForSupervisedDataset(
+        tokenizer=tokenizer, training_args=training_args
+    )
+    return dict(
+        train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
+    )
+
+
+def get_model(model_args, training_args, bnb_model_from_pretrained_args):
+    assert training_args.attn_implementation
+    if training_args.attn_implementation == "sdpa" and torch.__version__ < "2.1.2":
+        raise ValueError(
+            "The 'sdpa' attention implementation requires torch version 2.1.2 or higher."
+        )
+
+    customized_kwargs = dict()
+    customized_kwargs.update(bnb_model_from_pretrained_args)
+    cfg_pretrained = None
+
+    overwrite_config = {}
+    if any(
+        [
+            model_args.rope_scaling_factor is not None,
+            model_args.rope_scaling_type is not None,
+            model_args.mm_spatial_pool_stride is not None,
+            model_args.mm_spatial_pool_out_channels is not None,
+            model_args.mm_spatial_pool_mode is not None,
+            model_args.mm_resampler_type is not None,
+            model_args.num_experts_per_tok is not None,
+            model_args.num_experts is not None,
+            model_args.moe_choice is not None,
+            model_args.moe_lora_rank is not None,
+            model_args.moe_lora_alpha is not None,
+            model_args.moe_lora_in_features is not None,
+            model_args.moe_lora_out_features is not None,
+            model_args.moe_lora_dropout is not None,
+        ]
+    ):
+        cfg_pretrained = AutoConfig.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=True
+        )
+
+    if model_args.num_experts_per_tok is not None:
+        overwrite_config["num_experts_per_tok"] = model_args.num_experts_per_tok
+    if model_args.num_experts is not None:
+        overwrite_config["num_experts"] = model_args.num_experts
+    if model_args.moe_choice is not None:
+        overwrite_config["moe_choice"] = model_args.moe_choice
+    if model_args.capacity_factor is not None:
+        overwrite_config["capacity_factor"] = model_args.capacity_factor
+    if model_args.moe_lora_rank is not None:
+        overwrite_config["moe_lora_rank"] = model_args.moe_lora_rank
+    if model_args.moe_lora_alpha is not None:
+        overwrite_config["moe_lora_alpha"] = model_args.moe_lora_alpha
+    if model_args.moe_lora_in_features is not None:
+        overwrite_config["moe_lora_in_features"] = model_args.moe_lora_in_features
+    if model_args.moe_lora_out_features is not None:
+        overwrite_config["moe_lora_out_features"] = model_args.moe_lora_out_features
+    if model_args.moe_lora_dropout is not None:
+        overwrite_config["moe_lora_dropout"] = model_args.moe_lora_dropout
+    if (
+        model_args.use_pos_skipping is not None
+        and model_args.pos_skipping_range is not None
+    ):
+        overwrite_config["use_pos_skipping"] = model_args.use_pos_skipping
+        overwrite_config["pos_skipping_range"] = model_args.pos_skipping_range
+
+    if (
+        model_args.rope_scaling_factor is not None
+        and model_args.rope_scaling_type is not None
+    ):
+        overwrite_config["rope_scaling"] = {
+            "factor": model_args.rope_scaling_factor,
+            "type": model_args.rope_scaling_type,
+        }
+        if training_args.model_max_length is None:
+            training_args.model_max_length = (
+                cfg_pretrained.max_position_embeddings * model_args.rope_scaling_factor
+            )
+            overwrite_config["max_sequence_length"] = training_args.model_max_length
+        assert training_args.model_max_length == int(
+            cfg_pretrained.max_position_embeddings * model_args.rope_scaling_factor
+        ), print(
+            f"model_max_length: {training_args.model_max_length}, max_position_embeddings: {cfg_pretrained.max_position_embeddings}, rope_scaling_factor: {model_args.rope_scaling_factor}"
+        )
+        # overwrite_config["max_sequence_length"] = model_args.max_sequence_length
+        # overwrite_config["tokenizer_model_max_length"] = model_args.tokenizer_model_max_length
+
+    if (
+        model_args.mm_spatial_pool_stride is not None
+        and model_args.mm_spatial_pool_out_channels is not None
+        and model_args.mm_spatial_pool_mode is not None
+        and model_args.mm_resampler_type is not None
+    ):
+        overwrite_config["mm_resampler_type"] = model_args.mm_resampler_type
+        overwrite_config["mm_spatial_pool_stride"] = model_args.mm_spatial_pool_stride
+        overwrite_config["mm_spatial_pool_out_channels"] = (
+            model_args.mm_spatial_pool_out_channels
+        )
+        overwrite_config["mm_spatial_pool_mode"] = model_args.mm_spatial_pool_mode
+
+    if model_args.mm_spatial_pool_mode is not None:
+        overwrite_config["mm_spatial_pool_mode"] = model_args.mm_spatial_pool_mode
+
+    if overwrite_config:
+        assert cfg_pretrained is not None, "cfg_pretrained is None"
+
+        rank0_print(f"Overwriting config with {overwrite_config}")
+        for k, v in overwrite_config.items():
+            setattr(cfg_pretrained, k, v)
+
+        customized_kwargs["config"] = cfg_pretrained
+
+    model = LlavaLLaDAModelLM.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        attn_implementation=training_args.attn_implementation,
+        torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        low_cpu_mem_usage=False,
+        **customized_kwargs,
+    )
+
+    return model
+
+
+@torch.no_grad()
+def init_numeric_and_special_tokens(
+    model,
+    tokenizer,
+    numeric_tokens,
+    noise_scale: float = 0.01,
+):
+    """
+    Initialize embeddings for:
+      1) numeric tokens like "123.45"
+
+    Args:
+        model: HF model (e.g., AutoModelForCausalLM)
+        tokenizer: HF tokenizer (same one used to add tokens)
+        numeric_tokens: list[str] of numeric tokens (e.g., ["-100.00", ..., "100.00"])
+        noise_scale: stddev of small Gaussian noise added to each initialized vector
+    """
+    emb = model.get_input_embeddings().weight
+    device = emb.device
+    dim = emb.shape[1]
+
+    def tok_ids_for_text(text: str):
+        # Get subword ids (no specials), filter out -100/None if any
+        ids = tokenizer.encode(text, add_special_tokens=False)
+        return [i for i in ids if isinstance(i, int) and i >= 0]
+
+    def avg_embedding_for_text(text: str):
+        ids = tok_ids_for_text(text)
+        if not ids:
+            return None
+        vec = emb[torch.tensor(ids, device=device)].mean(dim=0)
+        return vec
+
+    # ---- Build a numeric "base" vector from digits and dot ----
+    digit_ids = []
+    for d in "0123456789":
+        _ids = tok_ids_for_text(d)
+        digit_ids.extend(_ids)
+    dot_ids = tok_ids_for_text(".")
+
+    base_chunks = []
+    if digit_ids:
+        base_chunks.append(emb[torch.tensor(digit_ids, device=device)].mean(dim=0))
+    if dot_ids:
+        base_chunks.append(emb[torch.tensor(dot_ids, device=device)].mean(dim=0))
+    if base_chunks:
+        numeric_base = torch.stack(base_chunks, dim=0).mean(dim=0)
+    else:
+        # fallback if tokenizer lacks digits/dot as standalone pieces
+        numeric_base = torch.zeros(dim, device=device)
+
+    # ---- Initialize numeric tokens ----
+    for t in numeric_tokens:
+        tid = tokenizer.convert_tokens_to_ids(t)
+        if tid is None or tid < 0:
+            continue
+        noise = noise_scale * torch.randn(dim, device=device)
+        emb[tid] = numeric_base + noise
+
+
+def train(attn_implementation=None):
+    global local_rank
+
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if training_args.verbose_logging:
+        rank0_print(f"Inspecting experiment hyperparameters:\n")
+        rank0_print(f"model_args = {vars(model_args)}\n\n")
+        rank0_print(f"data_args = {vars(data_args)}\n\n")
+        rank0_print(f"training_args = {vars(training_args)}\n\n")
+        # rank0_print(f"evaluation_args = {vars(evaluation_args)}\n\n")
+
+    local_rank = training_args.local_rank
+
+    if dist.get_rank() == 0 and training_args.debug_mode:
+        import debugpy
+
+        debugpy.listen(56780)
+        print("Waiting for debugger attach")
+        debugpy.wait_for_client()
+        debugpy.breakpoint()
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+
+    bnb_model_from_pretrained_args = {}
+
+    model = get_model(model_args, training_args, bnb_model_from_pretrained_args)
+    model.config.use_cache = False
+    if (
+        model_args.rope_scaling_factor is not None
+        and model_args.rope_scaling_type is not None
+    ):
+        model.config.rope_scaling = {
+            "factor": model_args.rope_scaling_factor,
+            "type": model_args.rope_scaling_type,
+        }
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    if len(tokenizer) < 130000 and model_args.num_token:
+        num_tokens = [f"{x:.2f}" for x in np.linspace(-100, 100, 20001)]
+        tokenizer.add_tokens(num_tokens)
+
+        model.resize_token_embeddings(len(tokenizer))
+
+        init_numeric_and_special_tokens(
+            model=model,
+            tokenizer=tokenizer,
+            numeric_tokens=num_tokens,
+            noise_scale=0.01,
+        )
+    rank0_print(f"num_tokens_length: {len(tokenizer)}")
+
+    rank0_print(f"Prompt version: {model_args.version}")
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    else:
+        if tokenizer.unk_token is not None:
+            tokenizer.pad_token = tokenizer.unk_token
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[
+                model_args.version
+            ]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[
+                "vicuna_v1"
+            ]
+
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(
+            model_args=model_args, fsdp=training_args.fsdp
+        )
+
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(
+            dtype=torch.bfloat16 if training_args.bf16 else torch.float16,
+            device=training_args.device,
+        )
+
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        if data_args.image_grid_pinpoints is not None:
+            if (
+                isinstance(data_args.image_grid_pinpoints, str)
+                and "x" in data_args.image_grid_pinpoints
+            ):
+                try:
+                    patch_size = data_args.image_processor.size[0]
+                except Exception as e:
+                    patch_size = data_args.image_processor.size["shortest_edge"]
+
+                assert patch_size in [224, 336, 384, 448, 512], (
+                    "patch_size should be in [224, 336, 384, 448, 512]"
+                )
+                # Use regex to extract the range from the input string
+                matches = re.findall(r"\((\d+)x(\d+)\)", data_args.image_grid_pinpoints)
+                range_start = tuple(map(int, matches[0]))
+                range_end = tuple(map(int, matches[-1]))
+                # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+                grid_pinpoints = [
+                    (i, j)
+                    for i in range(range_start[0], range_end[0] + 1)
+                    for j in range(range_start[1], range_end[1] + 1)
+                ]
+                # Multiply all elements by patch_size
+                data_args.image_grid_pinpoints = [
+                    [dim * patch_size for dim in pair] for pair in grid_pinpoints
+                ]
+            elif isinstance(data_args.image_grid_pinpoints, str):
+                data_args.image_grid_pinpoints = ast.literal_eval(
+                    data_args.image_grid_pinpoints
+                )
+
+        model.config.image_grid_pinpoints = data_args.image_grid_pinpoints
+        model.config.image_crop_resolution = data_args.image_crop_resolution
+        model.config.image_split_resolution = data_args.image_split_resolution
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.mm_newline_position = model_args.mm_newline_position
+        model.config.add_faster_video = model_args.add_faster_video
+        model.config.faster_token_stride = model_args.faster_token_stride
+        model.config.add_time_instruction = data_args.add_time_instruction
+        model.config.force_sample = data_args.force_sample
+        model.config.mm_spatial_pool_stride = model_args.mm_spatial_pool_stride
+
+        ### Deciding train which part of the model
+        if (
+            model_args.mm_tunable_parts is None
+        ):  # traditional way of deciding which part to train
+            model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = (
+                model_args.tune_mm_mlp_adapter
+            )
+            model.config.tune_mm_vision_resampler = (
+                training_args.tune_mm_vision_resampler
+            ) = model_args.tune_mm_vision_resampler
+            if model_args.tune_mm_mlp_adapter or model_args.tune_mm_vision_resampler:
+                model.requires_grad_(False)
+            if model_args.tune_mm_mlp_adapter:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = True
+            if model_args.tune_mm_vision_resampler:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = True
+
+            model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+            if training_args.freeze_mm_mlp_adapter:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = False
+
+            model.config.freeze_mm_vision_resampler = (
+                training_args.freeze_mm_vision_resampler
+            )
+            if training_args.freeze_mm_vision_resampler:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = False
+
+            model.config.unfreeze_mm_vision_tower = model_args.unfreeze_mm_vision_tower
+            if model_args.unfreeze_mm_vision_tower:
+                vision_tower.requires_grad_(True)
+            else:
+                vision_tower.requires_grad_(False)
+
+        else:
+            rank0_print(f"Using mm_tunable_parts: {model_args.mm_tunable_parts}")
+            model.config.mm_tunable_parts = training_args.mm_tunable_parts = (
+                model_args.mm_tunable_parts
+            )
+            # Set the entire model to not require gradients by default
+            model.requires_grad_(False)
+            vision_tower.requires_grad_(False)
+            model.get_model().mm_projector.requires_grad_(False)
+            model.get_model().vision_resampler.requires_grad_(False)
+            # Parse the mm_tunable_parts to decide which parts to unfreeze
+            tunable_parts = model_args.mm_tunable_parts.split(",")
+            if "mm_mlp_adapter" in tunable_parts:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = True
+            if "mm_vision_resampler" in tunable_parts:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = True
+            if "mm_vision_tower" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "vision_tower" in name:
+                        param.requires_grad_(True)
+            if "mm_language_model" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if (
+                        "vision_tower" not in name
+                        and "mm_projector" not in name
+                        and "vision_resampler" not in name
+                    ):
+                        param.requires_grad_(True)
+            if "embed_tokens" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "embed_tokens" in name:
+                        param.requires_grad_(True)
+            if "lm_head" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "lm_head" in name:
+                        param.requires_grad_(True)
+            if "moe" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "moe" in name:
+                        param.requires_grad_(True)
+        rank0_print("All Trainable Parameters")
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                rank0_print(name)
+
+        total_params = sum(
+            p.ds_numel if hasattr(p, "ds_numel") else p.numel()
+            for p in model.parameters()
+        )
+        trainable_params = sum(
+            p.ds_numel if hasattr(p, "ds_numel") else p.numel()
+            for p in model.parameters()
+            if p.requires_grad
+        )
+        rank0_print(f"Total parameters: ~{total_params / 1e6:.2f} MB)")
+        rank0_print(f"Trainable parameters: ~{trainable_params / 1e6:.2f} MB)")
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(
+                dtype=compute_dtype, device=training_args.device
+            )
+
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = (
+            model_args.mm_use_im_start_end
+        )
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.mm_vision_tower_lr = training_args.mm_vision_tower_lr
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+    data_module = make_supervised_data_module(
+        tokenizer=tokenizer, data_args=data_args, training_args=training_args
+    )
+    setattr(training_args, "use_webdataset", getattr(data_args, "use_webdataset"))
+    trainer = LLaVATrainer(
+        obs_upload_path=training_args.obs_upload_path,
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        **data_module,
+    )
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+    rank0_print(f"Model saved to {training_args.output_dir}")
+
+
+if __name__ == "__main__":
+    train()
diff --git a/train/llava/train/train_dpo.py b/train/llava/train/train_dpo.py
new file mode 100644
index 0000000..037eec4
--- /dev/null
+++ b/train/llava/train/train_dpo.py
@@ -0,0 +1,1782 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import os
+import copy
+import deepspeed
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import ast
+
+import yaml
+import time
+import random
+import yaml
+import math
+import re
+import torch
+
+import transformers
+import tokenizers
+
+from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_INDEX
+from torch.utils.data import Dataset
+from llava.train.llava_trainer import LLaVADPOTrainer
+from data_processing.utils import load_jsonl, load_json
+from llava import conversation as conversation_lib
+from llava.model import *
+from llava.model.language_model.llava_qwen import LlavaQwenConfig
+from llava.model.language_model.llava_llama import LlavaConfig
+from llava.model.language_model.llava_mistral import LlavaMistralConfig
+from llava.mm_utils import process_highres_image, process_anyres_image, process_highres_image_crop_split, tokenizer_image_token
+from llava.utils import rank0_print
+from transformers import AutoConfig
+import pickle
+
+from trl.trainer.utils import DPODataCollatorWithPadding
+from PIL import Image, ImageFile
+from decord import VideoReader, cpu
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+from packaging import version
+from typing import Any
+
+local_rank = None
+import numpy as np
+
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse("0.14")
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    model_class_name: Optional[str] = field(default=None, metadata={"help": "Used to init model class, format is XXXXForCausalLM. e.g. currently XXXX is chosen from LlavaLlama, LlavaMixtral, LlavaMistral, Llama"})
+
+    mm_tunable_parts: Optional[str] = field(
+        default=None, metadata={"help": 'Could be "mm_mlp_adapter", "mm_vision_resampler", "mm_vision_tower,mm_mlp_adapter,mm_language_model", "mm_vision_tower,mm_mlp_adapter,mm_language_model", "mm_mlp_adapter,mm_language_model"'}
+    )
+    # deciding which part of the multimodal model to tune, will overwrite other previous settings
+
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    tune_mm_vision_resampler: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    vision_tower_pretrained: Optional[str] = field(default=None)  # default to the last layer
+
+    unfreeze_mm_vision_tower: bool = field(default=False)
+    unfreeze_language_model: bool = field(default=False)
+    mm_vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default="linear")
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_patch_merge_type: Optional[str] = field(default="flat")
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    mm_resampler_type: Optional[str] = field(default=None)
+    mm_mask_drop_mode: str = field(default="fixed")
+    mm_mask_drop_skip_percentage: float = field(default=0.0)
+    mm_mask_drop_ratio: float = field(default=0.25)
+    mm_mask_drop_ratio_upper: Optional[float] = field(default=None)
+    mm_mask_drop_ratio_lower: Optional[float] = field(default=None)
+    mm_spatial_pool_stride: Optional[int] = field(default=None)
+    mm_spatial_pool_mode: str = field(default="average")
+    mm_spatial_pool_out_channels: Optional[int] = field(default=None)
+    mm_perceiver_depth: Optional[int] = field(default=3)
+    mm_perceiver_latents: Optional[int] = field(default=32)
+    mm_perceiver_ff_mult: Optional[float] = field(default=4)
+    mm_perceiver_pretrained: Optional[str] = field(default=None)
+    mm_qformer_depth: Optional[int] = field(default=3)
+    mm_qformer_latents: Optional[int] = field(default=32)
+    mm_qformer_pretrained: Optional[str] = field(default=None)
+
+    rope_scaling_factor: Optional[float] = field(default=None)
+    rope_scaling_type: Optional[str] = field(default=None)
+
+    s2: Optional[bool] = field(default=False)
+    s2_scales: Optional[str] = field(default="336,672,1008")
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None, metadata={"help": "Path to the training data, in llava's instruction.json format. Supporting multiple json files via /path/to/{a,b,c}.json"})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    image_folder: Optional[str] = field(default=None)
+    video_folder: Optional[str] = field(default=None)
+    video_fps: Optional[int] = field(default=1)
+    image_aspect_ratio: str = "square"
+    image_grid_pinpoints: Optional[str] = field(default=None)
+    image_crop_resolution: int = 384
+    image_split_resolution: int = 384
+    input_prompt: Optional[str] = field(default=None)
+    refine_prompt: Optional[bool] = field(default=False)
+    frames_upbound: Optional[int] = field(default=0)
+    num_sample: Optional[int] = field(default=None)
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    freeze_mm_vision_resampler: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    double_quant: bool = field(default=True, metadata={"help": "Compress the quantization statistics through double quantization."})
+    quant_type: str = field(default="nf4", metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."})
+    bits: int = field(default=16, metadata={"help": "How many bits to use."})
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    mm_vision_tower_lr: Optional[float] = None
+    group_by_varlen: bool = field(default=False)
+    group_by_modality_length: bool = field(default=False)
+    group_by_modality_length_auto: bool = field(default=False)
+    auto_find_batch_size: bool = field(default=False)
+    gradient_checkpointing: bool = field(default=True)
+    verbose_logging: bool = field(default=False)
+    attn_implementation: str = field(default="flash_attention_2", metadata={"help": "Use transformers attention implementation."})
+    dpo_alpha: float = field(default=1.0)
+    beta: float = field(default=0.1)
+    gamma: float = field(default=1.0)
+    generate_during_eval: bool = field(default=False)
+    precompute_ref_log_probs: bool = field(default=False)
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ["mm_projector", "vision_tower", "vision_resampler"]
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if "lm_head" in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove("lm_head")
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    if hasattr(trainer.args, "tune_mm_mlp_adapter") and trainer.args.tune_mm_mlp_adapter:
+        check_only_save_mm_adapter_tunnable = True
+    # only has mm_mlp_adapter and mm_vision_resampler in the tuneable parts
+    elif hasattr(trainer.args, "mm_tunable_parts") and (len(trainer.args.mm_tunable_parts.split(",")) == 1 and ("mm_mlp_adapter" in trainer.args.mm_tunable_parts or "mm_vision_resampler" in trainer.args.mm_tunable_parts)):
+        check_only_save_mm_adapter_tunnable = True
+    else:
+        check_only_save_mm_adapter_tunnable = False
+
+    trainer.accelerator.wait_for_everyone()
+    torch.cuda.synchronize()
+    rank0_print(f"Only save projectors: {check_only_save_mm_adapter_tunnable}")
+    if check_only_save_mm_adapter_tunnable:
+        # Only save Adapter
+        keys_to_match = ["mm_projector", "vision_resampler"]
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(["embed_tokens", "embed_in"])
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split("/")[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith("checkpoint-"):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f"{current_folder}.bin"))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f"mm_projector.bin"))
+        return
+
+    if trainer.deepspeed:
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx + 2 : cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+
+
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = "unknown"
+        sentence["value"] = BEGIN_SIGNAL + from_str + ": " + sentence["value"] + END_SIGNAL
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+
+
+def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] and not sentence["value"].startswith(DEFAULT_IMAGE_TOKEN):
+                sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip()
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                sentence["value"] = sentence["value"].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>")
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+    return sources
+
+
+def preprocess_multimodal_movie(sources: Sequence[str], data_args: DataArguments, video_inputs: str) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                prompt = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip()
+            replace_token = video_inputs
+            if data_args.mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+    return sources, prompt
+
+
+def preprocess_llama_2(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                rank0_print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def make_conv(prompt, answer):
+    return [
+        {
+            "from": "human",
+            "value": prompt,
+        },
+        {
+            "from": "gpt",
+            "value": answer,
+        },
+    ]
+
+
+def preprocess_gemma(sources: List[List[Dict[str, str]]], tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False) -> Dict:
+    conv: conversation_lib.Conversation = conversation_lib.default_conversation.copy()
+    roles: Dict[str, str] = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations: List[str] = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source: List[Dict[str, str]] = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role: str = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+    if has_image:
+        input_ids: torch.Tensor = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0)
+    else:
+        input_ids: torch.Tensor = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets: torch.Tensor = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.GEMMA
+
+    # Mask target
+    sep: str = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len: int = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds: List[str] = conversation.split(conv.sep)
+        re_rounds = []
+        for conv_idx in range(0, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx : conv_idx + 2]))
+
+        cur_len = 1  # Ignore <bos>
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep  # Re-append sep because split on this
+            # Now "".join(parts)==rou
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer)) - 1  # Ignore <bos>
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1  # Ignore <bos>
+            else:
+                round_len = len(tokenizer(rou).input_ids) - 1  # Ignore <bos>
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1  # Ignore <bos>
+
+            round_len += 2  # sep: <end_of_turn>\n takes 2 tokens
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                rank0_print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
+    roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
+
+    im_start, im_end = tokenizer.additional_special_tokens_ids
+    nl_tokens = tokenizer("\n").input_ids
+    _system = tokenizer("system").input_ids + nl_tokens
+    _user = tokenizer("user").input_ids + nl_tokens
+    _assistant = tokenizer("assistant").input_ids + nl_tokens
+
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+
+        input_id, target = [], []
+        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
+        input_id += system
+        target += [im_start] + [IGNORE_INDEX] * (len(system) - 3) + [im_end] + nl_tokens
+        assert len(input_id) == len(target)
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            if has_image and "<image>" in sentence["value"]:
+                assert sentence["value"].startswith("<image>"), print(sentence["value"])
+
+                _input_id = tokenizer(role).input_ids + nl_tokens + [IMAGE_TOKEN_INDEX] + nl_tokens + tokenizer(sentence["value"][len("<image>") :]).input_ids + [im_end] + nl_tokens
+            else:
+                _input_id = tokenizer(role).input_ids + nl_tokens + tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
+            input_id += _input_id
+            if role == "<|im_start|>user":
+                _target = [im_start] + [IGNORE_INDEX] * (len(_input_id) - 3) + [im_end] + nl_tokens
+            elif role == "<|im_start|>assistant":
+                _target = [im_start] + [IGNORE_INDEX] * len(tokenizer(role).input_ids) + _input_id[len(tokenizer(role).input_ids) + 1 : -2] + [im_end] + nl_tokens
+            else:
+                raise NotImplementedError
+            target += _target
+        assert len(input_id) == len(target)
+        # input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
+        # target += [IGNORE_INDEX] * (max_len - len(target))
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+        # attention_mask=input_ids.ne(tokenizer.pad_token_id), # tensor(bs x seq_len)
+    )
+
+
+def preprocess_llama3(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    max_len=2048,
+    system_message: str = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+) -> Dict:
+    roles = {"human": "<|start_header_id|>user<|end_header_id|>", "gpt": "<|start_header_id|>assistant<|end_header_id|>"}
+
+    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    nl_tokens = tokenizer("\n").input_ids
+
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+
+        input_id, target = [], []
+        system = tokenizer("<|begin_of_text|>").input_ids + tokenizer("<|start_header_id|>system<|end_header_id|>").input_ids + nl_tokens * 2 + tokenizer(system_message).input_ids + [eot_id]
+        input_id += system
+        target += [IGNORE_INDEX] * len(system)
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            if has_image and "<image>" in sentence["value"]:
+                assert sentence["value"].startswith("<image>"), print(sentence["value"])
+                _input_id = tokenizer(role).input_ids + nl_tokens * 2 + [IMAGE_TOKEN_INDEX] + tokenizer(sentence["value"][len("<image>") :]).input_ids + [eot_id]
+            else:
+                _input_id = tokenizer(role).input_ids + nl_tokens * 2 + tokenizer(sentence["value"]).input_ids + [eot_id]
+            input_id += _input_id
+            if role == "<|start_header_id|>user<|end_header_id|>":
+                _target = [IGNORE_INDEX] * len(_input_id)
+            elif role == "<|start_header_id|>assistant<|end_header_id|>":
+                _target = [IGNORE_INDEX] * (len(tokenizer(role).input_ids) + 2) + _input_id[len(tokenizer(role).input_ids) + 2 : -1] + [eot_id]
+            else:
+                raise NotImplementedError
+            target += _target
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+
+
+def preprocess_v1(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mpt(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx : conv_idx + 2]))  # user + gpt
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            if i != 0 and getattr(tokenizer, "legacy", False) and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len += 1
+                instruction_len += 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f"(#turns={len(re_rounds)} ignored)")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(sources: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "qwen":
+        return preprocess_qwen(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "gemma":
+        return preprocess_gemma(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "llama_v3":
+        return preprocess_llama3(sources, tokenizer, has_image=has_image)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def load_data(data_path):
+    if "jsonl" in data_path:
+        data_list = load_jsonl(data_path)
+    else:
+        data_list = load_json(data_path)
+    return data_list
+
+
+class DPODataset(Dataset):
+    """Dataset for DPODataset fine-tuning."""
+
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(DPODataset, self).__init__()
+        # Handle multiple JSON files specified in the data_path
+        self.list_data_dict = []
+
+        if "{" in data_path and "}" in data_path:
+            base_path, file_pattern = re.match(r"^(.*)\{(.*)\}\.json$", data_path).groups()
+            file_names = file_pattern.split(",")
+            rank0_print(f"Loading {file_names} from {base_path}")
+            data_args.dataset_paths = []
+            for file_name in file_names:
+                data_args.dataset_paths.append(f"{base_path}{file_name}.json")
+                full_path = f"{base_path}{file_name}.json"
+                rank0_print(f"Loading {full_path}")
+                cur_data_dict = load_data(full_path)
+                rank0_print(f"Loaded {len(cur_data_dict)} samples from {full_path}")
+                self.list_data_dict.extend(cur_data_dict)
+        elif data_path.endswith(".yaml"):
+            with open(data_path, "r") as file:
+                yaml_data = yaml.safe_load(file)
+                datasets = yaml_data.get("datasets")
+                # file should be in the format of:
+                # datasets:
+                #   - json_path: xxxx1.json
+                #     sampling_strategy: first:1000
+                #   - json_path: xxxx2.json
+                #     sampling_strategy: end:3000
+                #   - json_path: xxxx3.json
+                #     sampling_strategy: random:999
+                data_args.dataset_paths = [dataset.get("json_path") for dataset in datasets]
+                for dataset in datasets:
+                    json_path = dataset.get("json_path")
+                    sampling_strategy = dataset.get("sampling_strategy", "all")
+                    sampling_number = None
+
+                    rank0_print(f"Loading {json_path} with {sampling_strategy} sampling strategy")
+                    cur_data_dict = load_data(json_path)
+
+                    if ":" in sampling_strategy:
+                        sampling_strategy, sampling_number = sampling_strategy.split(":")
+                        if "%" in sampling_number:
+                            sampling_number = math.ceil(int(sampling_number.split("%")[0]) * len(cur_data_dict) / 100)
+                        else:
+                            sampling_number = int(sampling_number)
+
+                    # Apply the sampling strategy
+                    if sampling_strategy == "first" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    elif sampling_strategy == "end" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[-sampling_number:]
+                    elif sampling_strategy == "random" and sampling_number is not None:
+                        random.shuffle(cur_data_dict)
+                        cur_data_dict = cur_data_dict[:sampling_number]
+
+                    rank0_print(f"Loaded {len(cur_data_dict)} samples from {json_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+        else:
+            data_args.dataset_paths = [data_path]
+            rank0_print(f"Loading {data_path}")
+            cur_data_dict = load_data(data_path)
+            rank0_print(f"Loaded {len(cur_data_dict)} samples from {data_path}")
+            self.list_data_dict.extend(cur_data_dict)
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            # Calculate the length of the prompt, answer, chosen, and rejected text
+            cur_len = len(sample["prompt"].split()) + len(sample["answer"].split()) + len(sample["chosen"].split()) + len(sample["rejected"].split())
+            # Add additional tokens if an image is present
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(cur_len + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            # Calculate the length of the prompt, answer, chosen, and rejected text
+            cur_len = len(sample["prompt"].split()) + len(sample["answer"].split()) + len(sample["chosen"].split()) + len(sample["rejected"].split())
+            # If the sample includes a video, the length is positive; otherwise, it is negative
+            cur_len = cur_len if ("video" in sample or "image" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def process_image(self, image_file):
+        image_folder = self.data_args.image_folder
+        processor = self.data_args.image_processor
+        # print(f"\n\nInspecting the image path, folder = {image_folder}, image={image_file}\n\n")
+        try:
+            image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+        except Exception as exn:
+            print(f"Failed to open image {image_file}. Exception:", exn)
+            raise exn
+
+        image_size = image.size
+        if self.data_args.image_aspect_ratio == "highres":
+            image = process_highres_image(image, self.data_args.image_processor, self.data_args.image_grid_pinpoints)
+        elif self.data_args.image_aspect_ratio == "anyres" or "anyres" in self.data_args.image_aspect_ratio:
+            image = process_anyres_image(image, self.data_args.image_processor, self.data_args.image_grid_pinpoints)
+        elif self.data_args.image_aspect_ratio == "crop_split":
+            image = process_highres_image_crop_split(image, self.data_args)
+        elif self.data_args.image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+            image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        else:
+            image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        return image, image_size, "image"
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # TODO: define number of retries somewhere else
+        num_base_retries = 3
+        num_final_retries = 300
+
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:", e)
+                pass
+
+        # still fail, most likely to be path issue or cloud disk issue, retry the same sample for longer
+        # for attempt_idx in range(num_final_retries):
+        #     try:
+        #         sample = self._get_item(i)
+        #         return sample
+        #     except Exception as e:
+        #         # sleep 1s in case it is a cloud disk issue
+        #         print(f"[Final try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+        #         time.sleep(1)
+
+        # Finally raise exception on failing.
+        assert False, "Failed to fetch sample."
+
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+
+        suffix = None
+        if "image" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            if type(image_file) is list:
+                image = [self.process_image(f) for f in image_file]
+            else:
+                image = [self.process_image(image_file)]
+            # sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
+
+        elif "video" in sources[0]:  # FIXME: This logic should be largely improved by Yuanhan. It's too messy now.
+            video_file = self.list_data_dict[i]["video"]
+            video_folder = self.data_args.video_folder
+            video_file = os.path.join(video_folder, video_file)
+            suffix = video_file.split(".")[-1]
+            if not os.path.exists(video_file):
+                print("File {} not exist!".format(video_file))
+
+            if suffix == "pkl":
+                video_info = pickle.load(open(video_file, "rb"))
+                image = torch.from_numpy(video_info["feats"][:, 1:])
+                input_prompt = video_info["inputs"].replace("...", "")
+                # replace the default image token with multiple tokens
+                input_prompt = input_prompt.replace(DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN * self.data_args.video_token)
+                sources, query_prompt = preprocess_multimodal_movie(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, input_prompt)
+            else:  # using videoreader
+                if "shareVideoGPTV" not in video_file and "liangke" not in video_file:
+                    vr = VideoReader(video_file, ctx=cpu(0))
+                    total_frame_num = len(vr)
+                    avg_fps = round(vr.get_avg_fps() / self.data_args.video_fps)
+                    frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
+                    if self.data_args.frames_upbound > 0:
+                        if len(frame_idx) > self.data_args.frames_upbound:
+                            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.data_args.frames_upbound, dtype=int)
+                            frame_idx = uniform_sampled_frames.tolist()
+                    video = vr.get_batch(frame_idx).asnumpy()
+                    video = np.array(video)
+                else:
+                    if "liangke" in video_file:
+                        video_file = self.list_data_dict[i]["video"]
+                    frame_files = [os.path.join(video_file, f) for f in os.listdir(video_file) if os.path.isfile(os.path.join(video_file, f))]
+                    frame_files.sort()  # Ensure the frames are sorted if they are named sequentially
+
+                    # TODO: Hard CODE: Determine the indices for uniformly sampling 10 frames
+                    num_frames_to_sample = 10
+
+                    total_frames = len(frame_files)
+
+                    sampled_indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
+
+                    # Read and store the sampled frames
+                    video = []
+                    for idx in sampled_indices:
+                        frame_path = frame_files[idx]
+                        try:
+                            with Image.open(frame_path) as img:
+                                frame = img.convert("RGB")
+                                video.append(frame)
+                        except IOError:
+                            print(f"Failed to read frame at path: {frame_path}")
+
+                processor = self.data_args.image_processor
+                image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
+                image = [(image, video[0].size, "video")]
+                # sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
+
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        has_image = ("image" in self.list_data_dict[i]) or ("video" in self.list_data_dict[i])
+        # data_dict = preprocess(sources, self.tokenizer, has_image=has_image)
+        data_dict = copy.deepcopy(self.list_data_dict[i])  # inplace modification following
+
+        if "prompt" in data_dict:
+            prompt = data_dict["prompt"]
+            prompt = prompt.replace("<image>", "").strip()
+            prompt = "<image>\n" + prompt
+            data_dict["prompt"] = prompt
+        else:
+            prompt = None
+
+        if suffix == "pkl":
+            prompt = [query_prompt]
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = [
+                (torch.zeros(1, 3, crop_size["height"], crop_size["width"]), (crop_size["width"], crop_size["height"]), "text"),
+            ]
+        # prompt exist in the data
+        data_dict["has_image"] = has_image
+        return data_dict
+
+
+@dataclass
+class DPODataCollator(DPODataCollatorWithPadding):
+    """Collate examples for DPO fine-tuning."""
+
+    # tokenizer: transformers.PreTrainedTokenizer
+
+    def collate(self, batch):
+        # first, pad everything to the same length
+        # input_ids, labels = tuple([instance[key] for instance in instances]
+        #                           for key in ("input_ids", "labels"))
+        # input_ids = torch.nn.utils.rnn.pad_sequence(
+        #     input_ids,
+        #     batch_first=True,
+        #     padding_value=self.tokenizer.pad_token_id)
+        # labels = torch.nn.utils.rnn.pad_sequence(labels,
+        #                                          batch_first=True,
+        #                                          padding_value=IGNORE_INDEX)
+        # input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        # labels = labels[:, :self.tokenizer.model_max_length]
+        # batch = dict(
+        #     input_ids=input_ids,
+        #     labels=labels,
+        #     attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        # )
+        padded_batch = {}
+        for k in batch[0].keys():
+            if k.endswith("_input_ids") or k.endswith("_attention_mask") or k.endswith("_labels"):
+                # if "prompt" in k:
+                #     to_pad = [torch.LongTensor(ex[k][::-1]) for ex in batch]
+                # else:
+                to_pad = [torch.LongTensor(ex[k]) for ex in batch]
+                if k.endswith("_input_ids"):
+                    padding_value = self.tokenizer.pad_token_id
+                elif k.endswith("_labels"):
+                    padding_value = self.label_pad_token_id
+                else:
+                    continue
+                # elif k.endswith("_attention_mask"):
+                #     padding_value = self.padding_value
+                # else:
+                #     raise ValueError(f"Unexpected key in batch '{k}'")
+
+                padded_batch[k] = torch.nn.utils.rnn.pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
+                # for the prompt, flip back so padding is on left side
+                # if "prompt" in k:
+                #     padded_batch[k] = padded_batch[k].flip(dims=[1])
+            else:
+                padded_batch[k] = [ex[k] for ex in batch]
+        for k in ["chosen_input_ids", "rejected_input_ids"]:
+            attn_k = k.replace("input_ids", "attention_mask")
+            padded_batch[attn_k] = padded_batch[k].ne(self.tokenizer.pad_token_id)
+        return padded_batch
+
+    def tokenize_batch_element(self, prompt: str, chosen: str, rejected: str, has_image: bool = True) -> Dict:
+        """Tokenize a single batch element.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
+            in case the prompt + chosen or prompt + rejected responses is/are too long. First
+            we truncate the prompt; if we're still too long, we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to
+            the sum of the length of the prompt and the chosen/rejected response, with
+            label_pad_token_id  for the prompt tokens.
+        """
+        # import pdb; pdb.set_trace()
+        batch = {}
+
+        chosen_sources = make_conv(prompt, chosen)
+        rejected_sources = make_conv(prompt, rejected)
+        chosen_data_dict = preprocess([chosen_sources], self.tokenizer, has_image=has_image)
+        # chosen_data_dict['attention_mask'] = chosen_data_dict["input_ids"].ne(self.tokenizer.pad_token_id)
+
+        rejected_data_dict = preprocess([rejected_sources], self.tokenizer, has_image=has_image)
+        # rejected_data_dict['attention_mask'] = rejected_data_dict["input_ids"].ne(self.tokenizer.pad_token_id)
+
+        chosen_data_dict = {k: v[0] for k, v in chosen_data_dict.items()}
+        rejected_data_dict = {k: v[0] for k, v in rejected_data_dict.items()}
+
+        for k, toks in {
+            "chosen": chosen_data_dict,
+            "rejected": rejected_data_dict,
+        }.items():
+            for type_key, tokens in toks.items():
+                if type_key == "token_type_ids":
+                    continue
+                batch[f"{k}_{type_key}"] = tokens
+        return batch
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+
+        tokenized_batch = []
+        Xs, keys = [], []
+        for feature in features:
+            prompt = feature["prompt"]
+            chosen = feature["chosen"]
+            rejected = feature["rejected"]
+            has_image = feature["has_image"]
+            # Xs.append(feature[has_X])
+            # keys.append(has_X)
+
+            batch_element = self.tokenize_batch_element(prompt, chosen, rejected, has_image=has_image)
+            tokenized_batch.append(batch_element)
+
+        # return collated batch
+        padded_batch = self.collate(tokenized_batch)
+        # import pdb;pdb.set_trace()
+        if "image" in features[0]:
+            # instances[1]['image'][0][0].shape
+            # torch.Size([5, 3, 224, 224])
+            images = [instance["image"] for instance in features]
+
+            padded_batch["image_sizes"] = [im[1] for im_list in images for im in im_list]
+            padded_batch["modalities"] = [im[2] for im_list in images for im in im_list]
+            images = [im[0] for im_list in images for im in im_list]
+            # import pdb;pdb.set_trace()
+
+            padded_batch["images"] = images
+            # padded_batch["images"] =[padded_batch["modalities"], images]
+
+        return padded_batch
+
+
+def make_dpo_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = DPODataset(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args)
+    return train_dataset
+
+
+def get_model(model_args, training_args, bnb_model_from_pretrained_args):
+    assert training_args.attn_implementation
+    if training_args.attn_implementation == "sdpa" and torch.__version__ < "2.1.2":
+        raise ValueError("The 'sdpa' attention implementation requires torch version 2.1.2 or higher.")
+
+    ######################### Overwrite config #########################
+    customized_kwargs = dict()
+    customized_kwargs.update(bnb_model_from_pretrained_args)
+    overwrite_config = {}
+    cfg_pretrained = None
+    if "qwen" in model_args.model_name_or_path.lower():
+        cfg_pretrained = LlavaQwenConfig.from_pretrained(model_args.model_name_or_path)
+    elif "mistral" in model_args.model_name_or_path.lower() or "zephyr" in model_args.model_name_or_path.lower():
+        cfg_pretrained = LlavaMistralConfig.from_pretrained(model_args.model_name_or_path)
+    elif (
+        "wizardlm-2" in model_args.model_name_or_path.lower()
+        or "vicuna" in model_args.model_name_or_path.lower()
+        or "llama" in model_args.model_name_or_path.lower()
+        or "yi" in model_args.model_name_or_path.lower()
+        or "nous-hermes" in model_args.model_name_or_path.lower()
+        and "wizard-2" in model_args.model_name_or_path.lower()
+    ):
+        cfg_pretrained = LlavaConfig.from_pretrained(model_args.model_name_or_path)
+    else:
+        cfg_pretrained = AutoConfig.from_pretrained(model_args.model_name_or_path)
+
+    if model_args.rope_scaling_factor is not None and model_args.rope_scaling_type is not None and cfg_pretrained is not None:
+        overwrite_config["rope_scaling"] = {
+            "factor": model_args.rope_scaling_factor,
+            "type": model_args.rope_scaling_type,
+        }
+        if training_args.model_max_length is None:
+            training_args.model_max_length = cfg_pretrained.max_position_embeddings * model_args.rope_scaling_factor
+            overwrite_config["max_sequence_length"] = training_args.model_max_length
+        assert training_args.model_max_length == int(cfg_pretrained.max_position_embeddings * model_args.rope_scaling_factor), print(
+            f"model_max_length: {training_args.model_max_length}, max_position_embeddings: {cfg_pretrained.max_position_embeddings}, rope_scaling_factor: {model_args.rope_scaling_factor}"
+        )
+        # overwrite_config["max_sequence_length"] = model_args.max_sequence_length
+        # overwrite_config["tokenizer_model_max_length"] = model_args.tokenizer_model_max_length
+
+    if model_args.mm_spatial_pool_stride is not None and model_args.mm_spatial_pool_out_channels is not None and model_args.mm_spatial_pool_mode is not None and model_args.mm_resampler_type is not None and cfg_pretrained is not None:
+        overwrite_config["mm_resampler_type"] = model_args.mm_resampler_type
+        overwrite_config["mm_spatial_pool_stride"] = model_args.mm_spatial_pool_stride
+        overwrite_config["mm_spatial_pool_out_channels"] = model_args.mm_spatial_pool_out_channels
+        overwrite_config["mm_spatial_pool_mode"] = model_args.mm_spatial_pool_mode
+
+    if overwrite_config:
+        rank0_print(f"Overwriting config with {overwrite_config}")
+        for k, v in overwrite_config.items():
+            setattr(cfg_pretrained, k, v)
+
+        customized_kwargs["config"] = cfg_pretrained
+
+    ######################### Finish Overwrite ###########################
+
+    ref_model = None
+    if model_args.model_class_name is not None:
+        actual_model_class_name = f"{model_args.model_class_name}ForCausalLM"
+        model_class = getattr(transformers, actual_model_class_name)
+        rank0_print(f"Using model class {model_class} from {model_args.model_class_name}")
+        model = model_class.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=training_args.attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            low_cpu_mem_usage=False,
+            **customized_kwargs,
+        )
+    elif model_args.vision_tower is not None:
+        if "mixtral" in model_args.model_name_or_path.lower():
+            model = LlavaMixtralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=training_args.attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                low_cpu_mem_usage=False,
+                **customized_kwargs,
+            )
+            from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+            deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        elif "mistral" in model_args.model_name_or_path.lower() or "zephyr" in model_args.model_name_or_path.lower():
+            model = LlavaMistralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=training_args.attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                low_cpu_mem_usage=False,
+                **customized_kwargs,
+            )
+        elif (
+            "wizardlm-2" in model_args.model_name_or_path.lower()
+            or "vicuna" in model_args.model_name_or_path.lower()
+            or "llama" in model_args.model_name_or_path.lower()
+            or "yi" in model_args.model_name_or_path.lower()
+            or "nous-hermes" in model_args.model_name_or_path.lower()
+            and "wizard-2" in model_args.model_name_or_path.lower()
+        ):
+            model = LlavaLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=training_args.attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                low_cpu_mem_usage=False,
+                **customized_kwargs,
+            )
+
+            if "zero3" in training_args.deepspeed:
+                rank0_print("#### Initialize reference model #####")
+                ref_model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    cache_dir=training_args.cache_dir,
+                    attn_implementation=training_args.attn_implementation,
+                    torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                    low_cpu_mem_usage=False,
+                    **customized_kwargs,
+                )
+
+        elif "qwen" in model_args.model_name_or_path.lower() or "quyen" in model_args.model_name_or_path.lower():
+            if "moe" in model_args.model_name_or_path.lower():
+                model = LlavaQwenMoeForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    cache_dir=training_args.cache_dir,
+                    attn_implementation=training_args.attn_implementation,
+                    torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                    low_cpu_mem_usage=False,
+                    **customized_kwargs,
+                )
+                from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+                deepspeed.utils.set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+            else:
+                model = LlavaQwenForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    cache_dir=training_args.cache_dir,
+                    attn_implementation=training_args.attn_implementation,
+                    torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                    low_cpu_mem_usage=False,
+                    **customized_kwargs,
+                )
+
+            if "zero3" in training_args.deepspeed:
+                rank0_print("#### Initialize reference model #####")
+                ref_model = LlavaQwenForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    cache_dir=training_args.cache_dir,
+                    attn_implementation=training_args.attn_implementation,
+                    torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                    low_cpu_mem_usage=False,
+                    **customized_kwargs,
+                )
+
+        elif "gemma" in model_args.model_name_or_path.lower():
+            model = LlavaGemmaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=training_args.attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                low_cpu_mem_usage=False,
+                **customized_kwargs,
+            )
+        else:
+            raise ValueError(f"Unknown model class {model_args}")
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path, cache_dir=training_args.cache_dir, attn_implementation=training_args.attn_implementation, torch_dtype=(torch.bfloat16 if training_args.bf16 else None), **customized_kwargs
+        )
+    return model, ref_model
+
+
+def train(attn_implementation=None):
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if training_args.verbose_logging:
+        rank0_print(f"Inspecting experiment hyperparameters:\n")
+        rank0_print(f"model_args = {vars(model_args)}\n\n")
+        rank0_print(f"data_args = {vars(data_args)}\n\n")
+        rank0_print(f"training_args = {vars(training_args)}\n\n")
+        # rank0_print(f"evaluation_args = {vars(evaluation_args)}\n\n")
+
+    local_rank = training_args.local_rank
+    compute_dtype = torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+
+        bnb_model_from_pretrained_args.update(
+            dict(
+                device_map={"": training_args.device},
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=training_args.bits == 4,
+                    load_in_8bit=training_args.bits == 8,
+                    llm_int8_threshold=6.0,
+                    llm_int8_has_fp16_weight=False,
+                    bnb_4bit_compute_dtype=compute_dtype,
+                    bnb_4bit_use_double_quant=training_args.double_quant,
+                    bnb_4bit_quant_type=training_args.quant_type,  # {'fp4', 'nf4'}
+                ),
+            )
+        )
+
+    model, ref_model = get_model(model_args, training_args, bnb_model_from_pretrained_args)
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+
+        model.config.torch_dtype = torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+            if ref_model is not None:
+                ref_model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            if ref_model is not None:
+                ref_model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    if "mpt" in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=training_args.cache_dir, model_max_length=training_args.model_max_length, padding_side="right")
+    elif "mistral" in model_args.model_name_or_path.lower() or "mixtral" in model_args.model_name_or_path.lower() or "zephyr" in model_args.model_name_or_path.lower():
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=training_args.cache_dir, model_max_length=training_args.model_max_length, padding_side="left")
+    elif "qwen" in model_args.model_name_or_path.lower():
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=training_args.cache_dir, model_max_length=training_args.model_max_length, padding_side="right")
+    else:  # for all other models
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+
+    rank0_print(f"Prompt version: {model_args.version}")
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    else:
+        if tokenizer.unk_token is not None:
+            tokenizer.pad_token = tokenizer.unk_token
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
+
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        if data_args.image_grid_pinpoints is not None:
+            # for input like "(1x1)...(3x3)", convert to [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (3, 2), (1, 3), (2, 3), (3, 3)]
+            if "x" in data_args.image_grid_pinpoints and "..." in data_args.image_grid_pinpoints:
+                vis_encoder_size = data_args.image_processor.size[0]
+                matches = re.findall(r"\((\d+)x(\d+)\)", data_args.image_grid_pinpoints)
+                range_start = tuple(map(int, matches[0]))
+                range_end = tuple(map(int, matches[-1]))
+                grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+                grid_pinpoints = [[dim * vis_encoder_size for dim in pair] for pair in grid_pinpoints]
+                data_args.image_grid_pinpoints = grid_pinpoints
+            elif "x" in data_args.image_grid_pinpoints:
+                vis_encoder_size = data_args.image_processor.size[0]
+                assert vis_encoder_size in [224, 336, 384, 448, 512], "vis_encoder_size should be in [224, 336, 384, 448, 512]"
+                grid_pinpoints = data_args.image_grid_pinpoints.replace(" ", "").replace("x", ",")[1:-1].split("),(")
+                data_args.image_grid_pinpoints = [[int(x) * vis_encoder_size for x in item.split(",")] for item in grid_pinpoints]
+            else:
+                data_args.image_grid_pinpoints = ast.literal_eval(data_args.image_grid_pinpoints)  # for backward compatibility
+        model.config.image_grid_pinpoints = data_args.image_grid_pinpoints
+        model.config.image_crop_resolution = data_args.image_crop_resolution
+        model.config.image_split_resolution = data_args.image_split_resolution
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        ### Deciding train which part of the model
+        if model_args.mm_tunable_parts is None:  # traditional way of deciding which part to train
+            model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+            model.config.tune_mm_vision_resampler = training_args.tune_mm_vision_resampler = model_args.tune_mm_vision_resampler
+            if model_args.tune_mm_mlp_adapter or model_args.tune_mm_vision_resampler:
+                model.requires_grad_(False)
+            if model_args.tune_mm_mlp_adapter:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = True
+            if model_args.tune_mm_vision_resampler:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = True
+
+            model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+            if training_args.freeze_mm_mlp_adapter:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = False
+
+            model.config.freeze_mm_vision_resampler = training_args.freeze_mm_vision_resampler
+            if training_args.freeze_mm_vision_resampler:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = False
+
+            model.config.unfreeze_mm_vision_tower = model_args.unfreeze_mm_vision_tower
+            if model_args.unfreeze_mm_vision_tower:
+                vision_tower.requires_grad_(True)
+            else:
+                vision_tower.requires_grad_(False)
+
+        else:
+            rank0_print(f"Using mm_tunable_parts: {model_args.mm_tunable_parts}")
+            model.config.mm_tunable_parts = training_args.mm_tunable_parts = model_args.mm_tunable_parts
+            # Set the entire model to not require gradients by default
+            model.requires_grad_(False)
+            vision_tower.requires_grad_(False)
+            model.get_model().mm_projector.requires_grad_(False)
+            model.get_model().vision_resampler.requires_grad_(False)
+            # Parse the mm_tunable_parts to decide which parts to unfreeze
+            tunable_parts = model_args.mm_tunable_parts.split(",")
+            if "mm_mlp_adapter" in tunable_parts:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = True
+            if "mm_vision_resampler" in tunable_parts:
+                for p in model.get_model().vision_resampler.parameters():
+                    p.requires_grad = True
+            if "mm_vision_tower" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "vision_tower" in name:
+                        param.requires_grad_(True)
+            if "mm_language_model" in tunable_parts:
+                for name, param in model.named_parameters():
+                    if "vision_tower" not in name and "mm_projector" not in name and "vision_resampler" not in name:
+                        param.requires_grad_(True)
+
+        total_params = sum(p.ds_numel if hasattr(p, "ds_numel") else p.numel() for p in model.parameters())
+        trainable_params = sum(p.ds_numel if hasattr(p, "ds_numel") else p.numel() for p in model.parameters() if p.requires_grad)
+        rank0_print(f"Total parameters: ~{total_params/1e6:.2f} MB)")
+        rank0_print(f"Trainable parameters: ~{trainable_params/1e6:.2f} MB)")
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.mm_vision_tower_lr = training_args.mm_vision_tower_lr
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+        if ref_model is not None:
+            ref_model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
+            ref_vision_tower = ref_model.get_vision_tower()
+            ref_vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+            ref_model.config.image_aspect_ratio = data_args.image_aspect_ratio
+            ref_model.config.image_grid_pinpoints = data_args.image_grid_pinpoints
+            ref_model.config.image_crop_resolution = data_args.image_crop_resolution
+            ref_model.config.image_split_resolution = data_args.image_split_resolution
+            ref_model.config.tokenizer_padding_side = tokenizer.padding_side
+            ref_model.config.tokenizer_model_max_length = tokenizer.model_max_length
+            ref_model.config.mm_use_im_start_end = data_args.mm_use_im_start_end
+            ref_model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+            ref_model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+            parameter_names = [n for n, _ in ref_model.named_parameters()]
+            for param_name in parameter_names:
+                param = ref_model.get_parameter(param_name)
+                param.requires_grad = False
+            ref_model.eval()
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if "norm" in name:
+                module = module.to(torch.float32)
+            if "lm_head" in name or "embed_tokens" in name:
+                if hasattr(module, "weight"):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    train_dataset = make_dpo_data_module(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DPODataCollator(
+        tokenizer,
+        label_pad_token_id=IGNORE_INDEX,
+        pad_token_id=tokenizer.pad_token_id,
+    )
+
+    trainer = LLaVADPOTrainer(
+        model,
+        ref_model,
+        args=training_args,
+        dpo_alpha=training_args.dpo_alpha,
+        beta=training_args.beta,
+        gamma=training_args.gamma,
+        train_dataset=train_dataset,
+        eval_dataset=None,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        max_length=training_args.model_max_length,
+        generate_during_eval=False,  # training_args.generate_during_eval,
+        precompute_ref_log_probs=training_args.precompute_ref_log_probs,
+    )
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            if hasattr(model, "config"):
+                model.config.save_pretrained(training_args.output_dir)
+            if hasattr(model, "generation_config"):
+                model.generation_config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, "non_lora_trainables.bin"))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+    rank0_print(f"Model saved to {training_args.output_dir}")
+
+
+if __name__ == "__main__":
+    train()
diff --git a/train/llava/train/train_mem.py b/train/llava/train/train_mem.py
new file mode 100644
index 0000000..6135ca4
--- /dev/null
+++ b/train/llava/train/train_mem.py
@@ -0,0 +1,4 @@
+from llava.train.train import train
+
+if __name__ == "__main__":
+    train()
diff --git a/train/llava/utils.py b/train/llava/utils.py
new file mode 100644
index 0000000..3964198
--- /dev/null
+++ b/train/llava/utils.py
@@ -0,0 +1,403 @@
+"""
+Various utility functions.
+"""
+import io
+import logging
+import logging.handlers
+import os
+import random
+import sys
+
+import numpy as np
+import requests
+from PIL import Image
+
+import llava.s3_ops as st
+from llava.constants import LOGDIR
+
+import cv2
+import torch.distributed as dist
+
+try:
+    import av
+    
+except ImportError:
+    print("Please install pyav to use video processing functions.")
+try:
+    from decord import VideoReader, cpu
+except ImportError:
+    print("Be care decord is not install")
+
+import imageio.v2 as imageio
+
+handler = None
+
+
+def process_gif(gif_file):
+    """
+    Simple GIF loader using imageio and open_file.
+    Returns:
+        video: np.ndarray (num_frames, H, W, C)
+        video_time: total duration in seconds
+        frame_time: comma-separated frame timestamps ("0.00s,0.10s,...")
+        num_frames_to_sample: number of frames
+    """
+    # open file (supports remote or local)
+    f = st.open_file(gif_file, "rb")
+    data = f.read()
+    f.close()
+
+    bio = io.BytesIO(data)
+    reader = imageio.get_reader(bio, format="GIF")
+
+    # Get metadata to estimate total duration
+    meta = reader.get_meta_data()
+    total_duration = meta.get('duration', 0) / 1000.0  # ms → s
+
+    # --- Fetch only the first frame ---
+    first_frame = reader.get_data(0)
+    frame_array = np.expand_dims(first_frame, axis=0)  # shape: (1, H, W, C)
+    # ----------------------------------
+
+    frame_time = "0.00s"
+    num_frames_to_sample = 1
+
+    reader.close()
+
+    return frame_array, total_duration, frame_time, num_frames_to_sample
+
+
+def process_video_with_decord(video_file):
+    
+    video_file = st.open_file(video_file, mode = "rb")
+
+    vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
+
+    if len(vr) == 0:
+        raise ValueError("Empty video file")
+    
+    first_frame = vr.get_batch([0]).asnumpy()
+
+    fps = vr.get_avg_fps() or 0.0
+
+    video_time = (len(vr) / fps) if fps else 0.0
+    frame_time = "0.00s"
+
+    num_frames_to_sample = 1
+
+    vr.seek(0)
+
+    return first_frame, video_time, frame_time, num_frames_to_sample
+
+def process_video_with_pyav(video_file, data_args):
+    container = av.open(video_file)
+    # !!! This is the only difference. Using auto threading
+    container.streams.video[0].thread_type = "AUTO"
+
+    video_frames = []
+    for packet in container.demux():
+        if packet.stream.type == 'video':
+            for frame in packet.decode():
+                video_frames.append(frame)
+    total_frame_num = len(video_frames)
+    video_time = video_frames[-1].time
+    avg_fps = round(total_frame_num / video_time / data_args.video_fps)
+    frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
+
+    if data_args.frames_upbound > 0:
+        if len(frame_idx) > data_args.frames_upbound:
+            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+
+
+    frames = [video_frames[i] for i in frame_idx]
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
+    if sample in ['rand', 'middle']: # uniform sampling
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    else:
+        raise ValueError
+    return frame_indices
+
+
+def read_frames_gif(
+        video_path, num_frames, sample='rand', fix_start=None, min_num_frames=4
+):
+    f = st.open_file(video_path, "rb")
+    data = f.read()
+    f.close()
+
+    bio = io.BytesIO(data)
+    gif = imageio.get_reader(bio, format="GIF")
+    vlen = len(gif)
+
+    t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
+    frame_indices = get_frame_indices(
+        t_num_frames, vlen, sample=sample, fix_start=fix_start
+    )
+    frames = []
+    for index, frame in enumerate(gif):
+        if index in frame_indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB).astype(np.uint8)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+    return frames
+
+
+def read_frames_decord(
+        video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4
+):
+    video_file = st.open_file(video_path,mode='rb')
+    video_reader = VideoReader(video_file, num_threads=1)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+    if clip:
+        start, end = clip
+        duration = end - start
+        vlen = int(duration * fps)
+        start_index = int(start * fps)
+
+    # t_num_frames = min(max(int(duration * sample_fps), min_num_frames), num_frames)
+    t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
+
+    frame_indices = get_frame_indices(
+        t_num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps
+    )
+    if clip:
+        frame_indices = [f + start_index for f in frame_indices]
+    frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), np.uint8
+    frames = [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
+    return frames
+
+def read_frames_pyav(video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4 ):
+# Open the video. If you need to support file-like objects, you can pass that to av.open as well.
+    video_file = st.open_file(video_path, mode='rb')
+    container = av.open(video_file)
+    stream = container.streams.video[0]
+    stream.thread_type = 'AUTO'  # allow multithreaded decode when possible
+
+    # Try to get FPS from container
+    fps = float(stream.average_rate) if stream.average_rate else None
+
+    # Clip handling
+    start_sec, end_sec = (clip if clip else (None, None))
+    frames = []
+    timestamps = []
+
+    # Decode and collect frames (filter by clip window if provided and timestamps exist)
+    for frame in container.decode(stream):
+        # Timestamp in seconds (can be None for some inputs)
+        t = float(frame.pts * stream.time_base) if (frame.pts is not None and stream.time_base is not None) else None
+
+        if clip:
+            # If we have timestamps, use them to include only frames in [start_sec, end_sec)
+            if t is not None:
+                if t < start_sec:
+                    continue
+                if t >= end_sec:
+                    # We can break once we pass the clip end
+                    break
+
+        frames.append(frame.to_image())
+        if t is not None:
+            timestamps.append(t)
+
+    container.close()
+
+    vlen = len(frames)
+    if vlen == 0:
+        return []
+
+    # Duration calculation
+    if clip:
+        duration = end_sec - start_sec
+    else:
+        if len(timestamps) >= 2:
+            duration = max(1e-6, timestamps[-1] - timestamps[0])
+        else:
+            # Fallback to container duration; av time base is microseconds
+            duration = None
+
+    # FPS fallback if average_rate wasn’t available
+    if fps is None:
+        if duration and duration > 0:
+            fps = vlen / duration
+        else:
+            # last resort: assume 30 fps
+            fps = 30.0
+
+    # Number of frames to sample
+    t_num_frames = int(np.random.randint(min_num_frames, num_frames + 1))
+
+    # Build indices for sampling
+    # Your existing helper; expects vlen and fps
+    frame_indices = get_frame_indices(
+        t_num_frames, vlen, sample=sample, fix_start=fix_start, input_fps=fps
+    )
+
+    # Select frames and return as list of PIL Images
+    selected_frames = [frames[i] for i in frame_indices]
+    return selected_frames
+
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+
+
+def rank_print(*args):
+    if dist.is_initialized():
+        print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True)
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ""
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ""
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == "\n":
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != "":
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ""
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json", "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        print(f"######################### Moderation Error: {e} #########################")
+        flagged = False
+    except KeyError as e:
+        print(f"######################### Moderation Error: {e} #########################")
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
diff --git a/train/pyproject.toml b/train/pyproject.toml
new file mode 100644
index 0000000..ec21a2f
--- /dev/null
+++ b/train/pyproject.toml
@@ -0,0 +1,103 @@
+[tool.black]
+line-length = 240
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "llava"
+version = "1.7.0.dev0"
+description = "LLaVA OneVision: The Next Generation of LLaVA with Better Image and Video Understanding Capabilities"
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+
+[project.optional-dependencies]
+standalone = [
+    "shortuuid",
+    "httpx==0.24.0",
+    "einops",
+    "ftfy",
+]
+
+
+train = [
+    "llava[standalone]",
+    #"numpy==1.26.1",
+    "open_clip_torch",
+    "fastapi",
+    #"markdown2[all]", this is not work for h20
+    "numpy",
+    "requests",
+    "sentencepiece",
+    #"torch==2.1.2",
+    #"torchvision==0.16.2",
+    "uvicorn",
+    "wandb",
+    "deepspeed==0.14.4",
+    "peft==0.9.0",# add 0.9.0 for antllm25
+    "accelerate==0.29.1", #>=0.29.1 previous for code environment now for antllm24
+    "tokenizers~=0.15.2",
+    "transformers", #@git+https://github.com/huggingface/transformers.git@1c39974a4c4036fd641bc1191cc32799f85715a4
+    "bitsandbytes==0.45.3",# ==0.41.0 previous for code environment  now for antllm24
+    "scikit-learn==1.2.2",
+    "sentencepiece~=0.1.99",
+    "einops==0.6.1",
+    "einops-exts==0.0.4",
+    "gradio_client==0.2.9",
+    "urllib3<=2.0.0",
+    "datasets==2.16.1",
+    "pydantic==1.10.8",
+    "timm",
+    "hf_transfer",
+    "opencv-python",
+    "av",
+    "decord",
+    "tyro",
+    "scipy",
+    "jinja2==3.1.5",
+    "pynvml", # new add for antllm24
+]
+
+[project.urls]
+"Homepage" = "https://llava-vl.github.io"
+"Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
+
+[tool.setuptools.packages.find]
+include = ["llava*", "trl*"]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+    "checkpoints*",
+    "project_checkpoints*",
+    "debug_checkpoints*",
+    "mlx_configs*",
+    "wandb*",
+    "notebooks*",
+]
+
+[tool.wheel]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+    "checkpoints*",
+    "project_checkpoints*",
+    "debug_checkpoints*",
+    "mlx_configs*",
+    "wandb*",
+    "notebooks*",
+]
diff --git a/train/scripts/llada_v_finetune.sh b/train/scripts/llada_v_finetune.sh
new file mode 100644
index 0000000..bffff62
--- /dev/null
+++ b/train/scripts/llada_v_finetune.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_DEBUG=WARN
+export NCCL_DEBUG_SUBSYS=ALL
+export NCCL_IB_TIMEOUT=22  # 平台要求from马国权 
+
+num_node=${WORLD_SIZE:-1}
+gpu_num=${NGPUS_PER_NODE:-1}
+
+save_dir=${BASE_DIR:-'exp'}
+BASE_RUN_NAME=${BASE_RUN_NAME:-"moe_lora_s2"}
+total_epoch=${EPOCH:-3}
+saving_freq=${SAVE_FREQ:-5000}
+
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+MASTER_PORT=${MASTER_PORT:-"29199"}
+RANK=${RANK:-"0"}
+
+echo "master_addr ${MASTER_ADDR}"
+echo "master_port ${MASTER_PORT}"
+echo "node_rank ${RANK}"
+echo "gpu_num ${gpu_num}"
+echo "num_node ${num_node}"
+
+pretrain_path=${PRETRAIN_MODEL:-"../model/WAM-Diff-ckpt"}
+LLM_VERSION=${pretrain_path}
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="../model/siglip2-so400m-patch14-384"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+data_path=${DATA_PATH:-"./data/pretrain_moe.json"}
+
+############### Finetune ################
+
+PROMPT_VERSION="llava_llada"
+
+echo "save dir: ${save_dir}"
+echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
+echo "total epoch: ${total_epoch}"
+echo "saving freq: ${saving_freq}"
+echo "pretrained model path: ${pretrain_path}"
+echo "data path: ${data_path}"
+
+echo "
+moe_choice: ${MOE_CHOICE:-'expert'}
+num_experts_per_tok: ${NUM_EXPERT_PER_TOK:-'64'}
+num_experts: ${NUM_EXPERT:-'64'}
+capacity_factor: ${CAPACITY_FACTOR:-'0.1'}
+moe_router_score_function: None
+moe_lora_rank: ${MOE_LORA_RANK:-32}
+moe_lora_alpha: ${MOE_LORA_ALPHA:-32}
+moe_lora_in_features: 4096
+moe_lora_out_features: 4096
+moe_lora_dropout: 0.05
+"
+export TRAIN_MODULE=${TRAIN_MODULE:-"mm_vision_tower,mm_mlp_adapter,mm_language_model"}
+echo $TRAIN_MODULE
+
+export PYTHONPATH=$PYTHONPATH:${PWD}
+echo $PYTHONPATH
+
+export ZERO_STAGE_CONFIG=${ZERO_STAGE_CONFIG:-"scripts/zero3.json"}
+
+ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=${gpu_num} --nnodes=${num_node} --master_addr=${MASTER_ADDR} --master_port ${MASTER_PORT} --node_rank=${RANK} \
+    llava/train/train_mem.py \
+    --deepspeed ${ZERO_STAGE_CONFIG} \
+    --model_name_or_path ${LLM_VERSION} \
+    --version ${PROMPT_VERSION} \
+    --data_path "$data_path" \
+    --mm_tunable_parts ${TRAIN_MODULE} \
+    --num_token ${NUM_TOKEN:-False} \
+    --mm_vision_tower_lr=2e-6 \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres_max_2 \
+    --image_grid_pinpoints "(1x1),...,(6x6)" \
+    --mm_patch_merge_type spatial_unpad \
+    --moe_choice ${MOE_CHOICE:-'expert'} \
+    --num_experts_per_tok ${NUM_EXPERT_PER_TOK:-'64'} \
+    --num_experts ${NUM_EXPERT:-'64'} \
+    --moe_router_score_function None \
+    --moe_lora_rank ${MOE_LORA_RANK:-32} \
+    --moe_lora_alpha ${MOE_LORA_ALPHA:-32} \
+    --moe_lora_in_features 4096 \
+    --moe_lora_out_features 4096 \
+    --moe_lora_dropout 0.05 \
+    --capacity_factor ${CAPACITY_FACTOR:-'0.1'} \
+    --bf16 True \
+    --run_name $BASE_RUN_NAME \
+    --output_dir "$save_dir/$BASE_RUN_NAME" \
+    --num_train_epochs ${total_epoch} \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy "steps" \
+    --save_steps ${saving_freq} \
+    --save_total_limit 100 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.02 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --fp16 False \
+    --model_max_length 8192 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 0 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --torch_compile True \
+    --torch_compile_backend "inductor" \
+    --dataloader_drop_last True \
+    --attn_implementation sdpa \
+    --use_conversation_mask False \
+    --debug_mode False
\ No newline at end of file
diff --git a/train/scripts/zero2.json b/train/scripts/zero2.json
new file mode 100644
index 0000000..eb6267c
--- /dev/null
+++ b/train/scripts/zero2.json
@@ -0,0 +1,41 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": false,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/train/scripts/zero2_fused_adamw.json b/train/scripts/zero2_fused_adamw.json
new file mode 100644
index 0000000..f7df681
--- /dev/null
+++ b/train/scripts/zero2_fused_adamw.json
@@ -0,0 +1,41 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/train/scripts/zero2_offload.json b/train/scripts/zero2_offload.json
new file mode 100644
index 0000000..2d24e89
--- /dev/null
+++ b/train/scripts/zero2_offload.json
@@ -0,0 +1,31 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
diff --git a/train/scripts/zero3.json b/train/scripts/zero3.json
new file mode 100644
index 0000000..02d3431
--- /dev/null
+++ b/train/scripts/zero3.json
@@ -0,0 +1,41 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/train/scripts/zero3_offload.json b/train/scripts/zero3_offload.json
new file mode 100644
index 0000000..9da12de
--- /dev/null
+++ b/train/scripts/zero3_offload.json
@@ -0,0 +1,48 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "steps_per_print": 1e5,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/train/scripts/zero3pp.json b/train/scripts/zero3pp.json
new file mode 100644
index 0000000..f7ca45f
--- /dev/null
+++ b/train/scripts/zero3pp.json
@@ -0,0 +1,53 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "zero_quantized_weights": true,
+        "zero_hpz_partition_size": 16,
+        "zero_quantized_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/train/trl/__init__.py b/train/trl/__init__.py
new file mode 100644
index 0000000..c9a1a6b
--- /dev/null
+++ b/train/trl/__init__.py
@@ -0,0 +1,44 @@
+# flake8: noqa
+
+__version__ = "0.7.11.dev0"
+
+from .core import set_seed
+from .environment import TextEnvironment, TextHistory
+from .extras import BestOfNSampler
+from .import_utils import (
+    is_bitsandbytes_available,
+    is_diffusers_available,
+    is_npu_available,
+    is_peft_available,
+    is_wandb_available,
+    is_xpu_available,
+)
+from .models import (
+    AutoModelForCausalLMWithValueHead,
+    AutoModelForSeq2SeqLMWithValueHead,
+    PreTrainedModelWrapper,
+    create_reference_model,
+    setup_chat_format,
+)
+from .trainer import (
+    DataCollatorForCompletionOnlyLM,
+    DPOTrainer,
+    IterativeSFTTrainer,
+    ModelConfig,
+    PPOConfig,
+    PPOTrainer,
+    RewardConfig,
+    RewardTrainer,
+    SFTTrainer,
+)
+from .trainer.utils import get_kbit_device_map, get_peft_config, get_quantization_config
+
+
+if is_diffusers_available():
+    from .models import (
+        DDPOPipelineOutput,
+        DDPOSchedulerOutput,
+        DDPOStableDiffusionPipeline,
+        DefaultDDPOStableDiffusionPipeline,
+    )
+    from .trainer import DDPOConfig, DDPOTrainer
diff --git a/train/trl/core.py b/train/trl/core.py
new file mode 100644
index 0000000..4cf481e
--- /dev/null
+++ b/train/trl/core.py
@@ -0,0 +1,329 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import random
+import warnings
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+# from transformers import top_k_top_p_filtering
+
+from .import_utils import is_npu_available, is_xpu_available
+
+
+try:
+    from collections.abc import Mapping
+except ImportError:
+    from collections import Mapping
+
+
+WANDB_PADDING = -1
+
+
+def top_k_top_p_filtering(
+    logits: torch.FloatTensor,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+) -> torch.FloatTensor:
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
+
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        top_k (`int`, *optional*, defaults to 0):
+            If > 0, only keep the top k tokens with highest probability (top-k filtering)
+        top_p (`float`, *optional*, defaults to 1.0):
+            If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
+            filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimumber of tokens we keep per batch example in the output.
+
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+
+    if top_k > 0:
+        logits = TopKLogitsWarper(top_k=top_k, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(None, logits)
+
+    if 0 <= top_p <= 1.0:
+        logits = TopPLogitsWarper(top_p=top_p, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(None, logits)
+
+    return logits
+
+
+def flatten_dict(nested: Dict, sep: str = "/") -> Dict:
+    """Flatten dictionary and concatenate nested keys with separator."""
+
+    def recurse(nest: Dict, prefix: str, into: Dict) -> None:
+        for k, v in nest.items():
+            if sep in k:
+                raise ValueError(f"separator '{sep}' not allowed to be in key '{k}'")
+            if isinstance(v, Mapping):
+                recurse(v, prefix + k + sep, into)
+            else:
+                into[prefix + k] = v
+
+    flat = {}
+    recurse(nested, "", flat)
+    return flat
+
+
+def convert_to_scalar(stats: Dict) -> Dict:
+    """
+    Converts the stats from a flattened dict to single scalar dicts
+    """
+    tensorboard_stats = {}
+    for k, v in stats.items():
+        # for tensorboard compatibility - arrays and tensors are ignored with tensorboard
+        # therefore we convert single element tensors to scalars
+        if (isinstance(v, torch.Tensor) or isinstance(v, np.ndarray)) and (len(v.shape) == 0 or (len(v.shape) == 1 and v.shape[0] == 1)):
+            v = v.item()
+        tensorboard_stats[k] = v
+    return tensorboard_stats
+
+
+def stack_dicts(stats_dicts: List[Dict]) -> Dict:
+    """Stack the values of a dict."""
+    results = dict()
+    for k in stats_dicts[0]:
+        stats_list = [torch.flatten(d[k]) for d in stats_dicts]
+        results[k] = pad_sequence(stats_list, batch_first=True, padding_value=WANDB_PADDING)
+    return results
+
+
+def add_suffix(input_dict: Dict, suffix: str) -> Dict:
+    """Add suffix to dict keys."""
+    return dict((k + suffix, v) for k, v in input_dict.items())
+
+
+def pad_to_size(tensor: torch.Tensor, size: int, dim: int = 1, padding: int = 50256) -> torch.Tensor:
+    """Pad tensor to size."""
+    t_size = tensor.size()[dim]
+    if t_size == size:
+        return tensor
+    else:
+        return torch.nn.functional.pad(tensor, (0, size - t_size), "constant", padding)
+
+
+def logprobs_from_logits(logits: torch.Tensor, labels: torch.Tensor, gather: bool = True) -> torch.Tensor:
+    """
+    See: https://github.com/pytorch/pytorch/issues/563#issuecomment-330103591
+    """
+    logp = F.log_softmax(logits, dim=2)
+
+    if not gather:
+        return logp
+    logpy = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
+    return logpy
+
+
+def whiten(values: torch.Tensor, shift_mean: bool = True) -> torch.Tensor:
+    """Whiten values."""
+    mean, var = torch.mean(values), torch.var(values)
+    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+    if not shift_mean:
+        whitened += mean
+    return whitened
+
+
+def masked_mean(values: torch.Tensor, mask: torch.Tensor, axis: bool = None) -> torch.Tensor:
+    """Compute mean of tensor with a masked values."""
+    if axis is not None:
+        return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
+    else:
+        return (values * mask).sum() / mask.sum()
+
+
+def masked_var(values: torch.Tensor, mask: torch.Tensor, unbiased: bool = True) -> torch.Tensor:
+    """Compute variance of tensor with masked values."""
+    mean = masked_mean(values, mask)
+    centered_values = values - mean
+    variance = masked_mean(centered_values**2, mask)
+    if unbiased:
+        mask_sum = mask.sum()
+        if mask_sum == 0:
+            raise ValueError("The sum of the mask is zero, which can happen when `mini_batch_size=1`;" "try increase the `mini_batch_size` or `gradient_accumulation_steps`")
+        # note that if mask_sum == 1, then there is a division by zero issue
+        # to avoid it you just need to use a larger minibatch_size
+        bessel_correction = mask_sum / (mask_sum - 1)
+        variance = variance * bessel_correction
+    return variance
+
+
+def masked_whiten(values: torch.Tensor, mask: torch.Tensor, shift_mean: bool = True) -> torch.Tensor:
+    """Whiten values with masked values."""
+    mean, var = masked_mean(values, mask), masked_var(values, mask)
+    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+    if not shift_mean:
+        whitened += mean
+    return whitened
+
+
+def clip_by_value(x: torch.Tensor, tensor_min: float, tensor_max: float) -> torch.Tensor:
+    """
+    Tensor extension to torch.clamp
+    https://github.com/pytorch/pytorch/issues/2793#issuecomment-428784713
+    """
+    clipped = torch.max(torch.min(x, tensor_max), tensor_min)
+    return clipped
+
+
+def entropy_from_logits(logits: torch.Tensor) -> torch.Tensor:
+    """Calculate entropy from logits."""
+    pd = torch.nn.functional.softmax(logits, dim=-1)
+    entropy = torch.logsumexp(logits, axis=-1) - torch.sum(pd * logits, axis=-1)
+    return entropy
+
+
+def average_torch_dicts(list_of_dicts: List[Dict]) -> Dict:
+    """Average values of a list of dicts with torch tensors."""
+    average_dict = dict()
+    for key in list_of_dicts[0].keys():
+        average_dict[key] = torch.mean(torch.stack([d[key] for d in list_of_dicts]), axis=0)
+    return average_dict
+
+
+def stats_to_np(stats_dict: Dict) -> Dict:
+    """Cast all torch.tensors in dict to numpy arrays."""
+    new_dict = dict()
+    for k, v in stats_dict.items():
+        if isinstance(v, torch.Tensor):
+            new_dict[k] = v.detach().cpu()
+            if new_dict[k].dtype == torch.bfloat16:
+                new_dict[k] = new_dict[k].float()
+            new_dict[k] = new_dict[k].numpy()
+        else:
+            new_dict[k] = v
+        if np.isscalar(new_dict[k]):
+            new_dict[k] = float(new_dict[k])
+    return new_dict
+
+
+def respond_to_batch(model: nn.Module, queries: List[torch.LongTensor], txt_len: int = 20, top_k: int = 0, top_p: float = 1.0) -> torch.LongTensor:
+    """Sample text from language model."""
+    input_ids = queries
+    for i in range(txt_len):
+        # Get Logits
+        outputs = model(input_ids)
+        next_token_logits = outputs[0][:, -1, :]
+        next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+        # Sample
+        probs = F.softmax(next_token_logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+        input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)
+    return input_ids[:, -txt_len:]
+
+
+def set_seed(seed: int) -> None:
+    """
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, and `torch`.
+
+    Args:
+        seed (`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if is_xpu_available():
+        torch.xpu.manual_seed_all(seed)
+    elif is_npu_available():
+        torch.npu.manual_seed_all(seed)
+    else:
+        torch.cuda.manual_seed_all(seed)
+
+
+class LengthSampler:
+    """
+    Samples a length
+    """
+
+    def __init__(self, min_value: int, max_value: int):
+        self.values = list(range(min_value, max_value))
+
+    def __call__(self) -> int:
+        return np.random.choice(self.values)
+
+
+class PPODecorators(object):
+    optimize_device_cache = False
+
+    @classmethod
+    @contextmanager
+    def empty_device_cache(cls):
+        yield
+        if cls.optimize_device_cache:
+            if is_xpu_available():
+                gc.collect()
+                torch.xpu.empty_cache()
+                gc.collect()
+            elif is_npu_available():
+                gc.collect()
+                torch.npu.empty_cache()
+                gc.collect()
+            elif torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+                gc.collect()
+
+
+def randn_tensor(
+    shape: Union[Tuple, List],
+    generator: Optional[Union[List[torch.Generator], torch.Generator]] = None,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+) -> torch.Tensor:
+    """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
+    passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor
+    is always created on the CPU.
+    """
+    # device on which tensor is created defaults to device
+    rand_device = device
+    batch_size = shape[0]
+
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+
+    if generator is not None:
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
+            rand_device = "cpu"
+            if device != "mps":
+                warnings.warn(
+                    f"The passed generator was created on 'cpu' even though a tensor on {device} was expected."
+                    f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably"
+                    f" slighly speed up this function by passing a generator that was created on the {device} device."
+                )
+        elif gen_device_type != device.type and gen_device_type == "cuda":
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
+
+    if isinstance(generator, list):
+        shape = (1,) + shape[1:]
+        latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout) for i in range(batch_size)]
+        latents = torch.cat(latents, dim=0).to(device)
+    else:
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+
+    return latents
diff --git a/train/trl/environment/__init__.py b/train/trl/environment/__init__.py
new file mode 100644
index 0000000..ae1cda4
--- /dev/null
+++ b/train/trl/environment/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .base_environment import TextEnvironment, TextHistory
diff --git a/train/trl/environment/base_environment.py b/train/trl/environment/base_environment.py
new file mode 100644
index 0000000..4c06b1c
--- /dev/null
+++ b/train/trl/environment/base_environment.py
@@ -0,0 +1,463 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+
+import torch
+from accelerate.utils import extract_model_from_parallel
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+from ..import_utils import is_rich_available
+
+
+if is_rich_available():
+    from rich import print
+    from rich.text import Text
+
+
+class StringStoppingCriteria(StoppingCriteria):
+    """Custom `StoppingCriteria` which checks if all generations in the batch are completed."""
+
+    def __init__(self, stop_strings, tokenizer):
+        self.stop_strings = stop_strings
+        self.tokenizer = tokenizer
+        self.first_call = True
+
+    def __call__(self, input_ids, scores, **kwargs):
+        """Returns true if all generated sequences contain any of the stop strings."""
+        if self.first_call:
+            self.generated_tokens = [1 for _ in range(input_ids.shape[0])]
+            self.start_length = input_ids.shape[-1] - 1
+            self.first_call = False
+        decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
+        done = []
+
+        for i, decoded_generation in enumerate(decoded_generations):
+            sequence_complete = any([stop_string in decoded_generation for stop_string in self.stop_strings])
+            done.append(sequence_complete)
+            if not sequence_complete:
+                self.generated_tokens[i] += 1
+
+        if all(done):
+            self.first_call = True
+
+        return all(done)
+
+
+class TextHistory:
+    """The TextHistory class keeps track of the history of an interaction between the language model and the environment."""
+
+    def __init__(self, text, tokens, system=True):
+        """
+        Initialize TextHistory.
+
+        args:
+            text (`str`): The text of the first segment.
+            tokens (`torch.LongTensor`): The tokens of the first segment.
+            system (`bool`, *optional*): Whether the first segment is a system or user segment.
+        """
+        self.system_spans = []
+        self.text_spans = []
+        self.token_spans = []
+        self.token_masks = torch.tensor([], dtype=torch.long).to(tokens.device)
+        self.text = ""
+        self.tokens = torch.tensor([], dtype=torch.long).to(tokens.device)
+        self.completed = False
+        self.truncated = False
+        self.reward = 0.0
+
+        self.prompt_color = "black on grey85"
+        self.system_color = "black on cyan3"
+        self.model_color = "black on deep_sky_blue1"
+        self.reward_color = "black on plum1"
+
+        self.append_segment(text, tokens, system=system)
+
+    def append_segment(self, text, tokens, system=True):
+        """
+        Append a new segment to the history.
+
+        args:
+            text (`str`): The text of the new segment.
+            tokens (`torch.LongTensor`): The tokens of the new segment.
+            system (`bool`, *optional*): Whether the new segment is a system or user segment.
+        """
+
+        if len(text) == 0 or len(tokens) == 0:
+            raise ValueError("Can't append empty text or token list to history.")
+
+        original_text_length = len(self.text)
+
+        self.text += text
+        self.text_spans.append((original_text_length, len(self.text)))
+        self.system_spans.append(system)
+
+        original_token_length = len(self.tokens)
+
+        self.tokens = torch.cat((self.tokens, tokens))
+        if system:
+            self.token_masks = torch.cat((self.token_masks, torch.zeros_like(tokens)))
+        else:
+            self.token_masks = torch.cat((self.token_masks, torch.ones_like(tokens)))
+        self.token_spans.append((original_token_length, len(self.tokens)))
+
+    def complete(self, truncated=False):
+        """
+        Mark the history as completed.
+        """
+        self.completed = True
+        self.truncated = truncated
+
+    @property
+    def last_text_segment(self):
+        """
+        Get the last text segment.
+        """
+        start, end = self.text_spans[-1]
+        return self.text[start:end]
+
+    def split_query_response_tokens(self):
+        """
+        Split the tokens into query and response tokens.
+        """
+        split_index = self.token_spans[0][1]
+        query = self.tokens[:split_index]
+        response = self.tokens[split_index:]
+        mask = self.token_masks[split_index:]
+
+        return query, response, mask
+
+    def show_text(self, show_legend=False):
+        """
+        Print the text history.
+        """
+        if not is_rich_available():
+            warnings.warn("install rich to display text")
+            return
+
+        text = Text(self.text)
+        text.stylize(self.prompt_color, self.text_spans[0][0], self.text_spans[1][0])
+        for i, (start, end) in enumerate(self.text_spans[1:]):
+            if self.system_spans[i + 1]:
+                text.stylize(self.system_color, start, end)
+            else:
+                text.stylize(self.model_color, start, end)
+
+        text.append(f"\n\nReward: {self.reward}", style=self.reward_color)
+        print(text)
+
+        if show_legend:
+            self.show_colour_legend()
+
+    def show_tokens(self, tokenizer, show_legend=False):
+        """
+        Print the history tokens.
+        """
+        if not is_rich_available():
+            warnings.warn("install rich to display tokens")
+            return
+
+        text = Text()
+        prompt_end = self.token_spans[0][1]
+        for i, (token, mask) in enumerate(zip(self.tokens, self.token_masks)):
+            if i < prompt_end:
+                text.append(tokenizer.convert_ids_to_tokens(token.item()), style=self.prompt_color)
+                text.append(" ")
+            elif mask == 0:
+                text.append(tokenizer.convert_ids_to_tokens(token.item()), style=self.system_color)
+                text.append(" ")
+            else:
+                text.append(tokenizer.convert_ids_to_tokens(token.item()), style=self.model_color)
+                text.append(" ")
+        text.append(f"\n\nReward: {self.reward}", style=self.reward_color)
+        print(text)
+        if show_legend:
+            self.show_colour_legend()
+
+    def show_colour_legend(self):
+        """
+        Print the colour legend.
+        """
+        if not is_rich_available():
+            warnings.warn("install rich to display colour legend")
+            return
+        text = Text("\n\n(Colour Legend: ")
+        text.append("Prompt", style=self.prompt_color)
+        text.append("|")
+        text.append("System", style=self.system_color)
+        text.append("|")
+        text.append("Model", style=self.model_color)
+        text.append("|")
+        text.append("Reward", style=self.reward_color)
+        text.append(")")
+        print(text)
+
+
+class TextEnvironment:
+    """
+    The TextEnvironment enables interaction of a LLM with an environment using tools.
+    """
+
+    def __init__(
+        self,
+        model=None,
+        tokenizer=None,
+        tools=None,
+        reward_fn=None,
+        prompt=None,
+        max_turns=4,
+        max_tool_reponse=100,
+        max_length=None,
+        generation_kwargs=None,
+    ):
+        """
+        Initialize TextEnvironment.
+
+        Args:
+            model (`PreTrainedModelWrapper`): The model to use for generation.
+            tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use for generation.
+            tools (list): A list of tools to use for interaction.
+            reward_fn (function): A function that takes a string and returns a reward.
+            prompt (str): The base prompt to use for generation. Is prepended to the tasks.
+            max_turns (Optional[int]): The maximum number of turns to allow.
+            max_tool_response (Optional[int]): The maximum number of characters to allow in a tool response.
+            max_length (Optional[int]): The maximum number of tokens to allow in an episode.
+            generation_kwargs (Optional[dict]): A dictionary of keyword arguments to pass to the model's generate method.
+        """
+        self.model = model
+        self.tokenizer = tokenizer
+        self.prompt = prompt
+        if isinstance(tools, dict):
+            self.tools = tools
+        else:
+            self.tools = dict([(tool.__class__.__name__, tool) for tool in tools])
+        self.reward_fn = reward_fn
+        self.max_length = max_length
+        self.request_token = "<request>"
+        self.call_token = "<call>"
+        self.response_token = "<response>"
+        self.submit_token = "<submit>"
+        self.max_turns = max_turns
+        self.max_tool_response = max_tool_reponse
+
+        if generation_kwargs is None:
+            self.generation_kwargs = dict()
+        else:
+            self.generation_kwargs = generation_kwargs
+
+        self.is_encoder_decoder = hasattr(self.model, "is_encoder_decoder")
+        self.current_device = extract_model_from_parallel(self.model).pretrained_model.device
+
+    def run(self, queries, **rewards_kwargs):
+        """
+        Run the environment on a list of queries.
+
+        Args:
+            queries (list[str]): A list of queries to run the model in the environment on.
+        """
+        turns = 0
+
+        queries = [self.prompt + task for task in queries]
+        queries_tokens = [self.tokenizer(query, return_tensors="pt").input_ids[0].to(self.model.pretrained_model.device) for query in queries]
+
+        histories = [TextHistory(q, qt, system=True) for q, qt in zip(queries, queries_tokens)]
+
+        while any([not history.completed for history in histories]) and turns < self.max_turns:
+            histories = self.generate(histories)
+            histories = self.tasks_end_check(histories)
+            # TODO: make this parallel rather than for-loop
+            for i in range(len(histories)):
+                histories[i] = self.step(histories[i])
+            histories = self.tasks_end_check(histories, model_turn=False)
+            turns += 1
+        self.compute_reward(histories, **rewards_kwargs)
+
+        # convert a list of (q, r, m) tuples to lists of all qs, rs, and ms respectively
+        queries, responses, masks = map(list, zip(*[history.split_query_response_tokens() for history in histories]))
+
+        rewards = [history.reward for history in histories]
+        return queries, responses, masks, rewards, histories
+
+    def step(self, history):
+        """
+        Step the environment forward one turn.
+
+        Args:
+            history (`TextHistory`): The history to step forward.
+        """
+        truncated, ended = self.task_end_check(history)
+        if ended:
+            history.complete(truncated=truncated)
+        if history.completed:
+            return history
+
+        tool, query = self.parse_tool_call(history.last_text_segment)
+        if tool is None or query is None:
+            response = f"Unknown tool call: {history.last_text_segment}"
+        else:
+            if tool not in self.tools:
+                response = f"Unknown tool {tool}."
+            try:
+                response = self.tools[tool](query)
+            except Exception as error:
+                response = f"Tool error: {str(error)}"
+
+        if len(response) > self.max_tool_response:
+            response = response[: (self.max_tool_response - 3)] + "..."
+
+        history.append_segment(
+            response + self.response_token,
+            self.tokenizer(response + self.response_token, return_tensors="pt").input_ids[0].to(self.model.pretrained_model.device),
+            system=True,
+        )
+
+        return history
+
+    def parse_tool_call(self, text):
+        """
+        Parse request string. Expected format: <request><tool_name>query<call>
+        """
+        result = re.search(f"(?<={self.request_token}).*?(?={self.call_token})", text, re.DOTALL)
+
+        # if we can't find a <request>/<call> span we return none
+        if result is None:
+            return None, None
+        else:
+            extracted_text = result.group()
+
+        result = re.search(r"<(.*?)>", extracted_text)
+
+        # if we can't find a tool name we return none
+        if result is None:
+            return None, None
+        else:
+            tool = result.group(1)
+
+        # split off the tool name
+        query = ">".join(extracted_text.split(">")[1:])
+
+        return tool, query
+
+    def compute_reward(self, histories, **reward_kwargs):
+        """
+        Compute the reward for a list of histories.
+        """
+        rewards = self.reward_fn([history.last_text_segment for history in histories], **reward_kwargs)
+        for history, reward in zip(histories, rewards):
+            history.reward = reward
+        return histories
+
+    def generate(self, histories):
+        """
+        Generate responses for a list of histories.
+        """
+        active_histories = [i for i, history in enumerate(histories) if not history.completed]
+
+        query_tensors = [histories[i].tokens for i in active_histories]
+        response_tensors = self._generate_batched(query_tensors)
+        response_texts = self.tokenizer.batch_decode(response_tensors)
+
+        for i, response_text, response_tensor in zip(active_histories, response_texts, response_tensors):
+            histories[i].append_segment(response_text, response_tensor, system=False)
+
+        return histories
+
+    def tasks_end_check(self, histories, model_turn=True):
+        """
+        Check if the current generation sequences have finished.
+        """
+        for history in histories:
+            if not history.completed:
+                truncated, ended = self.task_end_check(history, model_turn=model_turn)
+                if ended:
+                    history.complete(truncated=truncated)
+        return histories
+
+    def task_end_check(self, history, model_turn=True):
+        """
+        Check if the current generation sequence has finished.
+        """
+        truncated = False
+        ended = False
+        if history.completed:
+            return truncated, ended
+        if self.max_length is not None and len(self.tokenizer(history.text).input_ids[0]) > self.max_length:
+            truncated = True
+            ended = True
+        elif self.tokenizer.eos_token in history.text:
+            ended = True
+        elif model_turn and not ((self.request_token in history.last_text_segment and self.call_token in history.last_text_segment) or self.submit_token in history.last_text_segment):
+            ended = True
+        elif self.submit_token in history.last_text_segment:
+            ended = True
+        return truncated, ended
+
+    def _generate_batched(
+        self,
+        query_tensors,
+        batch_size: int = 16,
+        pad_to_multiple_of: int = None,
+    ):
+        """
+        Generate responses for a list of query tensors.
+
+        args:
+            query_tensors (list[torch.Tensor]): A list of query tensors to generate responses for.
+            batch_size (int): The batch size to use for generation.
+            pad_to_multiple_of (int): The padding length to use for generation.
+        """
+        outputs = []
+        padding_side_default = self.tokenizer.padding_side
+        if not self.is_encoder_decoder:
+            self.tokenizer.padding_side = "left"
+
+        # in case we have fewer examples than bs
+        batch_size = min(len(query_tensors), batch_size)
+
+        for i in range(0, len(query_tensors), batch_size):
+            # prevent overflow if query tensors are not even multiple of bs
+            end_index = min(len(query_tensors), i + batch_size)
+
+            batch = query_tensors[i:end_index]
+            batch_mask = [torch.ones_like(element) for element in batch]
+            inputs = {"input_ids": batch, "attention_mask": batch_mask}
+
+            padded_inputs = self.tokenizer.pad(
+                inputs,
+                padding=True,
+                max_length=None,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors="pt",
+            ).to(self.current_device)
+
+            stopping_criteria = StringStoppingCriteria([self.call_token, self.submit_token], self.tokenizer)
+
+            self.generation_kwargs["stopping_criteria"] = StoppingCriteriaList([stopping_criteria])
+
+            generations = extract_model_from_parallel(self.model).generate(**padded_inputs, **self.generation_kwargs)
+
+            for generation, mask, generated_tokens in zip(generations, padded_inputs["attention_mask"], stopping_criteria.generated_tokens):
+                if not self.is_encoder_decoder:
+                    output = generation[(1 - mask).sum() :]  # remove padding
+                else:
+                    output = generation
+
+                if not self.is_encoder_decoder:
+                    output = output[(mask).sum() :]  # remove prompt
+
+                # remove chunk generated after stopping criteria in batch mode
+                outputs.append(output[:generated_tokens])
+        self.tokenizer.padding_side = padding_side_default
+        return outputs
diff --git a/train/trl/extras/__init__.py b/train/trl/extras/__init__.py
new file mode 100644
index 0000000..6b3035d
--- /dev/null
+++ b/train/trl/extras/__init__.py
@@ -0,0 +1,16 @@
+# flake8: noqa
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .best_of_n_sampler import BestOfNSampler
diff --git a/train/trl/extras/best_of_n_sampler.py b/train/trl/extras/best_of_n_sampler.py
new file mode 100644
index 0000000..b64231f
--- /dev/null
+++ b/train/trl/extras/best_of_n_sampler.py
@@ -0,0 +1,113 @@
+from typing import Any, Callable, List, Optional, Union
+
+import torch
+from transformers import GenerationConfig, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from ..core import set_seed
+from ..models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper
+
+
+class BestOfNSampler(object):
+    def __init__(
+        self,
+        model: PreTrainedModelWrapper,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        queries_to_scores: Callable[[List[str]], List[float]],
+        length_sampler: Any,
+        sample_size: int = 4,
+        seed: Optional[int] = None,
+        n_candidates: int = 1,
+        generation_config: Optional[GenerationConfig] = None,
+    ) -> None:
+        r"""
+        Initialize the sampler for best-of-n generation
+
+        Args:
+            model (`PreTrainedModelWrapper`):
+                The pretrained model to use for generation
+            tokenizer (`PreTrainedTokenizer` or `PreTrainedTokenizerFast`):
+                Tokenizer associated with the pretrained model
+            queries_to_scores (`Callable[[List[str]], List[float]]`):
+                Callable that takes a list of generated texts and returns the associated reward scores
+            length_sampler (`Any`):
+                Sampler used to sample the length of the generated text
+            sample_size (`int`):
+                Number of samples to generate for each query
+            seed (`int`, *optional*):
+                Random seed used to control generation
+            n_candidates (`int`):
+                Number of candidates to return for each query
+            generation_config (`GenerationConfig`, *optional*):
+                Generation config passed to the underlying model's `generate` method.
+                See `GenerationConfig` (https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig) for more details
+        """
+        if seed is not None:
+            set_seed(seed)
+
+        if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+            raise ValueError(f"tokenizer must be a PreTrainedTokenizer or PreTrainedTokenizerFast, got {type(tokenizer)}")
+        if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
+            raise ValueError(f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}")
+
+        self.model = model
+        self.tokenizer = tokenizer
+
+        self.queries_to_scores = queries_to_scores
+        self.length_sampler = length_sampler
+        self.gen_config = generation_config
+        self.sample_size = sample_size
+        self.n_candidates = n_candidates
+
+    def generate(
+        self,
+        tokenized_query: Union[List[int], torch.Tensor, List[torch.Tensor], List[List[int]]],
+        skip_special_tokens: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        **generation_kwargs,
+    ) -> List[List[str]]:
+        r"""
+        Generate the best of n samples for input queries
+
+        Args:
+            tokenized_query (`List[int]` or `torch.Tensor` or `List[torch.Tensor]` or `List[int]`):
+                represents either a single tokenized query (a single tensor or a list of integers) or a batch of tokenized queries (a list of tensors or a list of lists of integers)
+            skip_special_tokens (`bool`):
+                Whether to remove the special tokens from the output
+            device (`str` or `torch.device`, *optional*):
+                The device on which the model will be loaded
+            **generation_kwargs (`dict`, *optional*):
+                Additional keyword arguments passed along to the underlying model's `generate` method.
+                This is used to override generation config
+
+        Returns:
+            List[List[str]]: A list of lists of generated texts
+        """
+        queries = None
+
+        if isinstance(tokenized_query, torch.Tensor) and tokenized_query.ndim == 1:
+            queries = tokenized_query.unsqueeze(0)
+        elif isinstance(tokenized_query, List):
+            element_type = type(tokenized_query[0])
+            if element_type == int:
+                queries = torch.tensor(tokenized_query).unsqueeze(0)
+            elif element_type == torch.Tensor:
+                queries = [tensor.reshape((1, -1)) for tensor in tokenized_query]
+            else:
+                queries = [torch.tensor(query).reshape((1, -1)) for query in tokenized_query]
+
+        result = []
+
+        for query in queries:
+            queries = query.repeat((self.sample_size, 1))
+            output = self.model.generate(
+                queries.to(device),
+                max_new_tokens=self.length_sampler(),
+                generation_config=self.gen_config,
+                **generation_kwargs,
+            ).squeeze()
+            output = self.tokenizer.batch_decode(output, skip_special_tokens=skip_special_tokens)
+            scores = torch.tensor(self.queries_to_scores(output))
+            output = [output[i] for i in scores.topk(self.n_candidates).indices]
+            result.append(output)
+
+        return result
diff --git a/train/trl/extras/dataset_formatting.py b/train/trl/extras/dataset_formatting.py
new file mode 100644
index 0000000..b9a691b
--- /dev/null
+++ b/train/trl/extras/dataset_formatting.py
@@ -0,0 +1,86 @@
+import logging
+from typing import Callable, Literal, Optional, Union
+
+from datasets import Dataset, Value
+from transformers import AutoTokenizer
+
+from ..trainer.utils import ConstantLengthDataset
+
+
+FORMAT_MAPPING = {
+    "chatml": [{"content": Value(dtype="string", id=None), "role": Value(dtype="string", id=None)}],
+    "instruction": {"completion": Value(dtype="string", id=None), "prompt": Value(dtype="string", id=None)},
+}
+
+
+def conversations_formatting_function(tokenizer: AutoTokenizer, messages_field: Literal["messages", "conversations"]):
+    r"""
+    return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the tokenizer
+    apply chat template to the dataset
+    """
+
+    def format_dataset(examples):
+        if isinstance(examples[messages_field][0], list):
+            output_texts = []
+            for i in range(len(examples[messages_field])):
+                output_texts.append(tokenizer.apply_chat_template(examples[messages_field][i], tokenize=False))
+            return output_texts
+        else:
+            return tokenizer.apply_chat_template(examples[messages_field], tokenize=False)
+
+    return format_dataset
+
+
+def instructions_formatting_function(tokenizer: AutoTokenizer):
+    r"""
+    return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the tokenizer
+    apply chat template to the dataset
+    """
+
+    def format_dataset(examples):
+        if isinstance(examples["prompt"], list):
+            output_texts = []
+            for i in range(len(examples["prompt"])):
+                converted_sample = [
+                    {"role": "user", "content": examples["prompt"][i]},
+                    {"role": "assistant", "content": examples["completion"][i]},
+                ]
+                output_texts.append(tokenizer.apply_chat_template(converted_sample, tokenize=False))
+            return output_texts
+        else:
+            converted_sample = [
+                {"role": "user", "content": examples["prompt"]},
+                {"role": "assistant", "content": examples["completion"]},
+            ]
+            return tokenizer.apply_chat_template(converted_sample, tokenize=False)
+
+    return format_dataset
+
+
+def get_formatting_func_from_dataset(dataset: Union[Dataset, ConstantLengthDataset], tokenizer: AutoTokenizer) -> Optional[Callable]:
+    r"""
+    Finds the correct formatting function based on the dataset structure. Currently supported datasets are:
+    - `ChatML` with [{"role": str, "content": str}]
+    - `instruction` with [{"prompt": str, "completion": str}]
+
+    Args:
+        dataset (Dataset): User dataset
+        tokenizer (AutoTokenizer): Tokenizer used for formatting
+
+    Returns:
+        Callable: Formatting function if the dataset format is supported else None
+    """
+    if isinstance(dataset, Dataset):
+        if "messages" in dataset.features:
+            if dataset.features["messages"] == FORMAT_MAPPING["chatml"]:
+                logging.info("Formatting dataset with chatml format")
+                return conversations_formatting_function(tokenizer, "messages")
+        if "conversations" in dataset.features:
+            if dataset.features["conversations"] == FORMAT_MAPPING["chatml"]:
+                logging.info("Formatting dataset with chatml format")
+                return conversations_formatting_function(tokenizer, "conversations")
+        elif dataset.features == FORMAT_MAPPING["instruction"]:
+            logging.info("Formatting dataset with instruction format")
+            return instructions_formatting_function(tokenizer)
+
+    return None
diff --git a/train/trl/import_utils.py b/train/trl/import_utils.py
new file mode 100644
index 0000000..88a04f7
--- /dev/null
+++ b/train/trl/import_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import sys
+
+
+if sys.version_info < (3, 8):
+    _is_python_greater_3_8 = False
+else:
+    _is_python_greater_3_8 = True
+
+
+def is_peft_available() -> bool:
+    return importlib.util.find_spec("peft") is not None
+
+
+def is_unsloth_available() -> bool:
+    return importlib.util.find_spec("unsloth") is not None
+
+
+def is_accelerate_greater_20_0() -> bool:
+    if _is_python_greater_3_8:
+        from importlib.metadata import version
+
+        accelerate_version = version("accelerate")
+    else:
+        import pkg_resources
+
+        accelerate_version = pkg_resources.get_distribution("accelerate").version
+    return accelerate_version >= "0.20.0"
+
+
+def is_transformers_greater_than(version: str) -> bool:
+    _transformers_version = importlib.metadata.version("transformers")
+    return _transformers_version > version
+
+
+def is_torch_greater_2_0() -> bool:
+    if _is_python_greater_3_8:
+        from importlib.metadata import version
+
+        torch_version = version("torch")
+    else:
+        import pkg_resources
+
+        torch_version = pkg_resources.get_distribution("torch").version
+    return torch_version >= "2.0"
+
+
+def is_diffusers_available() -> bool:
+    return importlib.util.find_spec("diffusers") is not None
+
+
+def is_bitsandbytes_available() -> bool:
+    import torch
+
+    # bnb can be imported without GPU but is not usable.
+    return importlib.util.find_spec("bitsandbytes") is not None and torch.cuda.is_available()
+
+
+def is_torchvision_available() -> bool:
+    return importlib.util.find_spec("torchvision") is not None
+
+
+def is_rich_available() -> bool:
+    return importlib.util.find_spec("rich") is not None
+
+
+def is_wandb_available() -> bool:
+    return importlib.util.find_spec("wandb") is not None
+
+
+def is_xpu_available() -> bool:
+    if is_accelerate_greater_20_0():
+        import accelerate
+
+        return accelerate.utils.is_xpu_available()
+    else:
+        if importlib.util.find_spec("intel_extension_for_pytorch") is None:
+            return False
+        try:
+            import torch
+
+            return hasattr(torch, "xpu") and torch.xpu.is_available()
+        except RuntimeError:
+            return False
+
+
+def is_npu_available() -> bool:
+    """Checks if `torch_npu` is installed and potentially if a NPU is in the environment"""
+    if importlib.util.find_spec("torch") is None or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    return hasattr(torch, "npu") and torch.npu.is_available()
diff --git a/train/trl/models/__init__.py b/train/trl/models/__init__.py
new file mode 100644
index 0000000..ec20345
--- /dev/null
+++ b/train/trl/models/__init__.py
@@ -0,0 +1,35 @@
+# flake8: noqa
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_base import PreTrainedModelWrapper, create_reference_model
+from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead
+from .utils import setup_chat_format
+
+
+SUPPORTED_ARCHITECTURES = (
+    AutoModelForCausalLMWithValueHead,
+    AutoModelForSeq2SeqLMWithValueHead,
+)
+
+from ..import_utils import is_diffusers_available
+
+
+if is_diffusers_available():
+    from .modeling_sd_base import (
+        DDPOPipelineOutput,
+        DDPOSchedulerOutput,
+        DDPOStableDiffusionPipeline,
+        DefaultDDPOStableDiffusionPipeline,
+    )
diff --git a/train/trl/models/modeling_base.py b/train/trl/models/modeling_base.py
new file mode 100644
index 0000000..9e6c4fc
--- /dev/null
+++ b/train/trl/models/modeling_base.py
@@ -0,0 +1,640 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import os
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from accelerate import PartialState
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HFValidationError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+)
+from safetensors.torch import load_file as safe_load_file
+from transformers import PreTrainedModel
+
+from ..import_utils import is_npu_available, is_peft_available, is_transformers_greater_than, is_xpu_available
+
+
+if is_peft_available():
+    from peft import (
+        PeftConfig,
+        PeftModel,
+        PeftModelForCausalLM,
+        PeftModelForSeq2SeqLM,
+        PromptLearningConfig,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+    )
+
+if is_transformers_greater_than("4.33.0"):
+    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+else:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
+
+LAYER_PATTERNS = [
+    "transformer.h.{layer}",
+    "model.decoder.layers.{layer}",
+    "gpt_neox.layers.{layer}",
+    "model.layers.{layer}",
+]
+
+
+class PreTrainedModelWrapper(nn.Module):
+    r"""
+    A wrapper class around a (`transformers.PreTrainedModel`) to be compatible with the
+    (`~transformers.PreTrained`) class in order to keep some attributes and methods of the
+    (`~transformers.PreTrainedModel`) class.
+
+    Attributes:
+        pretrained_model: (`transformers.PreTrainedModel`)
+            The model to be wrapped.
+        parent_class: (`transformers.PreTrainedModel`)
+            The parent class of the model to be wrapped.
+        supported_args: (`list`)
+            The list of arguments that are supported by the wrapper class.
+    """
+
+    transformers_parent_class = None
+    supported_args = None
+    supported_modules = ("v_head",)
+    supported_rm_modules = ("score",)
+    supported_pretrained_model_architectures = (PreTrainedModel) if not is_peft_available() else (PreTrainedModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM)
+
+    def __init__(self, pretrained_model=None, score_module=None, supports_rm_adapter=False, rm_adapter_name=None, **kwargs):
+        super().__init__()
+        self.pretrained_model = pretrained_model
+
+        self.config = pretrained_model.config
+        self.prepare_inputs_for_generation = pretrained_model.prepare_inputs_for_generation
+        self.is_loaded_in_8bit = getattr(pretrained_model, "is_loaded_in_8bit", False)
+        self.is_loaded_in_4bit = getattr(pretrained_model, "is_loaded_in_4bit", False)
+        self.is_sequential_parallel = False
+
+        if hasattr(pretrained_model, "gradient_checkpointing_disable"):
+            self.gradient_checkpointing_disable = pretrained_model.gradient_checkpointing_disable
+
+        if hasattr(pretrained_model, "gradient_checkpointing_enable"):
+            self.gradient_checkpointing_enable = pretrained_model.gradient_checkpointing_enable
+
+        self.supports_rm_adapter = supports_rm_adapter
+        self.rm_adapter_name = rm_adapter_name
+        self.policy_adapter_name = "default"
+        if score_module is not None:
+            self.score = score_module
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Instantiates a new model from a pretrained model from `transformers`. The
+        pretrained model is loaded using the `from_pretrained` method of the
+        `transformers.PreTrainedModel` class. The arguments that are specific to the
+        `transformers.PreTrainedModel` class are passed along this method and filtered
+        out from the `kwargs` argument.
+
+
+        Args:
+            pretrained_model_name_or_path (`str` or `transformers.PreTrainedModel`):
+                The path to the pretrained model or its name.
+            *model_args (`list`, *optional*)):
+                Additional positional arguments passed along to the underlying model's
+                `from_pretrained` method.
+            **kwargs (`dict`, *optional*):
+                Additional keyword arguments passed along to the underlying model's
+                `from_pretrained` method. We also pre-process the kwargs to extract
+                the arguments that are specific to the `transformers.PreTrainedModel`
+                class and the arguments that are specific to trl models. The kwargs
+                also support `prepare_model_for_kbit_training` arguments from
+                `peft` library.
+        """
+        if kwargs is not None:
+            peft_config = kwargs.pop("peft_config", None)
+            reward_adapter = kwargs.pop("reward_adapter", None)
+            reward_adapter_name = kwargs.pop("reward_adapter_name", "reward_adapter")
+            is_trainable = kwargs.pop("is_trainable", False)
+            trl_model_args, pretrained_kwargs, peft_quantization_kwargs = cls._split_kwargs(kwargs)
+            token = pretrained_kwargs.get("token", None)
+        else:
+            peft_config = None
+            is_trainable = False
+            trl_model_args = {}
+            pretrained_kwargs = {}
+            peft_quantization_kwargs = {}
+            token = None
+
+        if reward_adapter is not None and not isinstance(reward_adapter, str):
+            raise ValueError("The `reward_adapter` argument should be a string representing the name of local path or the Hub id to the Reward Modeling adapter.")
+
+        is_peft_model = False
+
+        current_device = cls._get_current_device()
+        if isinstance(pretrained_model_name_or_path, str):
+            is_loaded_in_8bit = pretrained_kwargs["load_in_8bit"] if "load_in_8bit" in pretrained_kwargs else False
+            is_loaded_in_4bit = pretrained_kwargs["load_in_4bit"] if "load_in_4bit" in pretrained_kwargs else False
+        else:
+            is_loaded_in_8bit = getattr(pretrained_model_name_or_path, "is_loaded_in_8bit", False)
+            is_loaded_in_4bit = getattr(pretrained_model_name_or_path, "is_loaded_in_4bit", False)
+
+        if (is_loaded_in_8bit or is_loaded_in_4bit) and "device_map" not in pretrained_kwargs:
+            # warn users
+            logging.warning(
+                "The `device_map` argument is not provided. We will override the device_map argument."
+                " to set the entire"
+                " model on the current device. If you want to set the model on multiple devices, please provide"
+                " a custom `device_map` argument."
+            )
+            pretrained_kwargs["device_map"] = {"": current_device}
+
+        if is_peft_available() and peft_config is not None and not isinstance(peft_config, PeftConfig):
+            raise ValueError("The `peft_config` argument should be an instance of `peft.PeftConfig` class.")
+
+        # First, load the pre-trained model using the parent-class
+        # either `AutoModelForCausalLM` or `AutoModelForSeq2SeqLM`
+        if isinstance(pretrained_model_name_or_path, str):
+            if is_peft_available():
+                try:
+                    # If there is a trained peft adapter in the hub, load its config.
+                    remote_adapter_config = hf_hub_download(
+                        pretrained_model_name_or_path,
+                        "adapter_config.json",
+                        token=token,
+                    )
+                except (EntryNotFoundError, LocalEntryNotFoundError, HFValidationError, RepositoryNotFoundError):
+                    remote_adapter_config = None
+            else:
+                remote_adapter_config = None
+
+            local_adapter_present = os.path.exists(os.path.join(pretrained_model_name_or_path, "adapter_config.json"))
+
+            if (local_adapter_present or remote_adapter_config is not None) and is_peft_available():
+                if peft_config is not None:
+                    logging.warning("`peft_config` argument ignored since a peft config file was found in " f"{pretrained_model_name_or_path}")
+
+                # Load the trained peft adapter config
+                if local_adapter_present:
+                    trained_adapter_config = PeftConfig.from_pretrained(pretrained_model_name_or_path)
+                else:
+                    remote_adapter_dir = os.path.dirname(remote_adapter_config)
+                    trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_dir)
+
+                # Load the pretrained base model
+                pretrained_model = cls.transformers_parent_class.from_pretrained(trained_adapter_config.base_model_name_or_path, *model_args, **pretrained_kwargs)
+
+                # Wrap the pretrained model with the trained peft adapter
+                pretrained_model = PeftModel.from_pretrained(pretrained_model, pretrained_model_name_or_path, is_trainable=is_trainable)
+                logging.info("Trained peft adapter loaded")
+            else:
+                pretrained_model = cls.transformers_parent_class.from_pretrained(pretrained_model_name_or_path, *model_args, **pretrained_kwargs)
+
+                if peft_config is not None:
+                    # Initialize a new peft adapter with the given config
+                    if is_loaded_in_8bit or is_loaded_in_4bit:
+                        pretrained_model = prepare_model_for_kbit_training(
+                            pretrained_model,
+                            **peft_quantization_kwargs,
+                        )
+                    pretrained_model = get_peft_model(pretrained_model, peft_config)
+                    logging.info("peft adapter initialised")
+
+        elif isinstance(pretrained_model_name_or_path, cls.supported_pretrained_model_architectures):
+            pretrained_model = pretrained_model_name_or_path
+
+            if peft_config is not None and isinstance(pretrained_model, PreTrainedModel):
+                # Initialize a new peft adapter with the given config
+                if is_loaded_in_8bit or is_loaded_in_4bit:
+                    pretrained_model = prepare_model_for_kbit_training(
+                        pretrained_model,
+                        **peft_quantization_kwargs,
+                    )
+                pretrained_model = get_peft_model(pretrained_model, peft_config)
+                logging.info("peft adapter initialised")
+        else:
+            raise ValueError("pretrained_model_name_or_path should be a string or a PreTrainedModel, " f"but is {type(pretrained_model_name_or_path)}")
+
+        if is_peft_available():
+            if isinstance(pretrained_model, PeftModel):
+                is_peft_model = True
+                # for backward compatibility
+                if hasattr(pretrained_model, "active_peft_config") and isinstance(pretrained_model.active_peft_config, PromptLearningConfig):
+                    raise ValueError("PromptLearningConfig is not supported for PPO training.")
+
+        # Add reward modeling adapter if specified
+        if not is_peft_model and reward_adapter is not None:
+            raise ValueError("reward_adapter can only be used with a PeftModel. ")
+        elif is_peft_model and reward_adapter is not None:
+            score_module = cls.add_and_load_reward_modeling_adapter(pretrained_model, reward_adapter, reward_adapter_name, token=token)
+            multi_adapter_args = {
+                "score_module": score_module,
+                "supports_rm_adapter": True,
+                "rm_adapter_name": reward_adapter_name,
+            }
+        else:
+            multi_adapter_args = {"supports_rm_adapter": False}
+
+        # Then, create the full model by instantiating the wrapper class
+        model = cls(pretrained_model, **multi_adapter_args, **trl_model_args)
+
+        # if resume_training, load the state_dict again - this is ok since the
+        # state_dict is removed from the model after loading it.
+        is_resuming_training = True
+        if isinstance(pretrained_model_name_or_path, str):
+            safe_filename = os.path.join(pretrained_model_name_or_path, "model.safetensors")
+            filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
+
+            sharded_index_filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json")
+            safe_sharded_index_filename = os.path.join(pretrained_model_name_or_path, "model.safetensors.index.json")
+            is_sharded = False
+            use_safe = os.path.exists(safe_filename)
+
+            if not (os.path.exists(filename) or os.path.exists(safe_filename)):
+                # Try with `pytorch_model.bin`
+                filename, files_to_download, is_sharded, is_resuming_training = cls._get_checkpoint_from_hub(
+                    pretrained_model,
+                    pretrained_model_name_or_path,
+                    sharded_index_filename,
+                    token=token,
+                )
+                # Try with safetensors
+                if filename is None and files_to_download is None:
+                    safe_filename, files_to_download, is_sharded, is_resuming_training = cls._get_checkpoint_from_hub(
+                        pretrained_model,
+                        pretrained_model_name_or_path,
+                        safe_sharded_index_filename,
+                        token=token,
+                        model_name="model.safetensors",
+                        model_index_name="model.safetensors.index.json",
+                    )
+                    use_safe = True
+                else:
+                    use_safe = False
+
+            loading_func = safe_load_file if use_safe else torch.load
+            load_kwargs = {} if use_safe else {"map_location": "cpu"}
+
+            if is_resuming_training:
+                if is_sharded:
+                    # download each file and add it to the state_dict
+                    state_dict = {}
+
+                    for shard_file in files_to_download:
+                        filename = hf_hub_download(
+                            pretrained_model_name_or_path,
+                            shard_file,
+                            token=token,
+                        )
+                        state_dict.update(loading_func(filename, **load_kwargs))
+                else:
+                    state_dict = loading_func(filename if not use_safe else safe_filename, **load_kwargs)
+
+        else:
+            state_dict = pretrained_model_name_or_path.state_dict()
+
+        model.is_peft_model = is_peft_model
+        model.current_device = current_device
+
+        if is_resuming_training:
+            model.post_init(state_dict=state_dict)
+
+        return model
+
+    @classmethod
+    def _get_checkpoint_from_hub(
+        cls,
+        pretrained_model,
+        pretrained_model_name_or_path,
+        index_filename,
+        token=None,
+        model_name="pytorch_model.bin",
+        model_index_name="pytorch_model.bin.index.json",
+    ):
+        files_to_download = None
+        filename = None
+        is_resuming_training = True
+        is_sharded = False
+
+        try:
+            filename = hf_hub_download(
+                pretrained_model_name_or_path,
+                model_name,
+                token=token,
+            )
+        # sharded
+        except (EntryNotFoundError, LocalEntryNotFoundError, HFValidationError, RepositoryNotFoundError):
+            if os.path.exists(index_filename):
+                index_file_name = index_filename
+            else:
+                try:
+                    index_file_name = hf_hub_download(
+                        pretrained_model_name_or_path,
+                        model_index_name,
+                        token=token,
+                    )
+                except (EntryNotFoundError, LocalEntryNotFoundError, HFValidationError, RepositoryNotFoundError):
+                    # not continue training, do not have v_head weight
+                    is_resuming_training = False
+                    logging.warning(f"A {type(pretrained_model)} model is loaded from '{pretrained_model_name_or_path}', " f"and no v_head weight is found. This IS expected if you are not resuming PPO training.")
+            # load json
+            if is_resuming_training:
+                with open(index_file_name, "r") as f:
+                    index = json.load(f)
+                # check filename with `v_head` or any known extra module:
+                files_to_download = set()
+                for k, v in index["weight_map"].items():
+                    if any([module in k for module in cls.supported_modules]):
+                        files_to_download.add(v)
+                is_sharded = True
+
+        return filename, files_to_download, is_sharded, is_resuming_training
+
+    @classmethod
+    def _get_current_device(cls):
+        r"""
+        Get the current device. For GPU, we return the local process index using the `accelerate.PartialState`
+        object to handle corner cases when running scripts in distributed environments.
+
+        Returns:
+            current_device (`Union[int, str]`):
+                The current device.
+        """
+        state = PartialState()
+        if is_xpu_available():
+            return f"xpu:{state.local_process_index}"
+        elif is_npu_available():
+            return f"npu:{state.local_process_index}"
+        else:
+            return state.local_process_index if torch.cuda.is_available() else "cpu"
+
+    @classmethod
+    def _split_kwargs(cls, kwargs):
+        """
+        Separate the kwargs from the arguments that we support inside
+        `supported_args` and the ones that we don't.
+        """
+        check_peft_kwargs = False
+
+        if is_peft_available():
+            from peft import prepare_model_for_kbit_training
+
+            check_peft_kwargs = True
+
+        supported_kwargs = {}
+        unsupported_kwargs = {}
+        peft_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in cls.supported_args:
+                supported_kwargs[key] = value
+            else:
+                unsupported_kwargs[key] = value
+
+            if check_peft_kwargs:
+                if key in prepare_model_for_kbit_training.__code__.co_varnames:
+                    peft_kwargs[key] = value
+                    if key in unsupported_kwargs:
+                        unsupported_kwargs.pop(key)
+
+        return supported_kwargs, unsupported_kwargs, peft_kwargs
+
+    @classmethod
+    def add_and_load_reward_modeling_adapter(cls, pretrained_model, adapter_model_id, adapter_name="reward_model_adapter", token=None):
+        r"""
+        Add and load a reward modeling adapter. This method can only be used if the
+        model is a `PeftModel` and if you have initialized the model with the `reward_modeling_adapter_id`
+        argument, pointing to the id of the reward modeling adapter. The latest needs also to contain the
+        score head in order to produce the reward.
+        """
+        pretrained_model.load_adapter(adapter_model_id, adapter_name, is_trainable=False)
+        pretrained_model.train()
+
+        filename = os.path.join(adapter_model_id, "adapter_model.bin")
+        safe_loading = False
+        if not os.path.exists(filename):
+            try:
+                local_filename = hf_hub_download(
+                    adapter_model_id,
+                    "adapter_model.bin",
+                    token=token,
+                )
+            except:  # noqa
+                filename = os.path.join(adapter_model_id, "adapter_model.safetensors")
+                safe_loading = True
+                if not os.path.exists(filename):
+                    try:
+                        local_filename = hf_hub_download(
+                            adapter_model_id,
+                            "adapter_model.safetensors",
+                            token=token,
+                        )
+                    except:  # noqa
+                        raise ValueError("Could not find adapter model in the Hub, make sure you have the correct adapter model id.")
+                else:
+                    local_filename = filename
+        else:
+            local_filename = filename
+
+        loading_func = safe_load_file if safe_loading else torch.load
+        load_kwargs = {} if safe_loading else {"map_location": "cpu"}
+
+        adapter_state_dict = loading_func(local_filename, **load_kwargs)
+
+        for score_name_candidate in cls.supported_rm_modules:
+            if any([score_name_candidate in name for name in adapter_state_dict.keys()]):
+                score_name = score_name_candidate
+                # we have found the correct head name and can break
+                break
+
+        score_dict = {}
+
+        for name, param in adapter_state_dict.items():
+            if score_name in name:
+                key_name = ".".join(name.split(".")[-1:])
+                score_dict[key_name] = param.to(cls._get_current_device())
+
+        num_labels, hidden_dim = score_dict["weight"].shape
+        has_bias = any(["bias" in name for name in adapter_state_dict.keys()])
+
+        score = nn.Linear(hidden_dim, num_labels, bias=has_bias).to(
+            device=cls._get_current_device(),
+            dtype=pretrained_model.dtype,
+        )
+        score.load_state_dict(score_dict)
+        for param in score.parameters():
+            param.requires_grad = False
+
+        return score
+
+    def push_to_hub(self, *args, **kwargs):
+        r"""
+        Push the pretrained model to the hub. This method is a wrapper around
+        `transformers.PreTrainedModel.push_to_hub`. Please refer to the documentation
+        of `transformers.PreTrainedModel.push_to_hub` for more information.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed along to the underlying model's
+                `push_to_hub` method.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed along to the underlying model's
+                `push_to_hub` method.
+        """
+        raise NotImplementedError
+
+    def save_pretrained(self, *args, **kwargs):
+        r"""
+        Save the pretrained model to a directory. This method is a wrapper around
+        `transformers.PreTrainedModel.save_pretrained`. Please refer to the documentation
+        of `transformers.PreTrainedModel.save_pretrained` for more information.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed along to the underlying model's
+                `save_pretrained` method.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed along to the underlying model's
+                `save_pretrained` method.
+        """
+        state_dict = kwargs.get("state_dict")
+        if state_dict is None:
+            state_dict = self.state_dict()
+            kwargs["state_dict"] = state_dict
+
+        # if it is a peft model only save the `v_head` state_dict and
+        # pop the `state_dict` from the kwargs to avoid slient bugs with `peft`
+        if self.is_peft_model:
+            save_path = args[0]
+            save_path = os.path.join(save_path, "pytorch_model.bin")
+            torch.save(state_dict, save_path)
+            _ = kwargs.pop("state_dict", None)
+
+        return self.pretrained_model.save_pretrained(*args, **kwargs)
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Return the state_dict of the pretrained model.
+        """
+        raise NotImplementedError
+
+    def post_init(self, *args, **kwargs):
+        r"""
+        Post initialization method. This method is called after the model is
+        instantiated and loaded from a checkpoint. It can be used to perform
+        additional operations such as loading the state_dict.
+        """
+        raise NotImplementedError
+
+    def compute_reward_score(self, input_ids, attention_mask=None, **kwargs):
+        r"""
+        Computes the reward score for a given input. The method has first to enable the adapter
+        and then compute the reward score. After that the model disables the reward modeling
+        adapter and enables the default ppo adapter again.
+        """
+        if not self.supports_rm_adapter:
+            raise ValueError("This model does not support reward modeling adapter.")
+
+        # enable rm adapter
+        self.pretrained_model.set_adapter(self.rm_adapter_name)
+        self.pretrained_model.eval()
+
+        with torch.no_grad():
+            base_model_output = self.pretrained_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True,
+                **kwargs,
+            )
+
+            last_hidden_states = base_model_output.hidden_states[-1]
+            scores = self.score(last_hidden_states)
+
+        self.pretrained_model.set_adapter(self.policy_adapter_name)
+        self.pretrained_model.eval()
+
+        return scores
+
+
+def create_reference_model(model: PreTrainedModelWrapper, num_shared_layers: int = None, pattern: str = None) -> PreTrainedModelWrapper:
+    """
+    Creates a static reference copy of a model. Note that model will be in `.eval()` mode.
+
+    Args:
+        model (`PreTrainedModelWrapper`): The model to be copied.
+        num_shared_layers (`int`, *optional*): The number of initial layers that are shared between both models and kept frozen.
+        pattern (`str`, *optional*): The shared layers are selected with a string pattern
+            (e.g. "transformer.h.{layer}" for GPT2) and if a custom pattern is necessary it can be passed here.
+
+    Returns
+        `PreTrainedModelWrapper`
+    """
+    if is_deepspeed_zero3_enabled():
+        raise ValueError("DeepSpeed ZeRO-3 is enabled and is not compatible with `create_reference_model()`. Please instantiate your reference model directly with `AutoCausalLM.from_pretrained()`.")
+
+    parameter_names = [n for n, _ in model.named_parameters()]
+    ref_model = deepcopy(model)
+
+    # if no layers are shared, return copy of model
+    if num_shared_layers is None:
+        for param_name in parameter_names:
+            param = ref_model.get_parameter(param_name)
+            param.requires_grad = False
+        return ref_model.eval()
+
+    # identify layer name pattern
+    if pattern is not None:
+        pattern = pattern.format(layer=num_shared_layers)
+    else:
+        for pattern_candidate in LAYER_PATTERNS:
+            pattern_candidate = pattern_candidate.format(layer=num_shared_layers)
+            if any([pattern_candidate in name for name in parameter_names]):
+                pattern = pattern_candidate
+                break
+
+    if pattern is None:
+        raise ValueError("Layer pattern could not be matched.")
+
+    # divide parameters in shared and unshared parameter lists
+    shared_param_list = []
+    unshared_param_list = []
+
+    shared_parameter = True
+    for name, param in model.named_parameters():
+        if pattern in name:
+            shared_parameter = False
+        if shared_parameter:
+            shared_param_list.append(name)
+        else:
+            unshared_param_list.append(name)
+
+    # create reference of the original parameter if they are shared
+    for param_name in shared_param_list:
+        param = model.get_parameter(param_name)
+        param.requires_grad = False
+
+        ref_param = ref_model.get_parameter(param_name)  # noqa
+        ref_param = param  # noqa
+
+    # for all other parameters just make sure they don't use gradients
+    for param_name in unshared_param_list:
+        param = ref_model.get_parameter(param_name)
+        param.requires_grad = False
+
+    if pattern is not None and len(unshared_param_list) == 0:
+        logging.warning("Pattern passed or found, but no layers matched in the model. Check for a typo.")
+
+    return ref_model.eval()
diff --git a/train/trl/models/modeling_sd_base.py b/train/trl/models/modeling_sd_base.py
new file mode 100644
index 0000000..5cf6a1d
--- /dev/null
+++ b/train/trl/models/modeling_sd_base.py
@@ -0,0 +1,624 @@
+# Copyright 2023 DDPO-pytorch authors (Kevin Black), The HuggingFace Team, metric-space. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from diffusers import DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
+from diffusers.utils import convert_state_dict_to_diffusers
+
+from ..core import randn_tensor
+from ..import_utils import is_peft_available
+
+
+if is_peft_available():
+    from peft import LoraConfig
+    from peft.utils import get_peft_model_state_dict
+
+
+@dataclass
+class DDPOPipelineOutput(object):
+    """
+    Output class for the diffusers pipeline to be finetuned with the DDPO trainer
+
+    Args:
+        images (`torch.Tensor`):
+            The generated images.
+        latents (`List[torch.Tensor]`):
+            The latents used to generate the images.
+        log_probs (`List[torch.Tensor]`):
+            The log probabilities of the latents.
+
+    """
+
+    images: torch.Tensor
+    latents: torch.Tensor
+    log_probs: torch.Tensor
+
+
+@dataclass
+class DDPOSchedulerOutput(object):
+    """
+    Output class for the diffusers scheduler to be finetuned with the DDPO trainer
+
+    Args:
+        latents (`torch.Tensor`):
+            Predicted sample at the previous timestep. Shape: `(batch_size, num_channels, height, width)`
+        log_probs (`torch.Tensor`):
+            Log probability of the above mentioned sample. Shape: `(batch_size)`
+    """
+
+    latents: torch.Tensor
+    log_probs: torch.Tensor
+
+
+class DDPOStableDiffusionPipeline(object):
+    """
+    Main class for the diffusers pipeline to be finetuned with the DDPO trainer
+    """
+
+    def __call__(self, *args, **kwargs) -> DDPOPipelineOutput:
+        raise NotImplementedError
+
+    def scheduler_step(self, *args, **kwargs) -> DDPOSchedulerOutput:
+        raise NotImplementedError
+
+    @property
+    def unet(self):
+        """
+        Returns the 2d U-Net model used for diffusion.
+        """
+        raise NotImplementedError
+
+    @property
+    def vae(self):
+        """
+        Returns the Variational Autoencoder model used from mapping images to and from the latent space
+        """
+        raise NotImplementedError
+
+    @property
+    def tokenizer(self):
+        """
+        Returns the tokenizer used for tokenizing text inputs
+        """
+        raise NotImplementedError
+
+    @property
+    def scheduler(self):
+        """
+        Returns the scheduler associated with the pipeline used for the diffusion process
+        """
+        raise NotImplementedError
+
+    @property
+    def text_encoder(self):
+        """
+        Returns the text encoder used for encoding text inputs
+        """
+        raise NotImplementedError
+
+    @property
+    def autocast(self):
+        """
+        Returns the autocast context manager
+        """
+        raise NotImplementedError
+
+    def set_progress_bar_config(self, *args, **kwargs):
+        """
+        Sets the progress bar config for the pipeline
+        """
+        raise NotImplementedError
+
+    def save_pretrained(self, *args, **kwargs):
+        """
+        Saves all of the model weights
+        """
+        raise NotImplementedError
+
+    def get_trainable_layers(self, *args, **kwargs):
+        """
+        Returns the trainable parameters of the pipeline
+        """
+        raise NotImplementedError
+
+    def save_checkpoint(self, *args, **kwargs):
+        """
+        Light wrapper around accelerate's register_save_state_pre_hook which is run before saving state
+        """
+        raise NotImplementedError
+
+    def load_checkpoint(self, *args, **kwargs):
+        """
+        Light wrapper around accelerate's register_lad_state_pre_hook which is run before loading state
+        """
+        raise NotImplementedError
+
+
+def _left_broadcast(input_tensor, shape):
+    """
+    As opposed to the default direction of broadcasting (right to left), this function broadcasts
+    from left to right
+        Args:
+            input_tensor (`torch.FloatTensor`): is the tensor to broadcast
+            shape (`Tuple[int]`): is the shape to broadcast to
+    """
+    input_ndim = input_tensor.ndim
+    if input_ndim > len(shape):
+        raise ValueError("The number of dimensions of the tensor to broadcast cannot be greater than the length of the shape to broadcast to")
+    return input_tensor.reshape(input_tensor.shape + (1,) * (len(shape) - input_ndim)).broadcast_to(shape)
+
+
+def _get_variance(self, timestep, prev_timestep):
+    alpha_prod_t = torch.gather(self.alphas_cumprod, 0, timestep.cpu()).to(timestep.device)
+    alpha_prod_t_prev = torch.where(
+        prev_timestep.cpu() >= 0,
+        self.alphas_cumprod.gather(0, prev_timestep.cpu()),
+        self.final_alpha_cumprod,
+    ).to(timestep.device)
+    beta_prod_t = 1 - alpha_prod_t
+    beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+    variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+    return variance
+
+
+def scheduler_step(
+    self,
+    model_output: torch.FloatTensor,
+    timestep: int,
+    sample: torch.FloatTensor,
+    eta: float = 0.0,
+    use_clipped_model_output: bool = False,
+    generator=None,
+    prev_sample: Optional[torch.FloatTensor] = None,
+) -> DDPOSchedulerOutput:
+    """
+
+    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+    process from the learned model outputs (most often the predicted noise).
+
+    Args:
+        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+        timestep (`int`): current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            current instance of sample being created by diffusion process.
+        eta (`float`): weight of noise for added noise in diffusion step.
+        use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+            predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+            `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+            coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+        generator: random number generator.
+        variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+            can directly provide the noise for the variance itself. This is useful for methods such as
+            CycleDiffusion. (https://arxiv.org/abs/2210.05559)
+
+    Returns:
+        `DDPOSchedulerOutput`: the predicted sample at the previous timestep and the log probability of the sample
+    """
+
+    if self.num_inference_steps is None:
+        raise ValueError("Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler")
+
+    # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+    # Ideally, read DDIM paper in-detail understanding
+
+    # Notation (<variable name> -> <name in paper>
+    # - pred_noise_t -> e_theta(x_t, t)
+    # - pred_original_sample -> f_theta(x_t, t) or x_0
+    # - std_dev_t -> sigma_t
+    # - eta -> η
+    # - pred_sample_direction -> "direction pointing to x_t"
+    # - pred_prev_sample -> "x_t-1"
+
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+    # to prevent OOB on gather
+    prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1)
+
+    # 2. compute alphas, betas
+    alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu())
+    alpha_prod_t_prev = torch.where(
+        prev_timestep.cpu() >= 0,
+        self.alphas_cumprod.gather(0, prev_timestep.cpu()),
+        self.final_alpha_cumprod,
+    )
+    alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device)
+    alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, sample.shape).to(sample.device)
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    if self.config.prediction_type == "epsilon":
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        pred_epsilon = model_output
+    elif self.config.prediction_type == "sample":
+        pred_original_sample = model_output
+        pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+    elif self.config.prediction_type == "v_prediction":
+        pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+    else:
+        raise ValueError(f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" " `v_prediction`")
+
+    # 4. Clip or threshold "predicted x_0"
+    if self.config.thresholding:
+        pred_original_sample = self._threshold_sample(pred_original_sample)
+    elif self.config.clip_sample:
+        pred_original_sample = pred_original_sample.clamp(-self.config.clip_sample_range, self.config.clip_sample_range)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = _get_variance(self, timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+    std_dev_t = _left_broadcast(std_dev_t, sample.shape).to(sample.device)
+
+    if use_clipped_model_output:
+        # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+        pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+    # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    prev_sample_mean = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+    if prev_sample is not None and generator is not None:
+        raise ValueError("Cannot pass both generator and prev_sample. Please make sure that either `generator` or" " `prev_sample` stays `None`.")
+
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+
+    # log prob of prev_sample given prev_sample_mean and std_dev_t
+    log_prob = -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2)) - torch.log(std_dev_t) - torch.log(torch.sqrt(2 * torch.as_tensor(np.pi)))
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+
+    return DDPOSchedulerOutput(prev_sample.type(sample.dtype), log_prob)
+
+
+# 1. The output type for call is different as the logprobs are now returned
+# 2. An extra method called `scheduler_step` is added which is used to constraint the scheduler output
+@torch.no_grad()
+def pipeline_step(
+    self,
+    prompt: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.5,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    num_images_per_prompt: Optional[int] = 1,
+    eta: float = 0.0,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+    callback_steps: int = 1,
+    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    guidance_rescale: float = 0.0,
+):
+    r"""
+    Function invoked when calling the pipeline for generation.  Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.  instead.  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            The width in pixels of the generated image.
+        num_inference_steps (`int`, *optional*, defaults to 50):
+            The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+            expense of slower inference.
+        guidance_scale (`float`, *optional*, defaults to 7.5):
+            Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            `guidance_scale` is defined as `w` of equation 2. of [Imagen
+            Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+            1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+            usually at the expense of lower image quality.
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation. If not defined, one has to pass
+            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+            less than `1`).
+        num_images_per_prompt (`int`, *optional*, defaults to 1):
+            The number of images to generate per prompt.
+        eta (`float`, *optional*, defaults to 0.0):
+            Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+            [`schedulers.DDIMScheduler`], will be ignored for others.
+        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+            One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+            to make generation deterministic.
+        latents (`torch.FloatTensor`, *optional*):
+            Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+            generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+            tensor will ge generated by sampling using the supplied random `generator`.
+        prompt_embeds (`torch.FloatTensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+            provided, text embeddings will be generated from `prompt` input argument.
+        negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+            argument.
+        output_type (`str`, *optional*, defaults to `"pil"`):
+            The output format of the generate image. Choose between
+            [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+            plain tuple.
+        callback (`Callable`, *optional*):
+            A function that will be called every `callback_steps` steps during inference. The function will be
+            called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+        callback_steps (`int`, *optional*, defaults to 1):
+            The frequency at which the `callback` function will be called. If not specified, the callback will be
+            called at every step.
+        cross_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+        guidance_rescale (`float`, *optional*, defaults to 0.7):
+            Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+            [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+            Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+    Examples:
+
+    Returns:
+        `DDPOPipelineOutput`: The generated image, the predicted latents used to generate the image and the associated log probabilities
+    """
+    # 0. Default height and width to unet
+    height = height or self.unet.config.sample_size * self.vae_scale_factor
+    width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt,
+        prompt_embeds,
+        negative_prompt_embeds,
+    )
+
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+
+    device = self._execution_device
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    do_classifier_free_guidance = guidance_scale > 1.0
+
+    # 3. Encode input prompt
+    text_encoder_lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+    prompt_embeds = self._encode_prompt(
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        lora_scale=text_encoder_lora_scale,
+    )
+
+    # 4. Prepare timesteps
+    self.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = self.scheduler.timesteps
+
+    # 5. Prepare latent variables
+    num_channels_latents = self.unet.config.in_channels
+    latents = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+
+    # 6. Denoising loop
+    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+    all_latents = [latents]
+    all_log_probs = []
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            if do_classifier_free_guidance and guidance_rescale > 0.0:
+                # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = scheduler_step(self.scheduler, noise_pred, t, latents, eta)
+            latents = scheduler_output.latents
+            log_prob = scheduler_output.log_probs
+
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+
+    if not output_type == "latent":
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+    else:
+        image = latents
+        has_nsfw_concept = None
+
+    if has_nsfw_concept is None:
+        do_denormalize = [True] * image.shape[0]
+    else:
+        do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+    image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+    # Offload last model to CPU
+    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.final_offload_hook.offload()
+
+    return DDPOPipelineOutput(image, all_latents, all_log_probs)
+
+
+class DefaultDDPOStableDiffusionPipeline(DDPOStableDiffusionPipeline):
+    def __init__(self, pretrained_model_name: str, *, pretrained_model_revision: str = "main", use_lora: bool = True):
+        self.sd_pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name, revision=pretrained_model_revision)
+
+        self.use_lora = use_lora
+        self.pretrained_model = pretrained_model_name
+        self.pretrained_revision = pretrained_model_revision
+
+        try:
+            self.sd_pipeline.load_lora_weights(
+                pretrained_model_name,
+                weight_name="pytorch_lora_weights.safetensors",
+                revision=pretrained_model_revision,
+            )
+            self.use_lora = True
+        except OSError:
+            if use_lora:
+                warnings.warn("If you are aware that the pretrained model has no lora weights to it, ignore this message. " "Otherwise please check the if `pytorch_lora_weights.safetensors` exists in the model folder.")
+
+        self.sd_pipeline.scheduler = DDIMScheduler.from_config(self.sd_pipeline.scheduler.config)
+        self.sd_pipeline.safety_checker = None
+
+        # memory optimization
+        self.sd_pipeline.vae.requires_grad_(False)
+        self.sd_pipeline.text_encoder.requires_grad_(False)
+        self.sd_pipeline.unet.requires_grad_(not self.use_lora)
+
+    def __call__(self, *args, **kwargs) -> DDPOPipelineOutput:
+        return pipeline_step(self.sd_pipeline, *args, **kwargs)
+
+    def scheduler_step(self, *args, **kwargs) -> DDPOSchedulerOutput:
+        return scheduler_step(self.sd_pipeline.scheduler, *args, **kwargs)
+
+    @property
+    def unet(self):
+        return self.sd_pipeline.unet
+
+    @property
+    def vae(self):
+        return self.sd_pipeline.vae
+
+    @property
+    def tokenizer(self):
+        return self.sd_pipeline.tokenizer
+
+    @property
+    def scheduler(self):
+        return self.sd_pipeline.scheduler
+
+    @property
+    def text_encoder(self):
+        return self.sd_pipeline.text_encoder
+
+    @property
+    def autocast(self):
+        return contextlib.nullcontext if self.use_lora else None
+
+    def save_pretrained(self, output_dir):
+        if self.use_lora:
+            state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(self.sd_pipeline.unet))
+            self.sd_pipeline.save_lora_weights(save_directory=output_dir, unet_lora_layers=state_dict)
+        self.sd_pipeline.save_pretrained(output_dir)
+
+    def set_progress_bar_config(self, *args, **kwargs):
+        self.sd_pipeline.set_progress_bar_config(*args, **kwargs)
+
+    def get_trainable_layers(self):
+        if self.use_lora:
+            lora_config = LoraConfig(
+                r=4,
+                lora_alpha=4,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+            self.sd_pipeline.unet.add_adapter(lora_config)
+
+            # To avoid accelerate unscaling problems in FP16.
+            for param in self.sd_pipeline.unet.parameters():
+                # only upcast trainable parameters (LoRA) into fp32
+                if param.requires_grad:
+                    param.data = param.to(torch.float32)
+            return self.sd_pipeline.unet
+        else:
+            return self.sd_pipeline.unet
+
+    def save_checkpoint(self, models, weights, output_dir):
+        if len(models) != 1:
+            raise ValueError("Given how the trainable params were set, this should be of length 1")
+        if self.use_lora and hasattr(models[0], "peft_config") and getattr(models[0], "peft_config", None) is not None:
+            state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(models[0]))
+            self.sd_pipeline.save_lora_weights(save_directory=output_dir, unet_lora_layers=state_dict)
+        elif not self.use_lora and isinstance(models[0], UNet2DConditionModel):
+            models[0].save_pretrained(os.path.join(output_dir, "unet"))
+        else:
+            raise ValueError(f"Unknown model type {type(models[0])}")
+
+    def load_checkpoint(self, models, input_dir):
+        if len(models) != 1:
+            raise ValueError("Given how the trainable params were set, this should be of length 1")
+        if self.use_lora:
+            lora_state_dict, network_alphas = self.sd_pipeline.lora_state_dict(input_dir, weight_name="pytorch_lora_weights.safetensors")
+            self.sd_pipeline.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=models[0])
+
+        elif not self.use_lora and isinstance(models[0], UNet2DConditionModel):
+            load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+            models[0].register_to_config(**load_model.config)
+            models[0].load_state_dict(load_model.state_dict())
+            del load_model
+        else:
+            raise ValueError(f"Unknown model type {type(models[0])}")
diff --git a/train/trl/models/modeling_value_head.py b/train/trl/models/modeling_value_head.py
new file mode 100644
index 0000000..f2ca25e
--- /dev/null
+++ b/train/trl/models/modeling_value_head.py
@@ -0,0 +1,421 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM
+
+from .modeling_base import PreTrainedModelWrapper
+
+
+class ValueHead(nn.Module):
+    r"""
+    The ValueHead class implements a head for GPT2 that returns a scalar for each output token.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__()
+        if not hasattr(config, "summary_dropout_prob"):
+            summary_dropout_prob = kwargs.pop("summary_dropout_prob", 0.1)
+        else:
+            summary_dropout_prob = config.summary_dropout_prob
+
+        self.dropout = nn.Dropout(summary_dropout_prob) if summary_dropout_prob else nn.Identity()
+
+        # some models such as OPT have a projection layer before the word embeddings - e.g. OPT-350m
+        if hasattr(config, "hidden_size"):
+            hidden_size = config.hidden_size
+        if hasattr(config, "word_embed_proj_dim"):
+            hidden_size = config.word_embed_proj_dim
+        elif hasattr(config, "is_encoder_decoder"):
+            if config.is_encoder_decoder and hasattr(config, "decoder"):
+                if hasattr(config.decoder, "hidden_size"):
+                    hidden_size = config.decoder.hidden_size
+
+        self.summary = nn.Linear(hidden_size, 1)
+
+        self.flatten = nn.Flatten()
+
+    def forward(self, hidden_states):
+        output = self.dropout(hidden_states)
+
+        # For now force upcast in fp32 if needed. Let's keep the
+        # output in fp32 for numerical stability.
+        if output.dtype != self.summary.weight.dtype:
+            output = output.to(self.summary.weight.dtype)
+
+        output = self.summary(output)
+        return output
+
+
+class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
+    r"""
+    An autoregressive model with a value head in addition to the language model head.
+    This class inherits from `~trl.PreTrainedModelWrapper` and wraps a
+    `transformers.PreTrainedModel` class. The wrapper class supports classic functions
+    such as `from_pretrained`, `push_to_hub` and `generate`. To call a method of the wrapped
+    model, simply manipulate the `pretrained_model` attribute of this class.
+
+    Class attributes:
+        - **transformers_parent_class** (`transformers.PreTrainedModel`) -- The parent class of the wrapped model. This
+            should be set to `transformers.AutoModelForCausalLM` for this class.
+        - **lm_head_namings** (`tuple`) -- A tuple of strings that are used to identify the language model head of the
+            wrapped model. This is set to `("lm_head", "embed_out")` for this class but can be changed for other models
+            in the future
+        - **supported_args** (`tuple`) -- A tuple of strings that are used to identify the arguments that are supported
+            by the `ValueHead` class. Currently, the supported args are:
+            - **summary_dropout_prob** (`float`, `optional`, defaults to `None`) -- The dropout probability for the
+                `ValueHead` class.
+            - **v_head_initializer_range** (`float`, `optional`, defaults to `0.2`) -- The initializer range for the
+                `ValueHead` if a specific initialization strategy is selected.
+            - **v_head_init_strategy** (`str`, `optional`, defaults to `None`) -- The initialization strategy for the
+                `ValueHead`. Currently, the supported strategies are:
+                - **`None`** -- Initializes the weights of the `ValueHead` with a random distribution. This is the default
+                    strategy.
+                - **"normal"** -- Initializes the weights of the `ValueHead` with a normal distribution.
+
+    """
+
+    transformers_parent_class = AutoModelForCausalLM
+    lm_head_namings = ["lm_head", "embed_out"]
+    supported_args = (
+        "summary_dropout_prob",
+        "v_head_initializer_range",
+        "v_head_init_strategy",
+    )
+
+    def __init__(self, pretrained_model, **kwargs):
+        r"""
+        Initializes the model.
+
+        Args:
+            pretrained_model (`transformers.PreTrainedModel`):
+                The model to wrap. It should be a causal language model such as GPT2.
+                or any model mapped inside the `AutoModelForCausalLM` class.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the `ValueHead` class.
+        """
+        super().__init__(pretrained_model, **kwargs)
+        v_head_kwargs, _, _ = self._split_kwargs(kwargs)
+
+        if not any(hasattr(self.pretrained_model, attribute) for attribute in self.lm_head_namings):
+            raise ValueError("The model does not have a language model head, please use a model that has one.")
+
+        self.v_head = ValueHead(self.pretrained_model.config, **v_head_kwargs)
+
+        self._init_weights(**v_head_kwargs)
+
+    def _init_weights(self, **kwargs):
+        r"""
+        Initializes the weights of the value head. The default initialization strategy is random.
+        Users can pass a different initialization strategy by passing the `v_head_init_strategy` argument
+        when calling `.from_pretrained`. Supported strategies are:
+        - `normal`: initializes the weights with a normal distribution.
+
+        Args:
+            **kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the `ValueHead` class. These arguments
+                can contain the `v_head_init_strategy` argument as well as the `v_head_initializer_range`
+                argument.
+        """
+        initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
+        # random init by default
+        init_strategy = kwargs.pop("v_head_init_strategy", None)
+        if init_strategy is None:
+            # do nothing
+            pass
+        elif init_strategy == "normal":
+            self.v_head.summary.weight.data.normal_(mean=0.0, std=initializer_range)
+            self.v_head.summary.bias.data.zero_()
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        r"""
+        Applies a forward pass to the wrapped model and returns the logits of the value head.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`):
+                Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+                (see `past_key_values` input) to speed up sequential decoding.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the wrapped model.
+        """
+        kwargs["output_hidden_states"] = True  # this had already been set in the LORA / PEFT examples
+        kwargs["past_key_values"] = past_key_values
+
+        if self.is_peft_model and self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING":
+            kwargs.pop("past_key_values")
+
+        base_model_output = self.pretrained_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+        last_hidden_state = base_model_output.hidden_states[-1]
+        lm_logits = base_model_output.logits
+        loss = base_model_output.loss
+
+        if last_hidden_state.device != self.v_head.summary.weight.device:
+            last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device)
+
+        value = self.v_head(last_hidden_state).squeeze(-1)
+
+        # force upcast in fp32 if logits are in half-precision
+        if lm_logits.dtype != torch.float32:
+            lm_logits = lm_logits.float()
+
+        return (lm_logits, loss, value)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        A simple wrapper around the `generate` method of the wrapped model.
+        Please refer to the [`generate`](https://huggingface.co/docs/transformers/internal/generation_utils)
+        method of the wrapped model for more information about the supported arguments.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed to the `generate` method of the wrapped model.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed to the `generate` method of the wrapped model.
+        """
+        return self.pretrained_model.generate(*args, **kwargs)
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Returns the state dictionary of the model. We add the state dictionary of the value head
+        to the state dictionary of the wrapped model by prepending the key with `v_head.`.
+        """
+        if not self.is_peft_model:
+            pretrained_model_state_dict = self.pretrained_model.state_dict(*args, **kwargs)
+        else:
+            # if it is a peft model, only save the v_head
+            pretrained_model_state_dict = {}
+
+        v_head_state_dict = self.v_head.state_dict(*args, **kwargs)
+        for k, v in v_head_state_dict.items():
+            pretrained_model_state_dict[f"v_head.{k}"] = v
+        return pretrained_model_state_dict
+
+    def push_to_hub(self, *args, **kwargs):
+        setattr(self.pretrained_model, "v_head", self.v_head)
+
+        return self.pretrained_model.push_to_hub(*args, **kwargs)
+
+    def post_init(self, state_dict):
+        r"""
+        We add the state dictionary of the value head to the state dictionary of the wrapped model
+        by prepending the key with `v_head.`. This function removes the `v_head.` prefix from the
+        keys of the value head state dictionary.
+        """
+        for k in list(state_dict.keys()):
+            if "v_head." in k:
+                state_dict[k.replace("v_head.", "")] = state_dict.pop(k)
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+        if hasattr(self.pretrained_model, "hf_device_map"):
+            if "cpu" in self.pretrained_model.hf_device_map.values() or "disk" in self.pretrained_model.hf_device_map.values():
+                raise ValueError("The model is offloaded on CPU or disk - CPU & disk offloading is not supported for ValueHead models.")
+
+            first_device = list(set(self.pretrained_model.hf_device_map.values()))[0]
+
+            self.v_head = self.v_head.to(first_device)
+
+            def set_device_hook(module, input, outputs):
+                new_output = ()
+                for output in outputs:
+                    if isinstance(output, torch.Tensor):
+                        new_output += (output.to(first_device),)
+                    else:
+                        new_output += (output,)
+                return new_output
+
+            self.register_forward_hook(set_device_hook)
+
+            self.is_sequential_parallel = True
+
+
+class AutoModelForSeq2SeqLMWithValueHead(PreTrainedModelWrapper):
+    r"""
+    A seq2seq model with a value head in addition to the language model head.
+    This class inherits from `~trl.PreTrainedModelWrapper` and wraps a
+    `transformers.PreTrainedModel` class. The wrapper class supports classic functions
+    such as `from_pretrained` and `push_to_hub` and also provides some additional
+    functionalities such as `generate`.
+
+    Args:
+        pretrained_model (`transformers.PreTrainedModel`):
+            The model to wrap. It should be a causal language model such as GPT2.
+            or any model mapped inside the `AutoModelForSeq2SeqLM` class.
+        kwargs:
+            Additional keyword arguments passed along to the `ValueHead` class.
+    """
+
+    transformers_parent_class = AutoModelForSeq2SeqLM
+    lm_head_namings = ["lm_head", "embed_out", "output_projection"]
+    supported_args = (
+        "summary_dropout_prob",
+        "v_head_initializer_range",
+        "v_head_init_strategy",
+    )
+
+    def __init__(self, pretrained_model, **kwargs):
+        super().__init__(pretrained_model, **kwargs)
+        v_head_kwargs, _, _ = self._split_kwargs(kwargs)
+        self.is_encoder_decoder = True
+
+        if not self._has_lm_head():
+            raise ValueError("The model does not have a language model head, please use a model that has one.")
+
+        self.v_head = ValueHead(self.pretrained_model.config, **v_head_kwargs)
+
+        self._init_weights(**v_head_kwargs)
+
+    def _has_lm_head(self):
+        # check module names of all modules inside `pretrained_model` to find the language model head
+        for name, module in self.pretrained_model.named_modules():
+            if any(attribute in name for attribute in self.lm_head_namings):
+                return True
+        return False
+
+    def post_init(self, state_dict):
+        r"""
+        We add the state dictionary of the value head to the state dictionary of the wrapped model
+        by prepending the key with `v_head.`. This function removes the `v_head.` prefix from the
+        keys of the value head state dictionary.
+        """
+        for k in list(state_dict.keys()):
+            if "v_head." in k:
+                state_dict[k.replace("v_head.", "")] = state_dict.pop(k)
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+        if hasattr(self.pretrained_model, "hf_device_map"):
+            if "cpu" in self.pretrained_model.hf_device_map.values() or "disk" in self.pretrained_model.hf_device_map.values():
+                raise ValueError("The model is offloaded on CPU or disk - CPU & disk offloading is not supported for ValueHead models.")
+
+            # get the lm_head device
+            for name, module in self.pretrained_model.named_modules():
+                if any(attribute in name for attribute in self.lm_head_namings):
+                    lm_head_device = module.weight.device
+                    break
+
+            # put v_head on the same device as the lm_head to avoid issues
+            self.v_head = self.v_head.to(lm_head_device)
+
+            def set_device_hook(module, input, outputs):
+                r"""
+                A hook that sets the device of the output of the model to the device of the first
+                parameter of the model.
+
+                Args:
+                    module (`nn.Module`):
+                        The module to which the hook is attached.
+                    input (`tuple`):
+                        The input to the module.
+                    outputs (`tuple`):
+                        The output of the module.
+                """
+                new_output = ()
+                for output in outputs:
+                    if isinstance(output, torch.Tensor):
+                        new_output += (output.to(lm_head_device),)
+                    else:
+                        new_output += (output,)
+                return new_output
+
+            self.register_forward_hook(set_device_hook)
+            self.is_sequential_parallel = True
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Returns the state dictionary of the model. We add the state dictionary of the value head
+        to the state dictionary of the wrapped model by prepending the key with `v_head.`.
+        """
+        if not self.is_peft_model:
+            pretrained_model_state_dict = self.pretrained_model.state_dict(*args, **kwargs)
+        else:
+            # if it is a peft model, only save the v_head
+            pretrained_model_state_dict = {}
+
+        v_head_state_dict = self.v_head.state_dict(*args, **kwargs)
+        for k, v in v_head_state_dict.items():
+            pretrained_model_state_dict[f"v_head.{k}"] = v
+        return pretrained_model_state_dict
+
+    def push_to_hub(self, *args, **kwargs):
+        setattr(self.pretrained_model, "v_head", self.v_head)
+
+        return self.pretrained_model.push_to_hub(*args, **kwargs)
+
+    def _init_weights(self, **kwargs):
+        r"""
+        We initialize the weights of the value head.
+        """
+        initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
+        # random init by default
+        init_strategy = kwargs.pop("v_head_init_strategy", None)
+        if init_strategy is None:
+            # do nothing
+            pass
+        elif init_strategy == "normal":
+            self.v_head.summary.weight.data.normal_(mean=0.0, std=initializer_range)
+            self.v_head.summary.bias.data.zero_()
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        kwargs["past_key_values"] = past_key_values
+        if self.is_peft_model and self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING":
+            kwargs.pop("past_key_values")
+
+        base_model_output = self.pretrained_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,  # We force the model to output hidden states
+            **kwargs,
+        )
+
+        last_hidden_state = base_model_output.decoder_hidden_states[-1]
+        lm_logits = base_model_output.logits
+        loss = base_model_output.loss
+
+        value = self.v_head(last_hidden_state).squeeze(-1)
+
+        # force upcast in fp32 if logits are in half-precision
+        if lm_logits.dtype != torch.float32:
+            lm_logits = lm_logits.float()
+
+        return (lm_logits, loss, value)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        We call `generate` on the wrapped model.
+        """
+        return self.pretrained_model.generate(*args, **kwargs)
diff --git a/train/trl/models/utils.py b/train/trl/models/utils.py
new file mode 100644
index 0000000..d9caf75
--- /dev/null
+++ b/train/trl/models/utils.py
@@ -0,0 +1,83 @@
+from dataclasses import dataclass
+from typing import Literal, Optional, Tuple
+
+from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+# TODO: Add Abstract Base Class if more formats are added
+@dataclass
+class ChatMlSpecialTokens:
+    """Dataclass for special tokens used in ChatML, including system, user, assistant, bos, eos, and pad tokens."""
+
+    bos_token: str = "<|im_start|>"
+    eos_token: str = "<|im_end|>"
+    pad_token: str = "<|im_end|>"
+
+    @property
+    def system(self):
+        return f"{self.bos_token}system"
+
+    @property
+    def user(self):
+        return f"{self.bos_token}user"
+
+    @property
+    def assistant(self):
+        return f"{self.bos_token}assistant"
+
+    @property
+    def chat_template(self):
+        return (
+            "{% for message in messages %}"
+            f"{{{{'{self.bos_token}' + message['role'] + '\n' + message['content'] + '{self.eos_token}' + '\n'}}}}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %}"
+            f"{{{{ '{self.assistant}\n' }}}}"
+            "{% endif %}"
+        )
+
+
+FORMAT_MAPPING = {"chatml": ChatMlSpecialTokens}
+
+
+def setup_chat_format(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    format: Optional[Literal["chatml"]] = "chatml",
+    resize_to_multiple_of: Optional[int] = None,
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """
+    Setup chat format by adding special tokens to the tokenizer, setting the correct format, and extending the embedding layer of the model based on the new special tokens.
+
+    Args:
+      model (`~transformers.PreTrainedModel`): The model to be modified.
+      tokenizer (`~transformers.PreTrainedTokenizer`): The tokenizer to be modified.
+      format (`Optional[Literal["chatml"]]`): The format to be set. Defaults to "chatml".
+      resize_to_multiple_of (`Optional[int]`): Number to resize the embedding layer to. Defaults to None.
+    Returns:
+      model (`~transformers.PreTrainedModel`): The modified model.
+      tokenizer (`~transformers.PreTrainedTokenizer`): The modified tokenizer.
+    """
+    # check if format available and retrieve
+    if format not in FORMAT_MAPPING:
+        raise ValueError(f"Format {format} not available. Please use one of {FORMAT_MAPPING.keys()}")
+
+    chat_format = FORMAT_MAPPING[format]()
+
+    # set special tokens and them
+    tokenizer.eos_token = chat_format.eos_token
+    tokenizer.pad_token = chat_format.pad_token
+    tokenizer.bos_token = chat_format.bos_token
+    tokenizer.add_special_tokens({"additional_special_tokens": [chat_format.bos_token, chat_format.eos_token]})
+    # set chat format for tokenizer
+    tokenizer.chat_template = chat_format.chat_template
+
+    # resize embedding layer to a multiple of 64, https://x.com/karpathy/status/1621578354024677377
+    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=resize_to_multiple_of if resize_to_multiple_of is not None else None)
+    # Make sure to update the generation config to use the new eos & bos token
+    if getattr(model, "generation_config", None) is not None:
+        model.generation_config.bos_token_id = tokenizer.bos_token_id
+        model.generation_config.eos_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+    return model, tokenizer
diff --git a/train/trl/trainer/__init__.py b/train/trl/trainer/__init__.py
new file mode 100644
index 0000000..d354601
--- /dev/null
+++ b/train/trl/trainer/__init__.py
@@ -0,0 +1,46 @@
+# flake8: noqa
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# There is a circular import in the PPOTrainer if we let isort sort these
+# isort: off
+from .utils import (
+    AdaptiveKLController,
+    FixedKLController,
+    ConstantLengthDataset,
+    DataCollatorForCompletionOnlyLM,
+    RunningMoments,
+    disable_dropout_in_model,
+    peft_module_casting_to_bf16,
+)
+
+# isort: on
+
+from ..import_utils import is_diffusers_available
+from .base import BaseTrainer
+from .ddpo_config import DDPOConfig
+
+
+if is_diffusers_available():
+    from .ddpo_trainer import DDPOTrainer
+
+from .dpo_trainer import DPOTrainer
+from .iterative_sft_trainer import IterativeSFTTrainer
+from .model_config import ModelConfig
+from .ppo_config import PPOConfig
+from .ppo_trainer import PPOTrainer
+from .reward_config import RewardConfig
+from .reward_trainer import RewardTrainer, compute_accuracy
+from .sft_trainer import SFTTrainer
diff --git a/train/trl/trainer/base.py b/train/trl/trainer/base.py
new file mode 100644
index 0000000..f0314cb
--- /dev/null
+++ b/train/trl/trainer/base.py
@@ -0,0 +1,46 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub import PyTorchModelHubMixin
+
+
+class BaseTrainer(PyTorchModelHubMixin):
+    r"""
+    Base class for all trainers - this base class implements the basic functions that we
+    need for a trainer.
+
+    The trainer needs to have the following functions:
+        - step: takes in a batch of data and performs a step of training
+        - loss: takes in a batch of data and returns the loss
+        - compute_rewards: takes in a batch of data and returns the rewards
+        - _build_models_and_tokenizer: builds the models and tokenizer
+        - _build_dataset: builds the dataset
+    Each user is expected to implement their own trainer class that inherits from this base
+    if they want to use a new training algorithm.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    def step(self, *args):
+        raise NotImplementedError("Not implemented")
+
+    def loss(self, *args):
+        raise NotImplementedError("Not implemented")
+
+    def compute_rewards(self, *args):
+        raise NotImplementedError("Not implemented")
+
+    def _save_pretrained(self, save_directory):
+        raise NotImplementedError("Not implemented")
diff --git a/train/trl/trainer/ddpo_config.py b/train/trl/trainer/ddpo_config.py
new file mode 100644
index 0000000..2c91095
--- /dev/null
+++ b/train/trl/trainer/ddpo_config.py
@@ -0,0 +1,115 @@
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+from ..core import flatten_dict
+from ..import_utils import is_bitsandbytes_available, is_torchvision_available
+
+
+@dataclass
+class DDPOConfig:
+    """
+    Configuration class for DDPOTrainer
+    """
+
+    # common parameters
+    exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
+    """the name of this experiment (by default is the file name without the extension name)"""
+    run_name: Optional[str] = ""
+    """Run name for wandb logging and checkpoint saving."""
+    seed: int = 0
+    """Seed value for random generations"""
+    log_with: Optional[Literal["wandb", "tensorboard"]] = None
+    """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
+    tracker_kwargs: dict = field(default_factory=dict)
+    """Keyword arguments for the tracker (e.g. wandb_project)"""
+    accelerator_kwargs: dict = field(default_factory=dict)
+    """Keyword arguments for the accelerator"""
+    project_kwargs: dict = field(default_factory=dict)
+    """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
+    tracker_project_name: str = "trl"
+    """Name of project to use for tracking"""
+    logdir: str = "logs"
+    """Top-level logging directory for checkpoint saving."""
+
+    # hyperparameters
+    num_epochs: int = 100
+    """Number of epochs to train."""
+    save_freq: int = 1
+    """Number of epochs between saving model checkpoints."""
+    num_checkpoint_limit: int = 5
+    """Number of checkpoints to keep before overwriting old ones."""
+    mixed_precision: str = "fp16"
+    """Mixed precision training."""
+    allow_tf32: bool = True
+    """Allow tf32 on Ampere GPUs."""
+    resume_from: Optional[str] = ""
+    """Resume training from a checkpoint."""
+    sample_num_steps: int = 50
+    """Number of sampler inference steps."""
+    sample_eta: float = 1.0
+    """Eta parameter for the DDIM sampler."""
+    sample_guidance_scale: float = 5.0
+    """Classifier-free guidance weight."""
+    sample_batch_size: int = 1
+    """Batch size (per GPU!) to use for sampling."""
+    sample_num_batches_per_epoch: int = 2
+    """Number of batches to sample per epoch."""
+    train_batch_size: int = 1
+    """Batch size (per GPU!) to use for training."""
+    train_use_8bit_adam: bool = False
+    """Whether to use the 8bit Adam optimizer from bitsandbytes."""
+    train_learning_rate: float = 3e-4
+    """Learning rate."""
+    train_adam_beta1: float = 0.9
+    """Adam beta1."""
+    train_adam_beta2: float = 0.999
+    """Adam beta2."""
+    train_adam_weight_decay: float = 1e-4
+    """Adam weight decay."""
+    train_adam_epsilon: float = 1e-8
+    """Adam epsilon."""
+    train_gradient_accumulation_steps: int = 1
+    """Number of gradient accumulation steps."""
+    train_max_grad_norm: float = 1.0
+    """Maximum gradient norm for gradient clipping."""
+    train_num_inner_epochs: int = 1
+    """Number of inner epochs per outer epoch."""
+    train_cfg: bool = True
+    """Whether or not to use classifier-free guidance during training."""
+    train_adv_clip_max: float = 5
+    """Clip advantages to the range."""
+    train_clip_range: float = 1e-4
+    """The PPO clip range."""
+    train_timestep_fraction: float = 1.0
+    """The fraction of timesteps to train on."""
+    per_prompt_stat_tracking: bool = False
+    """Whether to track statistics for each prompt separately."""
+    per_prompt_stat_tracking_buffer_size: int = 16
+    """Number of reward values to store in the buffer for each prompt."""
+    per_prompt_stat_tracking_min_count: int = 16
+    """The minimum number of reward values to store in the buffer."""
+    async_reward_computation: bool = False
+    """Whether to compute rewards asynchronously."""
+    max_workers: int = 2
+    """The maximum number of workers to use for async reward computation."""
+    negative_prompts: Optional[str] = ""
+    """Comma-separated list of prompts to use as negative examples."""
+
+    def to_dict(self):
+        output_dict = {}
+        for key, value in self.__dict__.items():
+            output_dict[key] = value
+        return flatten_dict(output_dict)
+
+    def __post_init__(self):
+        if self.log_with not in ["wandb", "tensorboard"]:
+            warnings.warn(("Accelerator tracking only supports image logging if `log_with` is set to 'wandb' or 'tensorboard'."))
+
+        if self.log_with == "wandb" and not is_torchvision_available():
+            warnings.warn("Wandb image logging requires torchvision to be installed")
+
+        if self.train_use_8bit_adam and not is_bitsandbytes_available():
+            raise ImportError("You need to install bitsandbytes to use 8bit Adam. " "You can install it with `pip install bitsandbytes`.")
diff --git a/train/trl/trainer/ddpo_trainer.py b/train/trl/trainer/ddpo_trainer.py
new file mode 100644
index 0000000..f1f897a
--- /dev/null
+++ b/train/trl/trainer/ddpo_trainer.py
@@ -0,0 +1,604 @@
+# Copyright 2023 DDPO-pytorch authors (Kevin Black), metric-space, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+from collections import defaultdict
+from concurrent import futures
+from typing import Any, Callable, Optional, Tuple
+from warnings import warn
+
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import whoami
+
+from ..models import DDPOStableDiffusionPipeline
+from . import BaseTrainer, DDPOConfig
+from .utils import PerPromptStatTracker
+
+
+logger = get_logger(__name__)
+
+
+MODEL_CARD_TEMPLATE = """---
+license: apache-2.0
+tags:
+- trl
+- ddpo
+- diffusers
+- reinforcement-learning
+- text-to-image
+- stable-diffusion
+---
+
+# {model_name}
+
+This is a diffusion model that has been fine-tuned with reinforcement learning to
+ guide the model outputs according to a value, function, or human feedback. The model can be used for image generation conditioned with text.
+
+"""
+
+
+class DDPOTrainer(BaseTrainer):
+    """
+    The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models.
+    Note, this trainer is heavily inspired by the work here: https://github.com/kvablack/ddpo-pytorch
+    As of now only Stable Diffusion based pipelines are supported
+
+    Attributes:
+        **config** (`DDPOConfig`) -- Configuration object for DDPOTrainer. Check the documentation of `PPOConfig` for more
+         details.
+        **reward_function** (Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor]) -- Reward function to be used
+        **prompt_function** (Callable[[], Tuple[str, Any]]) -- Function to generate prompts to guide model
+        **sd_pipeline** (`DDPOStableDiffusionPipeline`) -- Stable Diffusion pipeline to be used for training.
+        **image_samples_hook** (Optional[Callable[[Any, Any, Any], Any]]) -- Hook to be called to log images
+    """
+
+    _tag_names = ["trl", "ddpo"]
+
+    def __init__(
+        self,
+        config: DDPOConfig,
+        reward_function: Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor],
+        prompt_function: Callable[[], Tuple[str, Any]],
+        sd_pipeline: DDPOStableDiffusionPipeline,
+        image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
+    ):
+        if image_samples_hook is None:
+            warn("No image_samples_hook provided; no images will be logged")
+
+        self.prompt_fn = prompt_function
+        self.reward_fn = reward_function
+        self.config = config
+        self.image_samples_callback = image_samples_hook
+
+        accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs)
+
+        if self.config.resume_from:
+            self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from))
+            if "checkpoint_" not in os.path.basename(self.config.resume_from):
+                # get the most recent checkpoint in this directory
+                checkpoints = list(
+                    filter(
+                        lambda x: "checkpoint_" in x,
+                        os.listdir(self.config.resume_from),
+                    )
+                )
+                if len(checkpoints) == 0:
+                    raise ValueError(f"No checkpoints found in {self.config.resume_from}")
+                checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints])
+                self.config.resume_from = os.path.join(
+                    self.config.resume_from,
+                    f"checkpoint_{checkpoint_numbers[-1]}",
+                )
+
+                accelerator_project_config.iteration = checkpoint_numbers[-1] + 1
+
+        # number of timesteps within each trajectory to train on
+        self.num_train_timesteps = int(self.config.sample_num_steps * self.config.train_timestep_fraction)
+
+        self.accelerator = Accelerator(
+            log_with=self.config.log_with,
+            mixed_precision=self.config.mixed_precision,
+            project_config=accelerator_project_config,
+            # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the
+            # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get
+            # the total number of optimizer steps to accumulate across.
+            gradient_accumulation_steps=self.config.train_gradient_accumulation_steps * self.num_train_timesteps,
+            **self.config.accelerator_kwargs,
+        )
+
+        is_okay, message = self._config_check()
+        if not is_okay:
+            raise ValueError(message)
+
+        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
+
+        if self.accelerator.is_main_process:
+            self.accelerator.init_trackers(
+                self.config.tracker_project_name,
+                config=dict(ddpo_trainer_config=config.to_dict()) if not is_using_tensorboard else config.to_dict(),
+                init_kwargs=self.config.tracker_kwargs,
+            )
+
+        logger.info(f"\n{config}")
+
+        set_seed(self.config.seed, device_specific=True)
+
+        self.sd_pipeline = sd_pipeline
+
+        self.sd_pipeline.set_progress_bar_config(
+            position=1,
+            disable=not self.accelerator.is_local_main_process,
+            leave=False,
+            desc="Timestep",
+            dynamic_ncols=True,
+        )
+
+        # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+        # as these weights are only used for inference, keeping weights in full precision is not required.
+        if self.accelerator.mixed_precision == "fp16":
+            inference_dtype = torch.float16
+        elif self.accelerator.mixed_precision == "bf16":
+            inference_dtype = torch.bfloat16
+        else:
+            inference_dtype = torch.float32
+
+        self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype)
+        self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype)
+
+        trainable_layers = self.sd_pipeline.get_trainable_layers()
+
+        self.accelerator.register_save_state_pre_hook(self._save_model_hook)
+        self.accelerator.register_load_state_pre_hook(self._load_model_hook)
+
+        # Enable TF32 for faster training on Ampere GPUs,
+        # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+        if self.config.allow_tf32:
+            torch.backends.cuda.matmul.allow_tf32 = True
+
+        self.optimizer = self._setup_optimizer(trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers)
+
+        self.neg_prompt_embed = self.sd_pipeline.text_encoder(
+            self.sd_pipeline.tokenizer(
+                [""] if self.config.negative_prompts is None else self.config.negative_prompts,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.sd_pipeline.tokenizer.model_max_length,
+            ).input_ids.to(self.accelerator.device)
+        )[0]
+
+        if config.per_prompt_stat_tracking:
+            self.stat_tracker = PerPromptStatTracker(
+                config.per_prompt_stat_tracking_buffer_size,
+                config.per_prompt_stat_tracking_min_count,
+            )
+
+        # NOTE: for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses
+        # more memory
+        self.autocast = self.sd_pipeline.autocast or self.accelerator.autocast
+
+        if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora:
+            unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+            self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters()))
+        else:
+            self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
+
+        if self.config.async_reward_computation:
+            self.executor = futures.ThreadPoolExecutor(max_workers=config.max_workers)
+
+        if config.resume_from:
+            logger.info(f"Resuming from {config.resume_from}")
+            self.accelerator.load_state(config.resume_from)
+            self.first_epoch = int(config.resume_from.split("_")[-1]) + 1
+        else:
+            self.first_epoch = 0
+
+    def compute_rewards(self, prompt_image_pairs, is_async=False):
+        if not is_async:
+            rewards = []
+            for images, prompts, prompt_metadata in prompt_image_pairs:
+                reward, reward_metadata = self.reward_fn(images, prompts, prompt_metadata)
+                rewards.append(
+                    (
+                        torch.as_tensor(reward, device=self.accelerator.device),
+                        reward_metadata,
+                    )
+                )
+        else:
+            rewards = self.executor.map(lambda x: self.reward_fn(*x), prompt_image_pairs)
+            rewards = [(torch.as_tensor(reward.result(), device=self.accelerator.device), reward_metadata.result()) for reward, reward_metadata in rewards]
+
+        return zip(*rewards)
+
+    def step(self, epoch: int, global_step: int):
+        """
+        Perform a single step of training.
+
+        Args:
+            epoch (int): The current epoch.
+            global_step (int): The current global step.
+
+        Side Effects:
+            - Model weights are updated
+            - Logs the statistics to the accelerator trackers.
+            - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step, and the accelerator tracker.
+
+        Returns:
+            global_step (int): The updated global step.
+
+        """
+        samples, prompt_image_data = self._generate_samples(
+            iterations=self.config.sample_num_batches_per_epoch,
+            batch_size=self.config.sample_batch_size,
+        )
+
+        # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...)
+        samples = {k: torch.cat([s[k] for s in samples]) for k in samples[0].keys()}
+        rewards, rewards_metadata = self.compute_rewards(prompt_image_data, is_async=self.config.async_reward_computation)
+
+        for i, image_data in enumerate(prompt_image_data):
+            image_data.extend([rewards[i], rewards_metadata[i]])
+
+        if self.image_samples_callback is not None:
+            self.image_samples_callback(prompt_image_data, global_step, self.accelerator.trackers[0])
+
+        rewards = torch.cat(rewards)
+        rewards = self.accelerator.gather(rewards).cpu().numpy()
+
+        self.accelerator.log(
+            {
+                "reward": rewards,
+                "epoch": epoch,
+                "reward_mean": rewards.mean(),
+                "reward_std": rewards.std(),
+            },
+            step=global_step,
+        )
+
+        if self.config.per_prompt_stat_tracking:
+            # gather the prompts across processes
+            prompt_ids = self.accelerator.gather(samples["prompt_ids"]).cpu().numpy()
+            prompts = self.sd_pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True)
+            advantages = self.stat_tracker.update(prompts, rewards)
+        else:
+            advantages = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
+
+        # ungather advantages;  keep the entries corresponding to the samples on this process
+        samples["advantages"] = torch.as_tensor(advantages).reshape(self.accelerator.num_processes, -1)[self.accelerator.process_index].to(self.accelerator.device)
+
+        del samples["prompt_ids"]
+
+        total_batch_size, num_timesteps = samples["timesteps"].shape
+
+        for inner_epoch in range(self.config.train_num_inner_epochs):
+            # shuffle samples along batch dimension
+            perm = torch.randperm(total_batch_size, device=self.accelerator.device)
+            samples = {k: v[perm] for k, v in samples.items()}
+
+            # shuffle along time dimension independently for each sample
+            # still trying to understand the code below
+            perms = torch.stack([torch.randperm(num_timesteps, device=self.accelerator.device) for _ in range(total_batch_size)])
+
+            for key in ["timesteps", "latents", "next_latents", "log_probs"]:
+                samples[key] = samples[key][
+                    torch.arange(total_batch_size, device=self.accelerator.device)[:, None],
+                    perms,
+                ]
+
+            original_keys = samples.keys()
+            original_values = samples.values()
+            # rebatch them as user defined train_batch_size is different from sample_batch_size
+            reshaped_values = [v.reshape(-1, self.config.train_batch_size, *v.shape[1:]) for v in original_values]
+
+            # Transpose the list of original values
+            transposed_values = zip(*reshaped_values)
+            # Create new dictionaries for each row of transposed values
+            samples_batched = [dict(zip(original_keys, row_values)) for row_values in transposed_values]
+
+            self.sd_pipeline.unet.train()
+            global_step = self._train_batched_samples(inner_epoch, epoch, global_step, samples_batched)
+            # ensure optimization step at the end of the inner epoch
+            if not self.accelerator.sync_gradients:
+                raise ValueError("Optimization step should have been performed by this point. Please check calculated gradient accumulation settings.")
+
+        if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process:
+            self.accelerator.save_state()
+
+        return global_step
+
+    def calculate_loss(self, latents, timesteps, next_latents, log_probs, advantages, embeds):
+        """
+        Calculate the loss for a batch of an unpacked sample
+
+        Args:
+            latents (torch.Tensor):
+                The latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, width]
+            timesteps (torch.Tensor):
+                The timesteps sampled from the diffusion model, shape: [batch_size]
+            next_latents (torch.Tensor):
+                The next latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, width]
+            log_probs (torch.Tensor):
+                The log probabilities of the latents, shape: [batch_size]
+            advantages (torch.Tensor):
+                The advantages of the latents, shape: [batch_size]
+            embeds (torch.Tensor):
+                The embeddings of the prompts, shape: [2*batch_size or batch_size, ...]
+                Note: the "or" is because if train_cfg is True, the expectation is that negative prompts are concatenated to the embeds
+
+        Returns:
+            loss (torch.Tensor), approx_kl (torch.Tensor), clipfrac (torch.Tensor)
+            (all of these are of shape (1,))
+        """
+        with self.autocast():
+            if self.config.train_cfg:
+                noise_pred = self.sd_pipeline.unet(
+                    torch.cat([latents] * 2),
+                    torch.cat([timesteps] * 2),
+                    embeds,
+                ).sample
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.config.sample_guidance_scale * (noise_pred_text - noise_pred_uncond)
+            else:
+                noise_pred = self.sd_pipeline.unet(
+                    latents,
+                    timesteps,
+                    embeds,
+                ).sample
+            # compute the log prob of next_latents given latents under the current model
+
+            scheduler_step_output = self.sd_pipeline.scheduler_step(
+                noise_pred,
+                timesteps,
+                latents,
+                eta=self.config.sample_eta,
+                prev_sample=next_latents,
+            )
+
+            log_prob = scheduler_step_output.log_probs
+
+        advantages = torch.clamp(
+            advantages,
+            -self.config.train_adv_clip_max,
+            self.config.train_adv_clip_max,
+        )
+
+        ratio = torch.exp(log_prob - log_probs)
+
+        loss = self.loss(advantages, self.config.train_clip_range, ratio)
+
+        approx_kl = 0.5 * torch.mean((log_prob - log_probs) ** 2)
+
+        clipfrac = torch.mean((torch.abs(ratio - 1.0) > self.config.train_clip_range).float())
+
+        return loss, approx_kl, clipfrac
+
+    def loss(
+        self,
+        advantages: torch.Tensor,
+        clip_range: float,
+        ratio: torch.Tensor,
+    ):
+        unclipped_loss = -advantages * ratio
+        clipped_loss = -advantages * torch.clamp(
+            ratio,
+            1.0 - clip_range,
+            1.0 + clip_range,
+        )
+        return torch.mean(torch.maximum(unclipped_loss, clipped_loss))
+
+    def _setup_optimizer(self, trainable_layers_parameters):
+        if self.config.train_use_8bit_adam:
+            import bitsandbytes
+
+            optimizer_cls = bitsandbytes.optim.AdamW8bit
+        else:
+            optimizer_cls = torch.optim.AdamW
+
+        return optimizer_cls(
+            trainable_layers_parameters,
+            lr=self.config.train_learning_rate,
+            betas=(self.config.train_adam_beta1, self.config.train_adam_beta2),
+            weight_decay=self.config.train_adam_weight_decay,
+            eps=self.config.train_adam_epsilon,
+        )
+
+    def _save_model_hook(self, models, weights, output_dir):
+        self.sd_pipeline.save_checkpoint(models, weights, output_dir)
+        weights.pop()  # ensures that accelerate doesn't try to handle saving of the model
+
+    def _load_model_hook(self, models, input_dir):
+        self.sd_pipeline.load_checkpoint(models, input_dir)
+        models.pop()  # ensures that accelerate doesn't try to handle loading of the model
+
+    def _generate_samples(self, iterations, batch_size):
+        """
+        Generate samples from the model
+
+        Args:
+            iterations (int): Number of iterations to generate samples for
+            batch_size (int): Batch size to use for sampling
+
+        Returns:
+            samples (List[Dict[str, torch.Tensor]]), prompt_image_pairs (List[List[Any]])
+        """
+        samples = []
+        prompt_image_pairs = []
+        self.sd_pipeline.unet.eval()
+
+        sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1)
+
+        for _ in range(iterations):
+            prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)])
+
+            prompt_ids = self.sd_pipeline.tokenizer(
+                prompts,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.sd_pipeline.tokenizer.model_max_length,
+            ).input_ids.to(self.accelerator.device)
+            prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0]
+
+            with self.autocast():
+                sd_output = self.sd_pipeline(
+                    prompt_embeds=prompt_embeds,
+                    negative_prompt_embeds=sample_neg_prompt_embeds,
+                    num_inference_steps=self.config.sample_num_steps,
+                    guidance_scale=self.config.sample_guidance_scale,
+                    eta=self.config.sample_eta,
+                    output_type="pt",
+                )
+
+                images = sd_output.images
+                latents = sd_output.latents
+                log_probs = sd_output.log_probs
+
+            latents = torch.stack(latents, dim=1)  # (batch_size, num_steps + 1, ...)
+            log_probs = torch.stack(log_probs, dim=1)  # (batch_size, num_steps, 1)
+            timesteps = self.sd_pipeline.scheduler.timesteps.repeat(batch_size, 1)  # (batch_size, num_steps)
+
+            samples.append(
+                {
+                    "prompt_ids": prompt_ids,
+                    "prompt_embeds": prompt_embeds,
+                    "timesteps": timesteps,
+                    "latents": latents[:, :-1],  # each entry is the latent before timestep t
+                    "next_latents": latents[:, 1:],  # each entry is the latent after timestep t
+                    "log_probs": log_probs,
+                    "negative_prompt_embeds": sample_neg_prompt_embeds,
+                }
+            )
+            prompt_image_pairs.append([images, prompts, prompt_metadata])
+
+        return samples, prompt_image_pairs
+
+    def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_samples):
+        """
+        Train on a batch of samples. Main training segment
+
+        Args:
+            inner_epoch (int): The current inner epoch
+            epoch (int): The current epoch
+            global_step (int): The current global step
+            batched_samples (List[Dict[str, torch.Tensor]]): The batched samples to train on
+
+        Side Effects:
+            - Model weights are updated
+            - Logs the statistics to the accelerator trackers.
+
+        Returns:
+            global_step (int): The updated global step
+        """
+        info = defaultdict(list)
+        for i, sample in enumerate(batched_samples):
+            if self.config.train_cfg:
+                # concat negative prompts to sample prompts to avoid two forward passes
+                embeds = torch.cat([sample["negative_prompt_embeds"], sample["prompt_embeds"]])
+            else:
+                embeds = sample["prompt_embeds"]
+
+            for j in range(self.num_train_timesteps):
+                with self.accelerator.accumulate(self.sd_pipeline.unet):
+                    loss, approx_kl, clipfrac = self.calculate_loss(
+                        sample["latents"][:, j],
+                        sample["timesteps"][:, j],
+                        sample["next_latents"][:, j],
+                        sample["log_probs"][:, j],
+                        sample["advantages"],
+                        embeds,
+                    )
+                    info["approx_kl"].append(approx_kl)
+                    info["clipfrac"].append(clipfrac)
+                    info["loss"].append(loss)
+
+                    self.accelerator.backward(loss)
+                    if self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.trainable_layers.parameters() if not isinstance(self.trainable_layers, list) else self.trainable_layers,
+                            self.config.train_max_grad_norm,
+                        )
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if self.accelerator.sync_gradients:
+                    # log training-related stuff
+                    info = {k: torch.mean(torch.stack(v)) for k, v in info.items()}
+                    info = self.accelerator.reduce(info, reduction="mean")
+                    info.update({"epoch": epoch, "inner_epoch": inner_epoch})
+                    self.accelerator.log(info, step=global_step)
+                    global_step += 1
+                    info = defaultdict(list)
+        return global_step
+
+    def _config_check(self) -> Tuple[bool, str]:
+        samples_per_epoch = self.config.sample_batch_size * self.accelerator.num_processes * self.config.sample_num_batches_per_epoch
+        total_train_batch_size = self.config.train_batch_size * self.accelerator.num_processes * self.config.train_gradient_accumulation_steps
+
+        if not self.config.sample_batch_size >= self.config.train_batch_size:
+            return (
+                False,
+                f"Sample batch size ({self.config.sample_batch_size}) must be greater than or equal to the train batch size ({self.config.train_batch_size})",
+            )
+        if not self.config.sample_batch_size % self.config.train_batch_size == 0:
+            return (
+                False,
+                f"Sample batch size ({self.config.sample_batch_size}) must be divisible by the train batch size ({self.config.train_batch_size})",
+            )
+        if not samples_per_epoch % total_train_batch_size == 0:
+            return (
+                False,
+                f"Number of samples per epoch ({samples_per_epoch}) must be divisible by the total train batch size ({total_train_batch_size})",
+            )
+        return True, ""
+
+    def train(self, epochs: Optional[int] = None):
+        """
+        Train the model for a given number of epochs
+        """
+        global_step = 0
+        if epochs is None:
+            epochs = self.config.num_epochs
+        for epoch in range(self.first_epoch, epochs):
+            global_step = self.step(epoch, global_step)
+
+    def create_model_card(self, path: str, model_name: Optional[str] = "TRL DDPO Model") -> None:
+        """Creates and saves a model card for a TRL model.
+
+        Args:
+            path (`str`): The path to save the model card to.
+            model_name (`str`, *optional*): The name of the model, defaults to `TRL DDPO Model`.
+        """
+        try:
+            user = whoami()["name"]
+        # handle the offline case
+        except:  # noqa
+            warnings.warn("Cannot retrieve user information assuming you are running in offline mode.")
+            return
+
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        model_card_content = MODEL_CARD_TEMPLATE.format(model_name=model_name, model_id=f"{user}/{path}")
+        with open(os.path.join(path, "README.md"), "w", encoding="utf-8") as f:
+            f.write(model_card_content)
+
+    def _save_pretrained(self, save_directory):
+        self.sd_pipeline.save_pretrained(save_directory)
+        self.create_model_card(save_directory)
diff --git a/train/trl/trainer/dpo_trainer.py b/train/trl/trainer/dpo_trainer.py
new file mode 100644
index 0000000..ae2df89
--- /dev/null
+++ b/train/trl/trainer/dpo_trainer.py
@@ -0,0 +1,1186 @@
+# DPO Authors: Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn 2023
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import random
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager, nullcontext
+from copy import deepcopy
+from functools import wraps
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate import PartialState
+from accelerate.utils import is_deepspeed_available, tqdm
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    DataCollator,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+
+from ..import_utils import is_peft_available, is_wandb_available
+from ..models import PreTrainedModelWrapper, create_reference_model
+from .utils import (
+    DPODataCollatorWithPadding,
+    disable_dropout_in_model,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    trl_sanitze_kwargs_for_tagging,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+if is_wandb_available():
+    import wandb
+
+if is_deepspeed_available():
+    import deepspeed
+
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+
+
+class DPOTrainer(Trainer):
+    r"""
+    Initialize DPOTrainer.
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The model to train, preferably an `AutoModelForSequenceClassification`.
+        ref_model (`PreTrainedModelWrapper`):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation and loss. If no
+            reference model is provided, the trainer will create a reference model with the same architecture as the model to be optimized.
+        beta (`float`, defaults to 0.1):
+            The beta factor in DPO loss. Higher beta means less divergence from the initial policy. For the IPO loss, beta is the regularization parameter denoted by tau in the paper.
+        label_smoothing (`float`, defaults to 0):
+            The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report that should be between 0 and 0.5.
+        loss_type (`str`, defaults to `"sigmoid"`):
+            The type of DPO loss to use. Either `"sigmoid"` the default DPO loss,`"hinge"` loss from [SLiC](https://arxiv.org/abs/2305.10425) paper, `"ipo"` from [IPO](https://arxiv.org/abs/2310.12036) paper, or `"kto"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf).
+        args (`transformers.TrainingArguments`):
+            The arguments to use for training.
+        data_collator (`transformers.DataCollator`):
+            The data collator to use for training. If None is specified, the default data collator (`DPODataCollatorWithPadding`) will be used
+            which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
+        label_pad_token_id (`int`, defaults to `-100`):
+            The label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, defaults to `0`):
+            The padding value if it is different to the tokenizer's pad_token_id.
+        truncation_mode (`str`, defaults to `keep_end`):
+            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
+        train_dataset (`datasets.Dataset`):
+            The dataset to use for training.
+        eval_dataset (`datasets.Dataset`):
+            The dataset to use for evaluation.
+        tokenizer (`transformers.PreTrainedTokenizerBase`):
+            The tokenizer to use for training. This argument is required if you want to use the default data collator.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be used.
+        callbacks (`List[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        max_length (`int`, defaults to `None`):
+            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
+        max_prompt_length (`int`, defaults to `None`):
+            The maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_target_length (`int`, defaults to `None`):
+            The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
+        peft_config (`Dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
+        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
+            If no model is provided, we need to know if the model_init returns an encoder-decoder.
+        disable_dropout (`bool`, defaults to `True`):
+            Whether or not to disable dropouts in `model` and `ref_model`.
+        generate_during_eval (`bool`, defaults to `False`):
+            Whether to sample and log generations during evaluation step.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return
+            a dictionary string to metric values.
+        precompute_ref_log_probs (`bool`, defaults to `False`):
+            Flag to precompute reference model log probabilities and evaluation datasets. This is useful if you want to train
+            without the reference model and reduce the total GPU memory needed.
+        dataset_num_proc (`Optional[int]`, *optional*):
+            The number of workers to use to tokenize the data. Defaults to None.
+        model_init_kwargs (`Optional[Dict]`, *optional*):
+            Dict of Optional kwargs to pass when instantiating the model from a string
+        ref_model_init_kwargs (`Optional[Dict]`, *optional*):
+            Dict of Optional kwargs to pass when instantiating the ref model from a string
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+        reference_free (`bool`):
+            If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses.
+    """
+
+    _tag_names = ["trl", "dpo"]
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        dpo_alpha: float = 1.0,
+        beta: float = 0.1,
+        gamma: float = 0.1,
+        label_smoothing: float = 0,
+        loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid",
+        args: Optional[TrainingArguments] = None,
+        data_collator: Optional[DataCollator] = None,
+        label_pad_token_id: int = -100,
+        padding_value: Optional[int] = None,
+        truncation_mode: str = "keep_end",
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        max_length: Optional[int] = None,
+        max_prompt_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        peft_config: Optional[Dict] = None,
+        is_encoder_decoder: Optional[bool] = None,
+        disable_dropout: bool = True,
+        generate_during_eval: bool = False,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        precompute_ref_log_probs: bool = False,
+        dataset_num_proc: Optional[int] = None,
+        model_init_kwargs: Optional[Dict] = None,
+        ref_model_init_kwargs: Optional[Dict] = None,
+        model_adapter_name: Optional[str] = None,
+        ref_adapter_name: Optional[str] = None,
+        reference_free: bool = False,
+    ):
+        # import pdb;pdb.set_trace()
+        if model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the DPOTrainer. But your model is already instantiated.")
+
+        if ref_model_init_kwargs is None:
+            ref_model_init_kwargs = {}
+        elif not isinstance(ref_model, str):
+            raise ValueError("You passed ref_model_kwargs to the DPOTrainer. But your ref_model is already instantiated.")
+
+        if isinstance(model, str):
+            warnings.warn("You passed a model_id to the DPOTrainer. This will automatically create an " "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.")
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if isinstance(ref_model, str):
+            warnings.warn("You passed a ref model_id to the DPOTrainer. This will automatically create an " "`AutoModelForCausalLM`")
+            ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if generate_during_eval and not is_wandb_available():
+            raise ValueError("`generate_during_eval=True` requires Weights and Biases to be installed." " Please install `wandb` to resolve.")
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = is_encoder_decoder
+
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+        self.reference_free = reference_free
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            if is_deepspeed_zero3_enabled():
+                self.ref_model = AutoModelForCausalLM.from_pretrained(model)
+            else:
+                self.ref_model = create_reference_model(model)
+
+        if tokenizer is None:
+            raise ValueError("tokenizer must be specified to tokenize a DPO dataset.")
+        if max_length is None:
+            warnings.warn(
+                "`max_length` is not set in the DPOTrainer's init" " it will default to `512` by default, but you should do it yourself in the future.",
+                UserWarning,
+            )
+            max_length = 512
+        if max_prompt_length is None:
+            warnings.warn(
+                "`max_prompt_length` is not set in the DPOTrainer's init" " it will default to `128` by default, but you should do it yourself in the future.",
+                UserWarning,
+            )
+            max_prompt_length = 128
+
+        if max_target_length is None and self.is_encoder_decoder:
+            warnings.warn(
+                "When using an encoder decoder architecture, you should set `max_target_length` in the DPOTrainer's init" " it will default to `128` by default, but you should do it yourself in the future.",
+                UserWarning,
+            )
+            max_target_length = 128
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=tokenizer.pad_token_id,
+                label_pad_token_id=label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                warnings.warn(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments" " we have set it for you, but you should do it yourself in the future.",
+                    UserWarning,
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        self.max_length = max_length
+        self.generate_during_eval = generate_during_eval
+        self.label_pad_token_id = label_pad_token_id
+        self.padding_value = padding_value if padding_value is not None else tokenizer.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = truncation_mode
+        self.max_target_length = max_target_length
+        self.tokenizer = tokenizer
+        self.precompute_ref_log_probs = precompute_ref_log_probs
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        if loss_type in ["hinge", "ipo", "kto_pair"] and label_smoothing > 0:
+            warnings.warn("You are using a loss type that does not support label smoothing. Ignoring label_smoothing parameter.")
+
+        self.dpo_alpha = dpo_alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.label_smoothing = label_smoothing
+        self.loss_type = loss_type
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        self.dataset_num_proc = dataset_num_proc
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        # with PartialState().local_main_process_first():
+        #     # tokenize the dataset
+        #     train_dataset = train_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc)
+        #     if eval_dataset is not None:
+        #         eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=self.dataset_num_proc)
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`.")
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
+                raise ValueError("You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`.")
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError("No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`")
+        else:
+            if self.is_deepspeed_enabled:
+                self.ref_model = self._prepare_deepspeed(self.ref_model)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+    def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
+        # Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
+
+        if model is not None:
+            if hasattr(model, "config"):
+                hidden_size = max(model.config.hidden_sizes) if getattr(model.config, "hidden_sizes", None) else getattr(model.config, "hidden_size", None)
+                if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
+                    # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
+                    # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+                    config_kwargs.update(
+                        {
+                            "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                            "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
+                            "zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
+                        }
+                    )
+
+        # If ZeRO-3 is used, we shard both the active and reference model.
+        # Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
+        if config_kwargs["zero_optimization"]["stage"] != 3:
+            config_kwargs["zero_optimization"]["stage"] = 0
+        model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+        model.eval()
+        return model
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params))
+
+            reference_chosen_logps = []
+            reference_rejected_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
+                reference_chosen_logp, reference_rejected_logp = self.compute_reference_log_probs(padded_batch)
+                reference_chosen_logp, reference_rejected_logp = self.accelerator.gather_for_metrics((reference_chosen_logp, reference_rejected_logp))
+                reference_chosen_logps.append(reference_chosen_logp.cpu())
+                reference_rejected_logps.append(reference_rejected_logp.cpu())
+
+            all_reference_chosen_logps = torch.cat(reference_chosen_logps).float().numpy()
+            all_reference_rejected_logps = torch.cat(reference_rejected_logps).float().numpy()
+
+            self.train_dataset = self.train_dataset.add_column(name="reference_chosen_logps", column=all_reference_chosen_logps)
+            self.train_dataset = self.train_dataset.add_column(name="reference_rejected_logps", column=all_reference_rejected_logps)
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
+
+            reference_chosen_logps = []
+            reference_rejected_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"):
+                reference_chosen_logp, reference_rejected_logp = self.compute_reference_log_probs(padded_batch)
+                reference_chosen_logp, reference_rejected_logp = self.accelerator.gather_for_metrics((reference_chosen_logp, reference_rejected_logp))
+                reference_chosen_logps.append(reference_chosen_logp.cpu())
+                reference_rejected_logps.append(reference_rejected_logp.cpu())
+
+            all_reference_chosen_logps = torch.cat(reference_chosen_logps).float().numpy()
+            all_reference_rejected_logps = torch.cat(reference_rejected_logps).float().numpy()
+
+            eval_dataset = eval_dataset.add_column(name="reference_chosen_logps", column=all_reference_chosen_logps)
+            eval_dataset = eval_dataset.add_column(name="reference_rejected_logps", column=all_reference_rejected_logps)
+
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`.
+        It does ensure `enc(a + b) = enc(a) + enc(a + b)[len(enc(a)):]`.
+        Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+
+        full_tokenized = self.tokenizer(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:
+        """Tokenize a single row from a DPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
+        in case the prompt + chosen or prompt + rejected responses is/are too long. First
+            we truncate the prompt; if we're still too long, we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to
+            the sum of the length of the prompt and the chosen/rejected response, with
+            label_pad_token_id  for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum([a != b for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"])])
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError("Chosen and rejected prompt_input_ids might only differ on the " "last token due to tokenizer merge ops.")
+
+            # add BOS token to head of prompt
+            prompt_tokens["prompt_input_ids"] = [self.tokenizer.bos_token_id] + prompt_tokens["prompt_input_ids"]
+            chosen_tokens["prompt_input_ids"] = [self.tokenizer.bos_token_id] + chosen_tokens["prompt_input_ids"]
+            rejected_tokens["prompt_input_ids"] = [self.tokenizer.bos_token_id] + rejected_tokens["prompt_input_ids"]
+
+            prompt_tokens["prompt_attention_mask"] = [1] + prompt_tokens["prompt_attention_mask"]
+            chosen_tokens["prompt_attention_mask"] = [1] + chosen_tokens["prompt_attention_mask"]
+            rejected_tokens["prompt_attention_mask"] = [1] + rejected_tokens["prompt_attention_mask"]
+
+            # add EOS token to end of answer
+            chosen_tokens["input_ids"].append(self.tokenizer.eos_token_id)
+            chosen_tokens["attention_mask"].append(1)
+
+            rejected_tokens["input_ids"].append(self.tokenizer.eos_token_id)
+            rejected_tokens["attention_mask"].append(1)
+
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
+                    else:
+                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
+
+            # Create labels
+            chosen_sequence_tokens = {k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]}
+            rejected_sequence_tokens = {k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]}
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [self.label_pad_token_id] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [self.label_pad_token_id] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.tokenizer(chosen, truncation=True, max_length=self.max_target_length, add_special_tokens=True)
+            rejected_tokens = self.tokenizer(rejected, truncation=True, max_length=self.max_target_length, add_special_tokens=True)
+            prompt_tokens = self.tokenizer(prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True)
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(labels=batch["rejected_labels"])
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(labels=batch["chosen_labels"])
+
+        return batch
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with self.accelerator.unwrap_model(self.model).disable_adapter() if self.is_peft_model and not self.ref_adapter_name else nullcontext():
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def compute_reference_log_probs(self, padded_batch: Dict) -> Dict:
+        """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset."""
+        compte_ref_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext
+
+        # compute reference logps
+        with torch.no_grad(), compte_ref_context_manager():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    (
+                        reference_chosen_logps,
+                        reference_rejected_logps,
+                        _,
+                        _,
+                    ) = self.concatenated_forward(self.model, padded_batch)
+            else:
+                (
+                    reference_chosen_logps,
+                    reference_rejected_logps,
+                    _,
+                    _,
+                ) = self.concatenated_forward(self.ref_model, padded_batch)
+
+        return reference_chosen_logps, reference_rejected_logps
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: Optional[torch.device] = None,
+    ) -> Dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors of shape (batch_size, sequence_length).
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+            label_pad_token_id: The label pad token id.
+            padding_value: The padding value to use for the concatenated inputs_ids.
+            device: The device for the concatenated inputs.
+
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+
+        for k in batch:
+            # import pdb; pdb.set_trace()
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+        # import pdb; pdb.set_trace()
+        # repeated_list = [
+        #     batch['images'][0] * 2,
+        #     batch['images'][1] * 2
+        # ]
+        concatenated_batch["concatenated_images"] = batch["images"] * 2
+        concatenated_batch["image_sizes"] = batch["image_sizes"] * 2
+        concatenated_batch["modalities"] = batch["modalities"] * 2
+        return concatenated_batch
+
+    def dpo_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+            reference_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
+            reference_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
+            The losses tensor contains the DPO loss for each example in the batch.
+            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
+        """
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        if self.reference_free:
+            ref_logratios = torch.tensor([0], dtype=pi_logratios.dtype, device=pi_logratios.device)
+        else:
+            ref_logratios = reference_chosen_logps - reference_rejected_logps
+
+        pi_logratios = pi_logratios.to(self.accelerator.device)
+        ref_logratios = ref_logratios.to(self.accelerator.device)
+        logits = pi_logratios - ref_logratios
+        # print(f"pi log ratios: {pi_logratios}")
+        # print(f"ref log ratios: {ref_logratios}")
+        # print(f"logits: {logits}")
+        # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative DPO loss.
+        if self.loss_type == "sigmoid":
+            losses = -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+        elif self.loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+        elif self.loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        elif self.loss_type == "kto_pair":
+            # eqn (7) of the HALOs paper
+            chosen_KL = (policy_chosen_logps - reference_chosen_logps).mean().clamp(min=0)
+            rejected_KL = (policy_rejected_logps - reference_rejected_logps).mean().clamp(min=0)
+
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+            # As described in the KTO report, the KL term for chosen (rejected) is estimated using the rejected (chosen) half.
+            losses = torch.cat(
+                (
+                    1 - F.sigmoid(self.beta * (chosen_logratios - rejected_KL)),
+                    1 - F.sigmoid(self.beta * (chosen_KL - rejected_logratios)),
+                ),
+                0,
+            )
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair']")
+
+        chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device) - reference_chosen_logps.to(self.accelerator.device)).detach()
+        rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device) - reference_rejected_logps.to(self.accelerator.device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels: Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are ignored. Shape: (batch_size, sequence_length)
+            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def get_sft_loss(self, logits, labels):
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = nn.CrossEntropyLoss()
+        shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        shift_labels = shift_labels.view(-1)
+        # Enable model/pipeline parallelism
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss = loss_fct(shift_logits, shift_labels)
+        return loss
+
+    def concatenated_forward(self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        # import pdb; pdb.set_trace()
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        # import pdb; pdb.set_trace()
+        all_logits, new_labels = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            labels=concatenated_batch["concatenated_labels"],
+            images=concatenated_batch["concatenated_images"],
+            image_sizes=concatenated_batch["image_sizes"],
+            modalities=concatenated_batch["modalities"],
+            use_cache=False,
+            dpo_forward=True,
+        )
+        all_logits = all_logits.to(torch.float32)
+        all_logps = self.get_batch_logps(
+            all_logits,
+            new_labels,
+            average_log_prob=self.loss_type == "ipo",
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        # don't count image embeds logits
+        # loss_mask = new_labels != -100
+        # logits = [all_logits[i][loss_mask[i]] for i in range(loss_mask.shape[0])]
+        # chosen_logits = logits[:len_chosen]
+        # rejected_logits = logits[len_chosen:]
+        # chosen_logits = [l.detach().cpu().mean() for l in chosen_logits]
+        # rejected_logits = [l.detach().cpu().mean() for l in rejected_logits]
+        # chosen_logits = sum(chosen_logits)/len_chosen
+        # rejected_logits = sum(rejected_logits)/len_chosen
+
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
+
+        chosen_labels = new_labels[:len_chosen]
+        rejected_labels = new_labels[len_chosen:]
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_labels, rejected_labels)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the DPO loss and other metrics for the given batch of inputs for train or test.
+        CHANGE: 1. add sft loss
+        2. all gather metrics
+        """
+        metrics = {}
+
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            chosen_labels,
+            rejected_labels,
+        ) = self.concatenated_forward(model, batch)
+
+        # if reference_chosen_logps and reference_rejected_logps in batch use them, otherwise use the reference model
+        if "reference_chosen_logps" in batch and "reference_rejected_logps" in batch:
+            reference_chosen_logps = batch["reference_chosen_logps"]
+            reference_rejected_logps = batch["reference_rejected_logps"]
+        else:
+            with torch.no_grad():
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                        ) = self.concatenated_forward(
+                            self.model, batch
+                        )[:2]
+                else:
+                    (
+                        reference_chosen_logps,
+                        reference_rejected_logps,
+                    ) = self.concatenated_forward(
+                        self.ref_model, batch
+                    )[:2]
+
+        unscaled_dpo_losses, chosen_rewards, rejected_rewards = self.dpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+        )
+        unscaled_dpo_losses = unscaled_dpo_losses.mean()
+        dpo_losses = unscaled_dpo_losses * self.dpo_alpha
+        unscaled_sft_loss = self.get_sft_loss(policy_chosen_logits, chosen_labels)
+        sft_loss = unscaled_sft_loss * self.gamma
+
+        # print(sft_loss.shape, dpo_losses.shape)
+        losses = dpo_losses + sft_loss
+        # losses = sft_loss # sft only
+        # losses = dpo_losses # dpo only
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        def all_gather_tensor(tensor):
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                tensor = tensor.detach()
+                gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
+                torch.distributed.all_gather(gathered_tensor, tensor)
+                tensor = torch.cat(gathered_tensor, dim=0)
+            # else:
+            #     print('not distributed')
+            return tensor
+
+        # gather chosen_rewards across devices
+        chosen_rewards = all_gather_tensor(chosen_rewards)
+        rejected_rewards = all_gather_tensor(rejected_rewards)
+        reward_accuracies = all_gather_tensor(reward_accuracies)
+        policy_chosen_logps = all_gather_tensor(policy_chosen_logps)
+        policy_rejected_logps = all_gather_tensor(policy_rejected_logps)
+        reference_chosen_logps = all_gather_tensor(reference_chosen_logps)
+        reference_rejected_logps = all_gather_tensor(reference_rejected_logps)
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}losses/dpo"] = unscaled_dpo_losses.cpu()
+        metrics[f"{prefix}losses/sft"] = unscaled_sft_loss.cpu()
+        metrics[f"{prefix}losses/total"] = losses.cpu()
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().cpu()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().cpu()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean().cpu()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean().cpu()
+        # policy logps
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().mean().cpu()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().mean().cpu()
+        # policy logits (exclude image tokens)
+        # metrics[f"{prefix}logits/rejected"] =policy_rejected_logits
+        # metrics[f"{prefix}logits/chosen"] = policy_chosen_logits
+        # reference logps
+        metrics[f"{prefix}ref_logps/rejected"] = reference_rejected_logps.mean().cpu()
+        metrics[f"{prefix}ref_logps/chosen"] = reference_chosen_logps.mean().cpu()
+
+        # metrics all pick .4 digits
+        # for k in metrics:
+        #     metrics[k] = round(metrics[k].item(), 4)
+
+        return losses, metrics
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+        if not self.use_dpo_data_collator:
+            warnings.warn(
+                "compute_loss is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
+                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
+            )
+
+        compute_loss_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext
+
+        with compute_loss_context_manager():
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def get_batch_samples(self, model, batch: Dict[str, torch.LongTensor]) -> Tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explictly call generate with
+        # the torch cuda amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = nullcontext if not self._peft_has_been_casted_to_bf16 else torch.cuda.amp.autocast
+
+        with generate_context_manager():
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.tokenizer.pad_token_id,
+            )
+
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            else:
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        reference_output = self.model.generate(
+                            input_ids=batch["prompt_input_ids"],
+                            attention_mask=batch["prompt_attention_mask"],
+                            max_length=self.max_length,
+                            do_sample=True,
+                            pad_token_id=self.tokenizer.pad_token_id,
+                        )
+                else:
+                    reference_output = self.ref_model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                    )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.tokenizer.pad_token_id)
+        policy_output_decoded = self.tokenizer.batch_decode(policy_output, skip_special_tokens=True)
+
+        reference_output = pad_to_length(reference_output, self.max_length, self.tokenizer.pad_token_id)
+        reference_output_decoded = self.tokenizer.batch_decode(reference_output, skip_special_tokens=True)
+
+        return policy_output_decoded, reference_output_decoded
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ):
+        if not self.use_dpo_data_collator:
+            warnings.warn(
+                "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
+                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
+            )
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = torch.cuda.amp.autocast if self._peft_has_been_casted_to_bf16 else nullcontext
+
+        with torch.no_grad(), prediction_context_manager():
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = tuple(v.unsqueeze(dim=0) for k, v in logits_dict.items() if k not in ignore_keys)
+        logits = torch.stack(logits).mean(axis=1).to(self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch.
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = self.get_batch_samples(self.model, random_batch)
+
+            self.log(
+                {
+                    "game_log": wandb.Table(
+                        columns=["Prompt", "Policy", "Ref Model"],
+                        rows=[[prompt, pol[len(prompt) :], ref[len(prompt) :]] for prompt, pol, ref in zip(random_batch["prompt"], policy_output_decoded, ref_output_decoded)],
+                    )
+                }
+            )
+            self.state.log_history.pop()
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
+
+        return initial_output
+
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs)
+
+    @wraps(Trainer.push_to_hub)
+    def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
+        """
+        Overwrite the `push_to_hub` method in order to force-add the tag "sft" when pushing the
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
+        """
+        kwargs = trl_sanitze_kwargs_for_tagging(model=self.model, tag_names=self._tag_names, kwargs=kwargs)
+
+        return super().push_to_hub(commit_message=commit_message, blocking=blocking, **kwargs)
diff --git a/train/trl/trainer/iterative_sft_trainer.py b/train/trl/trainer/iterative_sft_trainer.py
new file mode 100644
index 0000000..1fd5544
--- /dev/null
+++ b/train/trl/trainer/iterative_sft_trainer.py
@@ -0,0 +1,334 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from transformers import (
+    DataCollator,
+    DataCollatorForLanguageModeling,
+    DataCollatorForSeq2Seq,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.trainer_utils import EvalLoopOutput
+
+from ..core import PPODecorators
+from ..import_utils import is_peft_available
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class IterativeSFTTrainer(Trainer):
+    """
+    The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization.
+
+    Attributes:
+        **model** (`PreTrainedModel`) -- Model to be optimized, either an 'AutoModelForCausalLM' or an 'AutoModelForSeq2SeqLM'.
+            Check the documentation of `PreTrainedModel` for more details.
+        **args** (`transformers.TrainingArguments`): -- The arguments to use for training.
+        **tokenizer** (`PreTrainedTokenizerBase`) -- Tokenizer to be used for encoding the
+            data. Check the documentation of `transformers.PreTrainedTokenizer` and
+            `transformers.PreTrainedTokenizerFast` for more details.
+        **optimizers** (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): -- The optimizer and scheduler to use for training.
+        **data_collator** (Union[DataCollatorForLanguageModeling, DataCollatorForSeq2Seq], *optional*) -- Data collator to be used for training and
+            passed along the dataloader.
+        **eval_dataset** (`datasets.Dataset`): The dataset to use for evaluation.
+        **max_length** (`int`, defaults to `None`): -- The maximum length of the input.
+        **truncation_mode** (`str`, defaults to `keep_end`): -- The truncation mode to use, either `keep_end` or `keep_start`.
+        **preprocess_logits_for_metrics** (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): -- The function to use to preprocess the logits before computing the metrics.
+        **compute_metrics** (`Callable[[EvalPrediction], Dict]`, *optional*): -- The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to metric values.
+        **optimize_device_cache ** (`bool`, *optional*, defaults to `False`) -- Optimize CUDA cache for slightly more memory-efficient training.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel = None,
+        args: TrainingArguments = None,
+        tokenizer: PreTrainedTokenizerBase = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        data_collator: Optional[DataCollator] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        max_length: Optional[int] = None,
+        truncation_mode: Optional[str] = "keep_end",
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        optimize_device_cache: Optional[bool] = False,
+    ):
+        # Step 0: check positional arguments validity
+        if not isinstance(tokenizer, (PreTrainedTokenizerBase)):
+            raise ValueError(f"tokenizer must be a PreTrainedTokenizerBase like a PreTrainedTokenizer or a PreTrainedTokenizerFast, got {type(tokenizer)}")
+        if not isinstance(model, PreTrainedModel):
+            raise ValueError(f"model must be a PreTrainedModel, got {type(model)}")
+        if not model.can_generate():
+            warnings.warn(f"The current model class {type(model)} is not compatible with `.generate()`" "Please make sure that this is intended.")
+        if optimizers[1] is None and args.max_steps == -1:
+            raise ValueError("When no scheduler is provided, you need to set the total number of training steps to perform `max_steps`")
+
+        self.is_encoder_decoder = getattr(model.config, "is_encoder_decoder", False)
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+
+        self.tokenizer = tokenizer
+
+        if data_collator is None:
+            if self.is_encoder_decoder:
+                warnings.warn("No data collator is provided. Using 'DataCollatorForSeq2Seq' with" "'labels_pad_token_id' set to '-100' and 'pad_to_multiple_of' set to 8.")
+                self.data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=-100, pad_to_multiple_of=8)
+            else:
+                warnings.warn("No data collator is provided. Using 'DataCollatorForLanguageModeling'")
+                self.data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
+        else:
+            self.data_collator = data_collator
+
+        self.max_length = max_length
+        self.truncation_mode = truncation_mode
+        self.optimize_device_cache = optimize_device_cache
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=self.data_collator,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self.create_optimizer_and_scheduler(self.args.max_steps)
+
+        # prepare model, optimizer and lr_scheduler
+        self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(self.model, self.optimizer, self.lr_scheduler)
+
+        self.tokenizer.truncation_side = "left" if self.truncation_mode == "keep_end" else "right"
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`.")
+
+        PPODecorators.optimize_device_cache = self.optimize_device_cache
+
+    def prepare_model_inputs(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor):
+        if attention_mask is None:
+            attention_mask = [torch.ones_like(ids) for ids in input_ids]
+
+        if self.is_encoder_decoder:
+            input_data = self.data_collator([{"input_ids": ids, "attention_mask": att, "labels": lab} for ids, att, lab in zip(input_ids, attention_mask, labels)]).to(self.model.device)
+
+            input_data.pop("decoder_input_ids", None)  # This is directly computed inside the model
+
+            input_data["labels"][input_data["labels"] == self.tokenizer.pad_token_id] = -100
+
+        else:
+            input_data = self.data_collator([{"input_ids": ids, "attention_mask": att} for ids, att in zip(input_ids, attention_mask)]).to(self.model.device)
+
+        # truncate in case the user has provided input_ids, attention_mask and labels
+        if self.max_length is not None:
+            if self.truncation_mode == "keep_start":
+                input_data = {k: v[: self.max_length] for k, v in input_data.items()}
+            elif self.truncation_mode == "keep_end":
+                input_data = {k: v[-self.max_length :] for k, v in input_data.items()}
+            else:
+                raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+        return input_data
+
+    @staticmethod
+    def _step_safety_checker(
+        input_ids: List[torch.LongTensor],
+        attention_mask: List[torch.LongTensor],
+        labels: List[torch.LongTensor],
+        texts: List[str],
+        texts_labels: List[str],
+    ):
+        """
+        Check if the input data is valid for training.
+
+        Args:
+            input_ids (List[`torch.LongTensor`]):
+                List of tensors containing the input_ids
+            attention_mask (List[`torch.LongTensor`]):
+                List of tensors containing the attention_mask
+            labels (List[`torch.FloatTensor`]):
+                List of tensors containing the labels
+            texts (List[`str`]):
+                List of string containing the text input.
+            texts_labels (List[`str`]):
+                List of string containing the text labels.
+        Returns:
+            `tuple`: The input data.
+        """
+        if texts is None:
+            if attention_mask is None:
+                for name, tensor_list in zip(["input_ids", "labels"], [input_ids, labels]):
+                    if not isinstance(tensor_list, list):
+                        raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}")
+                    if not isinstance(tensor_list[0], torch.Tensor):
+                        raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}")
+            else:
+                for name, tensor_list in zip(["input_ids", "attention_mask", "labels"], [input_ids, attention_mask, labels]):
+                    if not isinstance(tensor_list, list):
+                        raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}")
+                    if not isinstance(tensor_list[0], torch.Tensor):
+                        raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}")
+        else:
+            if not isinstance(texts, list):
+                raise ValueError(f"'text' must be a list of strings - got {type(texts)}")
+            if not isinstance(texts[0], str):
+                raise ValueError(f"Elements in 'text' must be strings - got {type(texts[0])}")
+            if texts_labels is not None:
+                if not isinstance(texts_labels, list):
+                    raise ValueError(f"'text_labels' must be a list of strings - got {type(texts_labels)}")
+                if not isinstance(texts_labels[0], str):
+                    raise ValueError(f"Elements in 'text_labels' must be strings - got {type(texts_labels[0])}")
+
+        return input_ids, attention_mask, labels, texts, texts_labels
+
+    @PPODecorators.empty_device_cache()
+    def step(
+        self,
+        input_ids: Optional[List[torch.LongTensor]] = None,
+        attention_mask: Optional[List[torch.LongTensor]] = None,
+        labels: Optional[List[torch.LongTensor]] = None,
+        texts: Optional[List[str]] = None,
+        texts_labels: Optional[List[str]] = None,
+    ):
+        """
+        Run an optimisation step given a list of input_ids, attention_mask, and labels or a list of text and text_labels.
+        Args:
+            input_ids (List[`torch.LongTensor`]):
+                List of tensors containing the input_ids (if not provided, text will be used)
+            attention_mask (List[`torch.LongTensor`], , *optional*):
+                List of tensors containing the attention_mask
+            labels (List[`torch.FloatTensor`], *optional*):
+                List of tensors containing the labels (if set to None, will default to input_ids)
+            texts (List[`str`], *optional*):
+                List of strings containing the text input (if not provided, input_ids will directly be used)
+            texts_labels (List[`str`], *optional*):
+                List of strings containing the text labels (if set to None, will default to text)
+        Returns:
+            `dict[str, Any]`: A summary of the training statistics
+        """
+        self.model.train()
+
+        if self.state.global_step == 0:
+            self.tr_loss = torch.tensor(0.0).to(self.args.device)
+            self._globalstep_last_logged = self.state.global_step
+
+        if input_ids is None and texts is None:
+            raise ValueError("Step should include `input_ids` or `texts` as keyword arguments.")
+        elif input_ids is not None and texts is not None:
+            warnings.warn("Both 'input_ids' and 'texts' are provided. 'input_ids' will be overwritten using inputs provided by the 'texts' keyword argument.")
+
+        if labels is None and texts_labels is None and self.is_encoder_decoder:
+            raise ValueError("No 'labels' or 'text_labels' are provided. When using an encoder-decoder architecture, 'labels' or 'text_labels' must be passed.")
+
+        input_ids, attention_mask, labels, texts, texts_labels = self._step_safety_checker(input_ids, attention_mask, labels, texts, texts_labels)
+
+        if texts is not None:
+            model_inputs = self.tokenizer(texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")
+
+            input_ids, attention_mask = model_inputs["input_ids"], model_inputs["attention_mask"]
+
+        if texts_labels is not None:
+            labels = self.tokenizer(texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")["input_ids"]
+
+        if labels is None:
+            warnings.warn("No labels are provided. Setting labels to input_ids")
+            labels = input_ids
+
+        model_inputs = self.prepare_model_inputs(input_ids, attention_mask, labels)
+
+        model_inputs_names = list(model_inputs.keys())
+
+        batch_dict = {}
+        batch_dict.update(model_inputs)
+
+        def collator(data):
+            return_dict = dict()
+            for key in data[0]:
+                if key in ["input_ids", "attention_mask", "labels"]:
+                    return_dict[key] = torch.stack([d[key] for d in data]).to(self.model.device)
+            return return_dict
+
+        batch_data = Dataset.from_dict(batch_dict)
+        batch_data.set_format("torch")
+
+        step_dataloader = DataLoader(
+            batch_data,
+            batch_size=self.args.per_device_train_batch_size,
+            shuffle=True,
+            collate_fn=collator,
+        )
+
+        for _, batch in enumerate(step_dataloader):
+            with self.accelerator.accumulate(self.model):
+                model_inputs = {k: batch[k] for k in model_inputs_names}
+                loss = self.compute_loss(self.model, model_inputs)
+
+                if self.args.n_gpu > 1:
+                    loss = loss.mean()
+
+                tr_loss_step = loss.detach()
+
+                self.accelerator.backward(loss)
+
+                if self.accelerator.sync_gradients and self.args.max_grad_norm is not None:
+                    self.accelerator.clip_grad_norm_(
+                        self.model.parameters(),
+                        self.args.max_grad_norm,
+                    )
+
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                if self.lr_scheduler is not None:
+                    self.lr_scheduler.step()
+
+                self.state.global_step += 1
+
+                # update stats etc
+                self.tr_loss += tr_loss_step
+
+                self._maybe_log_save_evaluate()
+
+    def _maybe_log_save_evaluate(self):
+        # check if eval is required
+        if self.args.eval_steps is not None:
+            if self.state.global_step % self.args.eval_steps == 0 and self.state.global_step != 0:
+                self.evaluate(self.eval_dataset)
+
+        # check if logging is required
+        if self.args.logging_steps is not None:
+            if self.state.global_step % self.args.logging_steps == 0 and self.state.global_step != 0:
+                logs: Dict[str, float] = {}
+
+                tr_loss_scalar = self._nested_gather(self.tr_loss).mean().item()
+
+                # reset tr_loss to zero
+                self.tr_loss -= self.tr_loss
+
+                logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+                logs["learning_rate"] = self._get_learning_rate()
+
+                self._globalstep_last_logged = self.state.global_step
+
+                self.log(logs)
diff --git a/train/trl/trainer/model_config.py b/train/trl/trainer/model_config.py
new file mode 100644
index 0000000..e6df859
--- /dev/null
+++ b/train/trl/trainer/model_config.py
@@ -0,0 +1,71 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from ..core import flatten_dict
+
+
+@dataclass
+class ModelConfig:
+    """
+    Arguments which define the model and tokenizer to load.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The model checkpoint for weights initialization.")},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": ("Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " "dtype will be automatically derived from the model's weights."),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={"help": ("Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`")},
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use PEFT or not for training.")},
+    )
+    lora_r: Optional[int] = field(
+        default=16,
+        metadata={"help": ("LoRA R value.")},
+    )
+    lora_alpha: Optional[int] = field(
+        default=32,
+        metadata={"help": ("LoRA alpha.")},
+    )
+    lora_dropout: Optional[float] = field(
+        default=0.05,
+        metadata={"help": ("LoRA dropout.")},
+    )
+    lora_target_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("LoRA target modules.")},
+    )
+    lora_modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("Model layers to unfreeze & train")},
+    )
+    load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision for the base model - works only with LoRA"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision for the base model - works only with LoRA"})
+
+    bnb_4bit_quant_type: Optional[str] = field(default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"})
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+
+    def to_dict(self):
+        output_dict = {}
+        for key, value in self.__dict__.items():
+            output_dict[key] = value
+        return flatten_dict(output_dict)
+
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
diff --git a/train/trl/trainer/ppo_config.py b/train/trl/trainer/ppo_config.py
new file mode 100644
index 0000000..8642eab
--- /dev/null
+++ b/train/trl/trainer/ppo_config.py
@@ -0,0 +1,175 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+import numpy as np
+import tyro
+from typing_extensions import Annotated
+
+from trl.trainer.utils import exact_div
+
+from ..core import flatten_dict
+from ..import_utils import is_wandb_available
+
+
+JSONDict = Annotated[Optional[dict], tyro.conf.arg(metavar="JSON", constructor=json.loads)]
+
+
+@dataclass
+class PPOConfig:
+    """
+    Configuration class for PPOTrainer
+    """
+
+    # common parameters
+    exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
+    """the name of this experiment (by default is the file name without the extension name)"""
+    seed: int = 0
+    """Seed value for random generations"""
+    log_with: Optional[Literal["wandb", "tensorboard"]] = None
+    """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
+    task_name: Optional[str] = None
+    """Name of task to use - used only for tracking purposes"""
+    model_name: Optional[str] = "gpt2"
+    """Name of model to use - used only for tracking purposes"""
+    query_dataset: Optional[str] = "imdb"
+    """Name of dataset to query - used only for tracking purposes"""
+    reward_model: Optional[str] = "sentiment-analysis:lvwerra/distilbert-imdb"
+    """The reward model to use - used only for tracking purposes"""
+    remove_unused_columns: bool = True
+    """Remove unused columns from the dataset if `datasets.Dataset` is used"""
+    tracker_kwargs: JSONDict = field(default_factory=dict)
+    """Keyword arguments for the tracker (e.g. python ppo.py --tracker_kwargs='{"wandb": {"entity": "my_wandb_entity", "name": "my_exp_name"}}'"""
+    accelerator_kwargs: JSONDict = field(default_factory=dict)
+    """Keyword arguments for the accelerator"""
+    project_kwargs: JSONDict = field(default_factory=dict)
+    """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
+    tracker_project_name: str = "trl"
+    """Name of project to use for tracking"""
+    push_to_hub_if_best_kwargs: JSONDict = field(default_factory=dict)
+    """Keyword arguments for pushing model to the hub during training (e.g. repo_id)"""
+
+    # hyperparameters
+    steps: int = 20000
+    """Number of training steps"""
+    learning_rate: float = 1.41e-5
+    """Adam learning rate"""
+    adap_kl_ctrl: bool = True
+    """Use adaptive KL control, otherwise linear"""
+    init_kl_coef: Optional[float] = 0.2
+    """Initial KL penalty coefficient (used for adaptive and linear control)"""
+    kl_penalty: Literal["kl", "abs", "mse", "full"] = "kl"
+    """kl penalty options: 'kl': model_logp - ref_logp,  'abs': abs(kl),  'mse': mean squared error mse(kl) and 'full': the actual kl for all tokens in the distribution"""
+    target: Optional[float] = 6
+    """Target KL value for adaptive KL control"""
+    horizon: Optional[float] = 10000
+    """Horizon for adaptive KL control"""
+    gamma: float = 1
+    """Gamma parameter for advantage calculation"""
+    lam: float = 0.95
+    """Lambda parameter for advantage calculation"""
+    cliprange: float = 0.2
+    """Range for clipping in PPO policy gradient loss"""
+    cliprange_value: float = 0.2
+    """Range for clipping values in loss calculation"""
+    vf_coef: float = 0.1
+    """Scaling factor for value loss"""
+    batch_size: int = 128
+    """Number of samples per optimisation step"""
+    forward_batch_size: Optional[int] = None
+    """DEPRECATED: use `mini_batch_size` instead, which does the same thing."""
+    mini_batch_size: int = 128
+    """Number of samples optimized in each mini batch"""
+    gradient_accumulation_steps: int = 1
+    """The number of gradient accumulation steps"""
+    world_size: tyro.conf.Suppress[int] = None
+    """The world size for distributed training"""
+    ppo_epochs: int = 4
+    """Number of optimisation epochs per batch of samples"""
+    max_grad_norm: Optional[float] = None
+    """Maximum gradient norm for gradient clipping"""
+    optimize_cuda_cache: Optional[bool] = None
+    """DEPRECATED: use `optimize_device_cache` instead, which does the same thing."""
+    optimize_device_cache: Optional[bool] = False
+    """Optimize device cache for slightly more memory-efficient training"""
+    early_stopping: bool = False
+    """Whether to stop the PPO optimization loop early is the KL too high"""
+    target_kl: float = 1
+    """Stop early if we exceed this value by over 50%"""
+    compare_steps: int = 1
+    """Number of steps between comparison of the current reward with the best seen so far"""
+    ratio_threshold: float = 10.0
+    """Skip mini-batches with high PPO ratios that can cause loss spikes"""
+    use_score_scaling: bool = False
+    """Use score scaling"""
+    use_score_norm: bool = False
+    """Use score normalization. Only applicable if use_score_scaling is True"""
+    score_clip: Optional[float] = None
+    """Score clipping"""
+    whiten_rewards: bool = False
+    """Whiten the rewards before compute advantages"""
+
+    # computed hyperparameters at runtime; we use `tyro.conf.Suppress` to hide them from the help text
+    is_encoder_decoder: Optional[tyro.conf.Suppress[bool]] = None
+    """TO BE FILLED In RUNTIME: Whether the model is an encoder-decoder model"""
+    is_peft_model: Optional[tyro.conf.Suppress[bool]] = None
+    """TO BE FILLED In RUNTIME: Whether the model is a PEFT model"""
+    backward_batch_size: tyro.conf.Suppress[int] = None
+    """TO BE FILLED In RUNTIME: Number of samples optimized in an `optimizer.step()` call"""
+    global_backward_batch_size: tyro.conf.Suppress[int] = None
+    """TO BE FILLED In RUNTIME: the effective `backward_batch_size` across all processes"""
+    global_batch_size: tyro.conf.Suppress[int] = None
+    """TO BE FILLED In RUNTIME: the effective `batch_size` across all processes"""
+
+    if optimize_cuda_cache is not None:
+        warnings.warn("The `optimize_cuda_cache` argument will be deprecated soon, please use `optimize_device_cache` instead.")
+        optimize_device_cache = optimize_cuda_cache
+    else:
+        optimize_device_cache = False
+
+    def __post_init__(self):
+        if self.forward_batch_size is not None:
+            warnings.warn(
+                "Note that using `forward_batch_size` is deprecated, use `mini_batch_size` instead. By setting it you overwrite `mini_batch_size` which affects both the batch size during forward passes and also the mini batch size for PPO optimization."
+            )
+            self.mini_batch_size = self.forward_batch_size
+
+        self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps
+        exact_div(
+            self.batch_size,
+            self.backward_batch_size,
+            "`batch_size`",
+            "`mini_batch_size * gradient_accumulation_steps`",
+            "`batch_size` must be a multiple of `mini_batch_size * gradient_accumulation_steps`",
+        )
+
+        # check if wandb is installed
+        if self.log_with == "wandb":
+            # raise error if wandb is not installed
+            if not is_wandb_available():
+                raise ImportError("Please install wandb to use wandb logging. You can do this by running `pip install wandb`.")
+
+        self.total_ppo_epochs = int(np.ceil(self.steps / self.batch_size))
+        assert self.kl_penalty in ["kl", "abs", "mse", "full"]
+
+    def to_dict(self):
+        output_dict = {}
+        for key, value in self.__dict__.items():
+            output_dict[key] = value
+        return flatten_dict(output_dict)
diff --git a/train/trl/trainer/ppo_trainer.py b/train/trl/trainer/ppo_trainer.py
new file mode 100644
index 0000000..ce7e093
--- /dev/null
+++ b/train/trl/trainer/ppo_trainer.py
@@ -0,0 +1,1397 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+import os
+import time
+import typing
+import warnings
+from contextlib import nullcontext
+from typing import Callable, List, Optional, Union
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.utils import ProjectConfiguration, gather_object, is_deepspeed_available
+from datasets import Dataset
+from huggingface_hub import whoami
+from packaging import version
+from torch.optim import Adam
+from transformers import (
+    DataCollatorForLanguageModeling,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+
+from ..core import (
+    WANDB_PADDING,
+    PPODecorators,
+    clip_by_value,
+    convert_to_scalar,
+    entropy_from_logits,
+    flatten_dict,
+    logprobs_from_logits,
+    masked_mean,
+    masked_var,
+    masked_whiten,
+    set_seed,
+    stack_dicts,
+    stats_to_np,
+)
+from ..import_utils import is_npu_available, is_torch_greater_2_0, is_xpu_available
+from ..models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model
+from . import AdaptiveKLController, BaseTrainer, FixedKLController, PPOConfig, RunningMoments
+
+
+if is_deepspeed_available():
+    import deepspeed
+
+MODEL_CARD_TEMPLATE = """---
+license: apache-2.0
+tags:
+- trl
+- ppo
+- transformers
+- reinforcement-learning
+---
+
+# {model_name}
+
+This is a [TRL language model](https://github.com/huggingface/trl) that has been fine-tuned with reinforcement learning to
+ guide the model outputs according to a value, function, or human feedback. The model can be used for text generation.
+
+## Usage
+
+To use this model for inference, first install the TRL library:
+
+```bash
+python -m pip install trl
+```
+
+You can then generate text as follows:
+
+```python
+from transformers import pipeline
+
+generator = pipeline("text-generation", model="{model_id}")
+outputs = generator("Hello, my llama is cute")
+```
+
+If you want to use the model for training or to obtain the outputs from the value head, load the model as follows:
+
+```python
+from transformers import AutoTokenizer
+from trl import AutoModelForCausalLMWithValueHead
+
+tokenizer = AutoTokenizer.from_pretrained("{model_id}")
+model = AutoModelForCausalLMWithValueHead.from_pretrained("{model_id}")
+
+inputs = tokenizer("Hello, my llama is cute", return_tensors="pt")
+outputs = model(**inputs, labels=inputs["input_ids"])
+```
+"""
+
+
+class PPOTrainer(BaseTrainer):
+    """
+    The PPOTrainer uses Proximal Policy Optimization to optimise language models.
+    Note, this trainer is heavily inspired by the original OpenAI learning to summarize work here:
+    https://github.com/openai/summarize-from-feedback
+
+    Attributes:
+        **config** (`PPOConfig`) -- Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more
+            details.
+        **model** (`PreTrainedModelWrapper`) -- Model to be optimized, Hugging Face transformer model with a value head.
+            Check the documentation of `PreTrainedModelWrapper` for more details.
+        **ref_model** (`PreTrainedModelWrapper`, *optional*) -- Reference model to be used for KL penalty, Hugging Face
+            transformer model with a casual language modelling head. Check the documentation of `PreTrainedModelWrapper`
+            for more details. If no reference model is provided, the trainer will create a reference model with the same
+             architecture as the model to be optimized with shared layers.
+        **tokenizer** (`PreTrainedTokenizerBase`) -- Tokenizer to be used for encoding the
+            data. Check the documentation of `transformers.PreTrainedTokenizer` and
+            `transformers.PreTrainedTokenizerFast` for more details.
+        **dataset** (Union[`torch.utils.data.Dataset`, `datasets.Dataset`], *optional*) -- PyTorch dataset or Hugging
+            Face dataset. This is used to create a PyTorch dataloader. If no dataset is provided, the dataloader must be
+             created outside the trainer users needs to design their own dataloader and make sure the batch
+            size that is used is the same as the one specified in the configuration object.
+        **optimizer** (`torch.optim.Optimizer`, *optional*) -- Optimizer to be used for training. If no optimizer is
+            provided, the trainer will create an Adam optimizer with the learning rate specified in the configuration
+            object.
+        **data_collator** (DataCollatorForLanguageModeling, *optional*) -- Data collator to be used for training and
+            passed along the dataloader
+        **num_shared_layers** (int, *optional*) -- Number of layers to be shared between the model and the reference
+            model, if no reference model is passed. If no number is provided, all the layers will be shared.
+        **lr_scheduler** (`torch.optim.lr_scheduler`, *optional*) -- Learning rate scheduler to be used for training.
+    """
+
+    _tag_names = ["trl", "ppo"]
+
+    def __init__(
+        self,
+        config: PPOConfig = None,
+        model: PreTrainedModelWrapper = None,
+        ref_model: Optional[PreTrainedModelWrapper] = None,
+        tokenizer: PreTrainedTokenizerBase = None,
+        dataset: Optional[Union[torch.utils.data.Dataset, Dataset]] = None,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        data_collator: Optional[typing.Callable] = None,
+        num_shared_layers: Optional[int] = None,
+        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    ):
+        """
+        Initialize PPOTrainer.
+
+        Args:
+            config (`PPOConfig`):
+                Configuration object for PPOTrainer. Check the documentation of `PPOConfig` for more details.
+            model (`PreTrainedModelWrapper`):
+                Hugging Face transformer model with a value head.
+            ref_model (`PreTrainedModelWrapper`):
+                Hugging Face transformer model with a casual language modelling head. Used for KL penalty
+            tokenizer (`transformers.PreTrainedTokenizerBase`):
+                Hugging Face tokenizer
+            dataset (Optional[Union[`torch.utils.data.Dataset`, `datasets.Dataset`]]):
+                PyTorch dataset or Hugging Face dataset. If a Hugging Face dataset is passed, the dataset
+                will be preprocessed by removing the columns that are not used by the model. If none is passed,
+                a warning will be raised in a multi-GPU setting.
+            optimizer (Optional[`torch.optim.Optimizer`]):
+                Optimizer used for training. If `None`, the `Adam` is used as default.
+            data_collator (Optional[function]):
+                Data collator function.
+            num_shared_layers (Optional[int]):
+                Number of shared layers between the model and the reference model. If `None`, all layers are shared.
+                used only if `ref_model` is `None`.
+            lr_scheduler (Optional[`torch.optim.lr_scheduler`]):
+                Learning rate scheduler used for training.
+        """
+        super().__init__(config)
+
+        # initial seed for reproducible experiments
+        set_seed(config.seed)
+
+        # Step 0: check positional arguments validity
+        if not isinstance(config, PPOConfig):
+            raise ValueError(f"config must be a PPOConfig, got {type(config)}")
+        if not isinstance(tokenizer, (PreTrainedTokenizerBase)):
+            raise ValueError(f"tokenizer must be a PreTrainedTokenizerBase like a PreTrainedTokenizer or a PreTrainedTokenizerFast, got {type(tokenizer)}")
+        if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
+            raise ValueError(f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}")
+        # Step 1: Initialize Accelerator
+        self.accelerator = Accelerator(
+            log_with=config.log_with,
+            gradient_accumulation_steps=config.gradient_accumulation_steps,
+            project_config=ProjectConfiguration(**config.project_kwargs),
+            **config.accelerator_kwargs,
+        )
+
+        # Step 1.1 Runtime variables filled by the accelerator
+        config.world_size = self.accelerator.num_processes
+        config.global_backward_batch_size = config.backward_batch_size * config.world_size
+        config.global_batch_size = config.batch_size * config.world_size
+
+        self.model = model
+        self.model_params = filter(lambda p: p.requires_grad, self.model.parameters())
+        self.is_encoder_decoder = hasattr(self.model, "is_encoder_decoder")
+        self.is_peft_model = getattr(self.model, "is_peft_model", False)
+        config.is_encoder_decoder = self.is_encoder_decoder
+        config.is_peft_model = self.is_peft_model
+
+        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
+        self.accelerator.init_trackers(
+            config.tracker_project_name,
+            config=dict(trl_ppo_trainer_config=config.to_dict()) if not is_using_tensorboard else config.to_dict(),
+            init_kwargs=config.tracker_kwargs,
+        )
+        self.is_using_text_environment = getattr(config, "use_text_environment", False)
+
+        if isinstance(ref_model, SUPPORTED_ARCHITECTURES):
+            self.ref_model = ref_model
+            if num_shared_layers is not None:
+                warnings.warn(
+                    "num_shared_layers is ignored when ref_model is provided. Two different models are used for the " "model and the reference model and no layers are shared.",
+                    UserWarning,
+                )
+        elif ref_model is None and not self.is_peft_model:
+            self.ref_model = create_reference_model(self.model, num_shared_layers=num_shared_layers)
+        elif self.is_peft_model:
+            self.ref_model = None
+        else:
+            raise ValueError(f"ref_model must be a PreTrainedModelWrapper or `None`, got {type(ref_model)} - supported " f"architectures are: {SUPPORTED_ARCHITECTURES} ")
+        self.optional_peft_ctx = self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter if self.is_peft_model else nullcontext
+
+        if not (isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast)):
+            raise ValueError("tokenizer must be a transformers.PreTrainedTokenizer or transformers.PreTrainedTokenizerFast")
+        self.tokenizer = tokenizer
+
+        if dataset is not None and not (isinstance(dataset, torch.utils.data.Dataset) or isinstance(dataset, Dataset)):
+            raise ValueError("dataset must be a torch.utils.data.Dataset or datasets.Dataset")
+        elif dataset is None:
+            warnings.warn(
+                "No dataset is provided. Make sure to set config.batch_size to the correct value before training.",
+                UserWarning,
+            )
+        self.dataset = dataset
+        self._signature_columns = None
+        if self.dataset is not None:
+            self.dataloader = self.prepare_dataloader(self.dataset, data_collator)
+        elif self.dataset is None and self.accelerator.num_processes > 1:
+            warnings.warn(
+                "No dataset is provided. In a multi-GPU setting, this will lead to an error. You should"
+                " prepare your dataloader yourself with `dataloader = ppo_trainer.accelerator.prepare(dataloader)`"
+                " and using `torch.utils.data.DataLoader`, or pass a dataset to the `PPOTrainer`. Please "
+                " refer to the documentation for more details.",
+                UserWarning,
+            )
+            self.dataloader = None
+        else:
+            self.dataloader = None
+
+        # Step 3: Initialize optimizer and data collator
+        self.data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
+        if optimizer is None:
+            self.optimizer = Adam(
+                filter(lambda p: p.requires_grad, self.model.parameters()),
+                lr=self.config.learning_rate,
+            )
+        else:
+            self.optimizer = optimizer
+
+        self.lr_scheduler = lr_scheduler
+        if self.lr_scheduler is not None:
+            lr_scheduler_class = torch.optim.lr_scheduler._LRScheduler if not is_torch_greater_2_0() else torch.optim.lr_scheduler.LRScheduler
+
+            if not isinstance(self.lr_scheduler, lr_scheduler_class):
+                raise ValueError("lr_scheduler must be a torch.optim.lr_scheduler._LRScheduler or torch.optim.lr_scheduler.LRScheduler (for torch >= 2.0)")
+
+        if self.config.adap_kl_ctrl:
+            self.kl_ctl = AdaptiveKLController(self.config.init_kl_coef, self.config.target, self.config.horizon)
+        else:
+            self.kl_ctl = FixedKLController(self.config.init_kl_coef)
+
+        # Safety checkers for DS integration
+        is_deepspeed_used = self.accelerator.distributed_type == "DEEPSPEED" and hasattr(self.accelerator.state, "deepspeed_plugin")
+
+        (
+            self.model,
+            self.optimizer,
+            self.data_collator,
+            self.dataloader,
+            self.lr_scheduler,
+        ) = self.accelerator.prepare(
+            self.model,
+            self.optimizer,
+            self.data_collator,
+            self.dataloader,
+            self.lr_scheduler,
+        )
+        if is_deepspeed_used:
+            # Quantized models are already set on the correct device
+            if not self.is_peft_model and not (getattr(self.ref_model.pretrained_model, "is_loaded_in_8bit", False) or getattr(self.ref_model.pretrained_model, "is_loaded_in_4bit", False)):
+                self.ref_model = self._prepare_deepspeed(self.ref_model)
+        else:
+            self.ref_model = self.accelerator.prepare(self.ref_model)
+
+        # In a distributed setup, only logging needs to be performed on the main process
+        # check: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+        # or: https://discuss.pytorch.org/t/use-distributed-data-parallel-correctly/82500/11
+        self.is_distributed = self.accelerator.num_processes > 1
+
+        # init the current step
+        self.current_step = 0
+
+        # init variables for pushing model to hub
+        if config.push_to_hub_if_best_kwargs:
+            if "repo_id" not in config.push_to_hub_if_best_kwargs:
+                raise ValueError("You have to specify repo_id in order to push the model to the hub!")
+            self.push_to_hub_kwargs = config.push_to_hub_if_best_kwargs
+            self.compare_step = 0
+            self.highest_reward = torch.tensor(-float("inf"))
+
+        # post process for PP
+        if not getattr(self.model, "is_sequential_parallel", False):
+            self.current_device = self.accelerator.device
+        else:
+            if is_xpu_available():
+                self.current_device = torch.device("xpu:0")
+            elif is_npu_available():
+                self.current_device = torch.device("npu:0")
+            else:
+                self.current_device = torch.device("cuda:0")
+
+        PPODecorators.optimize_device_cache = self.config.optimize_device_cache
+
+        self.running = RunningMoments(self.accelerator)
+
+    def _filter_kwargs(self, kwargs, target_func):
+        """
+        filter the keyword arguments that are supported by the target function.
+
+        Args:
+            kwargs (dict):
+                Keyword arguments
+            target_func (function):
+                Target function
+        """
+        return {k: v for k, v in kwargs.items() if k in inspect.signature(target_func).parameters.keys()}
+
+    def prepare_dataloader(self, dataset: Union[torch.utils.data.Dataset, Dataset], data_collator=None):
+        """
+        Prepare the dataloader for training.
+
+        Args:
+            dataset (Union[`torch.utils.data.Dataset`, `datasets.Dataset`]):
+                PyTorch dataset or Hugging Face dataset. If a Hugging Face dataset is passed, the dataset
+                will be preprocessed by removing the columns that are not used by the model.
+            data_collator (Optional[function]):
+                Data collator function.
+
+        Returns:
+            `torch.utils.data.DataLoader`: PyTorch dataloader
+        """
+        if isinstance(dataset, Dataset):
+            dataset = self._remove_unused_columns(dataset)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            collate_fn=data_collator,
+            shuffle=True,
+            drop_last=True,
+        )
+        return dataloader
+
+    # Adapted from transformers.Trainer._set_signature_columns_if_needed
+    def _set_signature_columns_if_needed(self):
+        if self._signature_columns is None:
+            # Inspect model forward signature to keep only the arguments it accepts.
+            signature = inspect.signature(self.model.forward)
+            self._signature_columns = list(signature.parameters.keys())
+            # label => sentiment | we need query and response for logging purpose
+            self._signature_columns += ["label", "query", "response"]
+
+    # Adapted from transformers.Trainer._remove_unused_columns
+    def _remove_unused_columns(self, dataset: "Dataset"):
+        if not self.config.remove_unused_columns:
+            return dataset
+        self._set_signature_columns_if_needed()
+        signature_columns = self._signature_columns
+
+        ignored_columns = list(set(dataset.column_names) - set(signature_columns))
+
+        columns = [k for k in signature_columns if k in dataset.column_names]
+
+        if version.parse(datasets.__version__) < version.parse("1.4.0"):
+            dataset.set_format(
+                type=dataset.format["type"],
+                columns=columns,
+                format_kwargs=dataset.format["format_kwargs"],
+            )
+            return dataset
+        else:
+            return dataset.remove_columns(ignored_columns)
+
+    def generate(
+        self,
+        query_tensor: Union[torch.Tensor, List[torch.Tensor]],
+        length_sampler: Callable = None,
+        batch_size: int = 4,
+        return_prompt: bool = True,
+        generate_ref_response: bool = False,
+        **generation_kwargs,
+    ):
+        """
+        Generate response with the model given the query tensor.
+        call the `generate` method of the model.
+
+        Args:
+            query_tensor (`torch.LongTensor`):
+                A tensor of shape (`seq_len`) containing query tokens or a list of tensors of shape (`seq_len`).
+            length_sampler (`Callable`, *optional*):
+                Callable that returns the number of newly generated tokens.
+            batch_size (`int`, *optional):
+                Batch size used for generation, defaults to `4`.
+            return_prompt (`bool`, *optional*):
+                If set to `False` the prompt is not returned but only the newly generated tokens, defaults to `True`.
+            generate_ref_response (`bool`, *optional*):
+                If set to `True` the reference response is also generated, defaults to `False`.
+            generation_kwargs (dict[str, Any]):
+                Keyword arguments for generation.
+
+        Returns:
+            `torch.LongTensor`: A tensor of shape (`batch_size`, `gen_len`) containing response tokens.
+        """
+        if generate_ref_response:
+            ref_model = self.model if self.is_peft_model else self.ref_model
+        if isinstance(query_tensor, List):
+            response = self._generate_batched(
+                self.model,
+                query_tensor,
+                length_sampler=length_sampler,
+                batch_size=batch_size,
+                return_prompt=return_prompt,
+                **generation_kwargs,
+            )
+            if generate_ref_response:
+                with self.optional_peft_ctx():
+                    ref_response = self._generate_batched(
+                        ref_model,
+                        query_tensor,
+                        length_sampler=length_sampler,
+                        batch_size=batch_size,
+                        return_prompt=return_prompt,
+                        **generation_kwargs,
+                    )
+
+        else:
+            if len(query_tensor.shape) == 2:
+                raise ValueError("query_tensor must be a tensor of shape (`seq_len`) or a list of tensors of shape (`seq_len`)")
+
+            if length_sampler is not None:
+                generation_kwargs["max_new_tokens"] = length_sampler()
+            response = self.accelerator.unwrap_model(self.model).generate(input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs)
+            if generate_ref_response:
+                with self.optional_peft_ctx():
+                    ref_response = ref_model.generate(input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs)
+
+            if not return_prompt and not self.is_encoder_decoder:
+                response = response[:, query_tensor.shape[0] :]
+                if generate_ref_response:
+                    ref_response = ref_response[:, query_tensor.shape[0] :]
+
+        if generate_ref_response:
+            return response, ref_response
+        return response
+
+    def _generate_batched(
+        self,
+        model: PreTrainedModelWrapper,
+        query_tensors: List[torch.Tensor],
+        length_sampler: Callable = None,
+        batch_size: int = 4,
+        return_prompt: bool = True,
+        pad_to_multiple_of: int = None,
+        remove_padding: bool = True,
+        **generation_kwargs,
+    ):
+        outputs = []
+
+        padding_side_default = self.tokenizer.padding_side
+        if not self.is_encoder_decoder:
+            self.tokenizer.padding_side = "left"
+
+        # in case we have fewer examples than bs
+        batch_size = min(len(query_tensors), batch_size)
+
+        for i in range(0, len(query_tensors), batch_size):
+            if length_sampler is not None:
+                generation_kwargs["max_new_tokens"] = length_sampler()
+
+            # prevent overflow if query tensors are not even multiple of bs
+            end_index = min(len(query_tensors), i + batch_size)
+
+            batch = query_tensors[i:end_index]
+            batch_mask = [torch.ones_like(element) for element in batch]
+            inputs = {"input_ids": batch, "attention_mask": batch_mask}
+
+            padded_inputs = self.tokenizer.pad(
+                inputs,
+                padding=True,
+                max_length=None,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors="pt",
+            ).to(self.current_device)
+
+            generations = self.accelerator.unwrap_model(model).generate(**padded_inputs, **generation_kwargs)
+
+            for generation, mask in zip(generations, padded_inputs["attention_mask"]):
+                if not self.is_encoder_decoder:
+                    output = generation[(1 - mask).sum() :]  # remove padding
+                else:
+                    output = generation
+
+                if not return_prompt and not self.is_encoder_decoder:
+                    output = output[(mask).sum() :]  # remove prompt
+
+                if remove_padding and self.tokenizer.eos_token_id in output:
+                    pad_mask = output == self.tokenizer.eos_token_id
+                    pad_start = torch.nonzero(pad_mask, as_tuple=False)[0, 0].item()
+                    output = output[: pad_start + 1]  # keep the eos token at the end
+
+                outputs.append(output)
+
+        self.tokenizer.padding_side = padding_side_default
+        return outputs
+
+    def _step_safety_checker(
+        self,
+        batch_size: int,
+        queries: List[torch.LongTensor],
+        responses: List[torch.LongTensor],
+        scores: List[torch.FloatTensor],
+        masks: Optional[List[torch.LongTensor]] = None,
+    ):
+        """
+        Check if the input data is valid for training.
+
+        Args:
+            batch_size (int):
+                Batch size from the config file.
+            queries (List[`torch.LongTensor`]):
+                List of tensors containing the encoded queries of shape (`query_length`)
+            responses (List[`torch.LongTensor`]):
+                List of tensors containing the encoded responses of shape (`response_length`)
+            scores (List[`torch.FloatTensor`]):
+                List of tensors containing the scores.
+            masks (List[`torch.LongTensor`], *optional*):
+                list of optional tensors containing the masks of shape (`query_length` + `response_length`)
+        Returns:
+            `tuple`: The input processed data.
+        """
+        for name, tensor_list in zip(["queries", "responses", "scores"], [queries, responses, scores]):
+            if not isinstance(tensor_list, list):
+                raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}")
+            if not isinstance(tensor_list[0], torch.Tensor):
+                raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}")
+            if batch_size is not None and len(tensor_list) != batch_size:
+                raise ValueError(f"Batch size ({batch_size}) does not match number of examples - but got {len(tensor_list)} for: {name}")
+
+        # add queries, scores and responses on the correct device
+        queries = [tensor.to(self.current_device) for tensor in queries]
+        responses = [tensor.to(self.current_device) for tensor in responses]
+        scores = [tensor.to(self.current_device) for tensor in scores]
+        masks = [tensor.to(self.current_device) for tensor in masks] if masks is not None else None
+
+        # squeeze scores if needed
+        for i, score in enumerate(scores):
+            if score.dim() > 1:
+                raise ValueError(f"Scores must be 1-dimensional - got {score.dim()} for {score}")
+            elif score.dim() == 1:
+                scores[i] = score.squeeze()
+
+        return queries, responses, scores, masks
+
+    @PPODecorators.empty_device_cache()
+    def step(
+        self,
+        queries: List[torch.LongTensor],
+        responses: List[torch.LongTensor],
+        scores: List[torch.FloatTensor],
+        response_masks: Optional[List[torch.LongTensor]] = None,
+    ):
+        """
+        Run a PPO optimisation step given a list of queries, model responses, and rewards.
+
+        Args:
+            queries (List[`torch.LongTensor`]):
+                List of tensors containing the encoded queries of shape (`query_length`)
+            responses (List[`torch.LongTensor`]):
+                List of tensors containing the encoded responses of shape (`response_length`)
+            scores (List[`torch.FloatTensor`]):
+                List of tensors containing the scores.
+            response_masks (List[`torch.FloatTensor`], *optional*)):
+                List of tensors containing masks of the response tokens.
+
+        Returns:
+            `dict[str, Any]`: A summary of the training statistics
+        """
+        bs = self.config.batch_size
+
+        queries, responses, scores, response_masks = self._step_safety_checker(bs, queries, responses, scores, response_masks)
+        scores = torch.tensor(scores, device=self.current_device)
+        if self.config.use_score_scaling:
+            # Score scaling
+            scores_mean, scores_std = self.running.update(scores)
+            tensor_to_kwargs = dict(dtype=scores.dtype, device=scores.device)
+            score_scaling_factor = self.running.std.to(**tensor_to_kwargs) + torch.finfo(scores.dtype).eps
+            if self.config.use_score_norm:
+                scores = (scores - self.running.mean.to(**tensor_to_kwargs)) / score_scaling_factor
+            else:
+                scores /= score_scaling_factor
+
+        if self.config.score_clip is not None:
+            # Score clipping
+            scores_dtype = scores.dtype
+            scores = torch.clip(scores.float(), -self.config.score_clip, self.config.score_clip).to(dtype=scores_dtype)
+
+        # if we want to push best model to the hub
+        if hasattr(self, "highest_reward"):
+            if self.compare_step % self.config.compare_steps == 0:
+                curr_mean_reward = scores.mean()
+                # if the best reward ever seen
+                if curr_mean_reward > self.highest_reward:
+                    self.highest_reward = curr_mean_reward
+                    # push model to hub
+                    self.push_to_hub(**self.push_to_hub_kwargs)
+            self.compare_step += 1
+
+        timing = dict()
+        t0 = time.time()
+
+        t = time.time()
+
+        model_inputs = self.prepare_model_inputs(queries, responses)
+
+        if self.is_distributed:
+            pad_first = self.tokenizer.padding_side == "left"
+
+            model_inputs["input_ids"] = self.accelerator.pad_across_processes(
+                model_inputs["input_ids"],
+                dim=1,
+                pad_index=self.tokenizer.pad_token_id,
+                pad_first=pad_first,
+            )
+            model_inputs["attention_mask"] = self.accelerator.pad_across_processes(model_inputs["attention_mask"], dim=1, pad_index=0, pad_first=pad_first)
+            if self.is_encoder_decoder:
+                model_inputs["decoder_input_ids"] = self.accelerator.pad_across_processes(
+                    model_inputs["decoder_input_ids"],
+                    dim=1,
+                    pad_index=self.tokenizer.pad_token_id,
+                    pad_first=pad_first,
+                )
+                model_inputs["decoder_attention_mask"] = self.accelerator.pad_across_processes(
+                    model_inputs["decoder_attention_mask"],
+                    dim=1,
+                    pad_index=0,
+                    pad_first=pad_first,
+                )
+
+        model_inputs_names = list(model_inputs.keys())
+
+        full_kl_penalty = self.config.kl_penalty == "full"
+
+        with torch.no_grad():
+            all_logprobs, logits_or_none, values, masks = self.batched_forward_pass(
+                self.model,
+                queries,
+                responses,
+                model_inputs,
+                response_masks=response_masks,
+                return_logits=full_kl_penalty,
+            )
+            with self.optional_peft_ctx():
+                ref_logprobs, ref_logits_or_none, _, _ = self.batched_forward_pass(
+                    self.model if self.is_peft_model else self.ref_model,
+                    queries,
+                    responses,
+                    model_inputs,
+                    return_logits=full_kl_penalty,
+                )
+
+        timing["time/ppo/forward_pass"] = time.time() - t
+
+        with torch.no_grad():
+            t = time.time()
+            if full_kl_penalty:
+                active_full_logprobs = logprobs_from_logits(logits_or_none, None, gather=False)
+                ref_full_logprobs = logprobs_from_logits(ref_logits_or_none, None, gather=False)
+
+                rewards, non_score_reward, kls = self.compute_rewards(scores, active_full_logprobs, ref_full_logprobs, masks)
+            else:
+                rewards, non_score_reward, kls = self.compute_rewards(scores, all_logprobs, ref_logprobs, masks)
+            timing["time/ppo/compute_rewards"] = time.time() - t
+
+            t = time.time()
+            values, advantages, returns = self.compute_advantages(values, rewards, masks)
+            timing["time/ppo/compute_advantages"] = time.time() - t
+
+        # upcast to float32 to avoid dataset issues
+        batch_dict = {
+            "queries": queries,
+            "responses": responses,
+            "logprobs": all_logprobs.to(torch.float32),
+            "values": values.to(torch.float32),
+            "masks": masks,
+            "advantages": advantages,
+            "returns": returns,
+        }
+        batch_dict.update(model_inputs)
+
+        t = time.time()
+        all_stats = []
+        early_stop = False
+        for _ in range(self.config.ppo_epochs):
+            if early_stop:
+                break
+            b_inds = np.random.permutation(bs)
+            for backward_batch_start in range(0, bs, self.config.backward_batch_size):
+                backward_batch_end = backward_batch_start + self.config.backward_batch_size
+                backward_batch_inds = b_inds[backward_batch_start:backward_batch_end]
+
+                for mini_batch_start in range(0, self.config.backward_batch_size, self.config.mini_batch_size):
+                    mini_batch_end = mini_batch_start + self.config.mini_batch_size
+                    mini_batch_inds = backward_batch_inds[mini_batch_start:mini_batch_end]
+                    mini_batch_dict = {
+                        "logprobs": batch_dict["logprobs"][mini_batch_inds],
+                        "values": batch_dict["values"][mini_batch_inds],
+                        "masks": batch_dict["masks"][mini_batch_inds],
+                        # hacks: the queries and responses are ragged.
+                        "queries": [batch_dict["queries"][i] for i in mini_batch_inds],
+                        "responses": [batch_dict["responses"][i] for i in mini_batch_inds],
+                        "advantages": batch_dict["advantages"][mini_batch_inds],
+                        "returns": batch_dict["returns"][mini_batch_inds],
+                    }
+                    for k in model_inputs_names:
+                        mini_batch_dict[k] = batch_dict[k][mini_batch_inds]
+                    with self.accelerator.accumulate(self.model):
+                        model_inputs = {k: mini_batch_dict[k] for k in model_inputs_names}
+
+                        logprobs, logits, vpreds, _ = self.batched_forward_pass(
+                            self.model,
+                            mini_batch_dict["queries"],
+                            mini_batch_dict["responses"],
+                            model_inputs,
+                            return_logits=True,
+                        )
+                        train_stats = self.train_minibatch(
+                            mini_batch_dict["logprobs"],
+                            mini_batch_dict["values"],
+                            logprobs,
+                            logits,
+                            vpreds,
+                            mini_batch_dict["masks"],
+                            mini_batch_dict["advantages"],
+                            mini_batch_dict["returns"],
+                        )
+                        all_stats.append(train_stats)
+
+            # typically, early stopping is done at the epoch level
+            if self.config.early_stopping:
+                policykl = train_stats["policy/policykl"]
+                early_stop = self._early_stop(policykl)
+                if early_stop:
+                    break
+
+        timing["time/ppo/optimize_step"] = time.time() - t
+
+        t = time.time()
+        train_stats = stack_dicts(all_stats)
+
+        # reshape advantages/ratios such that they are not averaged.
+        train_stats["policy/advantages"] = torch.flatten(train_stats["policy/advantages"]).unsqueeze(0)
+        train_stats["policy/advantages"] = torch.nan_to_num(train_stats["policy/advantages"], WANDB_PADDING)
+        train_stats["policy/ratio"] = torch.flatten(train_stats["policy/ratio"]).unsqueeze(0)
+
+        stats = self.record_step_stats(
+            scores=scores,
+            logprobs=all_logprobs,
+            ref_logprobs=ref_logprobs,
+            non_score_reward=non_score_reward,
+            train_stats=train_stats,
+            kl_coef=self.kl_ctl.value,
+            masks=masks,
+            queries=queries,
+            responses=responses,
+            kls=kls,
+        )
+        # Gather/Reduce stats from all processes
+        if self.is_distributed:
+            stats = self.gather_stats(stats)
+        stats = stats_to_np(stats)
+        timing["time/ppo/calc_stats"] = time.time() - t
+        stats["ppo/learning_rate"] = self.optimizer.param_groups[0]["lr"]
+
+        # Update the KL control - multiply the batch_size by the number of processes
+        self.kl_ctl.update(
+            stats["objective/kl"],
+            self.config.batch_size * self.accelerator.num_processes,
+        )
+
+        # Log the total ppo time
+        timing["time/ppo/total"] = time.time() - t0
+        stats.update(timing)
+
+        # post-process stats for tensorboard and other loggers
+        if self.config.log_with != "wandb":
+            stats = convert_to_scalar(stats)
+
+        if self.lr_scheduler is not None:
+            self.lr_scheduler.step()
+
+        return stats
+
+    def _early_stop(self, policykl):
+        r"""
+        Handles the early stopping logic. If the policy KL is greater than the target KL, then the gradient is zeroed and
+        the optimization step is skipped.
+        This also handles the multi-gpu case where the policy KL is averaged across all processes.
+
+        Args:
+            policy_kl (torch.Tensor):
+                the policy KL
+
+        Returns:
+            `bool`: whether to early stop or not
+        """
+        early_stop = False
+        if not self.config.early_stopping:
+            return early_stop
+
+        if not self.is_distributed and policykl > 1.5 * self.config.target_kl:
+            self.optimizer.zero_grad()
+            early_stop = True
+        elif self.is_distributed:
+            import torch.distributed as dist
+
+            # Wait for all processes to finish
+            dist.barrier()
+
+            # all gather the policykl
+            dist.all_reduce(policykl, dist.ReduceOp.SUM)
+            policykl /= self.accelerator.num_processes
+
+            if policykl > 1.5 * self.config.target_kl:
+                self.optimizer.zero_grad()
+                early_stop = True
+        return early_stop
+
+    def gather_stats(self, stats):
+        """
+        Gather stats from all processes. Useful in the context of distributed training.
+
+        Args:
+            stats (dict[str, Any]):
+            a dictionary of stats to be gathered. The stats should contain torch tensors.
+
+        Returns:
+            `dict[str, Any]`: A dictionary of stats with the tensors gathered.
+        """
+        import torch.distributed as dist
+
+        # Wait for all processes to finish
+        dist.barrier()
+
+        for k, v in stats.items():
+            if isinstance(v, torch.Tensor):
+                dist.all_reduce(v.to(self.accelerator.device), dist.ReduceOp.SUM)
+                v /= self.accelerator.num_processes
+            stats[k] = v
+        return stats
+
+    def prepare_model_inputs(self, queries: torch.Tensor, responses: torch.Tensor):
+        if self.is_encoder_decoder:
+            input_data = self.data_collator([{"input_ids": q, "attention_mask": torch.ones_like(q)} for q in queries]).to(self.current_device)
+
+            decoder_inputs = self.data_collator([{"input_ids": r, "attention_mask": torch.ones_like(r)} for r in responses]).to(self.current_device)
+
+            input_data["decoder_input_ids"] = decoder_inputs["input_ids"]
+            input_data["decoder_attention_mask"] = decoder_inputs["attention_mask"]
+        else:
+            input_ids = [torch.cat([q, r]) for q, r in zip(queries, responses)]
+            input_data = self.data_collator([{"input_ids": ids, "attention_mask": torch.ones_like(ids)} for ids in input_ids]).to(self.current_device)
+
+        input_data.pop("labels", None)  # we don't want to compute LM losses
+        return input_data
+
+    @PPODecorators.empty_device_cache()
+    def batched_forward_pass(
+        self,
+        model: PreTrainedModelWrapper,
+        queries: torch.Tensor,
+        responses: torch.Tensor,
+        model_inputs: dict,
+        return_logits: bool = False,
+        response_masks: Optional[torch.Tensor] = None,
+    ):
+        """
+        Calculate model outputs in multiple batches.
+
+        Args:
+            queries (`torch.LongTensor`):
+                List of tensors containing the encoded queries, shape (`batch_size`, `query_length`)
+            responses (`torch.LongTensor`):
+                List of tensors containing the encoded responses, shape (`batch_size`, `response_length`)
+            return_logits (`bool`, *optional*, defaults to `False`):
+                Whether to return all_logits. Set to `False` if logits are not needed to reduce memory consumption.
+        Returns:
+            (tuple):
+                - all_logprobs (`torch.FloatTensor`): Log probabilities of the responses,
+                    shape (`batch_size`, `response_length`)
+                - all_ref_logprobs (`torch.FloatTensor`): Log probabilities of the responses,
+                    shape (`batch_size`, `response_length`)
+                - all_values (`torch.FloatTensor`): Values of the responses, shape (`batch_size`, `response_length`)
+        """
+        bs = len(queries)
+        fbs = self.config.mini_batch_size
+        all_logprobs = []
+        all_logits = []
+        all_masks = []
+        all_values = []
+
+        model.eval()
+
+        for i in range(math.ceil(bs / fbs)):
+            input_kwargs = {key: value[i * fbs : (i + 1) * fbs] for key, value in model_inputs.items()}
+            query_batch = queries[i * fbs : (i + 1) * fbs]
+            response_batch = responses[i * fbs : (i + 1) * fbs]
+            if response_masks is not None:
+                response_masks_batch = response_masks[i * fbs : (i + 1) * fbs]
+            logits, _, values = model(**input_kwargs)
+
+            if self.is_encoder_decoder:
+                input_ids = input_kwargs["decoder_input_ids"]
+                attention_mask = input_kwargs["decoder_attention_mask"]
+            else:
+                input_ids = input_kwargs["input_ids"]
+                attention_mask = input_kwargs["attention_mask"]
+
+            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
+            masks = torch.zeros_like(attention_mask)
+            masks[:, :-1] = attention_mask[:, 1:]
+
+            for j in range(len(query_batch)):
+                if self.is_encoder_decoder:
+                    # Decoder sentence starts always in the index 1 after padding in the Enc-Dec Models
+                    start = 1
+                    end = attention_mask[j, :].sum() - 1
+                else:
+                    start = len(query_batch[j]) - 1  # logprobs starts from the second query token
+                    if attention_mask[j, 0] == 0:  # offset left padding
+                        start += attention_mask[j, :].nonzero()[0]
+                    end = start + len(response_batch[j])
+                    if response_masks is not None:
+                        response_masks_batch[j] = torch.cat((torch.zeros_like(query_batch[j]), response_masks_batch[j]))[1:]
+
+                masks[j, :start] = 0
+                masks[j, end:] = 0
+                if response_masks is not None:
+                    masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end]
+
+            if return_logits:
+                all_logits.append(logits)
+            else:
+                del logits
+            all_values.append(values)
+            all_logprobs.append(logprobs)
+            all_masks.append(masks)
+
+        return (
+            torch.cat(all_logprobs),
+            torch.cat(all_logits)[:, :-1] if return_logits else None,
+            torch.cat(all_values)[:, :-1],
+            torch.cat(all_masks)[:, :-1],
+        )
+
+    @PPODecorators.empty_device_cache()
+    def train_minibatch(
+        self,
+        old_logprobs: torch.FloatTensor,
+        values: torch.FloatTensor,
+        logprobs: torch.FloatTensor,
+        logits: torch.FloatTensor,
+        vpreds: torch.FloatTensor,
+        mask: torch.LongTensor,
+        advantages: torch.FloatTensor,
+        returns: torch.FloatTensor,
+    ):
+        """
+        Train one PPO minibatch
+
+        Args:
+            logprobs (`torch.FloatTensor`):
+                Log probabilities of the model, shape [mini_batch_size, response_length]
+            values (`torch.FloatTensor`):
+                Values of the value head, shape [mini_batch_size, response_length]
+            query (`torch.LongTensor`):
+                Encoded queries, shape [mini_batch_size, query_length]
+            response (`torch.LongTensor`):
+                Encoded responses, shape [mini_batch_size, response_length]
+            model_input (`torch.LongTensor`):
+                Concatenated queries and responses, shape [mini_batch_size, query_length+response_length]
+
+        Returns:
+            train_stats (dict[str, `torch.Tensor`]):
+                Dictionary of training statistics
+        """
+        self.model.train()
+        loss_p, loss_v, train_stats = self.loss(old_logprobs, values, logits, vpreds, logprobs, mask, advantages, returns)
+        loss = loss_p + loss_v
+        self.accelerator.backward(loss)
+        if self.config.max_grad_norm is not None:
+            if self.accelerator.sync_gradients:
+                self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm)
+        self.optimizer.step()
+        # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation
+        # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code
+        self.optimizer.zero_grad()
+        return train_stats
+
+    def compute_rewards(
+        self,
+        scores: torch.FloatTensor,
+        logprobs: torch.FloatTensor,
+        ref_logprobs: torch.FloatTensor,
+        masks: torch.LongTensor,
+    ):
+        """
+        Compute per token rewards from scores and KL-penalty.
+
+        Args:
+            scores (`torch.FloatTensor`):
+                Scores from the reward model, shape (`batch_size`)
+            logprobs (`torch.FloatTensor`):
+                Log probabilities of the model, shape (`batch_size`, `response_length`)
+            ref_logprobs (`torch.FloatTensor`):
+                Log probabilities of the reference model, shape (`batch_size`, `response_length`)
+
+        Returns:
+            `torch.FloatTensor`: Per token rewards, shape (`batch_size`, `response_length`)
+            `torch.FloatTensor`: Non score rewards, shape (`batch_size`, `response_length`)
+            `torch.FloatTensor`: KL penalty, shape (`batch_size`, `response_length`)
+        """
+        rewards, non_score_rewards, kls = [], [], []
+        for score, logprob, ref_logprob, mask in zip(scores, logprobs, ref_logprobs, masks):
+            # compute KL penalty (from difference in logprobs)
+            kl = self._kl_penalty(logprob, ref_logprob)
+            kls.append(kl)
+            non_score_reward = -self.kl_ctl.value * kl
+            non_score_rewards.append(non_score_reward)
+            reward = non_score_reward.clone()
+            last_non_masked_index = mask.nonzero()[-1]
+
+            # reward is preference model score + KL penalty
+            reward[last_non_masked_index] += score
+            rewards.append(reward)
+        return torch.stack(rewards), torch.stack(non_score_rewards), torch.stack(kls)
+
+    def _kl_penalty(self, logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor) -> torch.FloatTensor:
+        if self.config.kl_penalty == "kl":
+            return logprob - ref_logprob
+
+        if self.config.kl_penalty == "abs":
+            return (logprob - ref_logprob).abs()
+
+        if self.config.kl_penalty == "mse":
+            return 0.5 * (logprob - ref_logprob).square()
+
+        if self.config.kl_penalty == "full":
+            # Flip is required due to this issue? :https://github.com/pytorch/pytorch/issues/57459
+            return F.kl_div(ref_logprob, logprob, log_target=True, reduction="none").sum(-1)
+
+        raise NotImplementedError
+
+    def compute_advantages(
+        self,
+        values: torch.FloatTensor,
+        rewards: torch.FloatTensor,
+        mask: torch.FloatTensor,
+    ):
+        lastgaelam = 0
+        advantages_reversed = []
+        gen_len = rewards.shape[-1]
+
+        values = values * mask
+        rewards = rewards * mask
+
+        if self.config.whiten_rewards:
+            rewards = masked_whiten(rewards, mask, shift_mean=False)
+
+        for t in reversed(range(gen_len)):
+            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
+            delta = rewards[:, t] + self.config.gamma * nextvalues - values[:, t]
+            lastgaelam = delta + self.config.gamma * self.config.lam * lastgaelam
+            advantages_reversed.append(lastgaelam)
+        advantages = torch.stack(advantages_reversed[::-1]).transpose(0, 1)
+
+        returns = advantages + values
+        advantages = masked_whiten(advantages, mask)
+        advantages = advantages.detach()
+        return values, advantages, returns
+
+    def loss(
+        self,
+        old_logprobs: torch.FloatTensor,
+        values: torch.FloatTensor,
+        logits: torch.FloatTensor,
+        vpreds: torch.FloatTensor,
+        logprobs: torch.FloatTensor,
+        mask: torch.LongTensor,
+        advantages: torch.FloatTensor,
+        returns: torch.FloatTensor,
+    ):
+        """
+        Calculate policy and value losses.
+
+        Args:
+            old_logprobs (`torch.FloatTensor`):
+                Log probabilities of the model, shape (`batch_size`, `response_length`)
+            values (`torch.FloatTensor`):
+                Values of the value head, shape (`batch_size`, `response_length`)
+            rewards (`torch.FloatTensor`):
+                Rewards from the reward model, shape (`batch_size`, `response_length`)
+            logits (`torch.FloatTensor`):
+                Logits of the model, shape (`batch_size`, `response_length`, `vocab_size`)
+            v_pred (`torch.FloatTensor`):
+                Values of the value head, shape (`batch_size`, `response_length`)
+            logprobs (`torch.FloatTensor`):
+                Log probabilities of the model, shape (`batch_size`, `response_length`)
+        """
+
+        vpredclipped = clip_by_value(
+            vpreds,
+            values - self.config.cliprange_value,
+            values + self.config.cliprange_value,
+        )
+
+        vf_losses1 = (vpreds - returns) ** 2
+        vf_losses2 = (vpredclipped - returns) ** 2
+        vf_loss = 0.5 * masked_mean(torch.max(vf_losses1, vf_losses2), mask)
+        vf_clipfrac = masked_mean(torch.gt(vf_losses2, vf_losses1).float(), mask)
+
+        ratio = torch.exp(logprobs - old_logprobs)
+
+        pg_losses = -advantages * ratio
+        pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - self.config.cliprange, 1.0 + self.config.cliprange)
+
+        pg_loss = masked_mean(torch.max(pg_losses, pg_losses2), mask)
+        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses).float(), mask)
+
+        loss = pg_loss + self.config.vf_coef * vf_loss
+
+        avg_ratio = masked_mean(ratio, mask).item()
+        if avg_ratio > self.config.ratio_threshold:
+            warnings.warn(f"The average ratio of batch ({avg_ratio:.2f}) exceeds threshold {self.config.ratio_threshold:.2f}. Skipping batch.")
+            pg_loss = pg_loss * 0.0
+            vf_loss = vf_loss * 0.0
+            loss = loss * 0.0
+
+        entropy = masked_mean(entropy_from_logits(logits), mask)
+
+        approxkl = 0.5 * masked_mean((logprobs - old_logprobs) ** 2, mask)
+        policykl = masked_mean(old_logprobs - logprobs, mask)
+
+        return_mean, return_var = masked_mean(returns, mask), masked_var(returns, mask)
+        value_mean, value_var = masked_mean(values, mask), masked_var(values, mask)
+
+        stats = dict(
+            loss=dict(policy=pg_loss.detach(), value=vf_loss.detach(), total=loss.detach()),
+            policy=dict(
+                entropy=entropy.detach(),
+                approxkl=approxkl.detach(),
+                policykl=policykl.detach(),
+                clipfrac=pg_clipfrac.detach(),
+                advantages=advantages.detach(),
+                advantages_mean=masked_mean(advantages, mask).detach(),
+                ratio=ratio.detach(),
+            ),
+            returns=dict(mean=return_mean.detach(), var=return_var.detach()),
+            val=dict(
+                vpred=masked_mean(vpreds, mask).detach(),
+                error=masked_mean((vpreds - returns) ** 2, mask).detach(),
+                clipfrac=vf_clipfrac.detach(),
+                mean=value_mean.detach(),
+                var=value_var.detach(),
+            ),
+        )
+        return pg_loss, self.config.vf_coef * vf_loss, flatten_dict(stats)
+
+    def record_step_stats(self, kl_coef: float, **data):
+        """
+        Record training step statistics.
+
+
+        Args:
+            kl_coef (`float`):
+                KL coefficient
+            data (`dict`):
+                Dictionary of training step data
+
+        Returns:
+            stats (`dict`):
+                Dictionary of training step statistics
+        """
+        mask = data.pop("masks")
+
+        kls = data.pop("kls")
+        kl_list = ((kls) * mask).sum(axis=-1)
+        mean_kl = kl_list.mean()
+        mean_entropy = (-data["logprobs"] * mask).sum(axis=-1).mean()
+
+        mean_non_score_reward = masked_mean(data["non_score_reward"], mask)  # non_score_reward is size `batch_size`, `response_length`
+        mean_scores = data["scores"].mean()  # scores is size `batch_size`
+        std_scores = data["scores"].std()
+
+        if mean_kl.item() < -1.0:
+            # warn users
+            warnings.warn(
+                f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
+                " sometimes this happens because the generation kwargs are not correctly set. Please make sure"
+                " that the generation kwargs are set correctly, or review your training hyperparameters."
+            )
+
+        stats = {
+            "objective/kl": mean_kl,
+            "objective/kl_dist": kl_list,
+            "objective/logprobs": data["logprobs"],
+            "objective/ref_logprobs": data["ref_logprobs"],
+            "objective/kl_coef": kl_coef,
+            "objective/entropy": mean_entropy,
+            "ppo/mean_non_score_reward": mean_non_score_reward,
+            "ppo/mean_scores": mean_scores,
+            "ppo/std_scores": std_scores,
+        }
+
+        # Log text properties
+        query_lens = torch.tensor([len(query) for query in data["queries"]], dtype=torch.float)
+        response_lens = torch.tensor([len(response) for response in data["responses"]], dtype=torch.float)
+
+        stats["tokens/queries_len_mean"] = torch.mean(query_lens).cpu().numpy().item()
+        stats["tokens/queries_len_std"] = torch.std(query_lens).cpu().numpy().item()
+        stats["tokens/queries_dist"] = query_lens.cpu().numpy()
+        stats["tokens/responses_len_mean"] = torch.mean(response_lens).cpu().numpy().item()
+        stats["tokens/responses_len_std"] = torch.std(response_lens).cpu().numpy().item()
+        stats["tokens/responses_dist"] = response_lens.cpu().numpy()
+
+        for k, v in data["train_stats"].items():
+            stats[f"ppo/{k}"] = torch.mean(v, axis=0)
+        stats["ppo/val/var_explained"] = 1 - stats["ppo/val/error"] / stats["ppo/returns/var"]
+        return stats
+
+    def log_stats(
+        self,
+        stats: dict,
+        batch: dict,
+        rewards: List[torch.FloatTensor],
+        columns_to_log: List[str] = ["query", "response"],
+    ):
+        """
+        A function that logs all the training stats. Call it at the end of each epoch.
+
+        Args:
+            stats (dict[str, Any]):
+                A dictionary of training stats.
+            batch (dict[str, Any]):
+                A dictionary of batch data, this contains the queries and responses.
+            rewards (`List[torch.FloatTensor]`):
+                A tensor of rewards.
+        """
+
+        # all gather stats
+        if not isinstance(rewards, torch.Tensor):
+            rewards = torch.tensor(rewards).to(self.current_device)
+        rewards = self.accelerator.gather(rewards).flatten()
+
+        if self.config.log_with == "wandb":
+            import wandb
+
+            if any([column_to_log not in batch.keys() for column_to_log in columns_to_log]):
+                raise ValueError(f"Columns to log {columns_to_log} are not present in the batch {batch.keys()}.")
+
+            batch_list = [batch[column_to_log] for column_to_log in columns_to_log]
+            if self.is_distributed:
+                gathered_batch_list = []
+                for b in batch_list:
+                    flattened = gather_object(b)
+                    gathered_batch_list.append(flattened)
+                batch_list = gathered_batch_list
+
+        # Log only if we are in the main process
+        if self.accelerator.is_main_process:
+            logs = {}
+
+            # Log stats
+            if "query" not in batch.keys() and "response" not in batch.keys():
+                # warn the user that the game logs will not be logged
+                warnings.warn("The game logs will not be logged because the batch does not contain the keys 'query' and " "'response'. ")
+            elif self.config.log_with == "wandb":
+                table_rows = [list(r) for r in zip(*batch_list, rewards.cpu().tolist())]
+                logs.update({"game_log": wandb.Table(columns=[*columns_to_log, "reward"], rows=table_rows)})
+
+            logs.update(stats)
+
+            # manually cast in fp32 for bf16 torch tensors
+            for k, v in logs.items():
+                if isinstance(v, torch.Tensor) and v.dtype == torch.bfloat16:
+                    logs[k] = v.float()
+
+            logs["env/reward_mean"] = torch.mean(rewards).cpu().numpy().item()
+            logs["env/reward_std"] = torch.std(rewards).cpu().numpy().item()
+            logs["env/reward_dist"] = rewards.cpu().numpy()
+
+            if self.config.log_with == "tensorboard":
+                # update the current step
+                self.current_step += 1
+
+            self.accelerator.log(
+                logs,
+                step=self.current_step if self.config.log_with == "tensorboard" else None,
+            )
+
+    def create_model_card(self, path: str, model_name: Optional[str] = "TRL Model") -> None:
+        """Creates and saves a model card for a TRL model.
+
+        Args:
+            path (`str`): The path to save the model card to.
+            model_name (`str`, *optional*): The name of the model, defaults to `TRL Model`.
+        """
+        try:
+            user = whoami()["name"]
+        # handle the offline case
+        except:  # noqa
+            warnings.warn("Cannot retrieve user information assuming you are running in offline mode.")
+            return
+
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        model_card_content = MODEL_CARD_TEMPLATE.format(model_name=model_name, model_id=f"{user}/{path}")
+        with open(os.path.join(path, "README.md"), "w", encoding="utf-8") as f:
+            f.write(model_card_content)
+
+    def _save_pretrained(self, save_directory: str) -> None:
+        self.accelerator.unwrap_model(self.model).save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+        self.create_model_card(save_directory)
+
+    def _show_tokens(self, tokens, masks):
+        from rich import print
+        from rich.text import Text
+
+        text = Text()
+
+        for i, (token, mask) in enumerate(zip(tokens, masks)):
+            if mask == 1:
+                text.append(self.tokenizer.decode(token.item()), style="black on deep_sky_blue1")
+                text.append(" ")
+            else:
+                text.append(self.tokenizer.decode(token.item()), style="black on cyan3")
+                text.append(" ")
+        print(text)
+
+    def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
+        # Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        config_kwargs = deepspeed_plugin.deepspeed_config
+        if model is not None:
+            if hasattr(model, "config"):
+                hidden_size = max(model.config.hidden_sizes) if getattr(model.config, "hidden_sizes", None) else getattr(model.config, "hidden_size", None)
+                if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
+                    # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
+                    # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+                    config_kwargs.update(
+                        {
+                            "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                            "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
+                            "zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
+                        }
+                    )
+
+        # If ZeRO-3 is used, we shard both the active and reference model.
+        # Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
+        if config_kwargs["zero_optimization"]["stage"] != 3:
+            config_kwargs["zero_optimization"]["stage"] = 0
+        model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+        model.eval()
+        return model
diff --git a/train/trl/trainer/reward_config.py b/train/trl/trainer/reward_config.py
new file mode 100644
index 0000000..32c7e26
--- /dev/null
+++ b/train/trl/trainer/reward_config.py
@@ -0,0 +1,38 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class RewardConfig(TrainingArguments):
+    """
+    RewardConfig collects all training arguments related to the [`RewardTrainer`] class.
+
+    Using [`HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        max_length (`int`, *optional*, defaults to `None`):
+            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
+        gradient_checkpointing (`bool`, *optional*, defaults to `True`):
+                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+    """
+
+    max_length: Optional[int] = None
+    """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
diff --git a/train/trl/trainer/reward_trainer.py b/train/trl/trainer/reward_trainer.py
new file mode 100644
index 0000000..eff4eb7
--- /dev/null
+++ b/train/trl/trainer/reward_trainer.py
@@ -0,0 +1,257 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from dataclasses import FrozenInstanceError, replace
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from datasets import Dataset
+from transformers import DataCollator, PreTrainedModel, PreTrainedTokenizerBase, Trainer, TrainingArguments
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_pt_utils import nested_detach
+from transformers.trainer_utils import EvalPrediction
+
+from ..import_utils import is_peft_available
+from .reward_config import RewardConfig
+from .utils import RewardDataCollatorWithPadding, compute_accuracy
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+class RewardTrainer(Trainer):
+    r"""
+    The RewardTrainer can be used to train your custom Reward Model. It is a subclass of the
+    `transformers.Trainer` class and inherits all of its attributes and methods. It is recommended to use
+    an `AutoModelForSequenceClassification` as the reward model. The reward model should be trained on a dataset
+    of paired examples, where each example is a tuple of two sequences. The reward model should be trained to
+    predict which example in the pair is more relevant to the task at hand.
+
+    The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least
+    if you don't use the default `RewardDataCollatorWithPadding` data collator. The entries should be named
+    - `input_ids_chosen`
+    - `attention_mask_chosen`
+    - `input_ids_rejected`
+    - `attention_mask_rejected`
+
+    Optionally, you can also pass a `margin` entry to the dataset. This entry should contain the margin used to modulate the
+    loss of the reward model as outlined in https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/.
+    If you don't pass a margin, no margin will be used.
+    """
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        args: Optional[RewardConfig] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        max_length: Optional[int] = None,
+        peft_config: Optional[Dict] = None,
+    ):
+        """
+        Initialize RewardTrainer.
+
+        Args:
+            model (`transformers.PreTrainedModel`):
+                The model to train, preferably an `AutoModelForSequenceClassification`.
+            args (`RewardConfig`):
+                The arguments to use for training.
+            data_collator (`transformers.DataCollator`):
+                The data collator to use for training. If None is specified, the default data collator (`RewardDataCollatorWithPadding`) will be used
+                which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
+            train_dataset (`datasets.Dataset`):
+                The dataset to use for training.
+            eval_dataset (`datasets.Dataset`):
+                The dataset to use for evaluation.
+            tokenizer (`transformers.PreTrainedTokenizerBase`):
+                The tokenizer to use for training. This argument is required if you want to use the default data collator.
+            model_init (`Callable[[], transformers.PreTrainedModel]`):
+                The model initializer to use for training. If None is specified, the default model initializer will be used.
+            compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to `compute_accuracy`):
+                The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used.
+            callbacks (`List[transformers.TrainerCallback]`):
+                The callbacks to use for training.
+            optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+                The optimizer and scheduler to use for training.
+            preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+                The function to use to preprocess the logits before computing the metrics.
+            max_length (`int`, defaults to `None`):
+                The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
+            peft_config (`Dict`, defaults to `None`):
+                The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
+        """
+        if type(args) == TrainingArguments:
+            warnings.warn(
+                "Using `transformers.TrainingArguments` for `args` is deprecated and will be removed in a future version. Please use `RewardConfig` instead.",
+                FutureWarning,
+            )
+            if max_length is not None:
+                warnings.warn(
+                    "The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
+                    FutureWarning,
+                )
+        else:
+            if max_length is not None and args.max_length is not None:
+                raise ValueError("You cannot specify both `max_length` and `args.max_length`. Please use the `RewardConfig` to set `max_length` once.")
+            if max_length is not None and args.max_length is None:
+                warnings.warn(
+                    "The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
+                    FutureWarning,
+                )
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError("PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models")
+        elif is_peft_available() and peft_config is not None:
+            if not isinstance(model, PeftModel):
+                if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False):
+                    _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(inspect.signature(prepare_model_for_kbit_training).parameters)
+
+                    preprare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                    if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
+                        warnings.warn("You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " "please update to the latest version of peft to use `gradient_checkpointing_kwargs`.")
+                    elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
+                        preprare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                    model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
+
+                model = get_peft_model(model, peft_config)
+
+        if compute_metrics is None:
+            compute_metrics = compute_accuracy
+
+        if data_collator is None:
+            if tokenizer is None:
+                raise ValueError("max_length or a tokenizer must be specified when using the default RewardDataCollatorWithPadding")
+            if type(args) == TrainingArguments:
+                if max_length is None:
+                    warnings.warn(
+                        "When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
+                        UserWarning,
+                    )
+                    max_length = 512
+            else:
+                if max_length is None and args.max_length is None:
+                    warnings.warn(
+                        "When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
+                        UserWarning,
+                    )
+                    max_length = 512
+                if max_length is None and args.max_length is not None:
+                    max_length = args.max_length
+
+            data_collator = RewardDataCollatorWithPadding(tokenizer, max_length=max_length)
+
+            if args.remove_unused_columns:
+                try:  # for bc before https://github.com/huggingface/transformers/pull/25435
+                    args.remove_unused_columns = False
+                except FrozenInstanceError:
+                    args = replace(args, remove_unused_columns=False)
+                # warn users
+                warnings.warn(
+                    "When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" " we have set it for you, but you should do it yourself in the future.",
+                    UserWarning,
+                )
+
+            self.use_reward_data_collator = True
+        else:
+            self.use_reward_data_collator = False
+        super().__init__(
+            model,
+            args,
+            data_collator,
+            train_dataset,
+            eval_dataset,
+            tokenizer,
+            model_init,
+            compute_metrics,
+            callbacks,
+            optimizers,
+            preprocess_logits_for_metrics,
+        )
+
+    def compute_loss(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        return_outputs=False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+        if not self.use_reward_data_collator:
+            warnings.warn("The current compute_loss is implemented for RewardDataCollatorWithPadding," " if you are using a custom data collator make sure you know what you are doing or" " implement your own compute_loss method.")
+        rewards_chosen = model(
+            input_ids=inputs["input_ids_chosen"],
+            attention_mask=inputs["attention_mask_chosen"],
+            return_dict=True,
+        )["logits"]
+        rewards_rejected = model(
+            input_ids=inputs["input_ids_rejected"],
+            attention_mask=inputs["attention_mask_rejected"],
+            return_dict=True,
+        )["logits"]
+        # calculate loss, optionally modulate with margin
+        if "margin" in inputs:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean()
+        else:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
+
+        if return_outputs:
+            return loss, {
+                "rewards_chosen": rewards_chosen,
+                "rewards_rejected": rewards_rejected,
+            }
+        return loss
+
+    def prediction_step(
+        self,
+        model: Union[PreTrainedModel, nn.Module],
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        with torch.no_grad():
+            loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True)
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        loss = loss.detach()
+        logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
+        logits = nested_detach(logits)
+        # Stack accepted against rejected, mean over logits
+        # and softmax to get preferences between accepted and rejected to sum to 1
+        logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T
+
+        labels = torch.zeros(logits.shape[0])
+        labels = self._prepare_inputs(labels)
+
+        return loss, logits, labels
diff --git a/train/trl/trainer/sft_trainer.py b/train/trl/trainer/sft_trainer.py
new file mode 100644
index 0000000..69f3275
--- /dev/null
+++ b/train/trl/trainer/sft_trainer.py
@@ -0,0 +1,480 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import inspect
+import warnings
+from functools import wraps
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from accelerate.state import PartialState
+from datasets import Dataset
+from datasets.arrow_writer import SchemaInferenceError
+from datasets.builder import DatasetGenerationError
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollator,
+    DataCollatorForLanguageModeling,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.modeling_utils import unwrap_model
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+
+from ..extras.dataset_formatting import get_formatting_func_from_dataset
+from ..import_utils import is_peft_available
+from .utils import (
+    ConstantLengthDataset,
+    DataCollatorForCompletionOnlyLM,
+    neftune_post_forward_hook,
+    peft_module_casting_to_bf16,
+    trl_sanitze_kwargs_for_tagging,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+class SFTTrainer(Trainer):
+    r"""
+    Class definition of the Supervised Finetuning Trainer (SFT Trainer).
+    This class is a wrapper around the `transformers.Trainer` class and inherits all of its attributes and methods.
+    The trainer takes care of properly initializing the PeftModel in case a user passes a `PeftConfig` object.
+
+    Args:
+        model (Union[`transformers.PreTrainedModel`, `nn.Module`, `str`]):
+            The model to train, can be a `PreTrainedModel`, a `torch.nn.Module` or a string with the model name to
+            load from cache or download. The model can be also converted to a `PeftModel` if a `PeftConfig` object is
+            passed to the `peft_config` argument.
+        args (Optional[`transformers.TrainingArguments`]):
+            The arguments to tweak for training. Please refer to the official documentation of `transformers.TrainingArguments`
+            for more information.
+        data_collator (Optional[`transformers.DataCollator`]):
+            The data collator to use for training.
+        train_dataset (Optional[`datasets.Dataset`]):
+            The dataset to use for training. We recommend users to use `trl.trainer.ConstantLengthDataset` to create their dataset.
+        eval_dataset (Optional[Union[`datasets.Dataset`, Dict[`str`, `datasets.Dataset`]]]):
+            The dataset to use for evaluation. We recommend users to use `trl.trainer.ConstantLengthDataset` to create their dataset.
+        tokenizer (Optional[`transformers.PreTrainedTokenizer`]):
+            The tokenizer to use for training. If not specified, the tokenizer associated to the model will be used.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be used.
+        compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to None):
+            The function used to compute metrics during evaluation. It should return a dictionary mapping metric names to metric values.
+            If not specified, only the loss will be computed during evaluation.
+        callbacks (`List[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`Optional[PeftConfig]`):
+            The PeftConfig object to use to initialize the PeftModel.
+        dataset_text_field (`Optional[str]`):
+            The name of the text field of the dataset, in case this is passed by a user, the trainer will automatically create a
+            `ConstantLengthDataset` based on the `dataset_text_field` argument.
+        formatting_func (`Optional[Callable]`):
+            The formatting function to be used for creating the `ConstantLengthDataset`.
+        max_seq_length (`Optional[int]`):
+            The maximum sequence length to use for the `ConstantLengthDataset` and for automatically creating the Dataset. Defaults to `512`.
+        infinite (`Optional[bool]`):
+            Whether to use an infinite dataset or not. Defaults to `False`.
+        num_of_sequences (`Optional[int]`):
+            The number of sequences to use for the `ConstantLengthDataset`. Defaults to `1024`.
+        chars_per_token (`Optional[float]`):
+            The number of characters per token to use for the `ConstantLengthDataset`. Defaults to `3.6`. You can check how this is computed in the
+            stack-llama example: https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53.
+        packing (`Optional[bool]`):
+            Used only in case `dataset_text_field` is passed. This argument is used by the `ConstantLengthDataset` to pack the sequences
+            of the dataset.
+        dataset_num_proc (`Optional[int]`):
+            The number of workers to use to tokenize the data. Only used when `packing=False`. Defaults to None.
+        dataset_batch_size (`int`):
+            The number of examples to tokenize per batch. If batch_size <= 0 or batch_size == None,
+            tokenize the full dataset as a single batch. Defaults to 1000.
+        neftune_noise_alpha (`Optional[float]`):
+            If not `None`, this will activate NEFTune noise embeddings. This has been proven to drastically improve model performances for instruction
+            fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune
+        model_init_kwargs: (`Optional[Dict]`, *optional*):
+            Dict of Optional kwargs to pass when instantiating the model from a string
+        dataset_kwargs: (`Optional[Dict]`, *optional*):
+            Dict of Optional kwargs to pass when creating packed or non-packed datasets
+    """
+
+    _tag_names = ["trl", "sft"]
+
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module, str] = None,
+        args: TrainingArguments = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        dataset_text_field: Optional[str] = None,
+        packing: Optional[bool] = False,
+        formatting_func: Optional[Callable] = None,
+        max_seq_length: Optional[int] = None,
+        infinite: Optional[bool] = None,
+        num_of_sequences: Optional[int] = 1024,
+        chars_per_token: Optional[float] = 3.6,
+        dataset_num_proc: Optional[int] = None,
+        dataset_batch_size: int = 1000,
+        neftune_noise_alpha: Optional[float] = None,
+        model_init_kwargs: Optional[Dict] = None,
+        dataset_kwargs: Optional[Dict] = None,
+    ):
+        if model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the SFTTrainer. But your model is already instantiated.")
+
+        if infinite is not None:
+            warnings.warn("The `infinite` argument is deprecated and will be removed in a future version of TRL. Use `TrainingArguments.max_steps` or `TrainingArguments.num_train_epochs` instead to control training length.")
+
+        if isinstance(model, str):
+            warnings.warn("You passed a model_id to the SFTTrainer. This will automatically create an " "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you.")
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if packing and data_collator is not None and isinstance(data_collator, DataCollatorForCompletionOnlyLM):
+            raise ValueError("You passed a `DataCollatorForCompletionOnlyLM` to the SFTTrainer. This is not compatible with the `packing` argument.")
+
+        if is_peft_available() and peft_config is not None:
+            if not isinstance(peft_config, PeftConfig):
+                raise ValueError("If you want to use the PeftModel, you need to pass a PeftConfig object to the SFTTrainer." f" and you passed a {type(peft_config)}.")
+
+            if not isinstance(model, PeftModel):
+                _support_gc_kwargs = hasattr(args, "gradient_checkpointing_kwargs") and "gradient_checkpointing_kwargs" in list(inspect.signature(prepare_model_for_kbit_training).parameters)
+                gradient_checkpointing_kwargs = getattr(args, "gradient_checkpointing_kwargs", None) or {}
+                if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                    preprare_model_kwargs = {"use_gradient_checkpointing": getattr(args, "gradient_checkpointing", False)}
+
+                    if _support_gc_kwargs:
+                        preprare_model_kwargs["gradient_checkpointing_kwargs"] = gradient_checkpointing_kwargs
+
+                    model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
+
+                    if args is not None:
+                        args = dataclasses.replace(args, gradient_checkpointing=False)
+                elif getattr(args, "gradient_checkpointing", False) and ("use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]):
+                    # For backward compatibility with older versions of transformers
+                    if hasattr(model, "enable_input_require_grads"):
+                        model.enable_input_require_grads()
+                    else:
+
+                        def make_inputs_require_grad(module, input, output):
+                            output.requires_grad_(True)
+
+                        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+                model = get_peft_model(model, peft_config)
+                if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                    peft_module_casting_to_bf16(model)
+
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
+            if getattr(tokenizer, "pad_token", None) is None:
+                tokenizer.pad_token = tokenizer.eos_token
+
+        if max_seq_length is None:
+            # to overcome some issues with broken tokenizers
+            max_seq_length = min(tokenizer.model_max_length, 1024)
+
+            warnings.warn(f"You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to {max_seq_length}")
+
+        self.dataset_num_proc = dataset_num_proc
+        self.dataset_batch_size = dataset_batch_size
+
+        self._trainer_supports_neftune = hasattr(args, "neftune_noise_alpha")
+
+        if neftune_noise_alpha is not None and self._trainer_supports_neftune:
+            args.neftune_noise_alpha = neftune_noise_alpha
+            warnings.warn("You passed a `neftune_noise_alpha` argument to the SFTTrainer, the value you passed will override the one in the `TrainingArguments`.")
+            # self.neftune_noise_alpha is done at Trainer level
+        elif not self._trainer_supports_neftune:
+            self.neftune_noise_alpha = neftune_noise_alpha
+
+        if formatting_func is None and dataset_text_field is None:
+            # check if dataset has ChatML format or instruction format and is supported
+            # if not stays #None
+            formatting_func = get_formatting_func_from_dataset(train_dataset, tokenizer)
+
+        if not packing:
+            if dataset_text_field is None and formatting_func is None:
+                raise ValueError("You passed `packing=False` to the SFTTrainer, but you didn't pass a `dataset_text_field` or `formatting_func` argument.")
+
+            if data_collator is None:
+                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+        # Pre-process the datasets only once per node. The remaining processes will use the cache.
+        with PartialState().local_main_process_first():
+            if dataset_kwargs is None:
+                dataset_kwargs = {}
+            if train_dataset is not None:
+                train_dataset = self._prepare_dataset(
+                    train_dataset,
+                    tokenizer,
+                    packing,
+                    dataset_text_field,
+                    max_seq_length,
+                    formatting_func,
+                    num_of_sequences,
+                    chars_per_token,
+                    remove_unused_columns=args.remove_unused_columns if args is not None else True,
+                    **dataset_kwargs,
+                )
+            if eval_dataset is not None:
+                _multiple = isinstance(eval_dataset, dict)
+                _eval_datasets = eval_dataset if _multiple else {"singleton": eval_dataset}
+                for _eval_dataset_name, _eval_dataset in _eval_datasets.items():
+                    _eval_datasets[_eval_dataset_name] = self._prepare_dataset(
+                        _eval_dataset,
+                        tokenizer,
+                        packing,
+                        dataset_text_field,
+                        max_seq_length,
+                        formatting_func,
+                        num_of_sequences,
+                        chars_per_token,
+                        remove_unused_columns=args.remove_unused_columns if args is not None else True,
+                        **dataset_kwargs,
+                    )
+                if not _multiple:
+                    eval_dataset = _eval_datasets["singleton"]
+
+        if tokenizer.padding_side is not None and tokenizer.padding_side != "right":
+            warnings.warn(
+                "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to "
+                "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code."
+            )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        if self.args.max_steps > 0 and packing:
+            warnings.warn("You passed `packing=True` to the SFTTrainer, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.")
+            self.train_dataset.infinite = True
+        elif self.args.max_steps == -1 and packing:
+            self.train_dataset.infinite = False
+
+    @wraps(Trainer.train)
+    def train(self, *args, **kwargs):
+        # Activate neftune right before training.
+        if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
+            self.model = self._trl_activate_neftune(self.model)
+
+        output = super().train(*args, **kwargs)
+
+        # After training we make sure to retrieve back the original forward pass method
+        # for the embedding layer by removing the forward post hook.
+        if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
+            unwrapped_model = unwrap_model(self.model)
+            if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+                embeddings = unwrapped_model.base_model.model.get_input_embeddings()
+            else:
+                embeddings = unwrapped_model.get_input_embeddings()
+
+            self.neftune_hook_handle.remove()
+            del embeddings.neftune_noise_alpha
+
+        return output
+
+    @wraps(Trainer.push_to_hub)
+    def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
+        """
+        Overwrite the `push_to_hub` method in order to force-add the tag "sft" when pushing the
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
+        """
+        kwargs = trl_sanitze_kwargs_for_tagging(model=self.model, tag_names=self._tag_names, kwargs=kwargs)
+
+        return super().push_to_hub(commit_message=commit_message, blocking=blocking, **kwargs)
+
+    def _prepare_dataset(
+        self,
+        dataset,
+        tokenizer,
+        packing,
+        dataset_text_field,
+        max_seq_length,
+        formatting_func,
+        num_of_sequences,
+        chars_per_token,
+        remove_unused_columns=True,
+        append_concat_token=True,
+        add_special_tokens=True,
+    ):
+        if dataset is None:
+            raise ValueError("The dataset should not be None")
+
+        # check if torch dataset / dataloader and do nothing
+        if isinstance(dataset, (torch.utils.data.IterableDataset, torch.utils.data.Dataset, ConstantLengthDataset)):
+            return dataset
+
+        if not packing:
+            return self._prepare_non_packed_dataloader(
+                tokenizer,
+                dataset,
+                dataset_text_field,
+                max_seq_length,
+                formatting_func,
+                add_special_tokens,
+                remove_unused_columns,
+            )
+
+        else:
+            return self._prepare_packed_dataloader(
+                tokenizer,
+                dataset,
+                dataset_text_field,
+                max_seq_length,
+                num_of_sequences,
+                chars_per_token,
+                formatting_func,
+                append_concat_token,
+                add_special_tokens,
+            )
+
+    def _prepare_non_packed_dataloader(
+        self,
+        tokenizer,
+        dataset,
+        dataset_text_field,
+        max_seq_length,
+        formatting_func=None,
+        add_special_tokens=True,
+        remove_unused_columns=True,
+    ):
+        use_formatting_func = formatting_func is not None and dataset_text_field is None
+        self._dataset_sanity_checked = False
+
+        # Inspired from: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
+        def tokenize(element):
+            outputs = tokenizer(
+                element[dataset_text_field] if not use_formatting_func else formatting_func(element),
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                padding=False,
+                max_length=max_seq_length,
+                return_overflowing_tokens=False,
+                return_length=False,
+            )
+
+            if use_formatting_func and not self._dataset_sanity_checked:
+                if not isinstance(formatting_func(element), list):
+                    raise ValueError("The `formatting_func` should return a list of processed strings since it can lead to silent bugs.")
+                else:
+                    self._dataset_sanity_checked = True
+
+            return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+
+        signature_columns = ["input_ids", "labels", "attention_mask"]
+
+        extra_columns = list(set(dataset.column_names) - set(signature_columns))
+
+        if not remove_unused_columns and len(extra_columns) > 0:
+            warnings.warn(
+                "You passed `remove_unused_columns=False` on a non-packed dataset. This might create some issues with the default collator and yield to errors. If you want to "
+                f"inspect dataset other columns (in this case {extra_columns}), you can subclass `DataCollatorForLanguageModeling` in case you used the default collator and create your own data collator in order to inspect the unused dataset columns."
+            )
+
+        tokenized_dataset = dataset.map(
+            tokenize,
+            batched=True,
+            remove_columns=dataset.column_names if remove_unused_columns else None,
+            num_proc=self.dataset_num_proc,
+            batch_size=self.dataset_batch_size,
+        )
+
+        return tokenized_dataset
+
+    def _prepare_packed_dataloader(
+        self,
+        tokenizer,
+        dataset,
+        dataset_text_field,
+        max_seq_length,
+        num_of_sequences,
+        chars_per_token,
+        formatting_func=None,
+        append_concat_token=True,
+        add_special_tokens=True,
+    ):
+        if dataset_text_field is not None or formatting_func is not None:
+            if tokenizer is None:
+                raise ValueError("You need to pass a tokenizer when using `dataset_text_field` with `SFTTrainer`.")
+
+            constant_length_iterator = ConstantLengthDataset(
+                tokenizer,
+                dataset,
+                dataset_text_field=dataset_text_field,
+                formatting_func=formatting_func,
+                seq_length=max_seq_length,
+                infinite=False,
+                num_of_sequences=num_of_sequences,
+                chars_per_token=chars_per_token,
+                eos_token_id=tokenizer.eos_token_id,
+                append_concat_token=append_concat_token,
+                add_special_tokens=add_special_tokens,
+            )
+
+            def data_generator(constant_length_iterator):
+                for i in constant_length_iterator:
+                    yield i
+
+            try:
+                packed_dataset = Dataset.from_generator(data_generator, gen_kwargs={"constant_length_iterator": constant_length_iterator})
+            except (DatasetGenerationError, SchemaInferenceError):
+                raise ValueError("Error occurred while packing the dataset. Make sure that your dataset has enough samples to at least yield one packed sequence.")
+            return packed_dataset
+        else:
+            raise ValueError("You need to pass a `dataset_text_field` or `formatting_func` argument to the SFTTrainer if you want to use the `ConstantLengthDataset`.")
+
+    def _trl_activate_neftune(self, model):
+        r"""
+        Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: https://arxiv.org/abs/2310.05914
+        Since in transformers Trainer we do have an `_activate_neftune` method, we need to rename this method to avoid conflicts.
+        """
+        unwrapped_model = unwrap_model(model)
+        if is_peft_available() and isinstance(unwrapped_model, PeftModel):
+            embeddings = unwrapped_model.base_model.model.get_input_embeddings()
+        else:
+            embeddings = unwrapped_model.get_input_embeddings()
+
+        embeddings.neftune_noise_alpha = self.neftune_noise_alpha
+        hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
+        self.neftune_hook_handle = hook_handle
+        return model
diff --git a/train/trl/trainer/utils.py b/train/trl/trainer/utils.py
new file mode 100644
index 0000000..1c566fa
--- /dev/null
+++ b/train/trl/trainer/utils.py
@@ -0,0 +1,703 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import warnings
+from collections import deque
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from accelerate import PartialState
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import IterableDataset
+from transformers import BitsAndBytesConfig, DataCollatorForLanguageModeling, PreTrainedTokenizerBase
+
+from ..import_utils import is_peft_available, is_unsloth_available, is_xpu_available
+from ..trainer.model_config import ModelConfig
+
+
+if is_peft_available():
+    from peft import LoraConfig, PeftConfig
+
+
+class AdaptiveKLController:
+    """
+    Adaptive KL controller described in the paper:
+    https://arxiv.org/pdf/1909.08593.pdf
+    """
+
+    def __init__(self, init_kl_coef, target, horizon):
+        self.value = init_kl_coef
+        self.target = target
+        self.horizon = horizon
+
+    def update(self, current, n_steps):
+        target = self.target
+        proportional_error = np.clip(current / target - 1, -0.2, 0.2)
+        mult = 1 + proportional_error * n_steps / self.horizon
+        self.value *= mult
+
+
+class FixedKLController:
+    """Fixed KL controller."""
+
+    def __init__(self, kl_coef):
+        self.value = kl_coef
+
+    def update(self, current, n_steps):
+        pass
+
+
+class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
+    """
+    Data collator used for completion tasks. It ensures that all the tokens of the labels are set to an 'ignore_index'
+    when they do not come from the assistant. This ensure that the loss is only
+    calculated on the completion made by the assistant.
+
+    Args:
+        response_template (`Union[str, List[int]]`): the template form that indicates the start of the response, typically something like
+            '### Response:\n'. It can also be passed as tokenized ids, which can be useful when using a tokenizer that encodes the response
+            differently if it does not have proper context.
+        instruction_template (`Union[str, List[int]]`): the template form that indicates the start of the human instruction, typically something like
+            '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids.
+        mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
+            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
+             for flexibility and backwards-compatibility.
+        ignore_index (`int`, *optional*, defaults to `-100`):
+            The index to use to ignore the initial tokens with
+    """
+
+    def __init__(
+        self,
+        response_template: Union[str, List[int]],
+        instruction_template: Union[str, List[int]] = None,
+        *args,
+        mlm: bool = False,
+        ignore_index: int = -100,
+        **kwargs,
+    ):
+        super().__init__(*args, mlm=mlm, **kwargs)
+
+        self.instruction_template = instruction_template
+        if isinstance(instruction_template, str):
+            # The user provides a string, must tokenize
+            self.instruction_token_ids = self.tokenizer.encode(self.instruction_template, add_special_tokens=False)
+        else:
+            # The user already provides the token ids
+            self.instruction_token_ids = instruction_template
+
+        self.response_template = response_template
+        if isinstance(response_template, str):
+            # The user provides a string, must tokenize
+            self.response_token_ids = self.tokenizer.encode(self.response_template, add_special_tokens=False)
+        else:
+            # The user already provides the token ids
+            self.response_token_ids = response_template
+
+        if not self.mlm and self.instruction_template and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            warnings.warn(
+                "The pad_token_id and eos_token_id values of this tokenizer are identical. "
+                "If you are planning for multi-turn training, "
+                "it can result in the model continuously generating questions and answers without eos token. "
+                "To avoid this, set the pad_token_id to a different value."
+            )
+
+        self.ignore_index = ignore_index
+
+    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        batch = super().torch_call(examples)
+
+        if self.instruction_template is None:
+            for i in range(len(examples)):
+                response_token_ids_start_idx = None
+
+                for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[0]:
+                    # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match
+                    if self.response_token_ids == batch["labels"][i][idx : idx + len(self.response_token_ids)].tolist():
+                        response_token_ids_start_idx = idx
+
+                if response_token_ids_start_idx is None:
+                    warnings.warn(
+                        f"Could not find response key `{self.response_template}` in the "
+                        f'following instance: {self.tokenizer.decode(batch["input_ids"][i])} '
+                        f"This instance will be ignored in loss calculation. "
+                        f"Note, if this happens often, consider increasing the `max_seq_length`."
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+                else:
+                    response_token_ids_end_idx = response_token_ids_start_idx + len(self.response_token_ids)
+
+                    # Make pytorch loss function ignore all tokens up through the end of the response key
+                    batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index
+
+        else:
+            for i in range(len(examples)):
+                response_token_ids_idxs = []
+                human_token_ids_idxs = []
+
+                for assistant_idx in np.where(batch["labels"][i] == self.response_token_ids[0])[0]:
+                    # find the indexes of the start of a response.
+                    if self.response_token_ids == batch["labels"][i][assistant_idx : assistant_idx + len(self.response_token_ids)].tolist():
+                        response_token_ids_idxs.append(assistant_idx + len(self.response_token_ids))
+
+                if len(response_token_ids_idxs) == 0:
+                    warnings.warn(
+                        f"Could not find response key `{self.response_template}` in the "
+                        f'following instance: {self.tokenizer.decode(batch["input_ids"][i])} '
+                        f"This instance will be ignored in loss calculation. "
+                        f"Note, if this happens often, consider increasing the `max_seq_length`."
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                human_token_ids = self.instruction_token_ids
+                for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]:
+                    # find the indexes of the start of a human answer.
+                    if human_token_ids == batch["labels"][i][human_idx : human_idx + len(human_token_ids)].tolist():
+                        human_token_ids_idxs.append(human_idx)
+
+                if len(human_token_ids_idxs) == 0:
+                    warnings.warn(
+                        f"Could not find instruction key `{self.instruction_template}` in the "
+                        f'following instance: {self.tokenizer.decode(batch["input_ids"][i])} '
+                        f"This instance will be ignored in loss calculation. "
+                        f"Note, if this happens often, consider increasing the `max_seq_length`."
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                if len(human_token_ids_idxs) > 0 and len(response_token_ids_idxs) > 0 and human_token_ids_idxs[0] > response_token_ids_idxs[0]:
+                    human_token_ids_idxs = [0] + human_token_ids_idxs
+
+                for idx, (start, end) in enumerate(zip(human_token_ids_idxs, response_token_ids_idxs)):
+                    # Make pytorch loss function ignore all non response tokens
+                    if idx != 0:
+                        batch["labels"][i, start:end] = self.ignore_index
+                    else:
+                        batch["labels"][i, :end] = self.ignore_index
+
+                if len(response_token_ids_idxs) < len(human_token_ids_idxs):
+                    batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index
+
+        return batch
+
+
+@dataclass
+class RewardDataCollatorWithPadding:
+    r"""
+    Reward DataCollator class that pads the inputs to the maximum length of the batch.
+    Args:
+        tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for encoding the data.
+        padding (`Union[bool, str, `PaddingStrategy`]`, `optional`, defaults to `True`):
+            padding_strategy to pass to the tokenizer.
+        max_length (`Optional[int]`, `optional`, defaults to `None`):
+            The maximum length of the sequence to be processed.
+        pad_to_multiple_of (`Optional[int]`, `optional`, defaults to `None`):
+            If set will pad the sequence to a multiple of the provided value.
+        return_tensors (`str`, `optional`, defaults to `"pt"`):
+            The tensor type to use.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        features_chosen = []
+        features_rejected = []
+        margin = []
+        # check if we have a margin. If we do, we need to batch it as well
+        has_margin = "margin" in features[0]
+        for feature in features:
+            # check if the keys are named as expected
+            if "input_ids_chosen" not in feature or "input_ids_rejected" not in feature or "attention_mask_chosen" not in feature or "attention_mask_rejected" not in feature:
+                raise ValueError("The features should include `input_ids_chosen`, `attention_mask_chosen`, `input_ids_rejected` and `attention_mask_rejected`")
+
+            features_chosen.append(
+                {
+                    "input_ids": feature["input_ids_chosen"],
+                    "attention_mask": feature["attention_mask_chosen"],
+                }
+            )
+            features_rejected.append(
+                {
+                    "input_ids": feature["input_ids_rejected"],
+                    "attention_mask": feature["attention_mask_rejected"],
+                }
+            )
+            if has_margin:
+                margin.append(feature["margin"])
+        batch_chosen = self.tokenizer.pad(
+            features_chosen,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch_rejected = self.tokenizer.pad(
+            features_rejected,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch = {
+            "input_ids_chosen": batch_chosen["input_ids"],
+            "attention_mask_chosen": batch_chosen["attention_mask"],
+            "input_ids_rejected": batch_rejected["input_ids"],
+            "attention_mask_rejected": batch_rejected["attention_mask"],
+            "return_loss": True,
+        }
+        if has_margin:
+            margin = torch.tensor(margin, dtype=torch.float)
+            batch["margin"] = margin
+        return batch
+
+
+@dataclass
+class DPODataCollatorWithPadding:
+    r"""
+    DPO DataCollator class that pads the tokenized inputs to the maximum length of the batch.
+    Args:
+        pad_token_id (`int` defaults to 0):
+            The tokenizer's pad_token_id.
+        label_pad_token_id (`int`, defaults to -100):
+            The label used for masking.
+        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
+            Whether or not you model has an encoder_decoder architecture.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    pad_token_id: int = 0
+    label_pad_token_id: int = -100
+    is_encoder_decoder: Optional[bool] = False
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        # first, pad everything to the same length
+        padded_batch = {}
+        for k in features[0].keys():
+            if k.endswith("_input_ids") or k.endswith("_attention_mask") or k.endswith("_labels"):
+                if self.is_encoder_decoder:
+                    to_pad = [torch.LongTensor(ex[k]) for ex in features]
+
+                    if (k.startswith("prompt")) and (k.endswith("input_ids")):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token." " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)" " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    elif (k.startswith("chosen")) or (k.startswith("rejected")) or ("decoder" in k):
+                        padding_value = self.label_pad_token_id
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
+                else:
+                    # adapted from https://stackoverflow.com/questions/73256206
+                    if "prompt" in k:
+                        to_pad = [torch.LongTensor(ex[k][::-1]) for ex in features]
+                    else:
+                        to_pad = [torch.LongTensor(ex[k]) for ex in features]
+                    if k.endswith("_input_ids"):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token." " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)" " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_labels"):
+                        padding_value = self.label_pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+
+                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
+                    # for the prompt, flip back so padding is on left side
+                    if "prompt" in k:
+                        padded_batch[k] = padded_batch[k].flip(dims=[1])
+            elif k.endswith("_logps"):
+                # the cached reference model logprobs
+                padded_batch[k] = torch.tensor([ex[k] for ex in features])
+            else:
+                padded_batch[k] = [ex[k] for ex in features]
+
+        return padded_batch
+
+
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+    The dataset also formats the text before tokenization with a specific format that is provided
+    by the user.
+
+        Args:
+            tokenizer (`transformers.PreTrainedTokenizer`):
+                The processor used for processing the data.
+            dataset (`dataset.Dataset`):
+                Dataset with text files.
+            dataset_text_field (`str`, **optional**):
+                Name of the field in the dataset that contains the text. Used only if `formatting_func` is `None`.
+            formatting_func (`Callable`, **optional**):
+                Function that formats the text before tokenization. Usually it is recommended to have follows a certain
+                pattern such as `"### Question: {question} ### Answer: {answer}"`
+            infinite (`bool`, *optional*, defaults to `False`):
+                If True the iterator is reset after dataset reaches end else stops.
+            seq_length (`int`, *optional*, defaults to `1024`):
+                Length of token sequences to return.
+            num_of_sequences (`int`, *optional*, defaults to `1024`):
+                Number of token sequences to keep in buffer.
+            chars_per_token (`int`, *optional*, defaults to `3.6`):
+                Number of characters per token used to estimate number of tokens in text buffer.
+            eos_token_id (`int`, *optional*, defaults to `0`):
+                Id of the end of sequence token if the passed tokenizer does not have an EOS token.
+            shuffle ('bool', *optional*, defaults to True)
+                Shuffle the examples before they are returned
+            append_concat_token ('bool', *optional*, defaults to True)
+                If true, appends `eos_token_id` at the end of each sample being packed.
+            add_special_tokens ('bool', *optional*, defaults to True)
+                If true, tokenizers adds special tokens to each sample being packed.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        dataset_text_field=None,
+        formatting_func=None,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        eos_token_id=0,
+        shuffle=True,
+        append_concat_token=True,
+        add_special_tokens=True,
+    ):
+        self.tokenizer = tokenizer
+
+        if tokenizer.eos_token_id is None:
+            warnings.warn(
+                "The passed tokenizer does not have an EOS token. We will use the passed eos_token_id instead which corresponds" f" to {eos_token_id}. If this is not the correct EOS token, make sure to pass the correct eos_token_id."
+            )
+
+        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.shuffle = shuffle
+        self.append_concat_token = append_concat_token
+        self.add_special_tokens = add_special_tokens
+        if formatting_func is None:
+            self.formatting_func = lambda x: x[dataset_text_field]
+        else:
+            self.formatting_func = formatting_func
+
+        if formatting_func is not None:
+            if formatting_func.__code__.co_argcount > 1:
+                warnings.warn(
+                    "The passed formatting_func has more than one argument. Usually that function should have a single argument `example`"
+                    " which corresponds to the dictionary returned by each element of the dataset. Make sure you know what you are doing."
+                )
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(self.formatting_func(next(iterator)))
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(buffer, add_special_tokens=self.add_special_tokens, truncation=False)["input_ids"]
+            all_token_ids = []
+            for tokenized_input in tokenized_inputs:
+                if self.append_concat_token:
+                    tokenized_input = tokenized_input + [self.concat_token_id]
+                all_token_ids.extend(tokenized_input)
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+
+
+class RunningMoments:
+    def __init__(self, accelerator):
+        """
+        Calculates the running mean and standard deviation of a data stream. Reference:
+        https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L75
+        """
+        self.mean = 0
+        self.std = 1
+        self.var = 1
+        self.count = 1e-24
+        self.accelerator = accelerator
+
+    @torch.no_grad()
+    def update(self, xs: torch.Tensor) -> Tuple[float, float]:
+        """
+        Updates running moments from batch's moments computed across ranks
+        """
+        if self.accelerator.use_distributed:
+            xs_mean, xs_var, xs_count = get_global_statistics(self.accelerator, xs)
+        else:
+            xs_count = xs.numel()
+            xs_var, xs_mean = torch.var_mean(xs, unbiased=False)
+        xs_mean, xs_var = xs_mean.float(), xs_var.float()
+
+        delta = xs_mean - self.mean
+        tot_count = self.count + xs_count
+
+        new_sum = xs_var * xs_count
+        # correct old_sum deviation accounting for the new mean
+        old_sum = self.var * self.count + delta**2 * self.count * xs_count / tot_count
+        tot_sum = old_sum + new_sum
+
+        self.mean += delta * xs_count / tot_count
+        self.var = tot_sum / tot_count
+        self.std = (self.var * tot_count / (tot_count - 1)).float().sqrt()
+        self.count = tot_count
+
+        return xs_mean.item(), (xs_var * xs_count / (xs_count - 1)).float().sqrt().item()
+
+
+@torch.no_grad()
+def get_global_statistics(accelerator, xs: torch.Tensor, mask=None, device="cpu") -> Tuple[float, float, int]:
+    """
+    Computes element-wise mean and variance of the tensor across processes. Reference:
+    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L57C1-L73C75
+    """
+    xs = xs.to(accelerator.device)
+    sum_and_count = torch.tensor([xs.sum(), (xs.numel() if mask is None else mask.sum())], device=xs.device)
+    sum_and_count = accelerator.reduce(sum_and_count)
+    global_sum, count = sum_and_count
+    global_mean = global_sum / count
+
+    sum_var = torch.sum(((xs - global_mean) ** 2).mul(1 if mask is None else mask))
+    sum_var = accelerator.reduce(sum_var)
+    global_var = sum_var / count
+
+    return global_mean.to(device), global_var.to(device), count.to(device)
+
+
+def compute_accuracy(eval_pred) -> Dict[str, float]:
+    predictions, labels = eval_pred
+    # Here, predictions is rewards_chosen and rewards_rejected.
+    # We want to see how much of the time rewards_chosen > rewards_rejected.
+    if np.array(predictions[:, 0] == predictions[:, 1], dtype=float).sum() > 0:
+        warnings.warn(f"There are {np.array(predictions[:, 0] == predictions[:, 1]).sum()} out of {len(predictions[:, 0])} instances where the predictions for both options are equal. As a consequence the accuracy can be misleading.")
+    predictions = np.argmax(predictions, axis=1)
+
+    accuracy = np.array(predictions == labels, dtype=float).mean().item()
+    return {"accuracy": accuracy}
+
+
+def pad_to_length(tensor: torch.Tensor, length: int, pad_value: Union[int, float], dim: int = -1) -> torch.Tensor:
+    if tensor.size(dim) >= length:
+        return tensor
+    else:
+        pad_size = list(tensor.shape)
+        pad_size[dim] = length - tensor.size(dim)
+        return torch.cat(
+            [
+                tensor,
+                pad_value * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device),
+            ],
+            dim=dim,
+        )
+
+
+def disable_dropout_in_model(model: torch.nn.Module) -> None:
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0
+
+
+def exact_div(a, b, a_str, b_str, custom_error_message=""):
+    q = a // b
+    if a != q * b:
+        raise ValueError(f"{custom_error_message}, {a_str}={a}, {b_str}={b}, inexact division: {a} / {b} = {a / b}")
+    return q
+
+
+# copied from https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/stat_tracking.py#L5
+class PerPromptStatTracker:
+    r"""
+    Class for tracking statistics per prompt. Mainly used to calculate advantage for the DPPO algorithm
+
+    Args:
+        buffer_size (`int`):
+            Size of the buffer to keep for each prompt.
+        min_count (`int`):
+            Minimum number of samples to keep in the buffer before calculating the mean and std.
+    """
+
+    def __init__(self, buffer_size, min_count):
+        self.buffer_size = buffer_size
+        self.min_count = min_count
+        self.stats = {}
+
+    def update(self, prompts, rewards):
+        prompts = np.array(prompts)
+        rewards = np.array(rewards)
+        unique = np.unique(prompts)
+        advantages = np.empty_like(rewards)
+        for prompt in unique:
+            prompt_rewards = rewards[prompts == prompt]
+            if prompt not in self.stats:
+                self.stats[prompt] = deque(maxlen=self.buffer_size)
+            self.stats[prompt].extend(prompt_rewards)
+
+            if len(self.stats[prompt]) < self.min_count:
+                mean = np.mean(rewards)
+                std = np.std(rewards) + 1e-6
+            else:
+                mean = np.mean(self.stats[prompt])
+                std = np.std(self.stats[prompt]) + 1e-6
+            advantages[prompts == prompt] = (prompt_rewards - mean) / std
+
+        return advantages
+
+    def get_stats(self):
+        return {k: {"mean": np.mean(v), "std": np.std(v), "count": len(v)} for k, v in self.stats.items()}
+
+
+def neftune_post_forward_hook(module, input, output):
+    """
+    Implements the NEFTune forward pass for the model using forward hooks. Note this works only for
+    torch.nn.Embedding layers. This method is slightly adapted from the original source code
+    that can be found here: https://github.com/neelsjain/NEFTune
+
+    Simply add it to your model as follows:
+    ```python
+    model = ...
+    model.embed_tokens.neftune_noise_alpha = 0.1
+    model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
+    ```
+
+    Args:
+        module (`torch.nn.Module`):
+            The embedding module where the hook is attached. Note that you need to set
+            `module.neftune_noise_alpha` to the desired noise alpha value.
+        input (`torch.Tensor`):
+            The input tensor to the model.
+        output (`torch.Tensor`):
+            The output tensor of the model (i.e. the embeddings).
+    """
+    if module.training:
+        dims = torch.tensor(output.size(1) * output.size(2))
+        mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
+        output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
+    return output
+
+
+def peft_module_casting_to_bf16(model):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for name, module in model.named_modules():
+        if isinstance(module, BaseTunerLayer):
+            module = module.to(torch.bfloat16)
+        elif isinstance(module, torch.nn.LayerNorm) or "norm" in name:
+            module = module.to(torch.float32)
+        elif any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
+            if hasattr(module, "weight"):
+                if module.weight.dtype == torch.float32:
+                    module = module.to(torch.bfloat16)
+
+
+def trl_sanitze_kwargs_for_tagging(model, tag_names, kwargs=None):
+    if is_unsloth_available():
+        # Unsloth adds a new attribute in the model config `unsloth_version`
+        # to keep track of models that have been patched with unsloth.
+        if hasattr(model, "config") and getattr(model.config, "unsloth_version", None) is not None:
+            tag_names.append("unsloth")
+
+    if kwargs is not None:
+        if "tags" not in kwargs:
+            kwargs["tags"] = tag_names
+        elif "tags" in kwargs and isinstance(kwargs["tags"], list):
+            kwargs["tags"].extend(tag_names)
+        elif "tags" in kwargs and isinstance(kwargs["tags"], str):
+            tag_names.append(kwargs["tags"])
+            kwargs["tags"] = tag_names
+    return kwargs
+
+
+def get_quantization_config(model_config: ModelConfig) -> Optional[BitsAndBytesConfig]:
+    if model_config.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=model_config.torch_dtype,  # For consistency with model weights, we use the same value as `torch_dtype`
+            bnb_4bit_quant_type=model_config.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_config.use_bnb_nested_quant,
+        )
+    elif model_config.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    else:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_kbit_device_map() -> Optional[Dict[str, int]]:
+    if is_xpu_available():
+        return {"": f"xpu:{PartialState().local_process_index}"}
+    elif torch.cuda.is_available():
+        return {"": PartialState().local_process_index}
+    else:
+        return None
+
+
+def get_peft_config(model_config: ModelConfig) -> "Optional[PeftConfig]":
+    if model_config.use_peft is False:
+        return None
+
+    peft_config = LoraConfig(
+        r=model_config.lora_r,
+        lora_alpha=model_config.lora_alpha,
+        lora_dropout=model_config.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=model_config.lora_target_modules,
+        modules_to_save=model_config.lora_modules_to_save,
+    )
+
+    return peft_config
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..e0bca57
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,3441 @@
+version = 1
+revision = 3
+requires-python = "==3.10.15"
+resolution-markers = [
+    "sys_platform == 'darwin'",
+    "platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')",
+]
+
+[[package]]
+name = "absl-py"
+version = "2.3.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/2a/c93173ffa1b39c1d0395b7e842bbdc62e556ca9d8d3b5572926f3e4ca752/absl_py-2.3.1.tar.gz", hash = "sha256:a97820526f7fbfd2ec1bce83f3f25e3a14840dac0d8e02a0b71cd75db3f77fc9", size = 116588, upload-time = "2025-07-03T09:31:44.05Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" },
+]
+
+[[package]]
+name = "accelerate"
+version = "0.29.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/96/85/df8b4e9a1d5d5cd3b7ae19d28b062b5aef8b0293c2b04d9971e383d9144d/accelerate-0.29.1.tar.gz", hash = "sha256:d1d0e5a591177891812fd6d1bc843af191e1192c80e5180258f52fefcb653a9f", size = 295206, upload-time = "2024-04-05T17:08:35.798Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6c/35/b3851a3c2d3ead15099defccb50b59c1165eb24dfde298abe4091ffb6cca/accelerate-0.29.1-py3-none-any.whl", hash = "sha256:7eda0c8bc62bc59129103310f1272a0fb7b3ebc55fc8920cfe1c102db30aca58", size = 297332, upload-time = "2024-04-05T17:08:33.113Z" },
+]
+
+[[package]]
+name = "affine"
+version = "2.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/98/d2f0bb06385069e799fc7d2870d9e078cfa0fa396dc8a2b81227d0da08b9/affine-2.4.0.tar.gz", hash = "sha256:a24d818d6a836c131976d22f8c27b8d3ca32d0af64c1d8d29deb7bafa4da1eea", size = 17132, upload-time = "2023-01-19T23:44:30.696Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0b/f7/85273299ab57117850cc0a936c64151171fac4da49bc6fba0dad984a7c5f/affine-2.4.0-py3-none-any.whl", hash = "sha256:8a3df80e2b2378aef598a83c1392efd47967afec4242021a0b06b4c7cbc61a92", size = 15662, upload-time = "2023-01-19T23:44:28.833Z" },
+]
+
+[[package]]
+name = "aioboto3"
+version = "15.5.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "aiobotocore", extra = ["boto3"] },
+    { name = "aiofiles" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/01/92e9ab00f36e2899315f49eefcd5b4685fbb19016c7f19a9edf06da80bb0/aioboto3-15.5.0.tar.gz", hash = "sha256:ea8d8787d315594842fbfcf2c4dce3bac2ad61be275bc8584b2ce9a3402a6979", size = 255069, upload-time = "2025-10-30T13:37:16.122Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e5/3e/e8f5b665bca646d43b916763c901e00a07e40f7746c9128bdc912a089424/aioboto3-15.5.0-py3-none-any.whl", hash = "sha256:cc880c4d6a8481dd7e05da89f41c384dbd841454fc1998ae25ca9c39201437a6", size = 35913, upload-time = "2025-10-30T13:37:14.549Z" },
+]
+
+[[package]]
+name = "aiobotocore"
+version = "2.25.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "aioitertools" },
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "multidict" },
+    { name = "python-dateutil" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/2e4ec48cf1abb89971cb2612d86f979a6240520f0a659b53a43116d344dc/aiobotocore-2.25.1.tar.gz", hash = "sha256:ea9be739bfd7ece8864f072ec99bb9ed5c7e78ebb2b0b15f29781fbe02daedbc", size = 120560, upload-time = "2025-10-28T22:33:21.787Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/2a/d275ec4ce5cd0096665043995a7d76f5d0524853c76a3d04656de49f8808/aiobotocore-2.25.1-py3-none-any.whl", hash = "sha256:eb6daebe3cbef5b39a0bb2a97cffbe9c7cb46b2fcc399ad141f369f3c2134b1f", size = 86039, upload-time = "2025-10-28T22:33:19.949Z" },
+]
+
+[package.optional-dependencies]
+boto3 = [
+    { name = "boto3" },
+]
+
+[[package]]
+name = "aiofiles"
+version = "25.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" },
+]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.13.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "async-timeout" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/ce/3b83ebba6b3207a7135e5fcaba49706f8a4b6008153b4e30540c982fae26/aiohttp-3.13.2.tar.gz", hash = "sha256:40176a52c186aefef6eb3cad2cdd30cd06e3afbe88fe8ab2af9c0b90f228daca", size = 7837994, upload-time = "2025-10-28T20:59:39.937Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/34/939730e66b716b76046dedfe0842995842fa906ccc4964bba414ff69e429/aiohttp-3.13.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2372b15a5f62ed37789a6b383ff7344fc5b9f243999b0cd9b629d8bc5f5b4155", size = 736471, upload-time = "2025-10-28T20:55:27.924Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/cf/dcbdf2df7f6ca72b0bb4c0b4509701f2d8942cf54e29ca197389c214c07f/aiohttp-3.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7f8659a48995edee7229522984bd1009c1213929c769c2daa80b40fe49a180c", size = 493985, upload-time = "2025-10-28T20:55:29.456Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9d/87/71c8867e0a1d0882dcbc94af767784c3cb381c1c4db0943ab4aae4fed65e/aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:939ced4a7add92296b0ad38892ce62b98c619288a081170695c6babe4f50e636", size = 489274, upload-time = "2025-10-28T20:55:31.134Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/38/0f/46c24e8dae237295eaadd113edd56dee96ef6462adf19b88592d44891dc5/aiohttp-3.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6315fb6977f1d0dd41a107c527fee2ed5ab0550b7d885bc15fee20ccb17891da", size = 1668171, upload-time = "2025-10-28T20:55:36.065Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/eb/c6/4cdfb4440d0e28483681a48f69841fa5e39366347d66ef808cbdadddb20e/aiohttp-3.13.2-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e7352512f763f760baaed2637055c49134fd1d35b37c2dedfac35bfe5cf8725", size = 1636036, upload-time = "2025-10-28T20:55:37.576Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/37/8708cf678628216fb678ab327a4e1711c576d6673998f4f43e86e9ae90dd/aiohttp-3.13.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e09a0a06348a2dd73e7213353c90d709502d9786219f69b731f6caa0efeb46f5", size = 1727975, upload-time = "2025-10-28T20:55:39.457Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/2e/3ebfe12fdcb9b5f66e8a0a42dffcd7636844c8a018f261efb2419f68220b/aiohttp-3.13.2-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a09a6d073fb5789456545bdee2474d14395792faa0527887f2f4ec1a486a59d3", size = 1815823, upload-time = "2025-10-28T20:55:40.958Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a1/4f/ca2ef819488cbb41844c6cf92ca6dd15b9441e6207c58e5ae0e0fc8d70ad/aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b59d13c443f8e049d9e94099c7e412e34610f1f49be0f230ec656a10692a5802", size = 1669374, upload-time = "2025-10-28T20:55:42.745Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/fe/1fe2e1179a0d91ce09c99069684aab619bf2ccde9b20bd6ca44f8837203e/aiohttp-3.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:20db2d67985d71ca033443a1ba2001c4b5693fe09b0e29f6d9358a99d4d62a8a", size = 1555315, upload-time = "2025-10-28T20:55:44.264Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/2b/f3781899b81c45d7cbc7140cddb8a3481c195e7cbff8e36374759d2ab5a5/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:960c2fc686ba27b535f9fd2b52d87ecd7e4fd1cf877f6a5cba8afb5b4a8bd204", size = 1639140, upload-time = "2025-10-28T20:55:46.626Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/27/c37e85cd3ece6f6c772e549bd5a253d0c122557b25855fb274224811e4f2/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6c00dbcf5f0d88796151e264a8eab23de2997c9303dd7c0bf622e23b24d3ce22", size = 1645496, upload-time = "2025-10-28T20:55:48.933Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/20/3af1ab663151bd3780b123e907761cdb86ec2c4e44b2d9b195ebc91fbe37/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fed38a5edb7945f4d1bcabe2fcd05db4f6ec7e0e82560088b754f7e08d93772d", size = 1697625, upload-time = "2025-10-28T20:55:50.377Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/eb/ae5cab15efa365e13d56b31b0d085a62600298bf398a7986f8388f73b598/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b395bbca716c38bef3c764f187860e88c724b342c26275bc03e906142fc5964f", size = 1542025, upload-time = "2025-10-28T20:55:51.861Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e9/2d/1683e8d67ec72d911397fe4e575688d2a9b8f6a6e03c8fdc9f3fd3d4c03f/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:204ffff2426c25dfda401ba08da85f9c59525cdc42bda26660463dd1cbcfec6f", size = 1714918, upload-time = "2025-10-28T20:55:53.515Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/a2/ffe8e0e1c57c5e542d47ffa1fcf95ef2b3ea573bf7c4d2ee877252431efc/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:05c4dd3c48fb5f15db31f57eb35374cb0c09afdde532e7fb70a75aede0ed30f6", size = 1656113, upload-time = "2025-10-28T20:55:55.438Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0d/42/d511aff5c3a2b06c09d7d214f508a4ad8ac7799817f7c3d23e7336b5e896/aiohttp-3.13.2-cp310-cp310-win32.whl", hash = "sha256:e574a7d61cf10351d734bcddabbe15ede0eaa8a02070d85446875dc11189a251", size = 432290, upload-time = "2025-10-28T20:55:56.96Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8b/ea/1c2eb7098b5bad4532994f2b7a8228d27674035c9b3234fe02c37469ef14/aiohttp-3.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:364f55663085d658b8462a1c3f17b2b84a5c2e1ba858e1b79bff7b2e24ad1514", size = 455075, upload-time = "2025-10-28T20:55:58.373Z" },
+]
+
+[[package]]
+name = "aioitertools"
+version = "0.13.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/3c/53c4a17a05fb9ea2313ee1777ff53f5e001aefd5cc85aa2f4c2d982e1e38/aioitertools-0.13.0.tar.gz", hash = "sha256:620bd241acc0bbb9ec819f1ab215866871b4bbd1f73836a55f799200ee86950c", size = 19322, upload-time = "2025-11-06T22:17:07.609Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/a1/510b0a7fadc6f43a6ce50152e69dbd86415240835868bb0bd9b5b88b1e06/aioitertools-0.13.0-py3-none-any.whl", hash = "sha256:0be0292b856f08dfac90e31f4739432f4cb6d7520ab9eb73e143f4f2fa5259be", size = 24182, upload-time = "2025-11-06T22:17:06.502Z" },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "frozenlist" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.9.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" }
+
+[[package]]
+name = "anyio"
+version = "4.12.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "exceptiongroup" },
+    { name = "idna" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/16/ce/8a777047513153587e5434fd752e89334ac33e379aa3497db860eeb60377/anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0", size = 228266, upload-time = "2025-11-28T23:37:38.911Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
+]
+
+[[package]]
+name = "appnope"
+version = "0.1.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
+]
+
+[[package]]
+name = "argon2-cffi"
+version = "25.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "argon2-cffi-bindings" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0e/89/ce5af8a7d472a67cc819d5d998aa8c82c5d860608c4db9f46f1162d7dab9/argon2_cffi-25.1.0.tar.gz", hash = "sha256:694ae5cc8a42f4c4e2bf2ca0e64e51e23a040c6a517a85074683d3959e1346c1", size = 45706, upload-time = "2025-06-03T06:55:32.073Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl", hash = "sha256:fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741", size = 14657, upload-time = "2025-06-03T06:55:30.804Z" },
+]
+
+[[package]]
+name = "argon2-cffi-bindings"
+version = "25.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "cffi" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/2d/db8af0df73c1cf454f71b2bbe5e356b8c1f8041c979f505b3d3186e520a9/argon2_cffi_bindings-25.1.0.tar.gz", hash = "sha256:b957f3e6ea4d55d820e40ff76f450952807013d361a65d7f28acc0acbf29229d", size = 1783441, upload-time = "2025-07-30T10:02:05.147Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/57/96b8b9f93166147826da5f90376e784a10582dd39a393c99bb62cfcf52f0/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:aecba1723ae35330a008418a91ea6cfcedf6d31e5fbaa056a166462ff066d500", size = 54121, upload-time = "2025-07-30T10:01:50.815Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/08/a9bebdb2e0e602dde230bdde8021b29f71f7841bd54801bcfd514acb5dcf/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2630b6240b495dfab90aebe159ff784d08ea999aa4b0d17efa734055a07d2f44", size = 29177, upload-time = "2025-07-30T10:01:51.681Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b6/02/d297943bcacf05e4f2a94ab6f462831dc20158614e5d067c35d4e63b9acb/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7aef0c91e2c0fbca6fc68e7555aa60ef7008a739cbe045541e438373bc54d2b0", size = 31090, upload-time = "2025-07-30T10:01:53.184Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/93/44365f3d75053e53893ec6d733e4a5e3147502663554b4d864587c7828a7/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e021e87faa76ae0d413b619fe2b65ab9a037f24c60a1e6cc43457ae20de6dc6", size = 81246, upload-time = "2025-07-30T10:01:54.145Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a", size = 87126, upload-time = "2025-07-30T10:01:55.074Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/70/7a2993a12b0ffa2a9271259b79cc616e2389ed1a4d93842fac5a1f923ffd/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c87b72589133f0346a1cb8d5ecca4b933e3c9b64656c9d175270a000e73b288d", size = 80343, upload-time = "2025-07-30T10:01:56.007Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/78/9a/4e5157d893ffc712b74dbd868c7f62365618266982b64accab26bab01edc/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1db89609c06afa1a214a69a462ea741cf735b29a57530478c06eb81dd403de99", size = 86777, upload-time = "2025-07-30T10:01:56.943Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/74/cd/15777dfde1c29d96de7f18edf4cc94c385646852e7c7b0320aa91ccca583/argon2_cffi_bindings-25.1.0-cp39-abi3-win32.whl", hash = "sha256:473bcb5f82924b1becbb637b63303ec8d10e84c8d241119419897a26116515d2", size = 27180, upload-time = "2025-07-30T10:01:57.759Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/c6/a759ece8f1829d1f162261226fbfd2c6832b3ff7657384045286d2afa384/argon2_cffi_bindings-25.1.0-cp39-abi3-win_amd64.whl", hash = "sha256:a98cd7d17e9f7ce244c0803cad3c23a7d379c301ba618a5fa76a67d116618b98", size = 31715, upload-time = "2025-07-30T10:01:58.56Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/42/b9/f8d6fa329ab25128b7e98fd83a3cb34d9db5b059a9847eddb840a0af45dd/argon2_cffi_bindings-25.1.0-cp39-abi3-win_arm64.whl", hash = "sha256:b0fdbcf513833809c882823f98dc2f931cf659d9a1429616ac3adebb49f5db94", size = 27149, upload-time = "2025-07-30T10:01:59.329Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/11/2d/ba4e4ca8d149f8dcc0d952ac0967089e1d759c7e5fcf0865a317eb680fbb/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6dca33a9859abf613e22733131fc9194091c1fa7cb3e131c143056b4856aa47e", size = 24549, upload-time = "2025-07-30T10:02:00.101Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/82/9b2386cc75ac0bd3210e12a44bfc7fd1632065ed8b80d573036eecb10442/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:21378b40e1b8d1655dd5310c84a40fc19a9aa5e6366e835ceb8576bf0fea716d", size = 25539, upload-time = "2025-07-30T10:02:00.929Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/db/740de99a37aa727623730c90d92c22c9e12585b3c98c54b7960f7810289f/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d588dec224e2a83edbdc785a5e6f3c6cd736f46bfd4b441bbb5aa1f5085e584", size = 28467, upload-time = "2025-07-30T10:02:02.08Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/71/7a/47c4509ea18d755f44e2b92b7178914f0c113946d11e16e626df8eaa2b0b/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5acb4e41090d53f17ca1110c3427f0a130f944b896fc8c83973219c97f57b690", size = 27355, upload-time = "2025-07-30T10:02:02.867Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ee/82/82745642d3c46e7cea25e1885b014b033f4693346ce46b7f47483cf5d448/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:da0c79c23a63723aa5d782250fbf51b768abca630285262fb5144ba5ae01e520", size = 29187, upload-time = "2025-07-30T10:02:03.674Z" },
+]
+
+[[package]]
+name = "arrow"
+version = "1.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "python-dateutil" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b9/33/032cdc44182491aa708d06a68b62434140d8c50820a087fac7af37703357/arrow-1.4.0.tar.gz", hash = "sha256:ed0cc050e98001b8779e84d461b0098c4ac597e88704a655582b21d116e526d7", size = 152931, upload-time = "2025-10-18T17:46:46.761Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/c9/d7977eaacb9df673210491da99e6a247e93df98c715fc43fd136ce1d3d33/arrow-1.4.0-py3-none-any.whl", hash = "sha256:749f0769958ebdc79c173ff0b0670d59051a535fa26e8eba02953dc19eb43205", size = 68797, upload-time = "2025-10-18T17:46:45.663Z" },
+]
+
+[[package]]
+name = "astroid"
+version = "4.0.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/22/97df040e15d964e592d3a180598ace67e91b7c559d8298bdb3c949dc6e42/astroid-4.0.2.tar.gz", hash = "sha256:ac8fb7ca1c08eb9afec91ccc23edbd8ac73bb22cbdd7da1d488d9fb8d6579070", size = 405714, upload-time = "2025-11-09T21:21:18.373Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/93/ac/a85b4bfb4cf53221513e27f33cc37ad158fce02ac291d18bee6b49ab477d/astroid-4.0.2-py3-none-any.whl", hash = "sha256:d7546c00a12efc32650b19a2bb66a153883185d3179ab0d4868086f807338b9b", size = 276354, upload-time = "2025-11-09T21:21:16.54Z" },
+]
+
+[[package]]
+name = "asttokens"
+version = "3.0.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" },
+]
+
+[[package]]
+name = "async-lru"
+version = "2.0.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/4d/71ec4d3939dc755264f680f6c2b4906423a304c3d18e96853f0a595dfe97/async_lru-2.0.5.tar.gz", hash = "sha256:481d52ccdd27275f42c43a928b4a50c3bfb2d67af4e78b170e3e0bb39c66e5bb", size = 10380, upload-time = "2025-03-16T17:25:36.919Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/03/49/d10027df9fce941cb8184e78a02857af36360d33e1721df81c5ed2179a1a/async_lru-2.0.5-py3-none-any.whl", hash = "sha256:ab95404d8d2605310d345932697371a5f40def0487c03d6d0ad9138de52c9943", size = 6069, upload-time = "2025-03-16T17:25:35.422Z" },
+]
+
+[[package]]
+name = "async-timeout"
+version = "5.0.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" },
+]
+
+[[package]]
+name = "attrs"
+version = "25.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
+]
+
+[[package]]
+name = "babel"
+version = "2.17.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
+]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
+]
+
+[[package]]
+name = "bleach"
+version = "6.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "webencodings" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/18/3c8523962314be6bf4c8989c79ad9531c825210dd13a8669f6b84336e8bd/bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22", size = 203533, upload-time = "2025-10-27T17:57:39.211Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
+]
+
+[package.optional-dependencies]
+css = [
+    { name = "tinycss2" },
+]
+
+[[package]]
+name = "bokeh"
+version = "2.4.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jinja2" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "pyyaml" },
+    { name = "tornado" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ac/01/c7622f3f8c6440f4a66ed58bfe5a2a499c2cc8551e00a298ceb94ccc3c70/bokeh-2.4.3.tar.gz", hash = "sha256:ef33801161af379665ab7a34684f2209861e3aefd5c803a21fbbb99d94874b03", size = 17722836, upload-time = "2022-05-18T19:00:19.343Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/15/06/706a9c43436cd0c3e2f4b94e93ae837e74965e59565c596b727974a74169/bokeh-2.4.3-py3-none-any.whl", hash = "sha256:104d2f0a4ca7774ee4b11e545aa34ff76bf3e2ad6de0d33944361981b65da420", size = 18508622, upload-time = "2022-05-18T19:00:14.625Z" },
+]
+
+[[package]]
+name = "boto3"
+version = "1.40.61"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/f9/6ef8feb52c3cce5ec3967a535a6114b57ac7949fd166b0f3090c2b06e4e5/boto3-1.40.61.tar.gz", hash = "sha256:d6c56277251adf6c2bdd25249feae625abe4966831676689ff23b4694dea5b12", size = 111535, upload-time = "2025-10-28T19:26:57.247Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/61/24/3bf865b07d15fea85b63504856e137029b6acbc73762496064219cdb265d/boto3-1.40.61-py3-none-any.whl", hash = "sha256:6b9c57b2a922b5d8c17766e29ed792586a818098efe84def27c8f582b33f898c", size = 139321, upload-time = "2025-10-28T19:26:55.007Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.40.61"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/28/a3/81d3a47c2dbfd76f185d3b894f2ad01a75096c006a2dd91f237dca182188/botocore-1.40.61.tar.gz", hash = "sha256:a2487ad69b090f9cccd64cf07c7021cd80ee9c0655ad974f87045b02f3ef52cd", size = 14393956, upload-time = "2025-10-28T19:26:46.108Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/38/c5/f6ce561004db45f0b847c2cd9b19c67c6bf348a82018a48cb718be6b58b0/botocore-1.40.61-py3-none-any.whl", hash = "sha256:17ebae412692fd4824f99cde0f08d50126dc97954008e5ba2b522eb049238aa7", size = 14055973, upload-time = "2025-10-28T19:26:42.15Z" },
+]
+
+[[package]]
+name = "casadi"
+version = "3.7.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/62/1e98662024915ecb09c6894c26a3f497f4afa66570af3f53db4651fc45f1/casadi-3.7.2.tar.gz", hash = "sha256:b4d7bd8acdc4180306903ae1c9eddaf41be2a3ae2fa7154c57174ae64acdc60d", size = 6053600, upload-time = "2025-09-10T10:05:49.521Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/75/2c/90d95b99380c6a2c7b2ea9566a91db93fe055d950152a54e74853ea186e7/casadi-3.7.2-cp310-none-macosx_10_13_x86_64.macosx_10_13_intel.whl", hash = "sha256:cf60276b40efe203738309d34a629e149db7c0865db1034aa3e094c9f567a6e3", size = 47115761, upload-time = "2025-09-10T07:51:48.383Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bd/01/cb72c8330e09c958ae8868d0b8e0f6499abfc3966108d78f55bdf1dc76fc/casadi-3.7.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:07c398d2e99847d99adc5a007f303d909be3141f34aca34e53adaa3d040c06ee", size = 42294185, upload-time = "2025-09-10T07:59:33.568Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1e/2b/c323a405df939d073ec2e9fdc0b1a350289e25ea29a5e4228923fb9e1657/casadi-3.7.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:75cd1b446aacb378d3e39d440e16a0e56c27ef0f28438d164b5bbe84399fa5d2", size = 47277287, upload-time = "2025-09-10T08:00:09.745Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/96/32fb4cd581644a5928dd19675ace96ac2803790f54ab77f6526116f1a9ee/casadi-3.7.2-cp310-none-manylinux2014_i686.whl", hash = "sha256:b00740ad6d5b5535bf1e5b50d841b577d487bf2493928db0520026305407e774", size = 72438460, upload-time = "2025-09-10T08:01:24.031Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/da/5f/116f67f5c399eda15e23fe0623786d23ce0e46f0b8ad937c5268eb16e72f/casadi-3.7.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:e1cb725c95024d5b8f09cb339d3fe65bb74368793783366fbbdffece980f0082", size = 75574457, upload-time = "2025-09-10T08:02:19.889Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/c2/cb77fe6a80b3fa6665c5deae1e371ec4e5bd0963589bdeeeabb9b01bd6b5/casadi-3.7.2-cp310-none-win_amd64.whl", hash = "sha256:73f2b1d22879b12cd228dc52f3483226d7451b456da1a58abd25a303d21f0dd4", size = 50994107, upload-time = "2025-09-10T08:02:59.832Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2025.11.12"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
+]
+
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/93/d7/516d984057745a6cd96575eea814fe1edd6646ee6efd552fb7b0921dec83/cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44", size = 184283, upload-time = "2025-09-08T23:22:08.01Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/84/ad6a0b408daa859246f57c03efd28e5dd1b33c21737c2db84cae8c237aa5/cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49", size = 180504, upload-time = "2025-09-08T23:22:10.637Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/bd/b1a6362b80628111e6653c961f987faa55262b4002fcec42308cad1db680/cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c", size = 208811, upload-time = "2025-09-08T23:22:12.267Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4f/27/6933a8b2562d7bd1fb595074cf99cc81fc3789f6a6c05cdabb46284a3188/cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb", size = 216402, upload-time = "2025-09-08T23:22:13.455Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/eb/b86f2a2645b62adcfff53b0dd97e8dfafb5c8aa864bd0d9a2c2049a0d551/cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0", size = 203217, upload-time = "2025-09-08T23:22:14.596Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9f/e0/6cbe77a53acf5acc7c08cc186c9928864bd7c005f9efd0d126884858a5fe/cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4", size = 203079, upload-time = "2025-09-08T23:22:15.769Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/98/29/9b366e70e243eb3d14a5cb488dfd3a0b6b2f1fb001a203f653b93ccfac88/cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453", size = 216475, upload-time = "2025-09-08T23:22:17.427Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/21/7a/13b24e70d2f90a322f2900c5d8e1f14fa7e2a6b3332b7309ba7b2ba51a5a/cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495", size = 218829, upload-time = "2025-09-08T23:22:19.069Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/60/99/c9dc110974c59cc981b1f5b66e1d8af8af764e00f0293266824d9c4254bc/cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5", size = 211211, upload-time = "2025-09-08T23:22:20.588Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/72/ff2d12dbf21aca1b32a40ed792ee6b40f6dc3a9cf1644bd7ef6e95e0ac5e/cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb", size = 218036, upload-time = "2025-09-08T23:22:22.143Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/cc/027d7fb82e58c48ea717149b03bcadcbdc293553edb283af792bd4bcbb3f/cffi-2.0.0-cp310-cp310-win32.whl", hash = "sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a", size = 172184, upload-time = "2025-09-08T23:22:23.328Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/33/fa/072dd15ae27fbb4e06b437eb6e944e75b068deb09e2a2826039e49ee2045/cffi-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739", size = 182790, upload-time = "2025-09-08T23:22:24.752Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/b8/6d51fc1d52cbd52cd4ccedd5b5b2f0f6a11bbf6765c782298b0f3e808541/charset_normalizer-3.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d", size = 209709, upload-time = "2025-10-14T04:40:11.385Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/af/1f9d7f7faafe2ddfb6f72a2e07a548a629c61ad510fe60f9630309908fef/charset_normalizer-3.4.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4bd5d4137d500351a30687c2d3971758aac9a19208fc110ccb9d7188fbe709e8", size = 148814, upload-time = "2025-10-14T04:40:13.135Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/79/3d/f2e3ac2bbc056ca0c204298ea4e3d9db9b4afe437812638759db2c976b5f/charset_normalizer-3.4.4-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:027f6de494925c0ab2a55eab46ae5129951638a49a34d87f4c3eda90f696b4ad", size = 144467, upload-time = "2025-10-14T04:40:14.728Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ec/85/1bf997003815e60d57de7bd972c57dc6950446a3e4ccac43bc3070721856/charset_normalizer-3.4.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f820802628d2694cb7e56db99213f930856014862f3fd943d290ea8438d07ca8", size = 162280, upload-time = "2025-10-14T04:40:16.14Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3e/8e/6aa1952f56b192f54921c436b87f2aaf7c7a7c3d0d1a765547d64fd83c13/charset_normalizer-3.4.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:798d75d81754988d2565bff1b97ba5a44411867c0cf32b77a7e8f8d84796b10d", size = 159454, upload-time = "2025-10-14T04:40:17.567Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/36/3b/60cbd1f8e93aa25d1c669c649b7a655b0b5fb4c571858910ea9332678558/charset_normalizer-3.4.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d1bb833febdff5c8927f922386db610b49db6e0d4f4ee29601d71e7c2694313", size = 153609, upload-time = "2025-10-14T04:40:19.08Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/64/91/6a13396948b8fd3c4b4fd5bc74d045f5637d78c9675585e8e9fbe5636554/charset_normalizer-3.4.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9cd98cdc06614a2f768d2b7286d66805f94c48cde050acdbbb7db2600ab3197e", size = 151849, upload-time = "2025-10-14T04:40:20.607Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/7a/59482e28b9981d105691e968c544cc0df3b7d6133152fb3dcdc8f135da7a/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:077fbb858e903c73f6c9db43374fd213b0b6a778106bc7032446a8e8b5b38b93", size = 151586, upload-time = "2025-10-14T04:40:21.719Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/59/f64ef6a1c4bdd2baf892b04cd78792ed8684fbc48d4c2afe467d96b4df57/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:244bfb999c71b35de57821b8ea746b24e863398194a4014e4c76adc2bbdfeff0", size = 145290, upload-time = "2025-10-14T04:40:23.069Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6b/63/3bf9f279ddfa641ffa1962b0db6a57a9c294361cc2f5fcac997049a00e9c/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:64b55f9dce520635f018f907ff1b0df1fdc31f2795a922fb49dd14fbcdf48c84", size = 163663, upload-time = "2025-10-14T04:40:24.17Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/09/c9e38fc8fa9e0849b172b581fd9803bdf6e694041127933934184e19f8c3/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:faa3a41b2b66b6e50f84ae4a68c64fcd0c44355741c6374813a800cd6695db9e", size = 151964, upload-time = "2025-10-14T04:40:25.368Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d2/d1/d28b747e512d0da79d8b6a1ac18b7ab2ecfd81b2944c4c710e166d8dd09c/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6515f3182dbe4ea06ced2d9e8666d97b46ef4c75e326b79bb624110f122551db", size = 161064, upload-time = "2025-10-14T04:40:26.806Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/9a/31d62b611d901c3b9e5500c36aab0ff5eb442043fb3a1c254200d3d397d9/charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc00f04ed596e9dc0da42ed17ac5e596c6ccba999ba6bd92b0e0aef2f170f2d6", size = 155015, upload-time = "2025-10-14T04:40:28.284Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/f3/107e008fa2bff0c8b9319584174418e5e5285fef32f79d8ee6a430d0039c/charset_normalizer-3.4.4-cp310-cp310-win32.whl", hash = "sha256:f34be2938726fc13801220747472850852fe6b1ea75869a048d6f896838c896f", size = 99792, upload-time = "2025-10-14T04:40:29.613Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/eb/66/e396e8a408843337d7315bab30dbf106c38966f1819f123257f5520f8a96/charset_normalizer-3.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:a61900df84c667873b292c3de315a786dd8dac506704dea57bc957bd31e22c7d", size = 107198, upload-time = "2025-10-14T04:40:30.644Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b5/58/01b4f815bf0312704c267f2ccb6e5d42bcc7752340cd487bc9f8c3710597/charset_normalizer-3.4.4-cp310-cp310-win_arm64.whl", hash = "sha256:cead0978fc57397645f12578bfd2d5ea9138ea0fac82b2f63f7f7c6877986a69", size = 100262, upload-time = "2025-10-14T04:40:32.108Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.2.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
+]
+
+[[package]]
+name = "click-plugins"
+version = "1.1.1.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "click" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/a4/34847b59150da33690a36da3681d6bbc2ec14ee9a846bc30a6746e5984e4/click_plugins-1.1.1.2.tar.gz", hash = "sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261", size = 8343, upload-time = "2025-06-25T00:47:37.555Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3d/9a/2abecb28ae875e39c8cad711eb1186d8d14eab564705325e77e4e6ab9ae5/click_plugins-1.1.1.2-py2.py3-none-any.whl", hash = "sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6", size = 11051, upload-time = "2025-06-25T00:47:36.731Z" },
+]
+
+[[package]]
+name = "cligj"
+version = "0.7.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "click" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ea/0d/837dbd5d8430fd0f01ed72c4cfb2f548180f4c68c635df84ce87956cff32/cligj-0.7.2.tar.gz", hash = "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27", size = 9803, upload-time = "2021-05-28T21:23:27.935Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/86/43fa9f15c5b9fb6e82620428827cd3c284aa933431405d1bcf5231ae3d3e/cligj-0.7.2-py3-none-any.whl", hash = "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df", size = 7069, upload-time = "2021-05-28T21:23:26.877Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "comm"
+version = "0.2.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" },
+]
+
+[[package]]
+name = "contourpy"
+version = "1.3.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/12/a3/da4153ec8fe25d263aa48c1a4cbde7f49b59af86f0b6f7862788c60da737/contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934", size = 268551, upload-time = "2025-04-15T17:34:46.581Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2f/6c/330de89ae1087eb622bfca0177d32a7ece50c3ef07b28002de4757d9d875/contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989", size = 253399, upload-time = "2025-04-15T17:34:51.427Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/bd/20c6726b1b7f81a8bee5271bed5c165f0a8e1f572578a9d27e2ccb763cb2/contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d", size = 312061, upload-time = "2025-04-15T17:34:55.961Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/fc/a9665c88f8a2473f823cf1ec601de9e5375050f1958cbb356cdf06ef1ab6/contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9", size = 351956, upload-time = "2025-04-15T17:35:00.992Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/eb/9f0a0238f305ad8fb7ef42481020d6e20cf15e46be99a1fcf939546a177e/contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512", size = 320872, upload-time = "2025-04-15T17:35:06.177Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/5c/1ee32d1c7956923202f00cf8d2a14a62ed7517bdc0ee1e55301227fc273c/contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631", size = 325027, upload-time = "2025-04-15T17:35:11.244Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/bf/9baed89785ba743ef329c2b07fd0611d12bfecbedbdd3eeecf929d8d3b52/contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f", size = 1306641, upload-time = "2025-04-15T17:35:26.701Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d4/cc/74e5e83d1e35de2d28bd97033426b450bc4fd96e092a1f7a63dc7369b55d/contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2", size = 1374075, upload-time = "2025-04-15T17:35:43.204Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/42/17f3b798fd5e033b46a16f8d9fcb39f1aba051307f5ebf441bad1ecf78f8/contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0", size = 177534, upload-time = "2025-04-15T17:35:46.554Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/54/ec/5162b8582f2c994721018d0c9ece9dc6ff769d298a8ac6b6a652c307e7df/contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a", size = 221188, upload-time = "2025-04-15T17:35:50.064Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/33/05/b26e3c6ecc05f349ee0013f0bb850a761016d89cec528a98193a48c34033/contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c", size = 265681, upload-time = "2025-04-15T17:44:59.314Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2b/25/ac07d6ad12affa7d1ffed11b77417d0a6308170f44ff20fa1d5aa6333f03/contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16", size = 315101, upload-time = "2025-04-15T17:45:04.165Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8f/4d/5bb3192bbe9d3f27e3061a6a8e7733c9120e203cb8515767d30973f71030/contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad", size = 220599, upload-time = "2025-04-15T17:45:08.456Z" },
+]
+
+[[package]]
+name = "control"
+version = "0.9.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "scipy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/52/17664c8ff44acd09ec8097329b6e4302a6c44bbb52c67e61cf85450567df/control-0.9.1.tar.gz", hash = "sha256:8c9084bf386eafcf5d74008f780fae6dec68d243d18a380c866ac10a3549f8d3", size = 357890, upload-time = "2021-12-31T17:07:48.501Z" }
+
+[[package]]
+name = "cycler"
+version = "0.12.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30" },
+]
+
+[[package]]
+name = "datasets"
+version = "2.16.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyarrow-hotfix" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/a8/80c2bc24fdebe0597cae2aa604dec54659b722045efccc2988c7db367b5d/datasets-2.16.1.tar.gz", hash = "sha256:ad3215e9b1984d1de4fda2123bc7319ccbdf1e17d0c3d5590d13debff308a080", size = 582206, upload-time = "2023-12-30T16:45:47.618Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ec/93/454ada0d1b289a0f4a86ac88dbdeab54921becabac45da3da787d136628f/datasets-2.16.1-py3-none-any.whl", hash = "sha256:fafa300c78ff92d521473a3d47d60c2d3e0d6046212cc03ceb6caf6550737257", size = 507147, upload-time = "2023-12-30T16:45:45.158Z" },
+]
+
+[[package]]
+name = "debugpy"
+version = "1.8.19"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/75/9e12d4d42349b817cd545b89247696c67917aab907012ae5b64bbfea3199/debugpy-1.8.19.tar.gz", hash = "sha256:eea7e5987445ab0b5ed258093722d5ecb8bb72217c5c9b1e21f64efe23ddebdb", size = 1644590, upload-time = "2025-12-15T21:53:28.044Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/98/d57054371887f37d3c959a7a8dc3c76b763acb65f5e78d849d7db7cadc5b/debugpy-1.8.19-cp310-cp310-macosx_15_0_x86_64.whl", hash = "sha256:fce6da15d73be5935b4438435c53adb512326a3e11e4f90793ea87cd9f018254", size = 2098493, upload-time = "2025-12-15T21:53:30.149Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ee/dd/c517b9aa3500157a30e4f4c4f5149f880026bd039d2b940acd2383a85d8e/debugpy-1.8.19-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:e24b1652a1df1ab04d81e7ead446a91c226de704ff5dde6bd0a0dbaab07aa3f2", size = 3087875, upload-time = "2025-12-15T21:53:31.511Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d8/57/3d5a5b0da9b63445253107ead151eff29190c6ad7440c68d1a59d56613aa/debugpy-1.8.19-cp310-cp310-win32.whl", hash = "sha256:327cb28c3ad9e17bc925efc7f7018195fd4787c2fe4b7af1eec11f1d19bdec62", size = 5239378, upload-time = "2025-12-15T21:53:32.979Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a6/36/7f9053c4c549160c87ae7e43800138f2695578c8b65947114c97250983b6/debugpy-1.8.19-cp310-cp310-win_amd64.whl", hash = "sha256:b7dd275cf2c99e53adb9654f5ae015f70415bbe2bacbe24cfee30d54b6aa03c5", size = 5271129, upload-time = "2025-12-15T21:53:35.085Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/3e/e27078370414ef35fafad2c06d182110073daaeb5d3bf734b0b1eeefe452/debugpy-1.8.19-py2.py3-none-any.whl", hash = "sha256:360ffd231a780abbc414ba0f005dad409e71c78637efe8f2bd75837132a41d38", size = 5292321, upload-time = "2025-12-15T21:54:16.024Z" },
+]
+
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
+
+[[package]]
+name = "deepspeed"
+version = "0.14.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "hjson" },
+    { name = "ninja" },
+    { name = "numpy" },
+    { name = "nvidia-ml-py" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "py-cpuinfo" },
+    { name = "pydantic" },
+    { name = "torch" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2e/06/7315113506f1804b8ba4f77cb31905c62f7080452a0af2d13eaadaa83a08/deepspeed-0.14.4.tar.gz", hash = "sha256:bb2f404b4a1d06aedd40c64d661f71307c22945d04a42d397ad39bbfeef9c8b8", size = 1340883, upload-time = "2024-06-21T17:34:35.846Z" }
+
+[[package]]
+name = "defusedxml"
+version = "0.7.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61" },
+]
+
+[[package]]
+name = "dill"
+version = "0.3.7"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c4/31/54dd222e02311c2dbc9e680d37cbd50f4494ce1ee9b04c69980e4ec26f38/dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03", size = 183355, upload-time = "2023-07-22T22:18:46.449Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e", size = 115254, upload-time = "2023-07-22T22:18:44.511Z" },
+]
+
+[[package]]
+name = "docstring-parser"
+version = "0.17.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" },
+]
+
+[[package]]
+name = "einops"
+version = "0.8.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e5/81/df4fbe24dff8ba3934af99044188e20a98ed441ad17a274539b74e82e126/einops-0.8.1.tar.gz", hash = "sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84", size = 54805, upload-time = "2025-02-09T03:17:00.434Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.3.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
+]
+
+[[package]]
+name = "executing"
+version = "2.2.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
+]
+
+[[package]]
+name = "fastjsonschema"
+version = "2.21.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/20/b5/23b216d9d985a956623b6bd12d4086b60f0059b27799f23016af04a74ea1/fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de", size = 374130, upload-time = "2025-08-14T18:49:36.666Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" },
+]
+
+[[package]]
+name = "filelock"
+version = "3.20.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a7/23/ce7a1126827cedeb958fc043d61745754464eb56c5937c35bbf2b8e26f34/filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c", size = 19476, upload-time = "2025-12-15T23:54:28.027Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" },
+]
+
+[[package]]
+name = "fiona"
+version = "1.10.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "certifi" },
+    { name = "click" },
+    { name = "click-plugins" },
+    { name = "cligj" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/51/e0/71b63839cc609e1d62cea2fc9774aa605ece7ea78af823ff7a8f1c560e72/fiona-1.10.1.tar.gz", hash = "sha256:b00ae357669460c6491caba29c2022ff0acfcbde86a95361ea8ff5cd14a86b68", size = 444606, upload-time = "2024-09-16T20:15:47.074Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d0/34/c7e681703db8f8509907ebe6326c5b4fd933f8ae9a7d3ab7a51e507f230e/fiona-1.10.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:6e2a94beebda24e5db8c3573fe36110d474d4a12fac0264a3e083c75e9d63829", size = 16143634, upload-time = "2024-09-16T20:14:20.089Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/2c/7f1968ecc17350db3c87d0feb59852ea50e7d8688a63659879d92badf90a/fiona-1.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc7366f99bdc18ec99441b9e50246fdf5e72923dc9cbb00267b2bf28edd142ba", size = 14750325, upload-time = "2024-09-16T20:14:23.762Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/75/cb/73805030100447d40408c8a0f63ec146fb2b6e82692d0c194655c28b6783/fiona-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c32f424b0641c79f4036b96c2e80322fb181b4e415c8cd02d182baef55e6730", size = 17294868, upload-time = "2024-09-16T20:14:26.847Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ca/a3/57d33c2f16a2a6b27911d83301a697ed1491dca48d2f1dd2ed3b58a66244/fiona-1.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:9a67bd88918e87d64168bc9c00d9816d8bb07353594b5ce6c57252979d5dc86e", size = 24480225, upload-time = "2024-09-16T20:14:30.749Z" },
+]
+
+[[package]]
+name = "fonttools"
+version = "4.61.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5b/94/8a28707adb00bed1bf22dac16ccafe60faf2ade353dcb32c3617ee917307/fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24", size = 2854799, upload-time = "2025-12-12T17:29:27.5Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/93/c2e682faaa5ee92034818d8f8a8145ae73eb83619600495dcf8503fa7771/fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958", size = 2403032, upload-time = "2025-12-12T17:29:30.115Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f1/62/1748f7e7e1ee41aa52279fd2e3a6d0733dc42a673b16932bad8e5d0c8b28/fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da", size = 4897863, upload-time = "2025-12-12T17:29:32.535Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/69/4ca02ee367d2c98edcaeb83fc278d20972502ee071214ad9d8ca85e06080/fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6", size = 4859076, upload-time = "2025-12-12T17:29:34.907Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8c/f5/660f9e3cefa078861a7f099107c6d203b568a6227eef163dd173bfc56bdc/fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1", size = 4875623, upload-time = "2025-12-12T17:29:37.33Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/d1/9d7c5091d2276ed47795c131c1bf9316c3c1ab2789c22e2f59e0572ccd38/fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881", size = 4993327, upload-time = "2025-12-12T17:29:39.781Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6f/2d/28def73837885ae32260d07660a052b99f0aa00454867d33745dfe49dbf0/fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47", size = 1502180, upload-time = "2025-12-12T17:29:42.217Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/fa/bfdc98abb4dd2bd491033e85e3ba69a2313c850e759a6daa014bc9433b0f/fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6", size = 1550654, upload-time = "2025-12-12T17:29:44.564Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" },
+]
+
+[[package]]
+name = "fqdn"
+version = "1.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/30/3e/a80a8c077fd798951169626cde3e239adeba7dab75deb3555716415bd9b0/fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f", size = 6015, upload-time = "2021-03-11T07:16:29.08Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014", size = 9121, upload-time = "2021-03-11T07:16:28.351Z" },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.8.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/4a/557715d5047da48d54e659203b9335be7bfaafda2c3f627b7c47e0b3aaf3/frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011", size = 86230, upload-time = "2025-10-06T05:35:23.699Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/fb/c85f9fed3ea8fe8740e5b46a59cc141c23b842eca617da8876cfce5f760e/frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565", size = 49621, upload-time = "2025-10-06T05:35:25.341Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/70/26ca3f06aace16f2352796b08704338d74b6d1a24ca38f2771afbb7ed915/frozenlist-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a88f062f072d1589b7b46e951698950e7da00442fc1cacbe17e19e025dc327ad", size = 49889, upload-time = "2025-10-06T05:35:26.797Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5d/ed/c7895fd2fde7f3ee70d248175f9b6cdf792fb741ab92dc59cd9ef3bd241b/frozenlist-1.8.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f57fb59d9f385710aa7060e89410aeb5058b99e62f4d16b08b91986b9a2140c2", size = 219464, upload-time = "2025-10-06T05:35:28.254Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6b/83/4d587dccbfca74cb8b810472392ad62bfa100bf8108c7223eb4c4fa2f7b3/frozenlist-1.8.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:799345ab092bee59f01a915620b5d014698547afd011e691a208637312db9186", size = 221649, upload-time = "2025-10-06T05:35:29.454Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/c6/fd3b9cd046ec5fff9dab66831083bc2077006a874a2d3d9247dea93ddf7e/frozenlist-1.8.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c23c3ff005322a6e16f71bf8692fcf4d5a304aaafe1e262c98c6d4adc7be863e", size = 219188, upload-time = "2025-10-06T05:35:30.951Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ce/80/6693f55eb2e085fc8afb28cf611448fb5b90e98e068fa1d1b8d8e66e5c7d/frozenlist-1.8.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a76ea0f0b9dfa06f254ee06053d93a600865b3274358ca48a352ce4f0798450", size = 231748, upload-time = "2025-10-06T05:35:32.101Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/97/d6/e9459f7c5183854abd989ba384fe0cc1a0fb795a83c033f0571ec5933ca4/frozenlist-1.8.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c7366fe1418a6133d5aa824ee53d406550110984de7637d65a178010f759c6ef", size = 236351, upload-time = "2025-10-06T05:35:33.834Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/97/92/24e97474b65c0262e9ecd076e826bfd1d3074adcc165a256e42e7b8a7249/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:13d23a45c4cebade99340c4165bd90eeb4a56c6d8a9d8aa49568cac19a6d0dc4", size = 218767, upload-time = "2025-10-06T05:35:35.205Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ee/bf/dc394a097508f15abff383c5108cb8ad880d1f64a725ed3b90d5c2fbf0bb/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:e4a3408834f65da56c83528fb52ce7911484f0d1eaf7b761fc66001db1646eff", size = 235887, upload-time = "2025-10-06T05:35:36.354Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/90/25b201b9c015dbc999a5baf475a257010471a1fa8c200c843fd4abbee725/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:42145cd2748ca39f32801dad54aeea10039da6f86e303659db90db1c4b614c8c", size = 228785, upload-time = "2025-10-06T05:35:37.949Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/f4/b5bc148df03082f05d2dd30c089e269acdbe251ac9a9cf4e727b2dbb8a3d/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e2de870d16a7a53901e41b64ffdf26f2fbb8917b3e6ebf398098d72c5b20bd7f", size = 230312, upload-time = "2025-10-06T05:35:39.178Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/db/4b/87e95b5d15097c302430e647136b7d7ab2398a702390cf4c8601975709e7/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:20e63c9493d33ee48536600d1a5c95eefc870cd71e7ab037763d1fbb89cc51e7", size = 217650, upload-time = "2025-10-06T05:35:40.377Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e5/70/78a0315d1fea97120591a83e0acd644da638c872f142fd72a6cebee825f3/frozenlist-1.8.0-cp310-cp310-win32.whl", hash = "sha256:adbeebaebae3526afc3c96fad434367cafbfd1b25d72369a9e5858453b1bb71a", size = 39659, upload-time = "2025-10-06T05:35:41.863Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/aa/3f04523fb189a00e147e60c5b2205126118f216b0aa908035c45336e27e4/frozenlist-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:667c3777ca571e5dbeb76f331562ff98b957431df140b54c85fd4d52eea8d8f6", size = 43837, upload-time = "2025-10-06T05:35:43.205Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/39/75/1135feecdd7c336938bd55b4dc3b0dfc46d85b9be12ef2628574b28de776/frozenlist-1.8.0-cp310-cp310-win_arm64.whl", hash = "sha256:80f85f0a7cc86e7a54c46d99c9e1318ff01f4687c172ede30fd52d19d1da1c8e", size = 39989, upload-time = "2025-10-06T05:35:44.596Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" },
+]
+
+[[package]]
+name = "fsspec"
+version = "2023.10.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a4/f7/16ec1f92523165d10301cfa8cb83df0356dbe615d4ca5ed611a16f53e09a/fsspec-2023.10.0.tar.gz", hash = "sha256:330c66757591df346ad3091a53bd907e15348c2ba17d63fd54f5c39c4457d2a5", size = 165452, upload-time = "2023-10-21T17:37:04.636Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/f6/3eccfb530aac90ad1301c582da228e4763f19e719ac8200752a4841b0b2d/fsspec-2023.10.0-py3-none-any.whl", hash = "sha256:346a8f024efeb749d2a5fca7ba8854474b1ff9af7c3faaf636a4548781136529", size = 166384, upload-time = "2023-10-21T17:37:02.632Z" },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+    { name = "requests" },
+]
+
+[[package]]
+name = "geopandas"
+version = "1.0.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyogrio" },
+    { name = "pyproj" },
+    { name = "shapely" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/39/08/2cf5d85356e45b10b8d066cf4c3ba1e9e3185423c48104eed87e8afd0455/geopandas-1.0.1.tar.gz", hash = "sha256:b8bf70a5534588205b7a56646e2082fb1de9a03599651b3d80c99ea4c2ca08ab", size = 317736, upload-time = "2024-07-02T12:26:52.928Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c4/64/7d344cfcef5efddf9cf32f59af7f855828e9d74b5f862eddf5bfd9f25323/geopandas-1.0.1-py3-none-any.whl", hash = "sha256:01e147d9420cc374d26f51fc23716ac307f32b49406e4bd8462c07e82ed1d3d6", size = 323587, upload-time = "2024-07-02T12:26:50.876Z" },
+]
+
+[[package]]
+name = "greenlet"
+version = "3.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/e5/40dbda2736893e3e53d25838e0f19a2b417dfc122b9989c91918db30b5d3/greenlet-3.3.0.tar.gz", hash = "sha256:a82bb225a4e9e4d653dd2fb7b8b2d36e4fb25bc0165422a11e48b88e9e6f78fb", size = 190651, upload-time = "2025-12-04T14:49:44.05Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/6a/33d1702184d94106d3cdd7bfb788e19723206fce152e303473ca3b946c7b/greenlet-3.3.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:6f8496d434d5cb2dce025773ba5597f71f5410ae499d5dd9533e0653258cdb3d", size = 273658, upload-time = "2025-12-04T14:23:37.494Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/b7/2b5805bbf1907c26e434f4e448cd8b696a0b71725204fa21a211ff0c04a7/greenlet-3.3.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b96dc7eef78fd404e022e165ec55327f935b9b52ff355b067eb4a0267fc1cffb", size = 574810, upload-time = "2025-12-04T14:50:04.154Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/38/343242ec12eddf3d8458c73f555c084359883d4ddc674240d9e61ec51fd6/greenlet-3.3.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:73631cd5cccbcfe63e3f9492aaa664d278fda0ce5c3d43aeda8e77317e38efbd", size = 586248, upload-time = "2025-12-04T14:57:39.35Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f0/d0/0ae86792fb212e4384041e0ef8e7bc66f59a54912ce407d26a966ed2914d/greenlet-3.3.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b299a0cb979f5d7197442dccc3aee67fce53500cd88951b7e6c35575701c980b", size = 597403, upload-time = "2025-12-04T15:07:10.831Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b6/a8/15d0aa26c0036a15d2659175af00954aaaa5d0d66ba538345bd88013b4d7/greenlet-3.3.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7dee147740789a4632cace364816046e43310b59ff8fb79833ab043aefa72fd5", size = 586910, upload-time = "2025-12-04T14:25:59.705Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e1/9b/68d5e3b7ccaba3907e5532cf8b9bf16f9ef5056a008f195a367db0ff32db/greenlet-3.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:39b28e339fc3c348427560494e28d8a6f3561c8d2bcf7d706e1c624ed8d822b9", size = 1547206, upload-time = "2025-12-04T15:04:21.027Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/bd/e3086ccedc61e49f91e2cfb5ffad9d8d62e5dc85e512a6200f096875b60c/greenlet-3.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b3c374782c2935cc63b2a27ba8708471de4ad1abaa862ffdb1ef45a643ddbb7d", size = 1613359, upload-time = "2025-12-04T14:27:26.548Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/6b/d4e73f5dfa888364bbf02efa85616c6714ae7c631c201349782e5b428925/greenlet-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:b49e7ed51876b459bd645d83db257f0180e345d3f768a35a85437a24d5a49082", size = 300740, upload-time = "2025-12-04T14:47:52.773Z" },
+]
+
+[[package]]
+name = "grpcio"
+version = "1.76.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/88/17/ff4795dc9a34b6aee6ec379f1b66438a3789cd1315aac0cbab60d92f74b3/grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc", size = 5840037, upload-time = "2025-10-21T16:20:25.069Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4e/ff/35f9b96e3fa2f12e1dcd58a4513a2e2294a001d64dec81677361b7040c9a/grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde", size = 11836482, upload-time = "2025-10-21T16:20:30.113Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3e/1c/8374990f9545e99462caacea5413ed783014b3b66ace49e35c533f07507b/grpcio-1.76.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:035d90bc79eaa4bed83f524331d55e35820725c9fbb00ffa1904d5550ed7ede3", size = 6407178, upload-time = "2025-10-21T16:20:32.733Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1e/77/36fd7d7c75a6c12542c90a6d647a27935a1ecaad03e0ffdb7c42db6b04d2/grpcio-1.76.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4215d3a102bd95e2e11b5395c78562967959824156af11fa93d18fdd18050990", size = 7075684, upload-time = "2025-10-21T16:20:35.435Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/38/f7/e3cdb252492278e004722306c5a8935eae91e64ea11f0af3437a7de2e2b7/grpcio-1.76.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49ce47231818806067aea3324d4bf13825b658ad662d3b25fada0bdad9b8a6af", size = 6611133, upload-time = "2025-10-21T16:20:37.541Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7e/20/340db7af162ccd20a0893b5f3c4a5d676af7b71105517e62279b5b61d95a/grpcio-1.76.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8cc3309d8e08fd79089e13ed4819d0af72aa935dd8f435a195fd152796752ff2", size = 7195507, upload-time = "2025-10-21T16:20:39.643Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/f0/b2160addc1487bd8fa4810857a27132fb4ce35c1b330c2f3ac45d697b106/grpcio-1.76.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:971fd5a1d6e62e00d945423a567e42eb1fa678ba89072832185ca836a94daaa6", size = 8160651, upload-time = "2025-10-21T16:20:42.492Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/2c/ac6f98aa113c6ef111b3f347854e99ebb7fb9d8f7bb3af1491d438f62af4/grpcio-1.76.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d9adda641db7207e800a7f089068f6f645959f2df27e870ee81d44701dd9db3", size = 7620568, upload-time = "2025-10-21T16:20:45.995Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/90/84/7852f7e087285e3ac17a2703bc4129fafee52d77c6c82af97d905566857e/grpcio-1.76.0-cp310-cp310-win32.whl", hash = "sha256:063065249d9e7e0782d03d2bca50787f53bd0fb89a67de9a7b521c4a01f1989b", size = 3998879, upload-time = "2025-10-21T16:20:48.592Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/30/d3d2adcbb6dd3ff59d6ac3df6ef830e02b437fb5c90990429fd180e52f30/grpcio-1.76.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6ae758eb08088d36812dd5d9af7a9859c05b1e0f714470ea243694b49278e7b", size = 4706892, upload-time = "2025-10-21T16:20:50.697Z" },
+]
+
+[[package]]
+name = "guppy3"
+version = "3.1.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/89/3f/20b34899b9e29465525f0ed45812ad2c0e71ec329b62f416df3131a154e2/guppy3-3.1.2.tar.gz", hash = "sha256:af580bec5269b1621f45bd589d76aa1a7c1986742a84ab016f0a0de9ff06086d", size = 335083, upload-time = "2021-10-08T17:32:33.069Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/52/a7/026a4bde7cef99f681f358bd97d7a7bc5875b1ff3cd1a18760dc13825ce3/guppy3-3.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e62be11076538a0bd7edc520c687af1f7adf29d73388b0c6aea1b4ae2cab6166", size = 361957, upload-time = "2021-10-08T17:37:58.569Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4c/1e/693a3635a9a933c634a34551dedb63e4cec2fc564f9ab284a71b88fcb669/guppy3-3.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b65c8b3f05e3385039fc38a93cd4d77e182791ed765716d7f3a05654cb7c04", size = 643842, upload-time = "2021-10-08T17:35:05.004Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/a4/470b467abb50d542fff71860dd2d70a8804d3b51820286f34f3e47876048/guppy3-3.1.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:209f0fb5cbdb48466efb6d359bbee1217c6975850e87c397156f0f699afee96b", size = 626421, upload-time = "2021-10-08T17:34:44.971Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/60/dc/798fcd54acb223960bff73840db72cebe374a76ac41b1e361dfb51c4345a/guppy3-3.1.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b6f3bd78b41736b02cf7b1999d55bb7cdb56188834499235ee3e090287dc9154", size = 647646, upload-time = "2021-10-08T17:34:46.844Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1b/a9/dc6c97fa95b6d0dbc731bce63056298bb1fa36fa4c35de68ba21491240ff/guppy3-3.1.2-cp310-cp310-win32.whl", hash = "sha256:f090e5e8e161d303b79dab7f2cac1ef4dd34e1445a750ed4e24d2137ca0c2b76", size = 342417, upload-time = "2021-10-08T17:36:34.471Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a0/ee/eeee46bc11476e19a22cee9066ba81a3df7e5faab25fb1113475d1475461/guppy3-3.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:08dcd39b03bcfd421d4b47a7c990a05d4c74b295b1913dbb9fd372b5413b5f54", size = 357078, upload-time = "2021-10-08T17:36:36.013Z" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.2.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
+]
+
+[[package]]
+name = "hjson"
+version = "3.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/82/e5/0b56d723a76ca67abadbf7fb71609fb0ea7e6926e94fcca6c65a85b36a0e/hjson-3.1.0.tar.gz", hash = "sha256:55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75", size = 40541, upload-time = "2022-08-13T02:53:01.919Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/7f/13cd798d180af4bf4c0ceddeefba2b864a63c71645abc0308b768d67bb81/hjson-3.1.0-py3-none-any.whl", hash = "sha256:65713cdcf13214fb554eb8b4ef803419733f4f5e551047c9b711098ab7186b89", size = 54018, upload-time = "2022-08-13T02:52:59.899Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "0.36.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" },
+]
+
+[[package]]
+name = "hydra-core"
+version = "1.2.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "omegaconf" },
+    { name = "packaging" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/da/39/6e6cd72ef4949ba8a48d981fc95c73bd019c33effc34262ea27dee760941/hydra-core-1.2.0.tar.gz", hash = "sha256:4990721ce4ac69abafaffee566d6b63a54faa6501ecce65b338d3251446ff634", size = 112573, upload-time = "2022-05-17T22:15:40.245Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/68/0d/0da98325011cb09e1f6439f0adcc98a0f5261d3578bf182438562096913e/hydra_core-1.2.0-py3-none-any.whl", hash = "sha256:b6614fd6d6a97a9499f7ddbef02c9dd38f2fec6a9bc83c10e248db1dae50a528", size = 151068, upload-time = "2022-05-17T22:15:38.069Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.11"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
+]
+
+[[package]]
+name = "imageio"
+version = "2.37.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pillow" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/6f/606be632e37bf8d05b253e8626c2291d74c691ddc7bcdf7d6aaf33b32f6a/imageio-2.37.2.tar.gz", hash = "sha256:0212ef2727ac9caa5ca4b2c75ae89454312f440a756fcfc8ef1993e718f50f8a", size = 389600, upload-time = "2025-11-04T14:29:39.898Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fb/fe/301e0936b79bcab4cacc7548bf2853fc28dced0a578bab1f7ef53c9aa75b/imageio-2.37.2-py3-none-any.whl", hash = "sha256:ad9adfb20335d718c03de457358ed69f141021a333c40a53e57273d8a5bd0b9b", size = 317646, upload-time = "2025-11-04T14:29:37.948Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "ipykernel"
+version = "7.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "appnope", marker = "sys_platform == 'darwin'" },
+    { name = "comm" },
+    { name = "debugpy" },
+    { name = "ipython" },
+    { name = "jupyter-client" },
+    { name = "jupyter-core" },
+    { name = "matplotlib-inline" },
+    { name = "nest-asyncio" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b9/a4/4948be6eb88628505b83a1f2f40d90254cab66abf2043b3c40fa07dfce0f/ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db", size = 174579, upload-time = "2025-10-27T09:46:39.471Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/17/20c2552266728ceba271967b87919664ecc0e33efca29c3efc6baf88c5f9/ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c", size = 117968, upload-time = "2025-10-27T09:46:37.805Z" },
+]
+
+[[package]]
+name = "ipython"
+version = "8.37.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "decorator" },
+    { name = "exceptiongroup" },
+    { name = "jedi" },
+    { name = "matplotlib-inline" },
+    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "prompt-toolkit" },
+    { name = "pygments" },
+    { name = "stack-data" },
+    { name = "traitlets" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/31/10ac88f3357fc276dc8a64e8880c82e80e7459326ae1d0a211b40abf6665/ipython-8.37.0.tar.gz", hash = "sha256:ca815841e1a41a1e6b73a0b08f3038af9b2252564d01fc405356d34033012216", size = 5606088, upload-time = "2025-05-31T16:39:09.613Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/91/d0/274fbf7b0b12643cbbc001ce13e6a5b1607ac4929d1b11c72460152c9fc3/ipython-8.37.0-py3-none-any.whl", hash = "sha256:ed87326596b878932dbcb171e3e698845434d8c61b8d8cd474bf663041a9dcf2", size = 831864, upload-time = "2025-05-31T16:39:06.38Z" },
+]
+
+[[package]]
+name = "isoduration"
+version = "20.11.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "arrow" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7c/1a/3c8edc664e06e6bd06cce40c6b22da5f1429aa4224d0c590f3be21c91ead/isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9", size = 11649, upload-time = "2020-11-01T11:00:00.312Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7b/55/e5326141505c5d5e34c5e0935d2908a74e4561eca44108fbfb9c13d2911a/isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042", size = 11321, upload-time = "2020-11-01T10:59:58.02Z" },
+]
+
+[[package]]
+name = "isort"
+version = "7.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/53/4f3c058e3bace40282876f9b553343376ee687f3c35a525dc79dbd450f88/isort-7.0.0.tar.gz", hash = "sha256:5513527951aadb3ac4292a41a16cbc50dd1642432f5e8c20057d414bdafb4187", size = 805049, upload-time = "2025-10-11T13:30:59.107Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7f/ed/e3705d6d02b4f7aea715a353c8ce193efd0b5db13e204df895d38734c244/isort-7.0.0-py3-none-any.whl", hash = "sha256:1bcabac8bc3c36c7fb7b98a76c8abb18e0f841a3ba81decac7691008592499c1", size = 94672, upload-time = "2025-10-11T13:30:57.665Z" },
+]
+
+[[package]]
+name = "jedi"
+version = "0.19.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "parso" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "jmespath"
+version = "1.0.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
+]
+
+[[package]]
+name = "joblib"
+version = "1.5.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
+]
+
+[[package]]
+name = "json5"
+version = "0.12.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/12/ae/929aee9619e9eba9015207a9d2c1c54db18311da7eb4dcf6d41ad6f0eb67/json5-0.12.1.tar.gz", hash = "sha256:b2743e77b3242f8d03c143dd975a6ec7c52e2f2afe76ed934e53503dd4ad4990", size = 52191, upload-time = "2025-08-12T19:47:42.583Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/e2/05328bd2621be49a6fed9e3030b1e51a2d04537d3f816d211b9cc53c5262/json5-0.12.1-py3-none-any.whl", hash = "sha256:d9c9b3bc34a5f54d43c35e11ef7cb87d8bdd098c6ace87117a7b7e83e705c1d5", size = 36119, upload-time = "2025-08-12T19:47:41.131Z" },
+]
+
+[[package]]
+name = "jsonpointer"
+version = "3.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/0a/eebeb1fa92507ea94016a2a790b93c2ae41a7e18778f85471dc54475ed25/jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef", size = 9114, upload-time = "2024-06-10T19:24:42.462Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" },
+]
+
+[[package]]
+name = "jsonschema"
+version = "4.25.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "jsonschema-specifications" },
+    { name = "referencing" },
+    { name = "rpds-py" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/74/69/f7185de793a29082a9f3c7728268ffb31cb5095131a9c139a74078e27336/jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85", size = 357342, upload-time = "2025-08-18T17:03:50.038Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/9c/8c95d856233c1f82500c2450b8c68576b4cf1c871db3afac5c34ff84e6fd/jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63", size = 90040, upload-time = "2025-08-18T17:03:48.373Z" },
+]
+
+[package.optional-dependencies]
+format-nongpl = [
+    { name = "fqdn" },
+    { name = "idna" },
+    { name = "isoduration" },
+    { name = "jsonpointer" },
+    { name = "rfc3339-validator" },
+    { name = "rfc3986-validator" },
+    { name = "rfc3987-syntax" },
+    { name = "uri-template" },
+    { name = "webcolors" },
+]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2025.9.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "referencing" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
+]
+
+[[package]]
+name = "jupyter-client"
+version = "8.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jupyter-core" },
+    { name = "python-dateutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a6/27/d10de45e8ad4ce872372c4a3a37b7b35b6b064f6f023a5c14ffcced4d59d/jupyter_client-8.7.0.tar.gz", hash = "sha256:3357212d9cbe01209e59190f67a3a7e1f387a4f4e88d1e0433ad84d7b262531d", size = 344691, upload-time = "2025-12-09T18:37:01.953Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/f5/fddaec430367be9d62a7ed125530e133bfd4a1c0350fe221149ee0f2b526/jupyter_client-8.7.0-py3-none-any.whl", hash = "sha256:3671a94fd25e62f5f2f554f5e95389c2294d89822378a5f2dd24353e1494a9e0", size = 106215, upload-time = "2025-12-09T18:37:00.024Z" },
+]
+
+[[package]]
+name = "jupyter-core"
+version = "5.9.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "platformdirs" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" },
+]
+
+[[package]]
+name = "jupyter-events"
+version = "0.12.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jsonschema", extra = ["format-nongpl"] },
+    { name = "packaging" },
+    { name = "python-json-logger" },
+    { name = "pyyaml" },
+    { name = "referencing" },
+    { name = "rfc3339-validator" },
+    { name = "rfc3986-validator" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9d/c3/306d090461e4cf3cd91eceaff84bede12a8e52cd821c2d20c9a4fd728385/jupyter_events-0.12.0.tar.gz", hash = "sha256:fc3fce98865f6784c9cd0a56a20644fc6098f21c8c33834a8d9fe383c17e554b", size = 62196, upload-time = "2025-02-03T17:23:41.485Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/48/577993f1f99c552f18a0428731a755e06171f9902fa118c379eb7c04ea22/jupyter_events-0.12.0-py3-none-any.whl", hash = "sha256:6464b2fa5ad10451c3d35fabc75eab39556ae1e2853ad0c0cc31b656731a97fb", size = 19430, upload-time = "2025-02-03T17:23:38.643Z" },
+]
+
+[[package]]
+name = "jupyter-lsp"
+version = "2.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jupyter-server" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/eb/5a/9066c9f8e94ee517133cd98dba393459a16cd48bba71a82f16a65415206c/jupyter_lsp-2.3.0.tar.gz", hash = "sha256:458aa59339dc868fb784d73364f17dbce8836e906cd75fd471a325cba02e0245", size = 54823, upload-time = "2025-08-27T17:47:34.671Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1a/60/1f6cee0c46263de1173894f0fafcb3475ded276c472c14d25e0280c18d6d/jupyter_lsp-2.3.0-py3-none-any.whl", hash = "sha256:e914a3cb2addf48b1c7710914771aaf1819d46b2e5a79b0f917b5478ec93f34f", size = 76687, upload-time = "2025-08-27T17:47:33.15Z" },
+]
+
+[[package]]
+name = "jupyter-server"
+version = "2.17.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "argon2-cffi" },
+    { name = "jinja2" },
+    { name = "jupyter-client" },
+    { name = "jupyter-core" },
+    { name = "jupyter-events" },
+    { name = "jupyter-server-terminals" },
+    { name = "nbconvert" },
+    { name = "nbformat" },
+    { name = "overrides" },
+    { name = "packaging" },
+    { name = "prometheus-client" },
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'aarch64' and sys_platform == 'linux') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "pyzmq" },
+    { name = "send2trash" },
+    { name = "terminado" },
+    { name = "tornado" },
+    { name = "traitlets" },
+    { name = "websocket-client" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5b/ac/e040ec363d7b6b1f11304cc9f209dac4517ece5d5e01821366b924a64a50/jupyter_server-2.17.0.tar.gz", hash = "sha256:c38ea898566964c888b4772ae1ed58eca84592e88251d2cfc4d171f81f7e99d5", size = 731949, upload-time = "2025-08-21T14:42:54.042Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/80/a24767e6ca280f5a49525d987bf3e4d7552bf67c8be07e8ccf20271f8568/jupyter_server-2.17.0-py3-none-any.whl", hash = "sha256:e8cb9c7db4251f51ed307e329b81b72ccf2056ff82d50524debde1ee1870e13f", size = 388221, upload-time = "2025-08-21T14:42:52.034Z" },
+]
+
+[[package]]
+name = "jupyter-server-terminals"
+version = "0.5.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'aarch64' and sys_platform == 'linux') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "terminado" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fc/d5/562469734f476159e99a55426d697cbf8e7eb5efe89fb0e0b4f83a3d3459/jupyter_server_terminals-0.5.3.tar.gz", hash = "sha256:5ae0295167220e9ace0edcfdb212afd2b01ee8d179fe6f23c899590e9b8a5269", size = 31430, upload-time = "2024-03-12T14:37:03.049Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/2d/2b32cdbe8d2a602f697a649798554e4f072115438e92249624e532e8aca6/jupyter_server_terminals-0.5.3-py3-none-any.whl", hash = "sha256:41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa", size = 13656, upload-time = "2024-03-12T14:37:00.708Z" },
+]
+
+[[package]]
+name = "jupyterlab"
+version = "4.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "async-lru" },
+    { name = "httpx" },
+    { name = "ipykernel" },
+    { name = "jinja2" },
+    { name = "jupyter-core" },
+    { name = "jupyter-lsp" },
+    { name = "jupyter-server" },
+    { name = "jupyterlab-server" },
+    { name = "notebook-shim" },
+    { name = "packaging" },
+    { name = "setuptools" },
+    { name = "tomli" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/09/21/413d142686a4e8f4268d985becbdb4daf060524726248e73be4773786987/jupyterlab-4.5.1.tar.gz", hash = "sha256:09da1ddfbd9eec18b5101dbb8515612aa1e47443321fb99503725a88e93d20d9", size = 23992251, upload-time = "2025-12-15T16:58:59.361Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/c3/acced767eecc11a70c65c45295db5396c4f0c1937874937d5a76d7b177b6/jupyterlab-4.5.1-py3-none-any.whl", hash = "sha256:31b059de96de0754ff1f2ce6279774b6aab8c34d7082e9752db58207c99bd514", size = 12384821, upload-time = "2025-12-15T16:58:55.563Z" },
+]
+
+[[package]]
+name = "jupyterlab-pygments"
+version = "0.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/90/51/9187be60d989df97f5f0aba133fa54e7300f17616e065d1ada7d7646b6d6/jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b1/dd/ead9d8ea85bf202d90cc513b533f9c363121c7792674f78e0d8a854b63b4/jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780" },
+]
+
+[[package]]
+name = "jupyterlab-server"
+version = "2.28.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "babel" },
+    { name = "jinja2" },
+    { name = "json5" },
+    { name = "jsonschema" },
+    { name = "jupyter-server" },
+    { name = "packaging" },
+    { name = "requests" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/2c/90153f189e421e93c4bb4f9e3f59802a1f01abd2ac5cf40b152d7f735232/jupyterlab_server-2.28.0.tar.gz", hash = "sha256:35baa81898b15f93573e2deca50d11ac0ae407ebb688299d3a5213265033712c", size = 76996, upload-time = "2025-10-22T13:59:18.37Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/07/a000fe835f76b7e1143242ab1122e6362ef1c03f23f83a045c38859c2ae0/jupyterlab_server-2.28.0-py3-none-any.whl", hash = "sha256:e4355b148fdcf34d312bbbc80f22467d6d20460e8b8736bf235577dd18506968", size = 59830, upload-time = "2025-10-22T13:59:16.767Z" },
+]
+
+[[package]]
+name = "kiwisolver"
+version = "1.4.9"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c6/5d/8ce64e36d4e3aac5ca96996457dcf33e34e6051492399a3f1fec5657f30b/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b", size = 124159, upload-time = "2025-08-10T21:25:35.472Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/96/1e/22f63ec454874378175a5f435d6ea1363dd33fb2af832c6643e4ccea0dc8/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f", size = 66578, upload-time = "2025-08-10T21:25:36.73Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/4c/1925dcfff47a02d465121967b95151c82d11027d5ec5242771e580e731bd/kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf", size = 65312, upload-time = "2025-08-10T21:25:37.658Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d4/42/0f333164e6307a0687d1eb9ad256215aae2f4bd5d28f4653d6cd319a3ba3/kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9", size = 1628458, upload-time = "2025-08-10T21:25:39.067Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/86/b6/2dccb977d651943995a90bfe3495c2ab2ba5cd77093d9f2318a20c9a6f59/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415", size = 1225640, upload-time = "2025-08-10T21:25:40.489Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/2b/362ebd3eec46c850ccf2bfe3e30f2fc4c008750011f38a850f088c56a1c6/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b", size = 1244074, upload-time = "2025-08-10T21:25:42.221Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6f/bb/f09a1e66dab8984773d13184a10a29fe67125337649d26bdef547024ed6b/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154", size = 1293036, upload-time = "2025-08-10T21:25:43.801Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ea/01/11ecf892f201cafda0f68fa59212edaea93e96c37884b747c181303fccd1/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48", size = 2175310, upload-time = "2025-08-10T21:25:45.045Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7f/5f/bfe11d5b934f500cc004314819ea92427e6e5462706a498c1d4fc052e08f/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220", size = 2270943, upload-time = "2025-08-10T21:25:46.393Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3d/de/259f786bf71f1e03e73d87e2db1a9a3bcab64d7b4fd780167123161630ad/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586", size = 2440488, upload-time = "2025-08-10T21:25:48.074Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1b/76/c989c278faf037c4d3421ec07a5c452cd3e09545d6dae7f87c15f54e4edf/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634", size = 2246787, upload-time = "2025-08-10T21:25:49.442Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/55/c2898d84ca440852e560ca9f2a0d28e6e931ac0849b896d77231929900e7/kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611", size = 73730, upload-time = "2025-08-10T21:25:51.102Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/09/486d6ac523dd33b80b368247f238125d027964cfacb45c654841e88fb2ae/kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536", size = 65036, upload-time = "2025-08-10T21:25:52.063Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/63/fde392691690f55b38d5dd7b3710f5353bf7a8e52de93a22968801ab8978/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527", size = 60183, upload-time = "2025-08-10T21:27:37.669Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/27/b1/6aad34edfdb7cced27f371866f211332bba215bfd918ad3322a58f480d8b/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771", size = 58675, upload-time = "2025-08-10T21:27:39.031Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9d/1a/23d855a702bb35a76faed5ae2ba3de57d323f48b1f6b17ee2176c4849463/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e", size = 80277, upload-time = "2025-08-10T21:27:40.129Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/5b/5239e3c2b8fb5afa1e8508f721bb77325f740ab6994d963e61b2b7abcc1e/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9", size = 77994, upload-time = "2025-08-10T21:27:41.181Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" },
+]
+
+[[package]]
+name = "lark"
+version = "1.3.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/da/34/28fff3ab31ccff1fd4f6c7c7b0ceb2b6968d8ea4950663eadcb5720591a0/lark-1.3.1.tar.gz", hash = "sha256:b426a7a6d6d53189d318f2b6236ab5d6429eaf09259f1ca33eb716eed10d2905", size = 382732, upload-time = "2025-10-27T18:25:56.653Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/82/3d/14ce75ef66813643812f3093ab17e46d3a206942ce7376d31ec2d36229e7/lark-1.3.1-py3-none-any.whl", hash = "sha256:c629b661023a014c37da873b4ff58a817398d12635d3bbb2c5a03be7fe5d1e12", size = 113151, upload-time = "2025-10-27T18:25:54.882Z" },
+]
+
+[[package]]
+name = "lightning-utilities"
+version = "0.15.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "setuptools" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b8/39/6fc58ca81492db047149b4b8fd385aa1bfb8c28cd7cacb0c7eb0c44d842f/lightning_utilities-0.15.2.tar.gz", hash = "sha256:cdf12f530214a63dacefd713f180d1ecf5d165338101617b4742e8f22c032e24", size = 31090, upload-time = "2025-08-06T13:57:39.242Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/de/73/3d757cb3fc16f0f9794dd289bcd0c4a031d9cf54d8137d6b984b2d02edf3/lightning_utilities-0.15.2-py3-none-any.whl", hash = "sha256:ad3ab1703775044bbf880dbf7ddaaac899396c96315f3aa1779cec9d618a9841", size = 29431, upload-time = "2025-08-06T13:57:38.046Z" },
+]
+
+[[package]]
+name = "markdown"
+version = "3.10"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7d/ab/7dd27d9d863b3376fcf23a5a13cb5d024aed1db46f963f1b5735ae43b3be/markdown-3.10.tar.gz", hash = "sha256:37062d4f2aa4b2b6b32aefb80faa300f82cc790cb949a35b8caede34f2b68c0e", size = 364931, upload-time = "2025-11-03T19:51:15.007Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/70/81/54e3ce63502cd085a0c556652a4e1b919c45a446bd1e5300e10c44c8c521/markdown-3.10-py3-none-any.whl", hash = "sha256:b5b99d6951e2e4948d939255596523444c0e677c669700b1d17aa4a8a464cb7c", size = 107678, upload-time = "2025-11-03T19:51:13.887Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559", size = 11631, upload-time = "2025-09-27T18:36:05.558Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/98/1b/fbd8eed11021cabd9226c37342fa6ca4e8a98d8188a8d9b66740494960e4/markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419", size = 12057, upload-time = "2025-09-27T18:36:07.165Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/01/e560d658dc0bb8ab762670ece35281dec7b6c1b33f5fbc09ebb57a185519/markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695", size = 22050, upload-time = "2025-09-27T18:36:08.005Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/cd/ce6e848bbf2c32314c9b237839119c5a564a59725b53157c856e90937b7a/markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591", size = 20681, upload-time = "2025-09-27T18:36:08.881Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/2a/b5c12c809f1c3045c4d580b035a743d12fcde53cf685dbc44660826308da/markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c", size = 20705, upload-time = "2025-09-27T18:36:10.131Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cf/e3/9427a68c82728d0a88c50f890d0fc072a1484de2f3ac1ad0bfc1a7214fd5/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f", size = 21524, upload-time = "2025-09-27T18:36:11.324Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/36/23578f29e9e582a4d0278e009b38081dbe363c5e7165113fad546918a232/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6", size = 20282, upload-time = "2025-09-27T18:36:12.573Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/56/21/dca11354e756ebd03e036bd8ad58d6d7168c80ce1fe5e75218e4945cbab7/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1", size = 20745, upload-time = "2025-09-27T18:36:13.504Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/99/faba9369a7ad6e4d10b6a5fbf71fa2a188fe4a593b15f0963b73859a1bbd/markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa", size = 14571, upload-time = "2025-09-27T18:36:14.779Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/25/55dc3ab959917602c96985cb1253efaa4ff42f71194bddeb61eb7278b8be/markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8", size = 15056, upload-time = "2025-09-27T18:36:16.125Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d0/9e/0a02226640c255d1da0b8d12e24ac2aa6734da68bff14c05dd53b94a0fc3/markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1", size = 13932, upload-time = "2025-09-27T18:36:17.311Z" },
+]
+
+[[package]]
+name = "matplotlib"
+version = "3.10.8"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "contourpy" },
+    { name = "cycler" },
+    { name = "fonttools" },
+    { name = "kiwisolver" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "pyparsing" },
+    { name = "python-dateutil" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/58/be/a30bd917018ad220c400169fba298f2bb7003c8ccbc0c3e24ae2aacad1e8/matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7", size = 8239828, upload-time = "2025-12-10T22:55:02.313Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/58/27/ca01e043c4841078e82cf6e80a6993dfecd315c3d79f5f3153afbb8e1ec6/matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656", size = 8128050, upload-time = "2025-12-10T22:55:04.997Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/aa/7ab67f2b729ae6a91bcf9dcac0affb95fb8c56f7fd2b2af894ae0b0cf6fa/matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df", size = 8700452, upload-time = "2025-12-10T22:55:07.47Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/ae/2d5817b0acee3c49b7e7ccfbf5b273f284957cc8e270adf36375db353190/matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17", size = 9534928, upload-time = "2025-12-10T22:55:10.566Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/5b/8e66653e9f7c39cb2e5cab25fce4810daffa2bff02cbf5f3077cea9e942c/matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933", size = 9586377, upload-time = "2025-12-10T22:55:12.362Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/e2/fd0bbadf837f81edb0d208ba8f8cb552874c3b16e27cb91a31977d90875d/matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a", size = 8128127, upload-time = "2025-12-10T22:55:14.436Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f5/43/31d59500bb950b0d188e149a2e552040528c13d6e3d6e84d0cccac593dcd/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8", size = 8237252, upload-time = "2025-12-10T22:56:39.529Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/2c/615c09984f3c5f907f51c886538ad785cf72e0e11a3225de2c0f9442aecc/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7", size = 8124693, upload-time = "2025-12-10T22:56:41.758Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/91/e1/2757277a1c56041e1fc104b51a0f7b9a4afc8eb737865d63cababe30bc61/matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3", size = 8702205, upload-time = "2025-12-10T22:56:43.415Z" },
+]
+
+[[package]]
+name = "matplotlib-inline"
+version = "0.2.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" },
+]
+
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", size = 9658, upload-time = "2022-01-24T01:14:51.113Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e", size = 7350, upload-time = "2022-01-24T01:14:49.62Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "mistune"
+version = "3.1.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d7/02/a7fb8b21d4d55ac93cdcde9d3638da5dd0ebdd3a4fed76c7725e10b81cbe/mistune-3.1.4.tar.gz", hash = "sha256:b5a7f801d389f724ec702840c11d8fc48f2b33519102fc7ee739e8177b672164", size = 94588, upload-time = "2025-08-29T07:20:43.594Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/f0/8282d9641415e9e33df173516226b404d367a0fc55e1a60424a152913abc/mistune-3.1.4-py3-none-any.whl", hash = "sha256:93691da911e5d9d2e23bc54472892aff676df27a75274962ff9edc210364266d", size = 53481, upload-time = "2025-08-29T07:20:42.218Z" },
+]
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
+]
+
+[[package]]
+name = "multidict"
+version = "6.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/80/1e/5492c365f222f907de1039b91f922b93fa4f764c713ee858d235495d8f50/multidict-6.7.0.tar.gz", hash = "sha256:c6e99d9a65ca282e578dfea819cfa9c0a62b2499d8677392e09feaf305e9e6f5", size = 101834, upload-time = "2025-10-06T14:52:30.657Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a9/63/7bdd4adc330abcca54c85728db2327130e49e52e8c3ce685cec44e0f2e9f/multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349", size = 77153, upload-time = "2025-10-06T14:48:26.409Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3f/bb/b6c35ff175ed1a3142222b78455ee31be71a8396ed3ab5280fbe3ebe4e85/multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e", size = 44993, upload-time = "2025-10-06T14:48:28.4Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/1f/064c77877c5fa6df6d346e68075c0f6998547afe952d6471b4c5f6a7345d/multidict-6.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03ca744319864e92721195fa28c7a3b2bc7b686246b35e4078c1e4d0eb5466d3", size = 44607, upload-time = "2025-10-06T14:48:29.581Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/7a/bf6aa92065dd47f287690000b3d7d332edfccb2277634cadf6a810463c6a/multidict-6.7.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f0e77e3c0008bc9316e662624535b88d360c3a5d3f81e15cf12c139a75250046", size = 241847, upload-time = "2025-10-06T14:48:32.107Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/39/297a8de920f76eda343e4ce05f3b489f0ab3f9504f2576dfb37b7c08ca08/multidict-6.7.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08325c9e5367aa379a3496aa9a022fe8837ff22e00b94db256d3a1378c76ab32", size = 242616, upload-time = "2025-10-06T14:48:34.054Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/39/3a/d0eee2898cfd9d654aea6cb8c4addc2f9756e9a7e09391cfe55541f917f7/multidict-6.7.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e2862408c99f84aa571ab462d25236ef9cb12a602ea959ba9c9009a54902fc73", size = 222333, upload-time = "2025-10-06T14:48:35.9Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/48/3b328851193c7a4240815b71eea165b49248867bbb6153a0aee227a0bb47/multidict-6.7.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d72a9a2d885f5c208b0cb91ff2ed43636bb7e345ec839ff64708e04f69a13cc", size = 253239, upload-time = "2025-10-06T14:48:37.302Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b1/ca/0706a98c8d126a89245413225ca4a3fefc8435014de309cf8b30acb68841/multidict-6.7.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:478cc36476687bac1514d651cbbaa94b86b0732fb6855c60c673794c7dd2da62", size = 251618, upload-time = "2025-10-06T14:48:38.963Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5e/4f/9c7992f245554d8b173f6f0a048ad24b3e645d883f096857ec2c0822b8bd/multidict-6.7.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6843b28b0364dc605f21481c90fadb5f60d9123b442eb8a726bb74feef588a84", size = 241655, upload-time = "2025-10-06T14:48:40.312Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/79/26a85991ae67efd1c0b1fc2e0c275b8a6aceeb155a68861f63f87a798f16/multidict-6.7.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:23bfeee5316266e5ee2d625df2d2c602b829435fc3a235c2ba2131495706e4a0", size = 239245, upload-time = "2025-10-06T14:48:41.848Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/14/1e/75fa96394478930b79d0302eaf9a6c69f34005a1a5251ac8b9c336486ec9/multidict-6.7.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:680878b9f3d45c31e1f730eef731f9b0bc1da456155688c6745ee84eb818e90e", size = 233523, upload-time = "2025-10-06T14:48:43.749Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/5e/085544cb9f9c4ad2b5d97467c15f856df8d9bac410cffd5c43991a5d878b/multidict-6.7.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:eb866162ef2f45063acc7a53a88ef6fe8bf121d45c30ea3c9cd87ce7e191a8d4", size = 243129, upload-time = "2025-10-06T14:48:45.225Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b9/c3/e9d9e2f20c9474e7a8fcef28f863c5cbd29bb5adce6b70cebe8bdad0039d/multidict-6.7.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:df0e3bf7993bdbeca5ac25aa859cf40d39019e015c9c91809ba7093967f7a648", size = 248999, upload-time = "2025-10-06T14:48:46.703Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b5/3f/df171b6efa3239ae33b97b887e42671cd1d94d460614bfb2c30ffdab3b95/multidict-6.7.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:661709cdcd919a2ece2234f9bae7174e5220c80b034585d7d8a755632d3e2111", size = 243711, upload-time = "2025-10-06T14:48:48.146Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/2f/9b5564888c4e14b9af64c54acf149263721a283aaf4aa0ae89b091d5d8c1/multidict-6.7.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:096f52730c3fb8ed419db2d44391932b63891b2c5ed14850a7e215c0ba9ade36", size = 237504, upload-time = "2025-10-06T14:48:49.447Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6c/3a/0bd6ca0f7d96d790542d591c8c3354c1e1b6bfd2024d4d92dc3d87485ec7/multidict-6.7.0-cp310-cp310-win32.whl", hash = "sha256:afa8a2978ec65d2336305550535c9c4ff50ee527914328c8677b3973ade52b85", size = 41422, upload-time = "2025-10-06T14:48:50.789Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/35/f6a637ea2c75f0d3b7c7d41b1189189acff0d9deeb8b8f35536bb30f5e33/multidict-6.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:b15b3afff74f707b9275d5ba6a91ae8f6429c3ffb29bbfd216b0b375a56f13d7", size = 46050, upload-time = "2025-10-06T14:48:51.938Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e7/b8/f7bf8329b39893d02d9d95cf610c75885d12fc0f402b1c894e1c8e01c916/multidict-6.7.0-cp310-cp310-win_arm64.whl", hash = "sha256:4b73189894398d59131a66ff157837b1fafea9974be486d036bb3d32331fdbf0", size = 43153, upload-time = "2025-10-06T14:48:53.146Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.15"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/68/e0/a77ca96e772e13c828fa52f3ad370d413bef194aeaf78b7c6611870ad815/multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e", size = 1894495, upload-time = "2023-07-23T05:03:29.314Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/36/73/3dcd3175c0295d3989859197c8a9111d5936693a0ea639dc80de87bb7803/multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5", size = 134981, upload-time = "2023-07-23T05:03:03.746Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/35/a8/36d8d7b3e46b377800d8dec47891cdf05842d1a2366909ae4a0c89fbc5e6/multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a", size = 134824, upload-time = "2023-07-23T05:03:21.182Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ca/3f/8354ce12fd13bd5c5bb4722261a10ca1d6e2eb7c1c08fa3d8a4e9dc98f44/multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5", size = 116276, upload-time = "2023-07-23T05:03:24.718Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c2/a6/c5cb599d917904878f220a4dbdfdcc4ef291dd3956c35b3b0dc6fc42fb6d/multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316", size = 132626, upload-time = "2023-07-23T05:03:25.895Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338", size = 133349, upload-time = "2023-07-23T05:03:27.022Z" },
+]
+
+[[package]]
+name = "nbclient"
+version = "0.10.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jupyter-client" },
+    { name = "jupyter-core" },
+    { name = "nbformat" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/66/7ffd18d58eae90d5721f9f39212327695b749e23ad44b3881744eaf4d9e8/nbclient-0.10.2.tar.gz", hash = "sha256:90b7fc6b810630db87a6d0c2250b1f0ab4cf4d3c27a299b0cde78a4ed3fd9193", size = 62424, upload-time = "2024-12-19T10:32:27.164Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/6d/e7fa07f03a4a7b221d94b4d586edb754a9b0dc3c9e2c93353e9fa4e0d117/nbclient-0.10.2-py3-none-any.whl", hash = "sha256:4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d", size = 25434, upload-time = "2024-12-19T10:32:24.139Z" },
+]
+
+[[package]]
+name = "nbconvert"
+version = "7.16.6"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "bleach", extra = ["css"] },
+    { name = "defusedxml" },
+    { name = "jinja2" },
+    { name = "jupyter-core" },
+    { name = "jupyterlab-pygments" },
+    { name = "markupsafe" },
+    { name = "mistune" },
+    { name = "nbclient" },
+    { name = "nbformat" },
+    { name = "packaging" },
+    { name = "pandocfilters" },
+    { name = "pygments" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/59/f28e15fc47ffb73af68a8d9b47367a8630d76e97ae85ad18271b9db96fdf/nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582", size = 857715, upload-time = "2025-01-28T09:29:14.724Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/9a/cd673b2f773a12c992f41309ef81b99da1690426bd2f96957a7ade0d3ed7/nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b", size = 258525, upload-time = "2025-01-28T09:29:12.551Z" },
+]
+
+[[package]]
+name = "nbformat"
+version = "5.10.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "fastjsonschema" },
+    { name = "jsonschema" },
+    { name = "jupyter-core" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/fd/91545e604bc3dad7dca9ed03284086039b294c6b3d75c0d2fa45f9e9caf3/nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a", size = 142749, upload-time = "2024-04-04T11:20:37.371Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" },
+]
+
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" },
+]
+
+[[package]]
+name = "networkx"
+version = "3.4.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263, upload-time = "2024-10-21T12:39:36.247Z" },
+]
+
+[[package]]
+name = "ninja"
+version = "1.13.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" },
+]
+
+[[package]]
+name = "notebook"
+version = "7.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jupyter-server" },
+    { name = "jupyterlab" },
+    { name = "jupyterlab-server" },
+    { name = "notebook-shim" },
+    { name = "tornado" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/a9/882707b0aa639e6d7d3e7df4bfbe07479d832e9a8f02d8471002a4ea6d65/notebook-7.5.1.tar.gz", hash = "sha256:b2fb4cef4d47d08c33aecce1c6c6e84be05436fbd791f88fce8df9fbca088b75", size = 14058696, upload-time = "2025-12-16T07:38:59.223Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/86/ca516cb58ad2cb2064124d31cf0fd8b012fca64bebeb26da2d2ddf03fc79/notebook-7.5.1-py3-none-any.whl", hash = "sha256:f4e2451c19910c33b88709b84537e11f6368c1cdff1aa0c43db701aea535dd44", size = 14468080, upload-time = "2025-12-16T07:38:55.644Z" },
+]
+
+[[package]]
+name = "notebook-shim"
+version = "0.2.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "jupyter-server" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/54/d2/92fa3243712b9a3e8bafaf60aac366da1cada3639ca767ff4b5b3654ec28/notebook_shim-0.2.4.tar.gz", hash = "sha256:b4b2cfa1b65d98307ca24361f5b30fe785b53c3fd07b7a47e89acb5e6ac638cb", size = 13167, upload-time = "2024-02-14T23:35:18.353Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f9/33/bd5b9137445ea4b680023eb0469b2bb969d61303dedb2aac6560ff3d14a1/notebook_shim-0.2.4-py3-none-any.whl", hash = "sha256:411a5be4e9dc882a074ccbcae671eda64cceb068767e9a3419096986560e1cef", size = 13307, upload-time = "2024-02-14T23:35:16.286Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "1.23.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/64/8e/9929b64e146d240507edaac2185cd5516f00b133be5b39250d253be25a64/numpy-1.23.4.tar.gz", hash = "sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c", size = 10728765, upload-time = "2022-10-12T14:53:03.823Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/08/417eabd93635695e69a18a336401b306da504dec032d537dbdc76577a569/numpy-1.23.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2", size = 18113929, upload-time = "2022-10-12T14:42:05.424Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/d7/fe76e876a139c5f3f7c7bb2695ec2136bf94d9748433d9028ef0e2f58843/numpy-1.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f", size = 13346178, upload-time = "2022-10-12T14:42:26.98Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0f/9e/fc7e8c4f98acb3be0bf32e16a0412bdbbb6a10292ba68061e46158c865b4/numpy-1.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71", size = 13942758, upload-time = "2022-10-12T14:42:47.788Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/83/78ae18fffc185d0d57097610d5a97473ef11dbdca95f16739ee96b158087/numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3", size = 17055213, upload-time = "2022-10-12T14:43:13.792Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a7/08/e776ca1cc170ead24149dd1f8f4646628c94e70612f5fed74e9543f063fd/numpy-1.23.4-cp310-cp310-win32.whl", hash = "sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd", size = 12193971, upload-time = "2022-10-12T14:43:32.62Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/f2/fb29a463abacb29fa03dfb548d463f77d5d0be887664a3d570556247767a/numpy-1.23.4-cp310-cp310-win_amd64.whl", hash = "sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329", size = 14643698, upload-time = "2022-10-12T14:43:55.316Z" },
+]
+
+[[package]]
+name = "nuplan-devkit"
+version = "1.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/04/323d0e1c914c5127bd00145c33c4a2b1c31ec6ee1518376f7ea4f40dfe14/nuplan_devkit-1.1.0.tar.gz", hash = "sha256:d0fae4020ce467b9c53650773b68b63f3bda496499565112131d3c971c22145a", size = 3000, upload-time = "2025-10-31T10:18:14.857Z" }
+
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.8.4.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" },
+]
+
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" },
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.8.93"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" },
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" },
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.10.2.21"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
+]
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.3.3.83"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
+]
+
+[[package]]
+name = "nvidia-cufile-cu12"
+version = "1.13.1.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" },
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.9.90"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" },
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.7.3.90"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
+]
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.5.8.93"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
+]
+
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.7.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
+]
+
+[[package]]
+name = "nvidia-ml-py"
+version = "12.575.51"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d2/4d/6f017814ed5ac28e08e1b8a62e3a258957da27582c89b7f8f8b15ac3d2e7/nvidia_ml_py-12.575.51.tar.gz", hash = "sha256:6490e93fea99eb4e966327ae18c6eec6256194c921f23459c8767aee28c54581", size = 46597, upload-time = "2025-05-06T20:46:37.962Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/db/24/552ebea28f0570b9e65e62b50287a273804c9f997cc1c2dcd4e2d64b9e7d/nvidia_ml_py-12.575.51-py3-none-any.whl", hash = "sha256:eb8641800d98ce40a22f479873f34b482e214a7e80349c63be51c3919845446e", size = 47547, upload-time = "2025-05-06T20:46:36.457Z" },
+]
+
+[[package]]
+name = "nvidia-nccl-cu12"
+version = "2.27.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" },
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.8.93"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" },
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
+]
+
+[[package]]
+name = "omegaconf"
+version = "2.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" },
+]
+
+[[package]]
+name = "opencv-python"
+version = "4.9.0.80"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/72/da7c69a3542071bf1e8f65336721b8b2659194425438d988f79bc14ed9cc/opencv-python-4.9.0.80.tar.gz", hash = "sha256:1a9f0e6267de3a1a1db0c54213d022c7c8b5b9ca4b580e80bdc58516c922c9e1", size = 92896686, upload-time = "2023-12-31T13:34:08.899Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/35/69/b657974ddcbba54d59d7d62b01e60a8b815e35f415b996e4d355be0ac7b4/opencv_python-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:7e5f7aa4486651a6ebfa8ed4b594b65bd2d2f41beeb4241a3e4b1b85acbbbadb", size = 55689340, upload-time = "2023-12-31T14:31:15.606Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/77/df/b56175c3fb5bc058774bdcf35f5a71cf9c3c5b909f98a1c688eb71cd3b1f/opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71dfb9555ccccdd77305fc3dcca5897fbf0cf28b297c51ee55e079c065d812a3", size = 35354525, upload-time = "2023-12-31T16:38:18.453Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/52/00/2adf376707c7965bb4569f28f73fafe303c404d01047b10e3b52761be086/opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b34a52e9da36dda8c151c6394aed602e4b17fa041df0b9f5b93ae10b0fcca2a", size = 41289855, upload-time = "2023-12-31T16:39:48.451Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d9/64/7fdfb9386511cd6805451e012c537073a79a958a58795c4e602e538c388c/opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4088cab82b66a3b37ffc452976b14a3c599269c247895ae9ceb4066d8188a57", size = 62208946, upload-time = "2023-12-31T13:34:00.941Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/76/9e/db1c2d56c04b97981c06663384f45f28950a73d9acf840c4006d60d0a1ff/opencv_python-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:dcf000c36dd1651118a2462257e3a9e76db789a78432e1f303c7bac54f63ef6c", size = 28546907, upload-time = "2023-12-31T13:34:03.922Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/ec/9dabb6a9abfdebb3c45b0cc52dec901caafef2b2c7e7d6a839ed86d81e91/opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:3f16f08e02b2a2da44259c7cc712e779eff1dd8b55fdb0323e8cab09548086c0", size = 38624911, upload-time = "2023-12-31T13:34:09.481Z" },
+]
+
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "attrs" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b" },
+]
+
+[[package]]
+name = "overrides"
+version = "7.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/36/86/b585f53236dec60aba864e050778b25045f857e17f6e5ea0ae95fe80edd2/overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a", size = 22812, upload-time = "2024-01-27T21:01:33.423Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/ab/fc8290c6a4c722e5514d80f62b2dc4c4df1a68a41d1364e625c35990fcf3/overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49", size = 17832, upload-time = "2024-01-27T21:01:31.393Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "25.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
+]
+
+[[package]]
+name = "pandas"
+version = "2.3.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
+]
+
+[[package]]
+name = "pandocfilters"
+version = "1.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/70/6f/3dd4940bbe001c06a65f88e36bad298bc7a0de5036115639926b0c5c0458/pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e", size = 8454, upload-time = "2024-01-18T20:08:13.726Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" },
+]
+
+[[package]]
+name = "parso"
+version = "0.8.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" },
+]
+
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "ptyprocess" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523" },
+]
+
+[[package]]
+name = "pillow"
+version = "12.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5d/08/26e68b6b5da219c2a2cb7b563af008b53bb8e6b6fcb3fa40715fcdb2523a/pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b", size = 5289809, upload-time = "2025-10-15T18:21:27.791Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/e9/4e58fb097fb74c7b4758a680aacd558810a417d1edaa7000142976ef9d2f/pillow-12.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1ac11e8ea4f611c3c0147424eae514028b5e9077dd99ab91e1bd7bc33ff145e1", size = 4650606, upload-time = "2025-10-15T18:21:29.823Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4b/e0/1fa492aa9f77b3bc6d471c468e62bfea1823056bf7e5e4f1914d7ab2565e/pillow-12.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d49e2314c373f4c2b39446fb1a45ed333c850e09d0c59ac79b72eb3b95397363", size = 6221023, upload-time = "2025-10-15T18:21:31.415Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/09/4de7cd03e33734ccd0c876f0251401f1314e819cbfd89a0fcb6e77927cc6/pillow-12.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c7b2a63fd6d5246349f3d3f37b14430d73ee7e8173154461785e43036ffa96ca", size = 8024937, upload-time = "2025-10-15T18:21:33.453Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2e/69/0688e7c1390666592876d9d474f5e135abb4acb39dcb583c4dc5490f1aff/pillow-12.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d64317d2587c70324b79861babb9c09f71fbb780bad212018874b2c013d8600e", size = 6334139, upload-time = "2025-10-15T18:21:35.395Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/1c/880921e98f525b9b44ce747ad1ea8f73fd7e992bafe3ca5e5644bf433dea/pillow-12.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d77153e14b709fd8b8af6f66a3afbb9ed6e9fc5ccf0b6b7e1ced7b036a228782", size = 7026074, upload-time = "2025-10-15T18:21:37.219Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/28/03/96f718331b19b355610ef4ebdbbde3557c726513030665071fd025745671/pillow-12.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32ed80ea8a90ee3e6fa08c21e2e091bba6eda8eccc83dbc34c95169507a91f10", size = 6448852, upload-time = "2025-10-15T18:21:39.168Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3a/a0/6a193b3f0cc9437b122978d2c5cbce59510ccf9a5b48825096ed7472da2f/pillow-12.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c828a1ae702fc712978bda0320ba1b9893d99be0badf2647f693cc01cf0f04fa", size = 7117058, upload-time = "2025-10-15T18:21:40.997Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a7/c4/043192375eaa4463254e8e61f0e2ec9a846b983929a8d0a7122e0a6d6fff/pillow-12.0.0-cp310-cp310-win32.whl", hash = "sha256:bd87e140e45399c818fac4247880b9ce719e4783d767e030a883a970be632275", size = 6295431, upload-time = "2025-10-15T18:21:42.518Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/c6/c2f2fc7e56301c21827e689bb8b0b465f1b52878b57471a070678c0c33cd/pillow-12.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:455247ac8a4cfb7b9bc45b7e432d10421aea9fc2e74d285ba4072688a74c2e9d", size = 7000412, upload-time = "2025-10-15T18:21:44.404Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/d2/5f675067ba82da7a1c238a73b32e3fd78d67f9d9f80fbadd33a40b9c0481/pillow-12.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:6ace95230bfb7cd79ef66caa064bbe2f2a1e63d93471c3a2e1f1348d9f22d6b7", size = 2435903, upload-time = "2025-10-15T18:21:46.29Z" },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cf/86/0248f086a84f01b37aaec0fa567b397df1a119f73c16f6c7a9aac73ea309/platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda", size = 21715, upload-time = "2025-12-05T13:52:58.638Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "positional-encodings"
+version = "6.0.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2e/e3/0806ada8ac7d4eeacf9557e41167d1253bdb24979c60709397091648993a/positional_encodings-6.0.1.tar.gz", hash = "sha256:01de4b788b7517cb22ba816974f9881e2901b5b1f83ef16476f18d2a1a4a9ce2", size = 6603, upload-time = "2022-06-17T14:52:38.593Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/50/3a5cf4de507cd4c67fb353e95c734d9228706a671f849457639c62933c06/positional_encodings-6.0.1-py3-none-any.whl", hash = "sha256:0675bdbe5d50e95bb43348c5fd234311e4b0a253474a30da89348949ad42c306", size = 7494, upload-time = "2022-06-17T14:52:37.295Z" },
+]
+
+[[package]]
+name = "prometheus-client"
+version = "0.23.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/23/53/3edb5d68ecf6b38fcbcc1ad28391117d2a322d9a1a3eff04bfdb184d8c3b/prometheus_client-0.23.1.tar.gz", hash = "sha256:6ae8f9081eaaaf153a2e959d2e6c4f4fb57b12ef76c8c7980202f1e57b48b2ce", size = 80481, upload-time = "2025-09-18T20:47:25.043Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b8/db/14bafcb4af2139e046d03fd00dea7873e48eafe18b7d2797e73d6681f210/prometheus_client-0.23.1-py3-none-any.whl", hash = "sha256:dd1913e6e76b59cfe44e7a4b83e01afc9873c1bdfd2ed8739f1e76aeca115f99", size = 61145, upload-time = "2025-09-18T20:47:23.875Z" },
+]
+
+[[package]]
+name = "prompt-toolkit"
+version = "3.0.52"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "wcwidth" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" },
+]
+
+[[package]]
+name = "propcache"
+version = "0.4.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/0e/934b541323035566a9af292dba85a195f7b78179114f2c6ebb24551118a9/propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db", size = 79534, upload-time = "2025-10-08T19:46:02.083Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a1/6b/db0d03d96726d995dc7171286c6ba9d8d14251f37433890f88368951a44e/propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8", size = 45526, upload-time = "2025-10-08T19:46:03.884Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e4/c3/82728404aea669e1600f304f2609cde9e665c18df5a11cdd57ed73c1dceb/propcache-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66c1f011f45a3b33d7bcb22daed4b29c0c9e2224758b6be00686731e1b46f925", size = 47263, upload-time = "2025-10-08T19:46:05.405Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/df/1b/39313ddad2bf9187a1432654c38249bab4562ef535ef07f5eb6eb04d0b1b/propcache-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a52009f2adffe195d0b605c25ec929d26b36ef986ba85244891dee3b294df21", size = 201012, upload-time = "2025-10-08T19:46:07.165Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5b/01/f1d0b57d136f294a142acf97f4ed58c8e5b974c21e543000968357115011/propcache-0.4.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5d4e2366a9c7b837555cf02fb9be2e3167d333aff716332ef1b7c3a142ec40c5", size = 209491, upload-time = "2025-10-08T19:46:08.909Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a1/c8/038d909c61c5bb039070b3fb02ad5cccdb1dde0d714792e251cdb17c9c05/propcache-0.4.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9d2b6caef873b4f09e26ea7e33d65f42b944837563a47a94719cc3544319a0db", size = 215319, upload-time = "2025-10-08T19:46:10.7Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/08/57/8c87e93142b2c1fa2408e45695205a7ba05fb5db458c0bf5c06ba0e09ea6/propcache-0.4.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b16ec437a8c8a965ecf95739448dd938b5c7f56e67ea009f4300d8df05f32b7", size = 196856, upload-time = "2025-10-08T19:46:12.003Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/42/df/5615fec76aa561987a534759b3686008a288e73107faa49a8ae5795a9f7a/propcache-0.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:296f4c8ed03ca7476813fe666c9ea97869a8d7aec972618671b33a38a5182ef4", size = 193241, upload-time = "2025-10-08T19:46:13.495Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d5/21/62949eb3a7a54afe8327011c90aca7e03547787a88fb8bd9726806482fea/propcache-0.4.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:1f0978529a418ebd1f49dad413a2b68af33f85d5c5ca5c6ca2a3bed375a7ac60", size = 190552, upload-time = "2025-10-08T19:46:14.938Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/30/ee/ab4d727dd70806e5b4de96a798ae7ac6e4d42516f030ee60522474b6b332/propcache-0.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fd138803047fb4c062b1c1dd95462f5209456bfab55c734458f15d11da288f8f", size = 200113, upload-time = "2025-10-08T19:46:16.695Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/0b/38b46208e6711b016aa8966a3ac793eee0d05c7159d8342aa27fc0bc365e/propcache-0.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8c9b3cbe4584636d72ff556d9036e0c9317fa27b3ac1f0f558e7e84d1c9c5900", size = 200778, upload-time = "2025-10-08T19:46:18.023Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cf/81/5abec54355ed344476bee711e9f04815d4b00a311ab0535599204eecc257/propcache-0.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f93243fdc5657247533273ac4f86ae106cc6445a0efacb9a1bfe982fcfefd90c", size = 193047, upload-time = "2025-10-08T19:46:19.449Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ec/b6/1f237c04e32063cb034acd5f6ef34ef3a394f75502e72703545631ab1ef6/propcache-0.4.1-cp310-cp310-win32.whl", hash = "sha256:a0ee98db9c5f80785b266eb805016e36058ac72c51a064040f2bc43b61101cdb", size = 38093, upload-time = "2025-10-08T19:46:20.643Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a6/67/354aac4e0603a15f76439caf0427781bcd6797f370377f75a642133bc954/propcache-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:1cdb7988c4e5ac7f6d175a28a9aa0c94cb6f2ebe52756a3c0cda98d2809a9e37", size = 41638, upload-time = "2025-10-08T19:46:21.935Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/e1/74e55b9fd1a4c209ff1a9a824bf6c8b3d1fc5a1ac3eabe23462637466785/propcache-0.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:d82ad62b19645419fe79dd63b3f9253e15b30e955c0170e5cebc350c1844e581", size = 38229, upload-time = "2025-10-08T19:46:23.368Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
+]
+
+[[package]]
+name = "protobuf"
+version = "3.20.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/3e/40c56d21154a1f3ababb69f675333d7fb70c8293f9ca42ea3e448327fc50/protobuf-3.20.0.tar.gz", hash = "sha256:71b2c3d1cd26ed1ec7c8196834143258b2ad7f444efff26fdc366c6f5e752702", size = 216509, upload-time = "2022-04-01T18:26:17.992Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/33/ef7e1a21a4428e6e57820db4312a674c0dd1228be5ad57c0a776c425f9d1/protobuf-3.20.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9d0f3aca8ca51c8b5e204ab92bd8afdb2a8e3df46bd0ce0bd39065d79aabcaa4", size = 962269, upload-time = "2022-04-01T18:21:33.164Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/75/b6/a49a33779d1309236aded36234bac5560d0f605f75114e4398b30a786735/protobuf-3.20.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:001c2160c03b6349c04de39cf1a58e342750da3632f6978a1634a3dcca1ec10e", size = 917927, upload-time = "2022-04-01T18:21:35.731Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5c/a6/9a451d69492a695e2300643933590762aa4e5bbef2ae5f08f7d218ea6c3a/protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5b5860b790498f233cdc8d635a17fc08de62e59d4dcd8cdb6c6c0d38a31edf2b", size = 1050440, upload-time = "2022-04-01T18:21:38.049Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/e3/6ea01d64c981b81e0f65b68ea3d022aa870f3fd4be6845f4d02c1f64cb86/protobuf-3.20.0-cp310-cp310-win32.whl", hash = "sha256:0b250c60256c8824219352dc2a228a6b49987e5bf94d3ffcf4c46585efcbd499", size = 779973, upload-time = "2022-04-01T18:21:40.421Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/fe/280a6ffbe14a1ac458b741591de43a9ae212a3fe088812ad03eb9a4b10ae/protobuf-3.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:a1eebb6eb0653e594cb86cd8e536b9b083373fca9aba761ade6cd412d46fb2ab", size = 903844, upload-time = "2022-04-01T18:21:43.534Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ae/80/9eaa62a2afcc5407a6b7d2652c208f073df3a5c83b5bff90bf99553fbcf2/protobuf-3.20.0-py2.py3-none-any.whl", hash = "sha256:4eda68bd9e2a4879385e6b1ea528c976f59cd9728382005cc54c28bcce8db983", size = 162127, upload-time = "2022-04-01T18:26:16.304Z" },
+]
+
+[[package]]
+name = "psutil"
+version = "7.1.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ef/94/46b9154a800253e7ecff5aaacdf8ebf43db99de4a2dfa18575b02548654e/psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab", size = 238359, upload-time = "2025-11-02T12:26:25.284Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/68/3a/9f93cff5c025029a36d9a92fef47220ab4692ee7f2be0fba9f92813d0cb8/psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880", size = 239171, upload-time = "2025-11-02T12:26:27.23Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ce/b1/5f49af514f76431ba4eea935b8ad3725cdeb397e9245ab919dbc1d1dc20f/psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3", size = 263261, upload-time = "2025-11-02T12:26:29.48Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/95/992c8816a74016eb095e73585d747e0a8ea21a061ed3689474fabb29a395/psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b", size = 264635, upload-time = "2025-11-02T12:26:31.74Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" },
+]
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
+]
+
+[[package]]
+name = "pure-eval"
+version = "0.2.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
+]
+
+[[package]]
+name = "py"
+version = "1.11.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", size = 207796, upload-time = "2021-11-04T17:17:01.377Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708, upload-time = "2021-11-04T17:17:00.152Z" },
+]
+
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" },
+]
+
+[[package]]
+name = "pyarrow"
+version = "22.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6c/41/3184b8192a120306270c5307f105b70320fdaa592c99843c5ef78aaefdcf/pyarrow-22.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:44d2d26cda26d18f7af7db71453b7b783788322d756e81730acb98f24eb90ace", size = 35942085, upload-time = "2025-10-24T10:03:38.146Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d9/3d/a1eab2f6f08001f9fb714b8ed5cfb045e2fe3e3e3c0c221f2c9ed1e6d67d/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b9d71701ce97c95480fecb0039ec5bb889e75f110da72005743451339262f4ce", size = 44964613, upload-time = "2025-10-24T10:03:46.516Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/46/a1d9c24baf21cfd9ce994ac820a24608decf2710521b29223d4334985127/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:710624ab925dc2b05a6229d47f6f0dac1c1155e6ed559be7109f684eba048a48", size = 47627059, upload-time = "2025-10-24T10:03:55.353Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3a/4c/f711acb13075c1391fd54bc17e078587672c575f8de2a6e62509af026dcf/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340", size = 47947043, upload-time = "2025-10-24T10:04:05.408Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4e/70/1f3180dd7c2eab35c2aca2b29ace6c519f827dcd4cfeb8e0dca41612cf7a/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd0d42297ace400d8febe55f13fdf46e86754842b860c978dfec16f081e5c653", size = 50206505, upload-time = "2025-10-24T10:04:15.786Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/80/07/fea6578112c8c60ffde55883a571e4c4c6bc7049f119d6b09333b5cc6f73/pyarrow-22.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:00626d9dc0f5ef3a75fe63fd68b9c7c8302d2b5bbc7f74ecaedba83447a24f84", size = 28101641, upload-time = "2025-10-24T10:04:22.57Z" },
+]
+
+[[package]]
+name = "pyarrow-hotfix"
+version = "0.7"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d2/ed/c3e8677f7abf3981838c2af7b5ac03e3589b3ef94fcb31d575426abae904/pyarrow_hotfix-0.7.tar.gz", hash = "sha256:59399cd58bdd978b2e42816a4183a55c6472d4e33d183351b6069f11ed42661d", size = 9910, upload-time = "2025-04-25T10:17:06.247Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2e/c3/94ade4906a2f88bc935772f59c934013b4205e773bcb4239db114a6da136/pyarrow_hotfix-0.7-py3-none-any.whl", hash = "sha256:3236f3b5f1260f0e2ac070a55c1a7b339c4bb7267839bd2015e283234e758100", size = 7923, upload-time = "2025-04-25T10:17:05.224Z" },
+]
+
+[[package]]
+name = "pycparser"
+version = "2.23"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fe/cf/d2d3b9f5699fb1e4615c8e32ff220203e43b248e1dfcc6736ad9057731ca/pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", size = 173734, upload-time = "2025-09-09T13:23:47.91Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.12.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.41.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pyinstrument"
+version = "5.1.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/80/ce/824ee634994e612156f7b84eaf50b8523c676ebfed8d8dd12939a82f4c15/pyinstrument-5.1.1.tar.gz", hash = "sha256:bc401cda990b3c1cfe8e0e0473cbd605df3c63b73478a89ac4ab108f2184baa8", size = 264730, upload-time = "2025-08-12T11:35:43.426Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/52/80/57448dba5ced8c45d753b4b8dc587f0e117712405d4235b305327fb90671/pyinstrument-5.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f4912edf01912792d2f44dfc43d3f041acc7ea634d0300b3394711963a431d1b", size = 130457, upload-time = "2025-08-12T11:34:20.354Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/2a/af500f4352a2ca952eb6465e861964def199e30bfe6f3f5e7f0f05bd870e/pyinstrument-5.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:217ee5decbc7312b16092307a1bfbe1c04b175bb91ad9622388cd266f54fb260", size = 122967, upload-time = "2025-08-12T11:34:21.914Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/e4/68952fce23590eb7cc47c852ad22037ef87148897af45194c6133689cc57/pyinstrument-5.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2fcad0f4211226e445cdbca1ee308567d3d7507cb18355463de64bea2dbe0b80", size = 147557, upload-time = "2025-08-12T11:34:23.386Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/cd/d54450580569a075ee165e91a885586e0099424a7792ab4c3e1480fbbda5/pyinstrument-5.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:129dcdd57f0720ffb0c181517eddf5db6bfb2cdd1338c6fd5f4082d62c657ba0", size = 146253, upload-time = "2025-08-12T11:34:24.577Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3b/d6/8756e79c29e9ac6f21b437627012bfb3329d4af792a2f529a4a9b31f0c71/pyinstrument-5.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e1efb57a5b708f76b5fed08ecb9dc380a1949f4cb82794d14c3c40113c2f4a2d", size = 146483, upload-time = "2025-08-12T11:34:25.992Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/48/8e0d264c5116325e38587aceda98b2d3da708149c320b6999b636a1735ad/pyinstrument-5.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d1e89dcf9a88d328cc7eb7c67c43c8a4ad38c8162eda87d93166a998fc8580d", size = 146009, upload-time = "2025-08-12T11:34:27.109Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/da/02/b492507090cf18f6d7a3ff419eecb53c1725c33f4ab6bbe32e9e611bacb5/pyinstrument-5.1.1-cp310-cp310-win32.whl", hash = "sha256:954bc886e11ddcf5789a8a953bec65e663f48d93cc634a8c77585f3e8762bcbb", size = 124155, upload-time = "2025-08-12T11:34:28.542Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/86/2a/7e909fc1a3e8e9d0724b71829b1d9f4d398a9108b20ee1591bed45b014c5/pyinstrument-5.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:9b2f1d54c72661feeefd5ca81b6f487baf52add1d688ae349be599a258168bfc", size = 125038, upload-time = "2025-08-12T11:34:29.702Z" },
+]
+
+[[package]]
+name = "pylint"
+version = "4.0.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "astroid" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "dill" },
+    { name = "isort" },
+    { name = "mccabe" },
+    { name = "platformdirs" },
+    { name = "tomli" },
+    { name = "tomlkit" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/d2/b081da1a8930d00e3fc06352a1d449aaf815d4982319fab5d8cdb2e9ab35/pylint-4.0.4.tar.gz", hash = "sha256:d9b71674e19b1c36d79265b5887bf8e55278cbe236c9e95d22dc82cf044fdbd2", size = 1571735, upload-time = "2025-11-30T13:29:04.315Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a6/92/d40f5d937517cc489ad848fc4414ecccc7592e4686b9071e09e64f5e378e/pylint-4.0.4-py3-none-any.whl", hash = "sha256:63e06a37d5922555ee2c20963eb42559918c20bd2b21244e4ef426e7c43b92e0", size = 536425, upload-time = "2025-11-30T13:29:02.53Z" },
+]
+
+[[package]]
+name = "pynvml"
+version = "12.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "nvidia-ml-py" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/6f/6b5880ed0239e85b9a39aed103b65b2ef81425beef9f45e5c035bf008330/pynvml-12.0.0.tar.gz", hash = "sha256:299ce2451a6a17e6822d6faee750103e25b415f06f59abb8db65d30f794166f5", size = 33636, upload-time = "2024-12-02T15:04:36.631Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ed/df/f7cf07a65a96dd11d71f346f9c2863accdd4784da83af7181b067d556cbc/pynvml-12.0.0-py3-none-any.whl", hash = "sha256:fdff84b62a27dbe98e08e1a647eb77342bef1aebe0878bcd15e99a83fcbecb9e", size = 26560, upload-time = "2024-12-02T15:04:35.047Z" },
+]
+
+[[package]]
+name = "pyogrio"
+version = "0.12.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "numpy" },
+    { name = "packaging" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/d4/12f86b1ed09721363da4c09622464b604c851a9223fc0c6b393fb2012208/pyogrio-0.12.1.tar.gz", hash = "sha256:e548ab705bb3e5383693717de1e6c76da97f3762ab92522cb310f93128a75ff1", size = 303289, upload-time = "2025-11-28T19:04:53.341Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/04/e69f476c4cc279adc6d26194da4d3497f5d9efdd46777a6c0ad59c09233f/pyogrio-0.12.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5c4735235ca0d8dcdb4ecd69bd73e66762d161bce913b10d4458a18137cc7062", size = 23672707, upload-time = "2025-11-28T19:02:54.87Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/9e/805d640f050fc4a064ee5ba3289457f47d7f3464b57140caa8ddac039a67/pyogrio-0.12.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3249d06c2520857b622f3ff0f1b7b4849291ee1fb72f21587825f5fd0f24b787", size = 25247903, upload-time = "2025-11-28T19:02:57.756Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/c3/65577611485bc3e53a466ffbcd2407f89e8bd7e1c4554e8a0d23a4b8a636/pyogrio-0.12.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f4011b63f9d6c278ee6605971ffabe30b0e8f5992ec2c6df8c70ecfa68a5d02b", size = 31279563, upload-time = "2025-11-28T19:03:00.344Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b1/a6/5c03dffaf02542e8bae6c785d3e302bf4b890cd2ab281336b3c4dc867f84/pyogrio-0.12.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:940857c45051e1e19608ebfe8338bcdf7dd005389057431a3c7b5bff5beb0a5f", size = 30831678, upload-time = "2025-11-28T19:03:03.234Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c8/aa/0e484c13cf14bbe46c366ad363ab2406242a0fba85a7561d42bbd34c35dd/pyogrio-0.12.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0fd86bcd69126739325a543a489f312b5fd86db092d2dead682772ae4ee434f3", size = 32380362, upload-time = "2025-11-28T19:03:06.098Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/7c/cc515005780235d9ab14a29d33868bcaa1d5b423cee7995dda94735c41dd/pyogrio-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:dcf9cca273ead32beba7c002dd3db8a304105f52dd66200d48fa1ef30d0676af", size = 22940628, upload-time = "2025-11-28T19:03:08.568Z" },
+]
+
+[[package]]
+name = "pyparsing"
+version = "3.2.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f2/a5/181488fc2b9d093e3972d2a472855aae8a03f000592dbfce716a512b3359/pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6", size = 1099274, upload-time = "2025-09-21T04:11:06.277Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" },
+]
+
+[[package]]
+name = "pyproj"
+version = "3.7.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "certifi" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/67/10/a8480ea27ea4bbe896c168808854d00f2a9b49f95c0319ddcbba693c8a90/pyproj-3.7.1.tar.gz", hash = "sha256:60d72facd7b6b79853f19744779abcd3f804c4e0d4fa8815469db20c9f640a47", size = 226339, upload-time = "2025-02-16T04:28:46.621Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/a3/c4cd4bba5b336075f145fe784fcaf4ef56ffbc979833303303e7a659dda2/pyproj-3.7.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:bf09dbeb333c34e9c546364e7df1ff40474f9fddf9e70657ecb0e4f670ff0b0e", size = 6262524, upload-time = "2025-02-16T04:27:19.725Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/45/4fdf18f4cc1995f1992771d2a51cf186a9d7a8ec973c9693f8453850c707/pyproj-3.7.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6575b2e53cc9e3e461ad6f0692a5564b96e7782c28631c7771c668770915e169", size = 4665102, upload-time = "2025-02-16T04:27:24.428Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/d2/360eb127380106cee83569954ae696b88a891c804d7a93abe3fbc15f5976/pyproj-3.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cb516ee35ed57789b46b96080edf4e503fdb62dbb2e3c6581e0d6c83fca014b", size = 9432667, upload-time = "2025-02-16T04:27:27.04Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/76/a5/c6e11b9a99ce146741fb4d184d5c468446c6d6015b183cae82ac822a6cfa/pyproj-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e47c4e93b88d99dd118875ee3ca0171932444cdc0b52d493371b5d98d0f30ee", size = 9259185, upload-time = "2025-02-16T04:27:30.35Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/56/a3c15c42145797a99363fa0fdb4e9805dccb8b4a76a6d7b2cdf36ebcc2a1/pyproj-3.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3e8d276caeae34fcbe4813855d0d97b9b825bab8d7a8b86d859c24a6213a5a0d", size = 10469103, upload-time = "2025-02-16T04:27:33.542Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ef/73/c9194c2802fefe2a4fd4230bdd5ab083e7604e93c64d0356fa49c363bad6/pyproj-3.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f173f851ee75e54acdaa053382b6825b400cb2085663a9bb073728a59c60aebb", size = 10401391, upload-time = "2025-02-16T04:27:36.051Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c5/1d/ce8bb5b9251b04d7c22d63619bb3db3d2397f79000a9ae05b3fd86a5837e/pyproj-3.7.1-cp310-cp310-win32.whl", hash = "sha256:f550281ed6e5ea88fcf04a7c6154e246d5714be495c50c9e8e6b12d3fb63e158", size = 5869997, upload-time = "2025-02-16T04:27:38.302Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/09/6a/ca145467fd2e5b21e3d5b8c2b9645dcfb3b68f08b62417699a1f5689008e/pyproj-3.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:3537668992a709a2e7f068069192138618c00d0ba113572fdd5ee5ffde8222f3", size = 6278581, upload-time = "2025-02-16T04:27:41.051Z" },
+]
+
+[[package]]
+name = "pyquaternion"
+version = "0.9.9"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7d/0d/3d092aa20efaedacb89c3221a92c6491be5b28f618a2c36b52b53e7446c2/pyquaternion-0.9.9.tar.gz", hash = "sha256:b1f61af219cb2fe966b5fb79a192124f2e63a3f7a777ac3cadf2957b1a81bea8", size = 15530, upload-time = "2020-10-05T01:31:30.327Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/b3/d8482e8cacc8ea15a356efea13d22ce1c5914a9ee36622ba250523240bf2/pyquaternion-0.9.9-py3-none-any.whl", hash = "sha256:e65f6e3f7b1fdf1a9e23f82434334a1ae84f14223eee835190cd2e841f8172ec", size = 14361, upload-time = "2020-10-05T01:31:37.575Z" },
+]
+
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+    { name = "tomli" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
+]
+
+[[package]]
+name = "python-json-logger"
+version = "4.0.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/29/bf/eca6a3d43db1dae7070f70e160ab20b807627ba953663ba07928cdd3dc58/python_json_logger-4.0.0.tar.gz", hash = "sha256:f58e68eb46e1faed27e0f574a55a0455eecd7b8a5b88b85a784519ba3cff047f", size = 17683, upload-time = "2025-10-06T04:15:18.984Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" },
+]
+
+[[package]]
+name = "pytorch-lightning"
+version = "2.2.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "fsspec", extra = ["http"] },
+    { name = "lightning-utilities" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "torch" },
+    { name = "torchmetrics" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/23/9b/da17991bde2a07d01454a1291ef72257363ba99726603451d78c65f9c952/pytorch-lightning-2.2.1.tar.gz", hash = "sha256:aa3be30c9109239a371748565a7f4b7b41ea1395725c30e04426cf946b3e2745", size = 615200, upload-time = "2024-03-04T18:59:36.127Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/56/ed/192d7518b15a06452f480346eeebe1d1d4595af80687e142b2e6f18539fd/pytorch_lightning-2.2.1-py3-none-any.whl", hash = "sha256:b7efdd46a7ede4d66814a6afc28d0dfadd8eea1bb8bddab4fd2ea36c099af685", size = 801598, upload-time = "2024-03-04T18:59:32.538Z" },
+]
+
+[[package]]
+name = "pytz"
+version = "2025.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
+]
+
+[[package]]
+name = "pywinpty"
+version = "3.0.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f3/bb/a7cc2967c5c4eceb6cc49cfe39447d4bfc56e6c865e7c2249b6eb978935f/pywinpty-3.0.2.tar.gz", hash = "sha256:1505cc4cb248af42cb6285a65c9c2086ee9e7e574078ee60933d5d7fa86fb004", size = 30669, upload-time = "2025-10-03T21:16:29.205Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3e/f5/b17ae550841949c217ad557ee445b4a14e9c0b506ae51ee087eff53428a6/pywinpty-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:65db57fd3387d71e8372b6a54269cbcd0f6dfa6d4616a29e0af749ec19f5c558", size = 2050330, upload-time = "2025-10-03T21:20:15.656Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" },
+]
+
+[[package]]
+name = "pyzmq"
+version = "27.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "cffi", marker = "implementation_name == 'pypy'" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/67/b9/52aa9ec2867528b54f1e60846728d8b4d84726630874fee3a91e66c7df81/pyzmq-27.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:508e23ec9bc44c0005c4946ea013d9317ae00ac67778bd47519fdf5a0e930ff4", size = 1329850, upload-time = "2025-09-08T23:07:26.274Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/64/5653e7b7425b169f994835a2b2abf9486264401fdef18df91ddae47ce2cc/pyzmq-27.1.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:507b6f430bdcf0ee48c0d30e734ea89ce5567fd7b8a0f0044a369c176aa44556", size = 906380, upload-time = "2025-09-08T23:07:29.78Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/78/7d713284dbe022f6440e391bd1f3c48d9185673878034cfb3939cdf333b2/pyzmq-27.1.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf7b38f9fd7b81cb6d9391b2946382c8237fd814075c6aa9c3b746d53076023b", size = 666421, upload-time = "2025-09-08T23:07:31.263Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/30/76/8f099f9d6482450428b17c4d6b241281af7ce6a9de8149ca8c1c649f6792/pyzmq-27.1.0-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03ff0b279b40d687691a6217c12242ee71f0fba28bf8626ff50e3ef0f4410e1e", size = 854149, upload-time = "2025-09-08T23:07:33.17Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/59/f0/37fbfff06c68016019043897e4c969ceab18bde46cd2aca89821fcf4fb2e/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:677e744fee605753eac48198b15a2124016c009a11056f93807000ab11ce6526", size = 1655070, upload-time = "2025-09-08T23:07:35.205Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/47/14/7254be73f7a8edc3587609554fcaa7bfd30649bf89cd260e4487ca70fdaa/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd2fec2b13137416a1c5648b7009499bcc8fea78154cd888855fa32514f3dad1", size = 2033441, upload-time = "2025-09-08T23:07:37.432Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/dc/49f2be26c6f86f347e796a4d99b19167fc94503f0af3fd010ad262158822/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:08e90bb4b57603b84eab1d0ca05b3bbb10f60c1839dc471fc1c9e1507bef3386", size = 1891529, upload-time = "2025-09-08T23:07:39.047Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/3e/154fb963ae25be70c0064ce97776c937ecc7d8b0259f22858154a9999769/pyzmq-27.1.0-cp310-cp310-win32.whl", hash = "sha256:a5b42d7a0658b515319148875fcb782bbf118dd41c671b62dae33666c2213bda", size = 567276, upload-time = "2025-09-08T23:07:40.695Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/b2/f4ab56c8c595abcb26b2be5fd9fa9e6899c1e5ad54964e93ae8bb35482be/pyzmq-27.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:c0bb87227430ee3aefcc0ade2088100e528d5d3298a0a715a64f3d04c60ba02f", size = 632208, upload-time = "2025-09-08T23:07:42.298Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3b/e3/be2cc7ab8332bdac0522fdb64c17b1b6241a795bee02e0196636ec5beb79/pyzmq-27.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:9a916f76c2ab8d045b19f2286851a38e9ac94ea91faf65bd64735924522a8b32", size = 559766, upload-time = "2025-09-08T23:07:43.869Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f3/81/a65e71c1552f74dec9dff91d95bafb6e0d33338a8dfefbc88aa562a20c92/pyzmq-27.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c17e03cbc9312bee223864f1a2b13a99522e0dc9f7c5df0177cd45210ac286e6", size = 836266, upload-time = "2025-09-08T23:09:40.048Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/58/ed/0202ca350f4f2b69faa95c6d931e3c05c3a397c184cacb84cb4f8f42f287/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f328d01128373cb6763823b2b4e7f73bdf767834268c565151eacb3b7a392f90", size = 800206, upload-time = "2025-09-08T23:09:41.902Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/47/42/1ff831fa87fe8f0a840ddb399054ca0009605d820e2b44ea43114f5459f4/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c1790386614232e1b3a40a958454bdd42c6d1811837b15ddbb052a032a43f62", size = 567747, upload-time = "2025-09-08T23:09:43.741Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/db/5c4d6807434751e3f21231bee98109aa57b9b9b55e058e450d0aef59b70f/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:448f9cb54eb0cee4732b46584f2710c8bc178b0e5371d9e4fc8125201e413a74", size = 747371, upload-time = "2025-09-08T23:09:45.575Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/af/78ce193dbf03567eb8c0dc30e3df2b9e56f12a670bf7eb20f9fb532c7e8a/pyzmq-27.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05b12f2d32112bf8c95ef2e74ec4f1d4beb01f8b5e703b38537f8849f92cb9ba", size = 544862, upload-time = "2025-09-08T23:09:47.448Z" },
+]
+
+[[package]]
+name = "rasterio"
+version = "1.3.11"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "affine" },
+    { name = "attrs" },
+    { name = "certifi" },
+    { name = "click" },
+    { name = "click-plugins" },
+    { name = "cligj" },
+    { name = "numpy" },
+    { name = "setuptools" },
+    { name = "snuggs" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/60/80f075e05e2ade12def9d65dc9f66b98519e52e5ac9ce1ba0aba1fbe73b5/rasterio-1.3.11.tar.gz", hash = "sha256:47aa70b4718ebc80d825bb7db3127577d74e31c53048ce215145c0baf530ece9", size = 413094, upload-time = "2024-09-04T04:21:14.519Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/39/aa5f648322e9b851cc92f5f50631d9fee4f219b268a7e10f015a097f849c/rasterio-1.3.11-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:f12e94dab367138a7c2fe6daf581ba84e6eb03c94fe0070c60c7a81cac2de0d3", size = 21020922, upload-time = "2024-09-04T04:20:11.326Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f0/34/2fef1c32d0ae06046757e4f3da441d842ffd56059bf17b2d6ff616be0b0a/rasterio-1.3.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:24491dafca5baafc909c5b53f7b035c4ccfb0f18326b15b24c4d112754c6cc8f", size = 18453652, upload-time = "2024-09-04T04:20:14.492Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/94/0d9fbcf77a1b76b896146ad5649a0edbd706bbf90ad234e9d241df1c31b8/rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:102c49a679ef96b336f5bd826cba461045906d735fb6b3623a5bc35be21a1105", size = 21735262, upload-time = "2024-09-04T04:20:17.461Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8d/b7/951151b53dc6f8d8efc1e0b783f0b2bf3c5c506e237a8baa3731f8862d9d/rasterio-1.3.11-cp310-cp310-win_amd64.whl", hash = "sha256:d2c0287627570542b43b91f04ac5398b8ec5ff7651679b00505c61b1d4cce37d", size = 24772241, upload-time = "2024-09-04T04:20:21.555Z" },
+]
+
+[[package]]
+name = "referencing"
+version = "0.37.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "rpds-py" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" },
+]
+
+[[package]]
+name = "regex"
+version = "2025.11.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/a9/546676f25e573a4cf00fe8e119b78a37b6a8fe2dc95cda877b30889c9c45/regex-2025.11.3.tar.gz", hash = "sha256:1fedc720f9bb2494ce31a58a1631f9c82df6a09b49c19517ea5cc280b4541e01", size = 414669, upload-time = "2025-11-03T21:34:22.089Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/d6/d788d52da01280a30a3f6268aef2aa71043bff359c618fea4c5b536654d5/regex-2025.11.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2b441a4ae2c8049106e8b39973bfbddfb25a179dda2bdb99b0eeb60c40a6a3af", size = 488087, upload-time = "2025-11-03T21:30:47.317Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/39/abec3bd688ec9bbea3562de0fd764ff802976185f5ff22807bf0a2697992/regex-2025.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fa2eed3f76677777345d2f81ee89f5de2f5745910e805f7af7386a920fa7313", size = 290544, upload-time = "2025-11-03T21:30:49.912Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/39/b3/9a231475d5653e60002508f41205c61684bb2ffbf2401351ae2186897fc4/regex-2025.11.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8b4a27eebd684319bdf473d39f1d79eed36bf2cd34bd4465cdb4618d82b3d56", size = 288408, upload-time = "2025-11-03T21:30:51.344Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/c5/1929a0491bd5ac2d1539a866768b88965fa8c405f3e16a8cef84313098d6/regex-2025.11.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cf77eac15bd264986c4a2c63353212c095b40f3affb2bc6b4ef80c4776c1a28", size = 781584, upload-time = "2025-11-03T21:30:52.596Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ce/fd/16aa16cf5d497ef727ec966f74164fbe75d6516d3d58ac9aa989bc9cdaad/regex-2025.11.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b7f9ee819f94c6abfa56ec7b1dbab586f41ebbdc0a57e6524bd5e7f487a878c7", size = 850733, upload-time = "2025-11-03T21:30:53.825Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/49/3294b988855a221cb6565189edf5dc43239957427df2d81d4a6b15244f64/regex-2025.11.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:838441333bc90b829406d4a03cb4b8bf7656231b84358628b0406d803931ef32", size = 898691, upload-time = "2025-11-03T21:30:55.575Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/14/62/b56d29e70b03666193369bdbdedfdc23946dbe9f81dd78ce262c74d988ab/regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cfe6d3f0c9e3b7e8c0c694b24d25e677776f5ca26dce46fd6b0489f9c8339391", size = 791662, upload-time = "2025-11-03T21:30:57.262Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/15/fc/e4c31d061eced63fbf1ce9d853975f912c61a7d406ea14eda2dd355f48e7/regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2ab815eb8a96379a27c3b6157fcb127c8f59c36f043c1678110cea492868f1d5", size = 782587, upload-time = "2025-11-03T21:30:58.788Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b2/bb/5e30c7394bcf63f0537121c23e796be67b55a8847c3956ae6068f4c70702/regex-2025.11.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:728a9d2d173a65b62bdc380b7932dd8e74ed4295279a8fe1021204ce210803e7", size = 774709, upload-time = "2025-11-03T21:31:00.081Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c5/c4/fce773710af81b0cb37cb4ff0947e75d5d17dee304b93d940b87a67fc2f4/regex-2025.11.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:509dc827f89c15c66a0c216331260d777dd6c81e9a4e4f830e662b0bb296c313", size = 845773, upload-time = "2025-11-03T21:31:01.583Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7b/5e/9466a7ec4b8ec282077095c6eb50a12a389d2e036581134d4919e8ca518c/regex-2025.11.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:849202cd789e5f3cf5dcc7822c34b502181b4824a65ff20ce82da5524e45e8e9", size = 836164, upload-time = "2025-11-03T21:31:03.244Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/18/82980a60e8ed1594eb3c89eb814fb276ef51b9af7caeab1340bfd8564af6/regex-2025.11.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b6f78f98741dcc89607c16b1e9426ee46ce4bf31ac5e6b0d40e81c89f3481ea5", size = 779832, upload-time = "2025-11-03T21:31:04.876Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/03/cc/90ab0fdbe6dce064a42015433f9152710139fb04a8b81b4fb57a1cb63ffa/regex-2025.11.3-cp310-cp310-win32.whl", hash = "sha256:149eb0bba95231fb4f6d37c8f760ec9fa6fabf65bab555e128dde5f2475193ec", size = 265802, upload-time = "2025-11-03T21:31:06.581Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/9d/e9e8493a85f3b1ddc4a5014465f5c2b78c3ea1cbf238dcfde78956378041/regex-2025.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:ee3a83ce492074c35a74cc76cf8235d49e77b757193a5365ff86e3f2f93db9fd", size = 277722, upload-time = "2025-11-03T21:31:08.144Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/15/c4/b54b24f553966564506dbf873a3e080aef47b356a3b39b5d5aba992b50db/regex-2025.11.3-cp310-cp310-win_arm64.whl", hash = "sha256:38af559ad934a7b35147716655d4a2f79fcef2d695ddfe06a06ba40ae631fa7e", size = 270289, upload-time = "2025-11-03T21:31:10.267Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.5"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
+]
+
+[[package]]
+name = "retry"
+version = "0.9.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "decorator" },
+    { name = "py" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9d/72/75d0b85443fbc8d9f38d08d2b1b67cc184ce35280e4a3813cda2f445f3a4/retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4", size = 6448, upload-time = "2016-05-11T13:58:51.541Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4b/0d/53aea75710af4528a25ed6837d71d117602b01946b307a3912cb3cfcbcba/retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", size = 7986, upload-time = "2016-05-11T13:58:39.925Z" },
+]
+
+[[package]]
+name = "rfc3339-validator"
+version = "0.1.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/28/ea/a9387748e2d111c3c2b275ba970b735e04e15cdb1eb30693b6b5708c4dbd/rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b", size = 5513, upload-time = "2021-05-12T16:37:54.178Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7b/44/4e421b96b67b2daff264473f7465db72fbdf36a07e05494f50300cc7b0c6/rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa", size = 3490, upload-time = "2021-05-12T16:37:52.536Z" },
+]
+
+[[package]]
+name = "rfc3986-validator"
+version = "0.1.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/da/88/f270de456dd7d11dcc808abfa291ecdd3f45ff44e3b549ffa01b126464d0/rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055", size = 6760, upload-time = "2019-10-28T16:00:19.144Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/51/17023c0f8f1869d8806b979a2bffa3f861f26a3f1a66b094288323fba52f/rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9", size = 4242, upload-time = "2019-10-28T16:00:13.976Z" },
+]
+
+[[package]]
+name = "rfc3987-syntax"
+version = "1.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "lark" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/06/37c1a5557acf449e8e406a830a05bf885ac47d33270aec454ef78675008d/rfc3987_syntax-1.1.0.tar.gz", hash = "sha256:717a62cbf33cffdd16dfa3a497d81ce48a660ea691b1ddd7be710c22f00b4a0d", size = 14239, upload-time = "2025-07-18T01:05:05.015Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7e/71/44ce230e1b7fadd372515a97e32a83011f906ddded8d03e3c6aafbdedbb7/rfc3987_syntax-1.1.0-py3-none-any.whl", hash = "sha256:6c3d97604e4c5ce9f714898e05401a0445a641cfa276432b0a648c80856f6a3f", size = 8046, upload-time = "2025-07-18T01:05:03.843Z" },
+]
+
+[[package]]
+name = "rich"
+version = "14.2.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fb/d2/8920e102050a0de7bfabeb4c4614a49248cf8d5d7a8d01885fbb24dc767a/rich-14.2.0.tar.gz", hash = "sha256:73ff50c7c0c1c77c8243079283f4edb376f0f6442433aecb8ce7e6d0b92d1fe4", size = 219990, upload-time = "2025-10-09T14:16:53.064Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" },
+]
+
+[[package]]
+name = "rpds-py"
+version = "0.30.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/19/6a/4ba3d0fb7297ebae71171822554abe48d7cab29c28b8f9f2c04b79988c05/rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00", size = 359751, upload-time = "2025-11-30T20:21:34.591Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cd/7c/e4933565ef7f7a0818985d87c15d9d273f1a649afa6a52ea35ad011195ea/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6", size = 389696, upload-time = "2025-11-30T20:21:36.122Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5e/01/6271a2511ad0815f00f7ed4390cf2567bec1d4b1da39e2c27a41e6e3b4de/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7", size = 403136, upload-time = "2025-11-30T20:21:37.728Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/64/c857eb7cd7541e9b4eee9d49c196e833128a55b89a9850a9c9ac33ccf897/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324", size = 524699, upload-time = "2025-11-30T20:21:38.92Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/ed/94816543404078af9ab26159c44f9e98e20fe47e2126d5d32c9d9948d10a/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df", size = 412022, upload-time = "2025-11-30T20:21:40.407Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/61/b5/707f6cf0066a6412aacc11d17920ea2e19e5b2f04081c64526eb35b5c6e7/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3", size = 390522, upload-time = "2025-11-30T20:21:42.17Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/13/4e/57a85fda37a229ff4226f8cbcf09f2a455d1ed20e802ce5b2b4a7f5ed053/rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221", size = 404579, upload-time = "2025-11-30T20:21:43.769Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f9/da/c9339293513ec680a721e0e16bf2bac3db6e5d7e922488de471308349bba/rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7", size = 421305, upload-time = "2025-11-30T20:21:44.994Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f9/be/522cb84751114f4ad9d822ff5a1aa3c98006341895d5f084779b99596e5c/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff", size = 572503, upload-time = "2025-11-30T20:21:46.91Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/9b/de879f7e7ceddc973ea6e4629e9b380213a6938a249e94b0cdbcc325bb66/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7", size = 598322, upload-time = "2025-11-30T20:21:48.709Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/ac/f01fc22efec3f37d8a914fc1b2fb9bcafd56a299edbe96406f3053edea5a/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139", size = 560792, upload-time = "2025-11-30T20:21:50.024Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/da/4e2b19d0f131f35b6146425f846563d0ce036763e38913d917187307a671/rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464", size = 221901, upload-time = "2025-11-30T20:21:51.32Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/96/cb/156d7a5cf4f78a7cc571465d8aec7a3c447c94f6749c5123f08438bcf7bc/rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169", size = 235823, upload-time = "2025-11-30T20:21:52.505Z" },
+]
+
+[[package]]
+name = "rtree"
+version = "1.4.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/09/7302695875a019514de9a5dd17b8320e7a19d6e7bc8f85dcfb79a4ce2da3/rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46", size = 52425, upload-time = "2025-08-13T19:32:01.413Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/d9/108cd989a4c0954e60b3cdc86fd2826407702b5375f6dfdab2802e5fed98/rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4", size = 468484, upload-time = "2025-08-13T19:31:50.593Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f3/cf/2710b6fd6b07ea0aef317b29f335790ba6adf06a28ac236078ed9bd8a91d/rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d", size = 436325, upload-time = "2025-08-13T19:31:52.367Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/e1/4d075268a46e68db3cac51846eb6a3ab96ed481c585c5a1ad411b3c23aad/rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65", size = 459789, upload-time = "2025-08-13T19:31:53.926Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/75/e5d44be90525cd28503e7f836d077ae6663ec0687a13ba7810b4114b3668/rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c", size = 507644, upload-time = "2025-08-13T19:31:55.164Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/85/b8684f769a142163b52859a38a486493b05bafb4f2fb71d4f945de28ebf9/rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967", size = 1454478, upload-time = "2025-08-13T19:31:56.808Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e9/a4/c2292b95246b9165cc43a0c3757e80995d58bc9b43da5cb47ad6e3535213/rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc", size = 1555140, upload-time = "2025-08-13T19:31:58.031Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/74/25/5282c8270bfcd620d3e73beb35b40ac4ab00f0a898d98ebeb41ef0989ec8/rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489", size = 389358, upload-time = "2025-08-13T19:31:59.247Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" },
+]
+
+[[package]]
+name = "s3transfer"
+version = "0.14.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
+]
+
+[[package]]
+name = "safetensors"
+version = "0.7.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a7/6a/4d08d89a6fcbe905c5ae68b8b34f0791850882fc19782d0d02c65abbdf3b/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4729811a6640d019a4b7ba8638ee2fd21fa5ca8c7e7bdf0fed62068fcaac737", size = 492430, upload-time = "2025-11-19T15:18:11.884Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/dd/29/59ed8152b30f72c42d00d241e58eaca558ae9dbfa5695206e2e0f54c7063/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12f49080303fa6bb424b362149a12949dfbbf1e06811a88f2307276b0c131afd", size = 503977, upload-time = "2025-11-19T15:18:17.523Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d3/0b/4811bfec67fa260e791369b16dab105e4bae82686120554cc484064e22b4/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0071bffba4150c2f46cae1432d31995d77acfd9f8db598b5d1a2ce67e8440ad2", size = 623890, upload-time = "2025-11-19T15:18:22.666Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/58/5b/632a58724221ef03d78ab65062e82a1010e1bef8e8e0b9d7c6d7b8044841/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473b32699f4200e69801bf5abf93f1a4ecd432a70984df164fc22ccf39c4a6f3", size = 531885, upload-time = "2025-11-19T15:18:27.146Z" },
+]
+
+[[package]]
+name = "scikit-learn"
+version = "1.2.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/fa/8e158d81e3602da1e7bafbd4987938bc003fe4b0f44d65681e7f8face95a/scikit-learn-1.2.2.tar.gz", hash = "sha256:8429aea30ec24e7a8c7ed8a3fa6213adf3814a6efbea09e16e0a0c71e1a1a3d7", size = 7269934, upload-time = "2023-03-09T09:57:57.31Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3c/21/ee21352f69a980614cb4193d68a64a83aa2c0f80183c9485d6d61821a922/scikit_learn-1.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99cc01184e347de485bf253d19fcb3b1a3fb0ee4cea5ee3c43ec0cc429b6d29f", size = 9107257, upload-time = "2023-03-09T09:57:10.488Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5a/43/5c4d21217df6a033999ee531fdfd52809263727b4afb26f7196a8ec709ae/scikit_learn-1.2.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e6e574db9914afcb4e11ade84fab084536a895ca60aadea3041e85b8ac963edb", size = 8455656, upload-time = "2023-03-09T09:57:13.131Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/92/a39d1c9e0a6cb5ed4112899ecca590138484356ba8c4274dde6c3893ff14/scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fe83b676f407f00afa388dd1fdd49e5c6612e551ed84f3b1b182858f09e987d", size = 9165302, upload-time = "2023-03-09T09:57:15.336Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fa/1e/36d7609e84b50d4a2e5bc43cd5013d9ea885799e5813a1e9cf5bb1afd3f4/scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2642baa0ad1e8f8188917423dd73994bf25429f8893ddbe115be3ca3183584", size = 9625294, upload-time = "2023-03-09T09:57:17.519Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/4d/fe3b35e18407da4b386be58616bd0f941ea1762a6c6798267f3aa64ef5d5/scikit_learn-1.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ad66c3848c0a1ec13464b2a95d0a484fd5b02ce74268eaa7e0c697b904f31d6c", size = 8306029, upload-time = "2023-03-09T09:57:19.64Z" },
+]
+
+[[package]]
+name = "scipy"
+version = "1.13.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720, upload-time = "2024-05-23T03:29:26.079Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076, upload-time = "2024-05-23T03:19:01.687Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232, upload-time = "2024-05-23T03:19:09.089Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202, upload-time = "2024-05-23T03:19:15.138Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335, upload-time = "2024-05-23T03:19:21.984Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728, upload-time = "2024-05-23T03:19:28.225Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588, upload-time = "2024-05-23T03:19:35.661Z" },
+]
+
+[[package]]
+name = "selenium"
+version = "4.39.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "trio" },
+    { name = "trio-websocket" },
+    { name = "typing-extensions" },
+    { name = "urllib3", extra = ["socks"] },
+    { name = "websocket-client" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/19/27c1bf9eb1f7025632d35a956b50746efb4b10aa87f961b263fa7081f4c5/selenium-4.39.0.tar.gz", hash = "sha256:12f3325f02d43b6c24030fc9602b34a3c6865abbb1db9406641d13d108aa1889", size = 928575, upload-time = "2025-12-06T23:12:34.896Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/58/d0/55a6b7c6f35aad4c8a54be0eb7a52c1ff29a59542fc3e655f0ecbb14456d/selenium-4.39.0-py3-none-any.whl", hash = "sha256:c85f65d5610642ca0f47dae9d5cc117cd9e831f74038bc09fe1af126288200f9", size = 9655249, upload-time = "2025-12-06T23:12:33.085Z" },
+]
+
+[[package]]
+name = "send2trash"
+version = "1.8.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/3a/aec9b02217bb79b87bbc1a21bc6abc51e3d5dcf65c30487ac96c0908c722/Send2Trash-1.8.3.tar.gz", hash = "sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf", size = 17394, upload-time = "2024-04-07T00:01:09.267Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/b0/4562db6223154aa4e22f939003cb92514c79f3d4dccca3444253fd17f902/Send2Trash-1.8.3-py3-none-any.whl", hash = "sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9", size = 18072, upload-time = "2024-04-07T00:01:07.438Z" },
+]
+
+[[package]]
+name = "setuptools"
+version = "65.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/f4/ca5cb6df512f453ad50f78900bf7ec6a5491ee44bb49d0f6f76802dbdd43/setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f", size = 2616576, upload-time = "2022-11-04T19:20:04.26Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b6/40/353c051f77ee5618adaf1fd96f4f6bae9714ed0a22c7142c01c24eb77fe4/setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31", size = 1232712, upload-time = "2022-11-04T19:20:02.418Z" },
+]
+
+[[package]]
+name = "shapely"
+version = "2.1.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/05/89/c3548aa9b9812a5d143986764dededfa48d817714e947398bdda87c77a72/shapely-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7ae48c236c0324b4e139bea88a306a04ca630f49be66741b340729d380d8f52f", size = 1825959, upload-time = "2025-09-24T13:50:00.682Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ce/8a/7ebc947080442edd614ceebe0ce2cdbd00c25e832c240e1d1de61d0e6b38/shapely-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eba6710407f1daa8e7602c347dfc94adc02205ec27ed956346190d66579eb9ea", size = 1629196, upload-time = "2025-09-24T13:50:03.447Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c8/86/c9c27881c20d00fc409e7e059de569d5ed0abfcec9c49548b124ebddea51/shapely-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ef4a456cc8b7b3d50ccec29642aa4aeda959e9da2fe9540a92754770d5f0cf1f", size = 2951065, upload-time = "2025-09-24T13:50:05.266Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/8a/0ab1f7433a2a85d9e9aea5b1fbb333f3b09b309e7817309250b4b7b2cc7a/shapely-2.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e38a190442aacc67ff9f75ce60aec04893041f16f97d242209106d502486a142", size = 3058666, upload-time = "2025-09-24T13:50:06.872Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/c6/5a30ffac9c4f3ffd5b7113a7f5299ccec4713acd5ee44039778a7698224e/shapely-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:40d784101f5d06a1fd30b55fc11ea58a61be23f930d934d86f19a180909908a4", size = 3966905, upload-time = "2025-09-24T13:50:09.417Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/72/e92f3035ba43e53959007f928315a68fbcf2eeb4e5ededb6f0dc7ff1ecc3/shapely-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f6f6cd5819c50d9bcf921882784586aab34a4bd53e7553e175dece6db513a6f0", size = 4129260, upload-time = "2025-09-24T13:50:11.183Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/42/24/605901b73a3d9f65fa958e63c9211f4be23d584da8a1a7487382fac7fdc5/shapely-2.1.2-cp310-cp310-win32.whl", hash = "sha256:fe9627c39c59e553c90f5bc3128252cb85dc3b3be8189710666d2f8bc3a5503e", size = 1544301, upload-time = "2025-09-24T13:50:12.521Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e1/89/6db795b8dd3919851856bd2ddd13ce434a748072f6fdee42ff30cbd3afa3/shapely-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:1d0bfb4b8f661b3b4ec3565fa36c340bfb1cda82087199711f86a88647d26b2f", size = 1722074, upload-time = "2025-09-24T13:50:13.909Z" },
+]
+
+[[package]]
+name = "shtab"
+version = "1.8.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b0/7a/7f131b6082d8b592c32e4312d0a6da3d0b28b8f0d305ddd93e49c9d89929/shtab-1.8.0.tar.gz", hash = "sha256:75f16d42178882b7f7126a0c2cb3c848daed2f4f5a276dd1ded75921cc4d073a", size = 46062, upload-time = "2025-11-18T10:57:47.601Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/e1/202a31727b0d096a04380f78e809074d7a1d0a22d9d5a39fea1d2353fd02/shtab-1.8.0-py3-none-any.whl", hash = "sha256:f0922a82174b4007e06ac0bac4f79abd826c5cca88e201bfd927f889803c571d", size = 14457, upload-time = "2025-11-18T10:57:45.906Z" },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
+]
+
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
+[[package]]
+name = "snuggs"
+version = "1.4.7"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pyparsing" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/93/19/0d11ab370735dde61076a0e41644e5593821776e69e3b0344626cfa0e56a/snuggs-1.4.7.tar.gz", hash = "sha256:501cf113fe3892e14e2fee76da5cd0606b7e149c411c271898e6259ebde2617b", size = 8196, upload-time = "2019-09-18T18:54:51.598Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/0e/d27d6e806d6c0d1a2cfdc5d1f088e42339a0a54a09c3343f7f81ec8947ea/snuggs-1.4.7-py3-none-any.whl", hash = "sha256:988dde5d4db88e9d71c99457404773dabcc7a1c45971bfbe81900999942d9f07", size = 5370, upload-time = "2019-09-18T18:54:49.829Z" },
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
+]
+
+[[package]]
+name = "soupsieve"
+version = "2.8"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
+]
+
+[[package]]
+name = "sqlalchemy"
+version = "1.4.27"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/16/d6/5ca70d8ab5bc99087524fbf0b7e3cd8c19ce61ad7df27c522141a0670413/SQLAlchemy-1.4.27.tar.gz", hash = "sha256:d768359daeb3a86644f3854c6659e4496a3e6bba2b4651ecc87ce7ad415b320c", size = 7907686, upload-time = "2021-11-11T15:48:18.483Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/8b/91193995f410cfc5243811950d9c3f0466da84f9738dcb8bb96636fc79b2/SQLAlchemy-1.4.27-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:d81c84c9d2523b3ea20f8e3aceea68615768a7464c0f9a9899600ce6592ec570", size = 1519412, upload-time = "2021-11-11T15:59:07.837Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/82/a6fcd46b596edc62ca76233576153d37eed646725d97895c2655e95e1c90/SQLAlchemy-1.4.27-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5881644fc51af7b232ab8d64f75c0f32295dfe88c2ee188023795cdbd4cf99b", size = 1573717, upload-time = "2021-11-11T16:40:36.127Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4f/a7/ff53b48fac5f7e54835eb224c338dc0ba067cd00c5e0d4dac815c70f1821/SQLAlchemy-1.4.27-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:24828c5e74882cf41516740c0b150702bee4c6817d87d5c3d3bafef2e6896f80", size = 1585259, upload-time = "2021-11-11T15:56:00.104Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7d/6e/0b40c7127d66ebab8ca4347c65df74cf786da33fb0eba4a169fc894a9f6c/SQLAlchemy-1.4.27-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7d0a1b1258efff7d7f2e6cfa56df580d09ba29d35a1e3f604f867e1f685feb2", size = 1585259, upload-time = "2021-11-11T15:56:01.645Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/28/02/bf71190a436c75725e05c6e4f342f70b3749410317811bf288f4bc475530/SQLAlchemy-1.4.27-cp310-cp310-win32.whl", hash = "sha256:aadc6d1e58e14010ae4764d1ba1fd0928dbb9423b27a382ea3a1444f903f4084", size = 1536463, upload-time = "2021-11-11T16:02:43.924Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/24/2adebb0e4cbf47dff87df56fd2b37f1347c6b3e24c07b4c2c9207548ba06/SQLAlchemy-1.4.27-cp310-cp310-win_amd64.whl", hash = "sha256:9134e5810262203388b203c2022bbcbf1a22e89861eef9340e772a73dd9076fa", size = 1538763, upload-time = "2021-11-11T16:01:11.32Z" },
+]
+
+[[package]]
+name = "stack-data"
+version = "0.6.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "executing" },
+    { name = "pure-eval" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
+]
+
+[[package]]
+name = "sympy"
+version = "1.14.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "mpmath" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
+]
+
+[[package]]
+name = "tensorboard"
+version = "2.16.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "absl-py" },
+    { name = "grpcio" },
+    { name = "markdown" },
+    { name = "numpy" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+    { name = "six" },
+    { name = "tensorboard-data-server" },
+    { name = "werkzeug" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3a/d0/b97889ffa769e2d1fdebb632084d5e8b53fc299d43a537acee7ec0c021a3/tensorboard-2.16.2-py3-none-any.whl", hash = "sha256:9f2b4e7dad86667615c0e5cd072f1ea8403fc032a299f0072d6f74855775cc45", size = 5490335, upload-time = "2024-02-16T19:56:55.912Z" },
+]
+
+[[package]]
+name = "tensorboard-data-server"
+version = "0.7.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530" },
+]
+
+[[package]]
+name = "terminado"
+version = "0.18.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "ptyprocess", marker = "os_name != 'nt'" },
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'aarch64' and sys_platform == 'linux') or (os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "tornado" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8a/11/965c6fd8e5cc254f1fe142d547387da17a8ebfd75a3455f637c663fb38a0/terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e", size = 32701, upload-time = "2024-03-12T14:34:39.026Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" },
+]
+
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
+]
+
+[[package]]
+name = "timm"
+version = "1.0.22"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+    { name = "torchvision" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c5/9d/e4670765d1c033f97096c760b3b907eeb659cf80f3678640e5f060b04c6c/timm-1.0.22.tar.gz", hash = "sha256:14fd74bcc17db3856b1a47d26fb305576c98579ab9d02b36714a5e6b25cde422", size = 2382998, upload-time = "2025-11-05T04:06:09.377Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/14/fc04d491527b774ec7479897f5861959209de1480e4c4cd32ed098ff8bea/timm-1.0.22-py3-none-any.whl", hash = "sha256:888981753e65cbaacfc07494370138b1700a27b1f0af587f4f9b47bc024161d0", size = 2530238, upload-time = "2025-11-05T04:06:06.823Z" },
+]
+
+[[package]]
+name = "tinycss2"
+version = "1.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "webencodings" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
+]
+
+[[package]]
+name = "tokenizers"
+version = "0.19.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/04/2071c150f374aab6d5e92aaec38d0f3c368d227dd9e0469a1f0966ac68d1/tokenizers-0.19.1.tar.gz", hash = "sha256:ee59e6680ed0fdbe6b724cf38bd70400a0c1dd623b07ac729087270caeac88e3", size = 321039, upload-time = "2024-04-17T21:40:41.849Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/60/91cac8d496b304ec5a22f07606893cad35ea8e1a8406dc8909e365f97a80/tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97", size = 2533301, upload-time = "2024-04-17T21:36:02.256Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4c/12/9cb68762ff5fee1efd51aefe2f62cb225f26f060a68a3779e1060bbc7a59/tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82c8b8063de6c0468f08e82c4e198763e7b97aabfe573fd4cf7b33930ca4df77", size = 2440223, upload-time = "2024-04-17T21:36:06.941Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e4/03/b2020e6a78fb994cff1ec962adc157c23109172a46b4fe451d6d0dd33fdb/tokenizers-0.19.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f03727225feaf340ceeb7e00604825addef622d551cbd46b7b775ac834c1e1c4", size = 3683779, upload-time = "2024-04-17T21:36:09.626Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/4e/2e5549a26dc6f9e434f83bebf16c2d7dc9dc3477cc0ec8b23ede4d465b90/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:453e4422efdfc9c6b6bf2eae00d5e323f263fff62b29a8c9cd526c5003f3f642", size = 3569431, upload-time = "2024-04-17T21:36:12.283Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/75/79/158626bd794e75551e0c6bb93f1cd3c9ba08ba14b181b98f09e95994f609/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02e81bf089ebf0e7f4df34fa0207519f07e66d8491d963618252f2e0729e0b46", size = 3424739, upload-time = "2024-04-17T21:36:14.787Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/65/8e/5f4316976c26009f1ae0b6543f3d97af29afa5ba5dc145251e6a07314618/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b07c538ba956843833fee1190cf769c60dc62e1cf934ed50d77d5502194d63b1", size = 3965791, upload-time = "2024-04-17T21:36:18.528Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/e1/5dbac9618709972434eea072670cd69fba1aa988e6200f16057722b4bf96/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28cab1582e0eec38b1f38c1c1fb2e56bce5dc180acb1724574fc5f47da2a4fe", size = 4049879, upload-time = "2024-04-17T21:36:21.251Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/40/4f/eb78de4af3b17b589f43a369cbf0c3a7173f25c3d2cd93068852c07689aa/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", size = 3607049, upload-time = "2024-04-17T21:36:23.803Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f5/f8/141dcb0f88e9452af8d20d14dd53aab5937222a2bb4f2c04bfed6829263c/tokenizers-0.19.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7fb297edec6c6841ab2e4e8f357209519188e4a59b557ea4fafcf4691d1b4c98", size = 9634084, upload-time = "2024-04-17T21:36:26.75Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2e/be/debb7caa3f88ed54015170db16e07aa3a5fea2d3983d0dde92f98d888dc8/tokenizers-0.19.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e8a3dd055e515df7054378dc9d6fa8c8c34e1f32777fb9a01fea81496b3f9d3", size = 9949480, upload-time = "2024-04-17T21:36:29.592Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7a/e7/26bedf5d270d293d572a90bd66b0b030012aedb95d8ee87e8bcd446b76fb/tokenizers-0.19.1-cp310-none-win32.whl", hash = "sha256:7ff898780a155ea053f5d934925f3902be2ed1f4d916461e1a93019cc7250837", size = 2041462, upload-time = "2024-04-17T21:36:33.369Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/85/d999b9a05fd101d48f1a365d68be0b109277bb25c89fb37a389d669f9185/tokenizers-0.19.1-cp310-none-win_amd64.whl", hash = "sha256:bea6f9947e9419c2fda21ae6c32871e3d398cba549b93f4a65a2d369662d9403", size = 2220036, upload-time = "2024-04-17T21:36:35.903Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cf/7b/38fb7207cde3d1dc5272411cd18178e6437cdc1ef08cac5d0e8cfd57f38c/tokenizers-0.19.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3b11853f17b54c2fe47742c56d8a33bf49ce31caf531e87ac0d7d13d327c9334", size = 2532668, upload-time = "2024-04-17T21:39:15.866Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/0d/2c452fe17fc17f0cdb713acb811eebb1f714b8c21d497c4672af4f491229/tokenizers-0.19.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d26194ef6c13302f446d39972aaa36a1dda6450bc8949f5eb4c27f51191375bd", size = 2438321, upload-time = "2024-04-17T21:39:17.887Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/19/e0/f9e915d028b45798723eab59c253da28040aa66b9f31dcb7cfc3be88fa37/tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e8d1ed93beda54bbd6131a2cb363a576eac746d5c26ba5b7556bc6f964425594", size = 3682304, upload-time = "2024-04-17T21:39:20.388Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ce/2b/db8a94608c392752681c2ca312487b7cd5bcc4f77e24a90daa4916138271/tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca407133536f19bdec44b3da117ef0d12e43f6d4b56ac4c765f37eca501c7bda", size = 3566208, upload-time = "2024-04-17T21:39:23.122Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d8/58/2e998462677c4c0eb5123ce386bcb488a155664d273d0283122866515f09/tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce05fde79d2bc2e46ac08aacbc142bead21614d937aac950be88dc79f9db9022", size = 3605791, upload-time = "2024-04-17T21:39:26.195Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/83/ac/26bc2e2bb2a054dc2e51699628936f5474e093b68da6ccdde04b2fc39ab8/tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:35583cd46d16f07c054efd18b5d46af4a2f070a2dd0a47914e66f3ff5efb2b1e", size = 9632867, upload-time = "2024-04-17T21:39:28.637Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/45/b6/36c1bb106bbe96012c9367df89ed01599cada036c0b96d38fbbdbeb75c9f/tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:43350270bfc16b06ad3f6f07eab21f089adb835544417afda0f83256a8bf8b75", size = 9945103, upload-time = "2024-04-17T21:39:32.522Z" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
+]
+
+[[package]]
+name = "tomlkit"
+version = "0.13.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/18/0bbf3884e9eaa38819ebe46a7bd25dcd56b67434402b66a58c4b8e552575/tomlkit-0.13.3.tar.gz", hash = "sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1", size = 185207, upload-time = "2025-06-05T07:13:44.947Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" },
+]
+
+[[package]]
+name = "torch"
+version = "2.8.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "jinja2" },
+    { name = "networkx" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "sympy" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/63/28/110f7274254f1b8476c561dada127173f994afa2b1ffc044efb773c15650/torch-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:0be92c08b44009d4131d1ff7a8060d10bafdb7ddcb7359ef8d8c5169007ea905", size = 102052793, upload-time = "2025-08-06T14:53:15.852Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/70/1c/58da560016f81c339ae14ab16c98153d51c941544ae568da3cb5b1ceb572/torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:89aa9ee820bb39d4d72b794345cccef106b574508dd17dbec457949678c76011", size = 888025420, upload-time = "2025-08-06T14:54:18.014Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/70/87/f69752d0dd4ba8218c390f0438130c166fa264a33b7025adb5014b92192c/torch-2.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:e8e5bf982e87e2b59d932769938b698858c64cc53753894be25629bdf5cf2f46", size = 241363614, upload-time = "2025-08-06T14:53:31.496Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ef/d6/e6d4c57e61c2b2175d3aafbfb779926a2cfd7c32eeda7c543925dceec923/torch-2.8.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a3f16a58a9a800f589b26d47ee15aca3acf065546137fc2af039876135f4c760", size = 73611154, upload-time = "2025-08-06T14:53:10.919Z" },
+]
+
+[[package]]
+name = "torchmetrics"
+version = "1.8.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "lightning-utilities" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "torch" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/85/2e/48a887a59ecc4a10ce9e8b35b3e3c5cef29d902c4eac143378526e7485cb/torchmetrics-1.8.2.tar.gz", hash = "sha256:cf64a901036bf107f17a524009eea7781c9c5315d130713aeca5747a686fe7a5", size = 580679, upload-time = "2025-09-03T14:00:54.077Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/21/aa0f434434c48490f91b65962b1ce863fdcce63febc166ca9fe9d706c2b6/torchmetrics-1.8.2-py3-none-any.whl", hash = "sha256:08382fd96b923e39e904c4d570f3d49e2cc71ccabd2a94e0f895d1f0dac86242", size = 983161, upload-time = "2025-09-03T14:00:51.921Z" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.23.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4d/49/5ad5c3ff4920be0adee9eb4339b4fb3b023a0fc55b9ed8dbc73df92946b8/torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7266871daca00ad46d1c073e55d972179d12a58fa5c9adec9a3db9bbed71284a", size = 1856885, upload-time = "2025-08-06T14:57:55.024Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/25/44/ddd56d1637bac42a8c5da2c8c440d8a28c431f996dd9790f32dd9a96ca6e/torchvision-0.23.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31c583ba27426a3a04eca8c05450524105c1564db41be6632f7536ef405a6de2", size = 2394251, upload-time = "2025-08-06T14:58:01.725Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/93/f3/3cdf55bbf0f737304d997561c34ab0176222e0496b6743b0feab5995182c/torchvision-0.23.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3932bf67256f2d095ce90a9f826f6033694c818856f4bb26794cf2ce64253e53", size = 8627497, upload-time = "2025-08-06T14:58:09.317Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/97/90/02afe57c3ef4284c5cf89d3b7ae203829b3a981f72b93a7dd2a3fd2c83c1/torchvision-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:83ee5bf827d61a8af14620c0a61d8608558638ac9c3bac8adb7b27138e2147d1", size = 1600760, upload-time = "2025-08-06T14:57:56.783Z" },
+]
+
+[[package]]
+name = "tornado"
+version = "6.5.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/37/1d/0a336abf618272d53f62ebe274f712e213f5a03c0b2339575430b8362ef2/tornado-6.5.4.tar.gz", hash = "sha256:a22fa9047405d03260b483980635f0b041989d8bcc9a313f8fe18b411d84b1d7", size = 513632, upload-time = "2025-12-15T19:21:03.836Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ab/a9/e94a9d5224107d7ce3cc1fab8d5dc97f5ea351ccc6322ee4fb661da94e35/tornado-6.5.4-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d6241c1a16b1c9e4cc28148b1cda97dd1c6cb4fb7068ac1bedc610768dff0ba9", size = 443909, upload-time = "2025-12-15T19:20:48.382Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/db/7e/f7b8d8c4453f305a51f80dbb49014257bb7d28ccb4bbb8dd328ea995ecad/tornado-6.5.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2d50f63dda1d2cac3ae1fa23d254e16b5e38153758470e9956cbc3d813d40843", size = 442163, upload-time = "2025-12-15T19:20:49.791Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ba/b5/206f82d51e1bfa940ba366a8d2f83904b15942c45a78dd978b599870ab44/tornado-6.5.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1cf66105dc6acb5af613c054955b8137e34a03698aa53272dbda4afe252be17", size = 445746, upload-time = "2025-12-15T19:20:51.491Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8e/9d/1a3338e0bd30ada6ad4356c13a0a6c35fbc859063fa7eddb309183364ac1/tornado-6.5.4-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50ff0a58b0dc97939d29da29cd624da010e7f804746621c78d14b80238669335", size = 445083, upload-time = "2025-12-15T19:20:52.778Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/d4/e51d52047e7eb9a582da59f32125d17c0482d065afd5d3bc435ff2120dc5/tornado-6.5.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5fb5e04efa54cf0baabdd10061eb4148e0be137166146fff835745f59ab9f7f", size = 445315, upload-time = "2025-12-15T19:20:53.996Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/27/07/2273972f69ca63dbc139694a3fc4684edec3ea3f9efabf77ed32483b875c/tornado-6.5.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9c86b1643b33a4cd415f8d0fe53045f913bf07b4a3ef646b735a6a86047dda84", size = 446003, upload-time = "2025-12-15T19:20:56.101Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/83/41c52e47502bf7260044413b6770d1a48dda2f0246f95ee1384a3cd9c44a/tornado-6.5.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:6eb82872335a53dd063a4f10917b3efd28270b56a33db69009606a0312660a6f", size = 445412, upload-time = "2025-12-15T19:20:57.398Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/10/c7/bc96917f06cbee182d44735d4ecde9c432e25b84f4c2086143013e7b9e52/tornado-6.5.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6076d5dda368c9328ff41ab5d9dd3608e695e8225d1cd0fd1e006f05da3635a8", size = 445392, upload-time = "2025-12-15T19:20:58.692Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/1a/d7592328d037d36f2d2462f4bc1fbb383eec9278bc786c1b111cbbd44cfa/tornado-6.5.4-cp39-abi3-win32.whl", hash = "sha256:1768110f2411d5cd281bac0a090f707223ce77fd110424361092859e089b38d1", size = 446481, upload-time = "2025-12-15T19:21:00.008Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d6/6d/c69be695a0a64fd37a97db12355a035a6d90f79067a3cf936ec2b1dc38cd/tornado-6.5.4-cp39-abi3-win_amd64.whl", hash = "sha256:fa07d31e0cd85c60713f2b995da613588aa03e1303d75705dca6af8babc18ddc", size = 446886, upload-time = "2025-12-15T19:21:01.287Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/50/49/8dc3fd90902f70084bd2cd059d576ddb4f8bb44c2c7c0e33a11422acb17e/tornado-6.5.4-cp39-abi3-win_arm64.whl", hash = "sha256:053e6e16701eb6cbe641f308f4c1a9541f91b6261991160391bfc342e8a551a1", size = 445910, upload-time = "2025-12-15T19:21:02.571Z" },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+]
+
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
+
+[[package]]
+name = "transformers"
+version = "4.43.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "safetensors" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/d9/9d932e3bff226465234a1a4dbc2e308a2a7333b92103b39bbd17a7d3a95c/transformers-4.43.0.tar.gz", hash = "sha256:4154e659eff31b0bf3b21b2c5d62e38ac8973e33c53cfe28284a19c8558234e2", size = 8071037, upload-time = "2024-07-23T15:06:28.804Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/23/c6/445ed1d345c215a5ad094cb00359d9697efd5ddb2e5927e32c6852fad666/transformers-4.43.0-py3-none-any.whl", hash = "sha256:0158b430c3bab1abde3232598e250916edd906680b9ffe9d4e1b0b3fafbfc0ed", size = 9411812, upload-time = "2024-07-23T15:06:16.908Z" },
+]
+
+[[package]]
+name = "trio"
+version = "0.32.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "cffi", marker = "(implementation_name != 'pypy' and os_name == 'nt' and platform_machine != 'aarch64' and sys_platform == 'linux') or (implementation_name != 'pypy' and os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "exceptiongroup" },
+    { name = "idna" },
+    { name = "outcome" },
+    { name = "sniffio" },
+    { name = "sortedcontainers" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d8/ce/0041ddd9160aac0031bcf5ab786c7640d795c797e67c438e15cfedf815c8/trio-0.32.0.tar.gz", hash = "sha256:150f29ec923bcd51231e1d4c71c7006e65247d68759dd1c19af4ea815a25806b", size = 605323, upload-time = "2025-10-31T07:18:17.466Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/41/bf/945d527ff706233636c73880b22c7c953f3faeb9d6c7e2e85bfbfd0134a0/trio-0.32.0-py3-none-any.whl", hash = "sha256:4ab65984ef8370b79a76659ec87aa3a30c5c7c83ff250b4de88c29a8ab6123c5", size = 512030, upload-time = "2025-10-31T07:18:15.885Z" },
+]
+
+[[package]]
+name = "trio-websocket"
+version = "0.12.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "exceptiongroup" },
+    { name = "outcome" },
+    { name = "trio" },
+    { name = "wsproto" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" },
+]
+
+[[package]]
+name = "triton"
+version = "3.4.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "setuptools", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" },
+]
+
+[[package]]
+name = "typeguard"
+version = "4.4.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/68/71c1a15b5f65f40e91b65da23b8224dad41349894535a97f63a52e462196/typeguard-4.4.4.tar.gz", hash = "sha256:3a7fd2dffb705d4d0efaed4306a704c89b9dee850b688f060a8b1615a79e5f74", size = 75203, upload-time = "2025-06-18T09:56:07.624Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1b/a9/e3aee762739c1d7528da1c3e06d518503f8b6c439c35549b53735ba52ead/typeguard-4.4.4-py3-none-any.whl", hash = "sha256:b5f562281b6bfa1f5492470464730ef001646128b180769880468bd84b68b09e", size = 34874, upload-time = "2025-06-18T09:56:05.999Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]
+
+[[package]]
+name = "tyro"
+version = "0.9.28"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "docstring-parser" },
+    { name = "rich" },
+    { name = "shtab" },
+    { name = "typeguard" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/01/e9/fc3faf382e442b5eaa292405ea7bb1de5a529209c56ee8b9783bfeb7c440/tyro-0.9.28.tar.gz", hash = "sha256:4a2283e507727ceff65761f2d630a1c3eb3494d6d2f06d770bfb3bfaf7c302ea", size = 308101, upload-time = "2025-08-09T22:13:38.967Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/d7/c7c4e0ffa27deced376161e28a0c60c88c8372a35bbaee6f2a15ba874838/tyro-0.9.28-py3-none-any.whl", hash = "sha256:0ff8497d01f5e7f3cff51dd5f4397ccc805a6011e63f910b25d2064d8cb420d5", size = 129240, upload-time = "2025-08-09T22:13:37.536Z" },
+]
+
+[[package]]
+name = "tzdata"
+version = "2025.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" },
+]
+
+[[package]]
+name = "ujson"
+version = "5.11.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/43/d9/3f17e3c5773fb4941c68d9a37a47b1a79c9649d6c56aefbed87cc409d18a/ujson-5.11.0.tar.gz", hash = "sha256:e204ae6f909f099ba6b6b942131cee359ddda2b6e4ea39c12eb8b991fe2010e0", size = 7156583, upload-time = "2025-08-20T11:57:02.452Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/86/0c/8bf7a4fabfd01c7eed92d9b290930ce6d14910dec708e73538baa38885d1/ujson-5.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:446e8c11c06048611c9d29ef1237065de0af07cabdd97e6b5b527b957692ec25", size = 55248, upload-time = "2025-08-20T11:55:02.368Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/7b/2e/eeab0b8b641817031ede4f790db4c4942df44a12f44d72b3954f39c6a115/ujson-5.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16ccb973b7ada0455201808ff11d48fe9c3f034a6ab5bd93b944443c88299f89", size = 53157, upload-time = "2025-08-20T11:55:04.012Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/21/1b/a4e7a41870797633423ea79618526747353fd7be9191f3acfbdee0bf264b/ujson-5.11.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3134b783ab314d2298d58cda7e47e7a0f7f71fc6ade6ac86d5dbeaf4b9770fa6", size = 57657, upload-time = "2025-08-20T11:55:05.169Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/94/ae/4e0d91b8f6db7c9b76423b3649612189506d5a06ddd3b6334b6d37f77a01/ujson-5.11.0-cp310-cp310-manylinux_2_24_i686.manylinux_2_28_i686.whl", hash = "sha256:185f93ebccffebc8baf8302c869fac70dd5dd78694f3b875d03a31b03b062cdb", size = 59780, upload-time = "2025-08-20T11:55:06.325Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b3/cc/46b124c2697ca2da7c65c4931ed3cb670646978157aa57a7a60f741c530f/ujson-5.11.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d06e87eded62ff0e5f5178c916337d2262fdbc03b31688142a3433eabb6511db", size = 57307, upload-time = "2025-08-20T11:55:07.493Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/39/eb/20dd1282bc85dede2f1c62c45b4040bc4c389c80a05983515ab99771bca7/ujson-5.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:181fb5b15703a8b9370b25345d2a1fd1359f0f18776b3643d24e13ed9c036d4c", size = 1036369, upload-time = "2025-08-20T11:55:09.192Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/64/a2/80072439065d493e3a4b1fbeec991724419a1b4c232e2d1147d257cac193/ujson-5.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:a4df61a6df0a4a8eb5b9b1ffd673429811f50b235539dac586bb7e9e91994138", size = 1195738, upload-time = "2025-08-20T11:55:11.402Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/5d/7e/d77f9e9c039d58299c350c978e086a804d1fceae4fd4a1cc6e8d0133f838/ujson-5.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6eff24e1abd79e0ec6d7eae651dd675ddbc41f9e43e29ef81e16b421da896915", size = 1088718, upload-time = "2025-08-20T11:55:13.297Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ab/f1/697559d45acc849cada6b3571d53522951b1a64027400507aabc6a710178/ujson-5.11.0-cp310-cp310-win32.whl", hash = "sha256:30f607c70091483550fbd669a0b37471e5165b317d6c16e75dba2aa967608723", size = 39653, upload-time = "2025-08-20T11:55:14.869Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/86/a2/70b73a0f55abe0e6b8046d365d74230c20c5691373e6902a599b2dc79ba1/ujson-5.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:3d2720e9785f84312b8e2cb0c2b87f1a0b1c53aaab3b2af3ab817d54409012e0", size = 43720, upload-time = "2025-08-20T11:55:15.897Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/5f/b19104afa455630b43efcad3a24495b9c635d92aa8f2da4f30e375deb1a2/ujson-5.11.0-cp310-cp310-win_arm64.whl", hash = "sha256:85e6796631165f719084a9af00c79195d3ebf108151452fefdcb1c8bb50f0105", size = 38410, upload-time = "2025-08-20T11:55:17.556Z" },
+]
+
+[[package]]
+name = "uri-template"
+version = "1.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/c7/0336f2bd0bcbada6ccef7aaa25e443c118a704f828a0620c6fa0207c1b64/uri-template-1.3.0.tar.gz", hash = "sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e7/00/3fca040d7cf8a32776d3d81a00c8ee7457e00f80c649f1e4a863c8321ae9/uri_template-1.3.0-py3-none-any.whl", hash = "sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1e/24/a2a2ed9addd907787d7aa0355ba36a6cadf1768b934c652ea78acbd59dcd/urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797", size = 432930, upload-time = "2025-12-11T15:56:40.252Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6d/b9/4095b668ea3678bf6a0af005527f39de12fb026516fb3df17495a733b7f8/urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd", size = 131182, upload-time = "2025-12-11T15:56:38.584Z" },
+]
+
+[package.optional-dependencies]
+socks = [
+    { name = "pysocks" },
+]
+
+[[package]]
+name = "wam-diff"
+version = "0.1.0"
+source = { virtual = "." }
+dependencies = [
+    { name = "accelerate" },
+    { name = "aioboto3" },
+    { name = "aiofiles" },
+    { name = "bokeh" },
+    { name = "casadi" },
+    { name = "control" },
+    { name = "datasets" },
+    { name = "deepspeed" },
+    { name = "einops" },
+    { name = "fiona" },
+    { name = "geopandas" },
+    { name = "guppy3" },
+    { name = "hydra-core" },
+    { name = "imageio" },
+    { name = "joblib" },
+    { name = "matplotlib" },
+    { name = "nest-asyncio" },
+    { name = "notebook" },
+    { name = "numpy" },
+    { name = "nuplan-devkit" },
+    { name = "opencv-python" },
+    { name = "pandas" },
+    { name = "pillow" },
+    { name = "positional-encodings" },
+    { name = "protobuf" },
+    { name = "psutil" },
+    { name = "pyarrow" },
+    { name = "pyinstrument" },
+    { name = "pylint" },
+    { name = "pynvml" },
+    { name = "pyogrio" },
+    { name = "pyquaternion" },
+    { name = "pytest" },
+    { name = "pytorch-lightning" },
+    { name = "rasterio" },
+    { name = "retry" },
+    { name = "rtree" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "selenium" },
+    { name = "setuptools" },
+    { name = "shapely" },
+    { name = "sqlalchemy" },
+    { name = "sympy" },
+    { name = "tensorboard" },
+    { name = "timm" },
+    { name = "torch" },
+    { name = "torchvision" },
+    { name = "tornado" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "tyro" },
+    { name = "ujson" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "accelerate", specifier = "==0.29.1" },
+    { name = "aioboto3", specifier = ">=15.5.0" },
+    { name = "aiofiles", specifier = ">=25.1.0" },
+    { name = "bokeh", specifier = "==2.4.3" },
+    { name = "casadi", specifier = ">=3.7.2" },
+    { name = "control", specifier = "==0.9.1" },
+    { name = "datasets", specifier = "==2.16.1" },
+    { name = "deepspeed", specifier = "==0.14.4" },
+    { name = "einops", specifier = ">=0.8.1" },
+    { name = "fiona", specifier = ">=1.10.1" },
+    { name = "geopandas", specifier = ">=0.12.1" },
+    { name = "guppy3", specifier = "==3.1.2" },
+    { name = "hydra-core", specifier = "==1.2.0" },
+    { name = "imageio", specifier = ">=2.37.2" },
+    { name = "joblib", specifier = ">=1.5.2" },
+    { name = "matplotlib", specifier = ">=3.10.8" },
+    { name = "nest-asyncio", specifier = ">=1.6.0" },
+    { name = "notebook", specifier = ">=7.5.0" },
+    { name = "numpy", specifier = "==1.23.4" },
+    { name = "nuplan-devkit" },
+    { name = "opencv-python", specifier = "==4.9.0.80" },
+    { name = "pandas", specifier = ">=2.3.3" },
+    { name = "pillow", specifier = ">=12.0.0" },
+    { name = "positional-encodings", specifier = "==6.0.1" },
+    { name = "protobuf", specifier = "==3.20.0" },
+    { name = "psutil", specifier = ">=7.1.3" },
+    { name = "pyarrow", specifier = ">=22.0.0" },
+    { name = "pyinstrument", specifier = ">=5.1.1" },
+    { name = "pylint", specifier = ">=4.0.4" },
+    { name = "pynvml", specifier = "==12.0.0" },
+    { name = "pyogrio", specifier = ">=0.12.1" },
+    { name = "pyquaternion", specifier = ">=0.9.5" },
+    { name = "pytest", specifier = ">=9.0.2" },
+    { name = "pytorch-lightning", specifier = "==2.2.1" },
+    { name = "rasterio", specifier = ">=1.3.11" },
+    { name = "retry", specifier = ">=0.9.2" },
+    { name = "rtree", specifier = ">=1.4.1" },
+    { name = "scikit-learn", specifier = "==1.2.2" },
+    { name = "scipy", specifier = ">=1.13.1" },
+    { name = "selenium", specifier = ">=4.39.0" },
+    { name = "setuptools", specifier = "==65.5.1" },
+    { name = "shapely", specifier = ">=2.0.0" },
+    { name = "sqlalchemy", specifier = "==1.4.27" },
+    { name = "sympy", specifier = ">=1.14.0" },
+    { name = "tensorboard", specifier = "==2.16.2" },
+    { name = "timm", specifier = ">=1.0.22" },
+    { name = "torch", specifier = "==2.8.0" },
+    { name = "torchvision", specifier = "==0.23.0" },
+    { name = "tornado", specifier = ">=6.5.3" },
+    { name = "tqdm", specifier = ">=4.67.1" },
+    { name = "transformers", specifier = "==4.43.0" },
+    { name = "tyro", specifier = "==0.9.28" },
+    { name = "ujson", specifier = ">=5.11.0" },
+]
+
+[[package]]
+name = "wcwidth"
+version = "0.2.14"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" },
+]
+
+[[package]]
+name = "webcolors"
+version = "25.10.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/7a/eb316761ec35664ea5174709a68bbd3389de60d4a1ebab8808bfc264ed67/webcolors-25.10.0.tar.gz", hash = "sha256:62abae86504f66d0f6364c2a8520de4a0c47b80c03fc3a5f1815fedbef7c19bf", size = 53491, upload-time = "2025-10-31T07:51:03.977Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/cc/e097523dd85c9cf5d354f78310927f1656c422bd7b2613b2db3e3f9a0f2c/webcolors-25.10.0-py3-none-any.whl", hash = "sha256:032c727334856fc0b968f63daa252a1ac93d33db2f5267756623c210e57a4f1d", size = 14905, upload-time = "2025-10-31T07:51:01.778Z" },
+]
+
+[[package]]
+name = "webencodings"
+version = "0.5.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" },
+]
+
+[[package]]
+name = "websocket-client"
+version = "1.9.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" },
+]
+
+[[package]]
+name = "werkzeug"
+version = "3.1.4"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/45/ea/b0f8eeb287f8df9066e56e831c7824ac6bab645dd6c7a8f4b2d767944f9b/werkzeug-3.1.4.tar.gz", hash = "sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e", size = 864687, upload-time = "2025-11-29T02:15:22.841Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2f/f9/9e082990c2585c744734f85bec79b5dae5df9c974ffee58fe421652c8e91/werkzeug-3.1.4-py3-none-any.whl", hash = "sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905", size = 224960, upload-time = "2025-11-29T02:15:21.13Z" },
+]
+
+[[package]]
+name = "wrapt"
+version = "1.17.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/3f/23/bb82321b86411eb51e5a5db3fb8f8032fd30bd7c2d74bfe936136b2fa1d6/wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04", size = 53482, upload-time = "2025-08-12T05:51:44.467Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/45/69/f3c47642b79485a30a59c63f6d739ed779fb4cc8323205d047d741d55220/wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2", size = 38676, upload-time = "2025-08-12T05:51:32.636Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/71/e7e7f5670c1eafd9e990438e69d8fb46fa91a50785332e06b560c869454f/wrapt-1.17.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd341868a4b6714a5962c1af0bd44f7c404ef78720c7de4892901e540417111c", size = 38957, upload-time = "2025-08-12T05:51:54.655Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/de/17/9f8f86755c191d6779d7ddead1a53c7a8aa18bccb7cea8e7e72dfa6a8a09/wrapt-1.17.3-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f9b2601381be482f70e5d1051a5965c25fb3625455a2bf520b5a077b22afb775", size = 81975, upload-time = "2025-08-12T05:52:30.109Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f2/15/dd576273491f9f43dd09fce517f6c2ce6eb4fe21681726068db0d0467096/wrapt-1.17.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:343e44b2a8e60e06a7e0d29c1671a0d9951f59174f3709962b5143f60a2a98bd", size = 83149, upload-time = "2025-08-12T05:52:09.316Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0c/c4/5eb4ce0d4814521fee7aa806264bf7a114e748ad05110441cd5b8a5c744b/wrapt-1.17.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:33486899acd2d7d3066156b03465b949da3fd41a5da6e394ec49d271baefcf05", size = 82209, upload-time = "2025-08-12T05:52:10.331Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/4b/819e9e0eb5c8dc86f60dfc42aa4e2c0d6c3db8732bce93cc752e604bb5f5/wrapt-1.17.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e6f40a8aa5a92f150bdb3e1c44b7e98fb7113955b2e5394122fa5532fec4b418", size = 81551, upload-time = "2025-08-12T05:52:31.137Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f8/83/ed6baf89ba3a56694700139698cf703aac9f0f9eb03dab92f57551bd5385/wrapt-1.17.3-cp310-cp310-win32.whl", hash = "sha256:a36692b8491d30a8c75f1dfee65bef119d6f39ea84ee04d9f9311f83c5ad9390", size = 36464, upload-time = "2025-08-12T05:53:01.204Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2f/90/ee61d36862340ad7e9d15a02529df6b948676b9a5829fd5e16640156627d/wrapt-1.17.3-cp310-cp310-win_amd64.whl", hash = "sha256:afd964fd43b10c12213574db492cb8f73b2f0826c8df07a68288f8f19af2ebe6", size = 38748, upload-time = "2025-08-12T05:53:00.209Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bd/c3/cefe0bd330d389c9983ced15d326f45373f4073c9f4a8c2f99b50bfea329/wrapt-1.17.3-cp310-cp310-win_arm64.whl", hash = "sha256:af338aa93554be859173c39c85243970dc6a289fa907402289eeae7543e1ae18", size = 36810, upload-time = "2025-08-12T05:52:51.906Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
+]
+
+[[package]]
+name = "wsproto"
+version = "1.3.2"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "h11" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c7/79/12135bdf8b9c9367b8701c2c19a14c913c120b882d50b014ca0d38083c2c/wsproto-1.3.2.tar.gz", hash = "sha256:b86885dcf294e15204919950f666e06ffc6c7c114ca900b060d6e16293528294", size = 50116, upload-time = "2025-11-20T18:18:01.871Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" },
+]
+
+[[package]]
+name = "xxhash"
+version = "3.6.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/ee/f9f1d656ad168681bb0f6b092372c1e533c4416b8069b1896a175c46e484/xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71", size = 32845, upload-time = "2025-10-02T14:33:51.573Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/b1/93508d9460b292c74a09b83d16750c52a0ead89c51eea9951cb97a60d959/xxhash-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f572dfd3d0e2eb1a57511831cf6341242f5a9f8298a45862d085f5b93394a27d", size = 30807, upload-time = "2025-10-02T14:33:52.964Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/55/28c93a3662f2d200c70704efe74aab9640e824f8ce330d8d3943bf7c9b3c/xxhash-3.6.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:89952ea539566b9fed2bbd94e589672794b4286f342254fad28b149f9615fef8", size = 193786, upload-time = "2025-10-02T14:33:54.272Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c1/96/fec0be9bb4b8f5d9c57d76380a366f31a1781fb802f76fc7cda6c84893c7/xxhash-3.6.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e6f2ffb07a50b52465a1032c3cf1f4a5683f944acaca8a134a2f23674c2058", size = 212830, upload-time = "2025-10-02T14:33:55.706Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c4/a0/c706845ba77b9611f81fd2e93fad9859346b026e8445e76f8c6fd057cc6d/xxhash-3.6.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b5b848ad6c16d308c3ac7ad4ba6bede80ed5df2ba8ed382f8932df63158dd4b2", size = 211606, upload-time = "2025-10-02T14:33:57.133Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/67/1e/164126a2999e5045f04a69257eea946c0dc3e86541b400d4385d646b53d7/xxhash-3.6.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a034590a727b44dd8ac5914236a7b8504144447a9682586c3327e935f33ec8cc", size = 444872, upload-time = "2025-10-02T14:33:58.446Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2d/4b/55ab404c56cd70a2cf5ecfe484838865d0fea5627365c6c8ca156bd09c8f/xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a8f1972e75ebdd161d7896743122834fe87378160c20e97f8b09166213bf8cc", size = 193217, upload-time = "2025-10-02T14:33:59.724Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/45/e6/52abf06bac316db33aa269091ae7311bd53cfc6f4b120ae77bac1b348091/xxhash-3.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ee34327b187f002a596d7b167ebc59a1b729e963ce645964bbc050d2f1b73d07", size = 210139, upload-time = "2025-10-02T14:34:02.041Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/34/37/db94d490b8691236d356bc249c08819cbcef9273a1a30acf1254ff9ce157/xxhash-3.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:339f518c3c7a850dd033ab416ea25a692759dc7478a71131fe8869010d2b75e4", size = 197669, upload-time = "2025-10-02T14:34:03.664Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b7/36/c4f219ef4a17a4f7a64ed3569bc2b5a9c8311abdb22249ac96093625b1a4/xxhash-3.6.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:bf48889c9630542d4709192578aebbd836177c9f7a4a2778a7d6340107c65f06", size = 210018, upload-time = "2025-10-02T14:34:05.325Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/fd/06/bfac889a374fc2fc439a69223d1750eed2e18a7db8514737ab630534fa08/xxhash-3.6.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:5576b002a56207f640636056b4160a378fe36a58db73ae5c27a7ec8db35f71d4", size = 413058, upload-time = "2025-10-02T14:34:06.925Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c9/d1/555d8447e0dd32ad0930a249a522bb2e289f0d08b6b16204cfa42c1f5a0c/xxhash-3.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af1f3278bd02814d6dedc5dec397993b549d6f16c19379721e5a1d31e132c49b", size = 190628, upload-time = "2025-10-02T14:34:08.669Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/15/8751330b5186cedc4ed4b597989882ea05e0408b53fa47bcb46a6125bfc6/xxhash-3.6.0-cp310-cp310-win32.whl", hash = "sha256:aed058764db109dc9052720da65fafe84873b05eb8b07e5e653597951af57c3b", size = 30577, upload-time = "2025-10-02T14:34:10.234Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bb/cc/53f87e8b5871a6eb2ff7e89c48c66093bda2be52315a8161ddc54ea550c4/xxhash-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:e82da5670f2d0d98950317f82a0e4a0197150ff19a6df2ba40399c2a3b9ae5fb", size = 31487, upload-time = "2025-10-02T14:34:11.618Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9f/00/60f9ea3bb697667a14314d7269956f58bf56bb73864f8f8d52a3c2535e9a/xxhash-3.6.0-cp310-cp310-win_arm64.whl", hash = "sha256:4a082ffff8c6ac07707fb6b671caf7c6e020c75226c561830b73d862060f281d", size = 27863, upload-time = "2025-10-02T14:34:12.619Z" },
+]
+
+[[package]]
+name = "yarl"
+version = "1.22.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/57/63/0c6ebca57330cd313f6102b16dd57ffaf3ec4c83403dcb45dbd15c6f3ea1/yarl-1.22.0.tar.gz", hash = "sha256:bebf8557577d4401ba8bd9ff33906f1376c877aa78d1fe216ad01b4d6745af71", size = 187169, upload-time = "2025-10-06T14:12:55.963Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d1/43/a2204825342f37c337f5edb6637040fa14e365b2fcc2346960201d457579/yarl-1.22.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c7bd6683587567e5a49ee6e336e0612bec8329be1b7d4c8af5687dcdeb67ee1e", size = 140517, upload-time = "2025-10-06T14:08:42.494Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/44/6f/674f3e6f02266428c56f704cd2501c22f78e8b2eeb23f153117cc86fb28a/yarl-1.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cdac20da754f3a723cceea5b3448e1a2074866406adeb4ef35b469d089adb8f", size = 93495, upload-time = "2025-10-06T14:08:46.2Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/b8/12/5b274d8a0f30c07b91b2f02cba69152600b47830fcfb465c108880fcee9c/yarl-1.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07a524d84df0c10f41e3ee918846e1974aba4ec017f990dc735aad487a0bdfdf", size = 94400, upload-time = "2025-10-06T14:08:47.855Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e2/7f/df1b6949b1fa1aa9ff6de6e2631876ad4b73c4437822026e85d8acb56bb1/yarl-1.22.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b329cb8146d7b736677a2440e422eadd775d1806a81db2d4cded80a48efc1a", size = 347545, upload-time = "2025-10-06T14:08:49.683Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/09/f92ed93bd6cd77872ab6c3462df45ca45cd058d8f1d0c9b4f54c1704429f/yarl-1.22.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:75976c6945d85dbb9ee6308cd7ff7b1fb9409380c82d6119bd778d8fcfe2931c", size = 319598, upload-time = "2025-10-06T14:08:51.215Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/97/ac3f3feae7d522cf7ccec3d340bb0b2b61c56cb9767923df62a135092c6b/yarl-1.22.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:80ddf7a5f8c86cb3eb4bc9028b07bbbf1f08a96c5c0bc1244be5e8fefcb94147", size = 363893, upload-time = "2025-10-06T14:08:53.144Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/06/49/f3219097403b9c84a4d079b1d7bda62dd9b86d0d6e4428c02d46ab2c77fc/yarl-1.22.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d332fc2e3c94dad927f2112395772a4e4fedbcf8f80efc21ed7cdfae4d574fdb", size = 371240, upload-time = "2025-10-06T14:08:55.036Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/35/9f/06b765d45c0e44e8ecf0fe15c9eacbbde342bb5b7561c46944f107bfb6c3/yarl-1.22.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cf71bf877efeac18b38d3930594c0948c82b64547c1cf420ba48722fe5509f6", size = 346965, upload-time = "2025-10-06T14:08:56.722Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c5/69/599e7cea8d0fcb1694323b0db0dda317fa3162f7b90166faddecf532166f/yarl-1.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:663e1cadaddae26be034a6ab6072449a8426ddb03d500f43daf952b74553bba0", size = 342026, upload-time = "2025-10-06T14:08:58.563Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/95/6f/9dfd12c8bc90fea9eab39832ee32ea48f8e53d1256252a77b710c065c89f/yarl-1.22.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6dcbb0829c671f305be48a7227918cfcd11276c2d637a8033a99a02b67bf9eda", size = 335637, upload-time = "2025-10-06T14:09:00.506Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/57/2e/34c5b4eb9b07e16e873db5b182c71e5f06f9b5af388cdaa97736d79dd9a6/yarl-1.22.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f0d97c18dfd9a9af4490631905a3f131a8e4c9e80a39353919e2cfed8f00aedc", size = 359082, upload-time = "2025-10-06T14:09:01.936Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/31/71/fa7e10fb772d273aa1f096ecb8ab8594117822f683bab7d2c5a89914c92a/yarl-1.22.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:437840083abe022c978470b942ff832c3940b2ad3734d424b7eaffcd07f76737", size = 357811, upload-time = "2025-10-06T14:09:03.445Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/26/da/11374c04e8e1184a6a03cf9c8f5688d3e5cec83ed6f31ad3481b3207f709/yarl-1.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a899cbd98dce6f5d8de1aad31cb712ec0a530abc0a86bd6edaa47c1090138467", size = 351223, upload-time = "2025-10-06T14:09:05.401Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/82/8f/e2d01f161b0c034a30410e375e191a5d27608c1f8693bab1a08b089ca096/yarl-1.22.0-cp310-cp310-win32.whl", hash = "sha256:595697f68bd1f0c1c159fcb97b661fc9c3f5db46498043555d04805430e79bea", size = 82118, upload-time = "2025-10-06T14:09:11.148Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/46/94c76196642dbeae634c7a61ba3da88cd77bed875bf6e4a8bed037505aa6/yarl-1.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb95a9b1adaa48e41815a55ae740cfda005758104049a640a398120bf02515ca", size = 86852, upload-time = "2025-10-06T14:09:12.958Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/af/af/7df4f179d3b1a6dcb9a4bd2ffbc67642746fcafdb62580e66876ce83fff4/yarl-1.22.0-cp310-cp310-win_arm64.whl", hash = "sha256:b85b982afde6df99ecc996990d4ad7ccbdbb70e2a4ba4de0aecde5922ba98a0b", size = 82012, upload-time = "2025-10-06T14:09:14.664Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" },
+]