diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml
index e2d58fbe25..cdd7d1dbf5 100644
--- a/.github/workflows/fast_tests.yml
+++ b/.github/workflows/fast_tests.yml
@@ -21,7 +21,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -36,7 +36,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/fast_tests.sh
   diffusers:
     name: Run tests for optimum.habana.diffusers
@@ -46,7 +46,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -61,5 +61,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/fast_tests_diffusers.sh
diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml
index b969273a3c..d0fcb85051 100644
--- a/.github/workflows/slow_tests.yml
+++ b/.github/workflows/slow_tests.yml
@@ -19,7 +19,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -31,7 +31,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/example_diff_tests.sh
   stable-diffusion:
     name: Test Stable Diffusion
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -57,7 +57,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
@@ -72,7 +72,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -84,7 +84,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh
   multi-card:
     name: Test multi-card models
@@ -99,7 +99,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -111,7 +111,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_8x.sh
   single-card:
     name: Test single-card models
@@ -127,7 +127,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -139,7 +139,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   albert-xxl-single-card:
     name: Test single-card ALBERT XXL
@@ -158,7 +158,7 @@ jobs:
       - name: Pull image
         if: github.event.schedule == '0 21 * * 6'
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run test
         if: github.event.schedule == '0 21 * * 6'
         run: |
@@ -171,7 +171,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/albert_xxl_1x.sh
       - name: Warning
         if: github.event.schedule != '0 21 * * 6'
@@ -192,7 +192,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -204,7 +204,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   trl:
     name: Test TRL integration
@@ -223,7 +223,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -235,7 +235,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_trl.sh
   sentence-transformers:
     name: Test Sentence Transformers integration
@@ -263,7 +263,7 @@ jobs:
           path: sentence-transformers
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -275,5 +275,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index 2e561a2765..88a37aa1b2 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -17,7 +17,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -30,7 +30,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/example_diff_tests.sh
   stable-diffusion:
     name: Test Stable Diffusion
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -59,7 +59,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
@@ -72,7 +72,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -88,7 +88,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   fsdp:
     name: Test FSDP models
@@ -101,7 +101,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -117,7 +117,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   multi-card:
     name: Test multi-card models
@@ -130,7 +130,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -146,7 +146,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_8x.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   single-card:
     name: Test single-card models
@@ -160,7 +160,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -177,7 +177,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   text-generation:
     name: Test text-generation example
@@ -192,7 +192,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -208,7 +208,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   trl:
     name: Test TRL integration
@@ -221,7 +221,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -237,7 +237,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_trl.sh
   sentence-transformers:
     name: Test Sentence Transformers integration
@@ -258,7 +258,7 @@ jobs:
           path: sentence-transformers
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -274,5 +274,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/Makefile b/Makefile
index e6989aa1b0..854197d214 100644
--- a/Makefile
+++ b/Makefile
@@ -93,7 +93,7 @@ slow_tests_8x: test_installs
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 	python -m pytest tests/test_examples.py -v -s -k "deepspeed"
 
 slow_tests_diffusers: test_installs
@@ -108,7 +108,7 @@ slow_tests_diffusers: test_installs
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
 	BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
 
 # Run image-to-text non-regression tests
diff --git a/README.md b/README.md
index 429caebffd..e44ca5430c 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ Please refer to the Intel Gaudi AI Accelerator official [installation guide](htt
 
 > Tests should be run in a Docker container based on Intel Gaudi Docker images.
 >
-> The current version has been validated for SynapseAI 1.18.
+> The current version has been validated for SynapseAI 1.19.
 
 
 ## Install the library and get example scripts
@@ -59,9 +59,9 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is up
 To use the example associated with the latest stable release, run:
 > ```
 > git clone https://github.com/huggingface/optimum-habana
-> cd optimum-habana && git checkout v1.14.0
+> cd optimum-habana && git checkout v1.15.0
 > ```
-> with `v1.14.0` the version number of this release.
+> with `v1.15.0` the version number of this release.
 
 ### Option 2: Use the latest main branch under development
 
@@ -88,7 +88,7 @@ git clone -b transformers_future https://github.com/huggingface/optimum-habana
 
 To use DeepSpeed on HPUs, you also need to run the following command:
 >```bash
->pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+>pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 >```
 
 To install the requirements for every example:
@@ -175,6 +175,22 @@ outputs = generator(
 ```
 
 
+## Important Note on Pytorch 2.5 Performance Degradation
+
+With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs. The note from PyTorch 2.5 states:
+
+"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul."
+
+For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting:
+```python
+torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+```
+Additionally, the next release of Optimum Habana will include a Gaudi-specific safe_softmax implementation that will also improve performance.
+
+More info:
+- https://pytorch.org/docs/stable/notes/numerical_accuracy.html
+
+
 ### Documentation
 
 Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/habana/index) for more advanced usage.
diff --git a/docs/Dockerfile b/docs/Dockerfile
index 6dd8d3a29f..060b7413dc 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 ARG commit_sha
 ARG clone_url
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2b6e8a0a5c..fa54c4446e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -24,7 +24,7 @@ python -m pip install --upgrade-strategy eager optimum[habana]
 To use Microsoft® DeepSpeed with Intel Gaudi devices, you also need to run the following command:
 
 ```bash
-python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 To ensure that you are installing the correct Intel Gaudi Software, please run the `hl-smi` command to confirm the software version
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index ec79ac05f9..c882de2629 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -32,12 +32,12 @@ platform for deep learning and follow the steps to start and connect to the node
 ## Docker Setup
 
 Now that you have access to the node, you will use the latest Intel Gaudi AI Accelerator docker image by executing the docker run command which will
-automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.18.0:
+automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.19.0:
 
 ```bash
-release=1.18.0
+release=1.19.0
 os=ubuntu22.04
-torch=2.4.0
+torch=2.5.1
 docker_image=vault.habana.ai/gaudi-docker/$release/$os/habanalabs/pytorch-installer-$torch:latest
 ```
 <Tip>
@@ -65,11 +65,11 @@ docker run -itd \
 ## Optimum for Intel Gaudi Setup
 
 Check latest release of Optimum for Intel Gaudi [here](https://github.com/huggingface/optimum-habana/releases).
-At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.14.0, which is paired with Intel Gaudi Software release
-version 1.18.0.  Install Optimum for Intel Gaudi as follows:
+At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.15.0, which is paired with Intel Gaudi Software release
+version 1.19.0.  Install Optimum for Intel Gaudi as follows:
 
 ```bash
-git clone -b v1.14.0 https://github.com/huggingface/optimum-habana
+git clone -b v1.15.0 https://github.com/huggingface/optimum-habana
 pip install ./optimum-habana
 ```
 
@@ -115,7 +115,7 @@ Microsoft® DeepSpeed. Gaudi-specific fork of the library is maintained by Intel
 
 To install the library compatible with the same Gaudi software release stack, use:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 With DeepSpeed successfully installed we can now run a distributed GPT-2 inference on an 8 HPU system as follows:
@@ -135,7 +135,7 @@ run_generation.py \
 
 🤗 Optimum for Intel Gaudi contains a number of examples demonstrating single and multi Gaudi device training/fine-tuning.
 
-For example, a number of language models can be trained with the scripts provided 
+For example, a number of language models can be trained with the scripts provided
 [language modeling examples section](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling).
 
 As an illustration, let us run GPT-2 single and multi card training examples on Gaudi.
@@ -239,7 +239,7 @@ outputs = pipeline(
 )
 ```
 
-In addition, sample scripts for fine-tuning diffusion models are given in 
+In addition, sample scripts for fine-tuning diffusion models are given in
 [Stable Diffusion training section](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training).
 
 A more comprehensive list of examples in Optimum for Intel Gaudi is given next.
diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx
index 833358d9c4..f6617e92ce 100644
--- a/docs/source/usage_guides/deepspeed.mdx
+++ b/docs/source/usage_guides/deepspeed.mdx
@@ -32,7 +32,7 @@ You can find more information about DeepSpeed Gaudi integration [here](https://d
 To use DeepSpeed on Gaudi, you need to install Optimum for Intel Gaudi and [DeepSpeed fork for Intel Gaudi](https://github.com/HabanaAI/DeepSpeed) with:
 ```bash
 pip install optimum[habana]
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 
@@ -79,7 +79,7 @@ It is strongly advised to read [this section](https://huggingface.co/docs/transf
 
 </Tip>
 
-Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.18.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel.
+Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.19.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel.
 
 The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well.
 A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/).
diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
index 6840cb580c..2fe6d5abd9 100644
--- a/examples/audio-classification/README.md
+++ b/examples/audio-classification/README.md
@@ -56,6 +56,7 @@ python run_audio_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True
 ```
@@ -93,6 +94,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --use_lazy_mode False\
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True \
     --torch_compile \
@@ -110,7 +112,7 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4
 
 > You need to install DeepSpeed with:
 > ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 > ```
 
 DeepSpeed can be used with almost the same command as for a multi-card run:
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
index 9a23428866..6defd566d3 100644
--- a/examples/audio-classification/run_audio_classification.py
+++ b/examples/audio-classification/run_audio_classification.py
@@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
index d61eacb451..c0aa57ac41 100644
--- a/examples/contrastive-image-text/README.md
+++ b/examples/contrastive-image-text/README.md
@@ -115,6 +115,7 @@ PT_HPU_LAZY_MODE=0 python run_clip.py \
     --gaudi_config_name Habana/clip \
     --throughput_warmup_steps 3 \
     --dataloader_num_workers 16 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code \
     --torch_compile_backend=hpu_backend \
@@ -274,6 +275,7 @@ python run_clip.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/clip \
     --bf16 \
+    --sdp_on_bf16 \
     --mediapipe_dataloader \
     --trust_remote_code
 ```
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index b54ca8e7c0..5964b2cdcc 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -59,7 +59,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
index 6a8ca235e1..fc3bb4886e 100644
--- a/examples/contrastive-image-text/run_clip.py
+++ b/examples/contrastive-image-text/run_clip.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py
index 0f76dcd379..f282809a31 100644
--- a/examples/gaudi_spawn.py
+++ b/examples/gaudi_spawn.py
@@ -84,7 +84,7 @@ def main():
         if not is_deepspeed_available():
             raise ImportError(
                 "--use_deepspeed requires deepspeed: `pip install"
-                " git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                " git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
             )
 
     # Patch sys.argv
diff --git a/examples/image-classification/README.md b/examples/image-classification/README.md
index 08c4d67123..01b19b25ba 100644
--- a/examples/image-classification/README.md
+++ b/examples/image-classification/README.md
@@ -57,6 +57,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 6 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -107,6 +108,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 3 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -211,6 +213,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 8 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -298,6 +301,7 @@ python run_image_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/vit \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
index b2694665a3..bc45087f9e 100644
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 51f4a5dda2..e4dbb05472 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -44,7 +44,8 @@ python3 run_pipeline.py \
     --model_name_or_path Salesforce/blip-image-captioning-large \
     --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-7b inference, use the following command:
@@ -52,7 +53,8 @@ To run Llava-1.5-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-13b inference, use the following command:
@@ -60,7 +62,8 @@ To run Llava-1.5-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-mistral-7b inference, use the following command:
@@ -68,7 +71,8 @@ To run Llava-v1.6-mistral-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-vicuna-13b inference, use the following command:
@@ -76,7 +80,8 @@ To run Llava-v1.6-vicuna-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
@@ -84,7 +89,8 @@ To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-34b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run google/paligemma-3b-mix-224 inference, use the following command:
@@ -92,7 +98,8 @@ To run google/paligemma-3b-mix-224 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path google/paligemma-3b-mix-224 \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
@@ -100,7 +107,8 @@ To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run idefics2 inference, use the following command:
@@ -109,7 +117,8 @@ To run idefics2 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path HuggingFaceM4/idefics2-8b \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run mllama inference using reduced precision in the SDPA, use the following command:
@@ -134,7 +143,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-1.5-7b:
@@ -143,7 +153,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 
@@ -153,7 +164,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b:
@@ -162,7 +174,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b:
@@ -171,7 +184,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b:
@@ -180,7 +194,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FusedSDPA
diff --git a/examples/kubernetes/Dockerfile b/examples/kubernetes/Dockerfile
index 08f2937fca..7ebfd93894 100644
--- a/examples/kubernetes/Dockerfile
+++ b/examples/kubernetes/Dockerfile
@@ -1,7 +1,7 @@
-ARG GAUDI_SW_VER=1.18.0
+ARG GAUDI_SW_VER=1.19.0
 ARG OS=ubuntu22.04
-ARG TORCH_VER=2.4.0
-ARG OPTIMUM_HABANA_VER=1.14.0
+ARG TORCH_VER=2.5.1
+ARG OPTIMUM_HABANA_VER=1.15.0
 
 FROM vault.habana.ai/gaudi-docker/${GAUDI_SW_VER}/${OS}/habanalabs/pytorch-installer-${TORCH_VER}:latest AS optimum-habana
 
diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
index 2ba6b017f1..06f4f01d09 100644
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -43,12 +43,12 @@ Use the the following commands to build the containers:
 
 ```bash
 # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container
-export GAUDI_SW_VER=1.18.0
+export GAUDI_SW_VER=1.19.0
 export OS=ubuntu22.04
-export TORCH_VER=2.4.0
+export TORCH_VER=2.5.1
 
 # Specify the version of optimum-habana to install in the container
-export OPTIMUM_HABANA_VER=1.14.0
+export OPTIMUM_HABANA_VER=1.15.0
 
 git clone https://github.com/huggingface/optimum-habana.git
 
diff --git a/examples/kubernetes/README.md.gotmpl b/examples/kubernetes/README.md.gotmpl
index 52a2c4fbab..431f8ad611 100644
--- a/examples/kubernetes/README.md.gotmpl
+++ b/examples/kubernetes/README.md.gotmpl
@@ -43,12 +43,12 @@ Use the the following commands to build the containers:
 
 ```bash
 # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container
-export GAUDI_SW_VER=1.18.0
+export GAUDI_SW_VER=1.19.0
 export OS=ubuntu22.04
-export TORCH_VER=2.4.0
+export TORCH_VER=2.5.1
 
 # Specify the version of optimum-habana to install in the container
-export OPTIMUM_HABANA_VER=1.14.0
+export OPTIMUM_HABANA_VER=1.15.0
 
 git clone https://github.com/huggingface/optimum-habana.git
 
diff --git a/examples/kubernetes/docker-compose.yaml b/examples/kubernetes/docker-compose.yaml
index 214707eccb..6bdea75bbd 100644
--- a/examples/kubernetes/docker-compose.yaml
+++ b/examples/kubernetes/docker-compose.yaml
@@ -5,30 +5,30 @@ services:
         http_proxy: ${http_proxy:-""}
         https_proxy: ${https_proxy:-""}
         no_proxy: ${no_proxy:-""}
-        GAUDI_SW_VER: ${GAUDI_SW_VER:-1.18.0}
+        GAUDI_SW_VER: ${GAUDI_SW_VER:-1.19.0}
         OS: ${OS:-ubuntu22.04}
-        OPTIMUM_HABANA_VER:  ${OPTIMUM_HABANA_VER:-1.14.0}
-        TORCH_VER: ${TORCH_VER:-2.4.0}
+        OPTIMUM_HABANA_VER:  ${OPTIMUM_HABANA_VER:-1.15.0}
+        TORCH_VER: ${TORCH_VER:-2.5.1}
         REGISTRY: ${REGISTRY}
         REPO: ${REPO}
       context: .
       labels:
-        org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.18.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.3.1}:latest"
+        org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.19.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.5.1}:latest"
         org.opencontainers.image.title: "Optimum for Intel® Gaudi® Accelerators"
-        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}
+        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}
     command: >
       sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'"
-    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}
+    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}
     pull_policy: always
   optimum-habana-examples:
     build:
       labels:
-        org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}"
+        org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}"
         org.opencontainers.image.title: "Optimum for Intel® Gaudi® Accelerators Examples"
-        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.13.0}
+        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0}
       target: optimum-habana-examples
     command: >
       sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'"
     extends: optimum-habana
-    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.13.0}
+    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0}
 
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index b97b634941..feac065364 100644
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index 4782ed58ae..3ff7fbfd3a 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -70,7 +70,7 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 @dataclass
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 30315bfc84..2de43c910b 100644
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py
index 1d81bcc496..9f955db44e 100644
--- a/examples/language-modeling/run_multitask_prompt_tuning.py
+++ b/examples/language-modeling/run_multitask_prompt_tuning.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risk.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
index e263c0c1b6..44ea542d14 100644
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ b/examples/language-modeling/run_prompt_tuning_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile
index a527f99603..bc6f827164 100644
--- a/examples/multi-node-training/EFA/Dockerfile
+++ b/examples/multi-node-training/EFA/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -14,11 +14,12 @@ RUN git clone "https://github.com/HabanaAI/hccl_ofi_wrapper.git" "${OFI_WRAPPER_
 RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
    sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+   /usr/bin/ssh-keygen -A && \
    service ssh restart
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile
index b3763c4277..5375a6fcc7 100644
--- a/examples/multi-node-training/GaudiNIC/Dockerfile
+++ b/examples/multi-node-training/GaudiNIC/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -8,11 +8,12 @@ RUN apt-get update && apt-get install -y pdsh && \
 RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
    sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+   /usr/bin/ssh-keygen -A && \
    service ssh restart
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
index 6941e6e5c1..230d1c61e8 100644
--- a/examples/protein-folding/run_esmfold.py
+++ b/examples/protein-folding/run_esmfold.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 def convert_outputs_to_pdb(outputs):
diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py
index dde75a2564..fa35d8b803 100644
--- a/examples/protein-folding/run_sequence_classification.py
+++ b/examples/protein-folding/run_sequence_classification.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py
index 3b475883e8..7da135f080 100644
--- a/examples/protein-folding/run_zero_shot_eval.py
+++ b/examples/protein-folding/run_zero_shot_eval.py
@@ -36,7 +36,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logging.basicConfig(
diff --git a/examples/pytorch-image-models/README.md b/examples/pytorch-image-models/README.md
index 8567a77fe6..f1dc21ddf4 100644
--- a/examples/pytorch-image-models/README.md
+++ b/examples/pytorch-image-models/README.md
@@ -51,7 +51,7 @@ Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.
 ```bash
 python train_hpu_lazy.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
@@ -65,7 +65,7 @@ python train_hpu_lazy.py --data-dir='./' --dataset hfds/johnowhitaker/imagenette
 ```bash
 python train_hpu_graph.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
@@ -98,7 +98,7 @@ Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.
 torchrun --nnodes 1 --nproc_per_node 2 \
     train_hpu_lazy.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
@@ -111,7 +111,7 @@ torchrun --nnodes 1 --nproc_per_node 2 \
 torchrun --nnodes 1 --nproc_per_node 2 \
     train_hpu_graph.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
@@ -142,7 +142,7 @@ Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.
 ```bash
 python inference.py \
     --data-dir='./' \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device='hpu' \
     --model resnet50.a1_in1k \
     --split train \
@@ -153,7 +153,7 @@ python inference.py \
 ```bash
 python inference.py \
     --data-dir='./' \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device='hpu' \
     --model resnet50.a1_in1k \
     --split train
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index bf6cd04aec..c7414c777d 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -190,14 +190,6 @@ Here is a DeepSpeed configuration you can use to train your models on Gaudi:
 }
 ```
 
-
-### Training in torch.compile mode
-
-Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \
-b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
 ## Fine-tuning Llama on SQuAD1.1
 
 > [!NOTE]
@@ -301,6 +293,7 @@ python run_seq2seq_qa.py \
   --pad_to_max_length \
   --save_strategy epoch \
   --throughput_warmup_steps 3 \
+  --sdp_on_bf16 \
   --bf16
 ```
 
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index d22949c076..5ad77be381 100644
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index 1f045552bd..aaadbee417 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
index 4d5eb69b91..fe80cf775f 100644
--- a/examples/speech-recognition/README.md
+++ b/examples/speech-recognition/README.md
@@ -85,6 +85,7 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --throughput_warmup_steps="3" \
+    --sdp_on_bf16 \
     --bf16 \
     --use_hpu_graphs_for_training \
     --use_hpu_graphs_for_inference \
@@ -128,6 +129,7 @@ python ../gaudi_spawn.py \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
     --bf16 \
+    --sdp_on_bf16 \
     --use_hpu_graphs_for_training \
     --use_hpu_graphs_for_inference \
     --sdp_on_bf16
@@ -143,7 +145,7 @@ On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **
 
 > You need to install DeepSpeed with:
 > ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 > ```
 
 DeepSpeed can be used with almost the same command as for a multi-card run:
@@ -210,6 +212,7 @@ python run_speech_recognition_ctc.py \
     --use_habana \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
+    --sdp_on_bf16 \
     --bf16 \
     --use_hpu_graphs_for_inference \
     --sdp_on_bf16
@@ -250,6 +253,7 @@ python run_speech_recognition_seq2seq.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_train \
@@ -259,7 +263,8 @@ python run_speech_recognition_seq2seq.py \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
     --dataloader_num_workers 8 \
-    --throughput_warmup_steps 3
+    --throughput_warmup_steps 3 \
+    --sdp_on_bf16
 ```
 
 If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition.
@@ -289,6 +294,7 @@ python ../gaudi_spawn.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_train \
@@ -322,6 +328,7 @@ python run_speech_recognition_seq2seq.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_eval \
@@ -329,5 +336,6 @@ python run_speech_recognition_seq2seq.py \
     --use_habana \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
-    --dataloader_num_workers 8
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16
 ```
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
index 83865556d1..9d53e58519 100644
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/speech-recognition/run_speech_recognition_ctc.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
index ff9702e80c..db25b852eb 100755
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py
@@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
index f4df474f09..00759e9bf1 100644
--- a/examples/stable-diffusion/README.md
+++ b/examples/stable-diffusion/README.md
@@ -44,6 +44,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -65,6 +66,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -83,6 +85,7 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
@@ -107,6 +110,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion-2 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -134,8 +138,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion-2 \
-    --ldm3d \
-    --bf16
+    --ldm3d
 ```
 
 Here is how to generate images and depth maps with two prompts on two HPUs:
@@ -180,6 +183,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -200,6 +204,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -221,6 +226,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -241,6 +247,7 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
@@ -257,6 +264,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --optimize
 ```
@@ -273,6 +281,7 @@ QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --optimize
 ```
@@ -298,6 +307,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --num_inference_steps 1 \
     --guidance_scale 1.000001 \
@@ -339,6 +349,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -363,6 +374,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -389,6 +401,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -409,6 +422,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --quant_mode measure
 ```
@@ -428,6 +442,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --quant_mode quantize
 ```
@@ -451,6 +466,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -468,6 +484,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -486,6 +503,7 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
@@ -505,6 +523,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -524,6 +543,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion-2 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -547,6 +567,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -566,6 +587,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -590,6 +612,7 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -615,6 +638,7 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -639,6 +663,7 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -660,6 +685,7 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -678,6 +704,7 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -693,6 +720,7 @@ python depth_to_image_generation.py \
     --image_save_dir /tmp/stable_diffusion_images \
     --use_habana \
     --use_hpu_graphs \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -709,6 +737,7 @@ python unconditional_image_generation.py \
     --use_habana \
     --use_gaudi_ddim_scheduler \
     --use_hpu_graphs \
+    --sdp_on_bf16 \
     --bf16 \
     --save_outputs \
     --output_dir "/tmp/"
@@ -725,7 +754,7 @@ Here is how to run the example of controlling brightness. For more details,
 please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/control_brightness).
 
 ```bash
-python text_to_image_generation.py \
+PT_HPU_MAX_COMPOUND_OP_SIZE=1 python text_to_image_generation.py \
     --model_name_or_path ptx0/pseudo-journey-v2 \
     --prompts "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" \
     --num_images_per_prompt 1 \
@@ -753,6 +782,7 @@ python text_to_image_generation.py \
     --use_habana --use_hpu_graphs \
     --image_save_dir /tmp/stable_diffusion_images_compel \
     --seed 33 \
+    --sdp_on_bf16 \
     --bf16 \
     --num_inference_steps 20 \
     --use_compel
@@ -773,6 +803,7 @@ python text_to_image_generation.py \
     --image_save_dir /tmp/stable_diffusion_images_freeu \
     --seed 33 \
     --use_freeu \
+    --sdp_on_bf16 \
     --bf16
 ```
 # Stable Video Diffusion Examples
@@ -799,6 +830,7 @@ python image_to_video_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -823,6 +855,7 @@ python image_to_video_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -858,8 +891,14 @@ python image_to_video_generation.py \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
     --bf16 \
+    --sdp_on_bf16 \
     --num_frames 14 \
     --motion_bucket_id=14 \
     --width=512 \
     --height=512
 ```
+
+> [!NOTE]
+> For Gaudi3 only:
+> 1. Due to a known issue, batch sizes for models needs to be reduced. It will be fixed in the future release.
+> 2. The Image-to-video ControlNet command is not enabled on Gaudi3.
diff --git a/examples/stable-diffusion/depth_to_image_generation.py b/examples/stable-diffusion/depth_to_image_generation.py
index 570a39b2c3..c32d61a05b 100755
--- a/examples/stable-diffusion/depth_to_image_generation.py
+++ b/examples/stable-diffusion/depth_to_image_generation.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -172,6 +172,12 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--throughput_warmup_steps",
         type=int,
@@ -223,6 +229,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     if args.bf16:
diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index a9f2f81930..c76d3c0f5a 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -193,6 +193,12 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt."
     )
@@ -318,6 +324,7 @@ def main():
             output_type=args.output_type,
             profiling_warmup_steps=args.profiling_warmup_steps,
             profiling_steps=args.profiling_steps,
+            sdp_on_bf16=args.sdp_on_bf16,
             **res,
         )
     elif flux:
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
index 4112a1b39c..bd704a301b 100755
--- a/examples/stable-diffusion/image_to_video_generation.py
+++ b/examples/stable-diffusion/image_to_video_generation.py
@@ -34,7 +34,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -177,6 +177,12 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument("--num_frames", type=int, default=25, help="The number of video frames to generate.")
     args = parser.parse_args()
 
@@ -218,6 +224,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     set_seed(args.seed)
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index 4303e7a038..8fd48c99a8 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -42,7 +42,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md
index a10c194066..afa4a0a61f 100644
--- a/examples/stable-diffusion/training/README.md
+++ b/examples/stable-diffusion/training/README.md
@@ -198,6 +198,7 @@ python train_controlnet.py \
  --train_batch_size=4 \
  --throughput_warmup_steps=3 \
  --use_hpu_graphs \
+ --sdp_on_bf16 \
  --bf16 \
  --trust_remote_code
 ```
@@ -217,6 +218,7 @@ python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
   --train_batch_size=4 \
   --throughput_warmup_steps 3 \
   --use_hpu_graphs \
+  --sdp_on_bf16 \
   --bf16 \
   --trust_remote_code
 ```
@@ -295,6 +297,7 @@ python train_text_to_image_sdxl.py \
   --gaudi_config_name Habana/stable-diffusion \
   --throughput_warmup_steps 3 \
   --dataloader_num_workers 8 \
+  --sdp_on_bf16 \
   --bf16 \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
@@ -330,6 +333,7 @@ python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py
   --gaudi_config_name Habana/stable-diffusion \
   --throughput_warmup_steps 3 \
   --dataloader_num_workers 8 \
+  --sdp_on_bf16 \
   --bf16 \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
@@ -365,6 +369,7 @@ python train_text_to_image_sdxl.py \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
   --checkpointing_steps 3000 \
+  --sdp_on_bf16 \
   --bf16
 ```
 
@@ -498,6 +503,7 @@ python ../text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -695,5 +701,6 @@ python ../text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
index 5648952413..004cee5af5 100755
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ b/examples/stable-diffusion/training/train_controlnet.py
@@ -68,7 +68,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 if is_wandb_available():
     import wandb
 
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index c9d84ae1b9..b78c84bbe1 100755
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -491,6 +491,12 @@ def parse_args(input_args=None):
         default=False,
         help=("Whether to use bf16 mixed precision."),
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--local_rank",
         type=int,
@@ -1421,6 +1427,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
                         use_habana=True,
                         use_hpu_graphs=args.use_hpu_graphs_for_inference,
                         gaudi_config=args.gaudi_config_name,
+                        sdp_on_bf16=args.sdp_on_bf16,
                     )
                 else:
                     # vae and text encoders are frozen, only need to update unet
diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py
index 36e35ff90f..bd70d0e4d6 100755
--- a/examples/stable-diffusion/unconditional_image_generation.py
+++ b/examples/stable-diffusion/unconditional_image_generation.py
@@ -20,7 +20,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 # Setup logging
 logging.basicConfig(
@@ -68,6 +68,12 @@ def main():
         action="store_true",
         help="Whether to use bf16 precision for classification.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--save_outputs",
         action="store_true",
@@ -104,6 +110,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": gaudi_config,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     kwargs_call = {"throughput_warmup_steps": args.throughput_warmup_steps}
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 28498fc0a2..65755d24a2 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 57bf7cbb05..68f5e9a2aa 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/text-feature-extraction/README.md b/examples/text-feature-extraction/README.md
index 9c34ede54a..2b0d5354ef 100644
--- a/examples/text-feature-extraction/README.md
+++ b/examples/text-feature-extraction/README.md
@@ -28,6 +28,7 @@ python run_feature_extraction.py \
         "BERT is a common machine learning architecture for text-based applications." \
         "Alexander Hamilton is one of the founding fathers of the United States." \
     --use_hpu_graphs \
+    --sdp_on_bf16 \
     --bf16
 ```
 
diff --git a/examples/text-feature-extraction/run_feature_extraction.py b/examples/text-feature-extraction/run_feature_extraction.py
index 47320b1979..159f36b488 100644
--- a/examples/text-feature-extraction/run_feature_extraction.py
+++ b/examples/text-feature-extraction/run_feature_extraction.py
@@ -83,6 +83,9 @@ def parse_args():
         action="store_true",
         help="Whether to perform generation in bf16 precision.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--warmup",
         type=int,
@@ -100,6 +103,8 @@ def parse_args():
 
 def main():
     args = parse_args()
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     model = AutoModel.from_pretrained(args.model_name_or_path).to("hpu")
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 6da9bc8470..7767443c6e 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -33,7 +33,7 @@ pip install -r requirements_lm_eval.txt
 
 Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 
@@ -79,7 +79,8 @@ python run_generation.py \
 --use_kv_cache \
 --max_new_tokens 100 \
 --do_sample \
---prompt "Here is my prompt"
+--prompt "Here is my prompt" \
+--sdp_on_bf16
 ```
 
 If you want to provide several prompts as inputs, here is how to do it:
@@ -91,7 +92,8 @@ python run_generation.py \
 --max_new_tokens 100 \
 --do_sample \
 --batch_size 2 \
---prompt "Hello world" "How are you?"
+--prompt "Hello world" "How are you?" \
+--sdp_on_bf16
 ```
 
 > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
@@ -110,7 +112,8 @@ python run_generation.py \
 --use_kv_cache \
 --num_return_sequences 1 \
 --temperature 0 \
---prompt "Alice and Bob"
+--prompt "Alice and Bob" \
+--sdp_on_bf16
 ```
 
 ### Benchmark
@@ -137,7 +140,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --batch_size 1 \
 --use_hpu_graphs \
 --use_kv_cache \
---max_new_tokens 100
+--max_new_tokens 100 \
+--sdp_on_bf16
 ```
 
 You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command:
@@ -152,7 +156,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --attn_softmax_bf16 \
 --limit_hpu_graphs \
 --reuse_cache \
---trim_logits
+--trim_logits \
+--sdp_on_bf16
 ```
 
 To run Falcon-7B inference, use the following command:
@@ -164,7 +169,8 @@ python run_generation.py \
  --use_kv_cache \
  --batch_size 1 \
  --max_new_tokens 128 \
- --do_sample
+ --do_sample \
+ --sdp_on_bf16
 ```
 
 To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
@@ -181,6 +187,20 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --flash_attention_causal_mask
 ```
 
+To run Llama3-405B inference on 8 Gaudi3 cards use the following command:
+```bash
+python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--max_new_tokens 2048 \
+--bf16 \
+--use_hpu_graphs \
+--use_kv_cache \
+--batch_size 1 \
+--do_sample \
+--use_flash_attention \
+--flash_attention_causal_mask
+```
+
 > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should:
 > - have a HF account
 > - agree to the terms of use of the model in its model card on the HF Hub
@@ -195,7 +215,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 > --use_hpu_graphs \
 > --use_kv_cache \
 > --max_new_tokens 100 \
-> --bf16
+> --bf16 \
+> --sdp_on_bf16
 > ```
 
 ### Use any dataset from the Hugging Face Hub
@@ -214,7 +235,8 @@ python run_generation.py \
 --use_kv_cache \
 --dataset_name JulesBelveze/tldr_news \
 --column_name content \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated.
@@ -233,7 +255,8 @@ python run_generation.py \
 --bf16 \
 --max_new_tokens 100 \
 --prompt "Here is my prompt" \
---peft_model yard1/llama-2-7b-sql-lora-test
+--peft_model yard1/llama-2-7b-sql-lora-test \
+--sdp_on_bf16
 ```
 
 ### Using growing bucket optimization
@@ -329,7 +352,7 @@ PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py  --world_s
 
 ### Running with FP8
 
-Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
+Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B, phi-2 and Llama3-405B in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling fp8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
@@ -452,6 +475,44 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
 --flash_attention_causal_mask
 ```
 
+Here is an example to measure the tensor quantization statistics on Llama3-405B with 8 cards:
+> Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3.
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_lm_eval.py \
+-o acc_llama3_405b_bs1_quant.txt \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--use_hpu_graphs \
+--use_kv_cache \
+--trim_logits \
+--batch_size 1 \
+--bf16 \
+--reuse_cache \
+--use_flash_attention \
+--flash_attention_recompute \
+--flash_attention_causal_mask
+```
+
+Here is an example to quantize the model based on previous measurements for Llama3-405B with 8 cards:
+> Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3.
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--use_hpu_graphs \
+--use_kv_cache \
+--limit_hpu_graphs \
+--max_input_tokens 2048 \
+--max_new_tokens 2048 \
+--batch_size 2 \
+--bf16 \
+--reuse_cache \
+--trim_logits \
+--use_flash_attention \
+--flash_attention_recompute \
+--flash_attention_causal_mask
+```
+
 Here is an example to measure the tensor quantization statistics on phi-2 with 1 card:
 
 ```bash
@@ -490,7 +551,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for gemma with 1 card:
@@ -502,7 +564,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 
@@ -512,14 +575,13 @@ Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8
 As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script.
 Here are the steps:
 1. Measure the model on a number of cards that are enough for the model to fit in BF16.
-2. Quantize the model on the same amount of cards for scales to be saved.
-3. Run unify_measurements.py script using the measurement files created after running steps 1 and 2. A unified measurement is then calculated.
+2. Run unify_measurements.py script using the measurement files created in step 1. A unified measurement is then calculated.
 ```bash
 python quantization_tools/unify_measurements.py -g 01234567 -m *path_to_8x_measurements* -o *path_to_output_1x_measurement*
 ```
 In the above example, the measurements of cards 0-7 will be unified to a single measurement. For example, if you specify `-g 0123 4567`,
 cards 0-3 and cards 4-7 will be unified in two different measurement files. All different group combinations are supported.
-4. Run quantization using the unified measurement file/s.
+3. Run quantization using the unified measurement file/s.
 
 More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
 
@@ -567,10 +629,8 @@ https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UI
 
 Below is an example to load a model with 4bit checkpoints from Hugging Face.
 Please note that model name is denoted as `<model_path_in_hugging_face>`.
-Additionally, the below env vars are used for performance optimizations, and are planned to be removed in future version:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
+
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
 python run_lm_eval.py \
 -o acc_load_uint4_model.txt \
 --model_name_or_path <model_path_in_hugging_face> \
@@ -593,12 +653,10 @@ Currently, only uint4 checkpoints and single-device configurations are supported
 More information on enabling 4-bit inference in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html?highlight=inference%20using%20int4#enabling-and-running-uint4-in-pytorch-models.
 
-Below is an example of loading a llama7b model with a 4bit checkpoint quantized in INC.
+Below is an example of loading a llama2-7b model with a 4bit checkpoint quantized in INC.
 Please note that the model checkpoint name is denoted as `<local_model_path_from_inc>`.
-Additionally, the following environment variables are used for performance optimizations and are planned to be removed in future versions:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
+
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
 python run_lm_eval.py \
 -o acc_load_uint4_model.txt \
 --model_name_or_path meta-llama/Llama-2-7b-hf \
@@ -643,26 +701,18 @@ For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Mo
 
 ### Running with UINT4 weight quantization using AutoGPTQ
 
-
-Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
+Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ), which provides quantization capabilities in PyTorch.
 Currently, the support is for UINT4 inference of pre-quantized models only.
 
 ```bash
-BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
+BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/AutoGPTQ/AutoGPTQ
 ```
 
-You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
-and by adding the argument `--load_quantized_model_with_autogptq`.
-
-***Note:***
-Setting the above environment variables improves performance. These variables will be removed in future releases.
-
+You can run a *UINT4 weight quantized* model using AutoGPTQ by adding the argument `--load_quantized_model_with_autogptq`.
 
 Here is an example to run a quantized model <quantized_gptq_model>:
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
-ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
+python run_generation.py \
 --attn_softmax_bf16 \
 --model_name_or_path <quantized_gptq_model> \
 --use_hpu_graphs \
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
index 4282e4ac49..de2b086c2a 100644
--- a/examples/text-generation/quantization_tools/unify_measurements.py
+++ b/examples/text-generation/quantization_tools/unify_measurements.py
@@ -6,49 +6,45 @@
 import numpy as np
 
 
-def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
+def find_measurement_path(measurement, measurements_dir_path, group_size):
     measurment_card = measurement + "_" + str(group_size)
     for measurment_file in os.listdir(measurements_dir_path):
         filename = os.fsdecode(measurment_file)
         if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename:
             continue
-        if scales:
-            if "MAXABS" in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
-        else:
-            if "MAXABS" not in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
 
+        if "MAXABS" not in filename:
+            return os.path.join(measurements_dir_path, measurment_file)
 
-def unify_measurements(
-    measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index, scales=False
-):
+
+def unify_measurements(measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index):
     measurements_paths = []
     group_name = ""
 
     # save all the jsons paths in the given measurement group
     for measurement in measurement_group:
-        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size)
-        measurements_paths.append(measurement_path)
+        measurement_path = find_measurement_path(measurement, measurements_dir_path, groups_size)
+        if measurement_path is not None:
+            measurements_paths.append(measurement_path)
         group_name += measurement
-
     # save all the jsons content in the given measurement group
     measurements_jsons = []
     for measurement_path in measurements_paths:
-        with open(measurement_path, "r") as f:
-            js = json.load(f)
-            measurements_jsons.append(js["Nodes"])
+        if measurement_path is not None:
+            with open(measurement_path, "r") as f:
+                js = json.load(f)
+                measurements_jsons.append(js["Nodes"])
     # create a name for the unified json that will be created for this measurement group
 
     if groups_num == 1:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace("_" + measurement_group[0] + "_" + str(groups_size), "")
         )
     else:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace(
                 "_" + measurement_group[0] + "_" + str(groups_size), "_" + str(group_index) + "_" + str(groups_num)
@@ -74,70 +70,27 @@ def unify_measurements(
             max_weight = node_values["params"]["weight"]
 
         # iterate over all the measurment group and take the maximum for each tensor and its channel
-        if scales:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
-                if max_outputs is not None:
-                    if isinstance(max_outputs[0], list):
-                        for i in range(0, len(max_outputs)):
-                            for j in range(0, len(max_outputs[i])):
-                                max_outputs[i][j] = max(
-                                    measurement_json[node_name]["outputs"][i][j], max_outputs[i][j]
-                                )
-                    else:
-                        for i in range(0, len(max_outputs)):
-                            max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i])
-                if max_weight is not None:
-                    if isinstance(max_weight, dict):
-                        for key, values in max_weight.items():
-                            for i in range(0, len(values)):
-                                max_weight[key][i] = max(
-                                    measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i]
-                                )
-                    else:
-                        max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
-        else:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    for j in range(0, len(max_inputs[i])):
-                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
-                if max_outputs is not None:
-                    for i in range(0, len(max_outputs)):
-                        max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
-                if max_weight is not None:
-                    for i in range(0, len(max_weight)):
-                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
-
-        # update the maximum in the unified json
-        if scales:
-            for i in range(0, len(max_inputs)):
-                unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
-            if max_outputs is not None:
-                if isinstance(max_outputs[0], list):
-                    for i in range(0, len(max_outputs)):
-                        for j in range(0, len(max_outputs[i])):
-                            unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j]
-                else:
-                    for i in range(0, len(max_outputs)):
-                        unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i]
-            if max_weight is not None:
-                if isinstance(max_weight, dict):
-                    for key, values in max_weight.items():
-                        for i in range(0, len(values)):
-                            unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i]
-                else:
-                    unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
-        else:
+        for measurement_json in measurements_jsons:
             for i in range(0, len(max_inputs)):
                 for j in range(0, len(max_inputs[i])):
-                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+                    max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
             if max_outputs is not None:
                 for i in range(0, len(max_outputs)):
-                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+                    max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
             if max_weight is not None:
                 for i in range(0, len(max_weight)):
-                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
+                    max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
+
+        # update the maximum in the unified json
+        for i in range(0, len(max_inputs)):
+            for j in range(0, len(max_inputs[i])):
+                unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+        if max_outputs is not None:
+            for i in range(0, len(max_outputs)):
+                unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+        if max_weight is not None:
+            for i in range(0, len(max_weight)):
+                unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
     global_rank = None
     local_rank = group_index if groups_num != 1 else -1
     mode = ""
@@ -153,10 +106,10 @@ def unify_measurements(
         layers[layer] = {}
         layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
         if dlayer.get("outputs") is not None:
-            layers[layer]["outputs"] = np.array(dlayer["outputs"])
+            layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
         if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
             layers[layer]["params"] = {}
-            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+            layers[layer]["params"]["weight"] = [np.array(x) for x in dlayer["params"]["weight"]]
     df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
     with open(unified_npz_path, "w"):
         np.savez(unified_npz_path, df)
@@ -196,26 +149,14 @@ def main(args):
     groups = args.groups
 
     num_jsons_drange = 0
-    num_jsons_scales = 0
     for path in os.listdir(measurements_path):
-        if path.endswith(".json"):
-            if "MAXABS" in path:
-                num_jsons_scales += 1
-            elif "mod_list" not in path:
-                num_jsons_drange += 1
-    assert (
-        os.path.isdir(measurements_path)
-        and (num_jsons_drange % len(groups)) == 0
-        and (num_jsons_scales % len(groups)) == 0
-    )
+        if path.endswith(".json") and "MAXABS" not in path and "mod_list" not in path:
+            num_jsons_drange += 1
+
+    assert os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0
 
     for group_index, group in enumerate(groups):
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_drange, len(groups), group_index, scales=False
-        )
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_scales, len(groups), group_index, scales=True
-        )
+        unify_measurements(group, measurements_path, output_path, num_jsons_drange, len(groups), group_index)
 
     print("finished measurement unifier script")
 
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 5355ceb1b6..ef2252a989 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -320,6 +320,9 @@ def setup_parser(parser):
         action="store_true",
         help="Run the inference with dataset for specified --n_iterations(default:5)",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
 
     quant_parser_group = parser.add_mutually_exclusive_group()
     quant_parser_group.add_argument(
@@ -389,6 +392,9 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.dataset_name is None:
         # Benchmark over the prompts below
         if args.prompt:
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
index 689860fc7c..152d83ec93 100644
--- a/examples/text-generation/run_lm_eval.py
+++ b/examples/text-generation/run_lm_eval.py
@@ -226,6 +226,7 @@ def main():
             for k, v in mem.items():
                 print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
         json.dump(results, open(args.output_file, "w"), indent=2)
+        print(json.dumps(results, indent=2))
     if args.quant_config:
         finalize_quantization(model)
 
diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md
index a10792be2a..2aa036ec3a 100644
--- a/examples/text-generation/text-generation-pipeline/README.md
+++ b/examples/text-generation/text-generation-pipeline/README.md
@@ -22,7 +22,7 @@ The text-generation pipeline can be used to perform text-generation by providing
 
 If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 If you would like to use the pipeline with LangChain classes, you can install LangChain as follows:
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 4fe6567f64..6a576799e2 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -253,12 +253,6 @@ def setup_model(args, model_dtype, model_kwargs, logger):
             args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
         )
     elif args.load_quantized_model_with_inc:
-        # TODO: This will be removed in v1.19 Synapse release
-        # Override neural_compressor _load_remaining_pretrained_weight for the Transformer 4.45 release.
-        import neural_compressor.torch.algorithms.weight_only.save_load as nc_sl
-
-        nc_sl.WOQModelLoader._load_remaining_pretrained_weight = local_load_remaining_pretrained_weight
-
         from neural_compressor.torch.quantization import load
 
         model = load(model_name_or_path=args.model_name_or_path, format="huggingface", device="hpu", **model_kwargs)
@@ -277,9 +271,6 @@ def setup_model(args, model_dtype, model_kwargs, logger):
             original_model=org_model,
             **model_kwargs,
         )
-        # TODO: This will be removed in v1.19 Synapse release
-        # the loaded model should have the same dtype as original_model
-        model = model.to(model_kwargs["torch_dtype"])
     else:
         if args.assistant_model is not None:
             assistant_model = AutoModelForCausalLM.from_pretrained(
@@ -739,47 +730,3 @@ def initialize_model(args, logger):
     logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
     logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
     return model, assistant_model, tokenizer, generation_config
-
-
-# TODO:This will be removed from Synapse v1.19 release.
-# This is to override _load_remaining_pretrained_weight for Transformer 4.45 release.
-def local_load_remaining_pretrained_weight(self, model):
-    from transformers.modeling_utils import _load_state_dict_into_meta_model, load_state_dict
-
-    resolved_archive_file = self.kwargs.pop("resolved_archive_file", None)
-    torch_dtype = self.kwargs.pop("torch_dtype", torch.float32)
-    dtype_orig = self.kwargs.pop("dtype_orig", None)
-    offload_folder = self.kwargs.pop("offload_folder", None)
-    offload_state_dict = self.kwargs.pop("offload_state_dict", False)
-
-    # restore default dtype
-    if dtype_orig is not None:
-        torch.set_default_dtype(dtype_orig)
-
-    if not isinstance(resolved_archive_file, list):
-        resolved_archive_file = [resolved_archive_file]
-    for shard_file in resolved_archive_file:
-        state_dict = load_state_dict(shard_file)
-
-        params_dict = {
-            "model": model,
-            "state_dict": state_dict,
-            "start_prefix": "",
-            "expected_keys": self.loaded_state_dict_keys,
-            "device_map": {"": self.device},
-            "offload_folder": offload_folder,
-            "state_dict_folder": tempfile.mkdtemp() if offload_state_dict else None,
-            "state_dict_index": {} if offload_state_dict else None,
-            "dtype": torch_dtype,
-            "keep_in_fp32_modules": [],
-        }
-
-        _load_state_dict_into_meta_model(**params_dict)
-
-    # make sure token embedding weights are still tied if needed
-    model.tie_weights()
-
-    # Set model in evaluation mode to deactivate DropOut modules by default
-    model.eval()
-
-    return model
diff --git a/examples/text-to-video/text_to_video_generation.py b/examples/text-to-video/text_to_video_generation.py
index 4a91359617..8813e321cf 100755
--- a/examples/text-to-video/text_to_video_generation.py
+++ b/examples/text-to-video/text_to_video_generation.py
@@ -37,7 +37,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index c2def132a7..6f55ae1350 100644
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/trl/README.md b/examples/trl/README.md
index b42919c48d..05464309d5 100644
--- a/examples/trl/README.md
+++ b/examples/trl/README.md
@@ -362,6 +362,9 @@ python ddpo.py \
   --hf_hub_model_id="ddpo-finetuned-stable-diffusion" \
   --push_to_hub False
 ```
+> [!NOTE]
+> Due to a known issue on Gaudi3, sample_batch_sizes should be changed to 3. The issue will be fixed in the future release.
+
 
 2. Inference using the fine-tuned LoRA weights as shown in the example below:
 ```python
diff --git a/examples/video-classification/run_example.py b/examples/video-classification/run_example.py
index b593fb5955..2f78883742 100644
--- a/examples/video-classification/run_example.py
+++ b/examples/video-classification/run_example.py
@@ -80,7 +80,10 @@ def run(
     warm_up_epcohs: int,
     use_hpu_graphs: bool,
     cast_bf16: bool,
+    sdp_on_bf16: bool,
 ):
+    if sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
     processor = VideoMAEImageProcessor.from_pretrained(model_name)
     device = torch.device("hpu")
     model = VideoMAEForVideoClassification.from_pretrained(model_name)
@@ -152,6 +155,9 @@ def main():
         action="store_true",
         help="Whether to perform in bf16 precision.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--log_level",
         default=None,
@@ -176,6 +182,7 @@ def main():
         args.warm_up_epochs,
         args.use_hpu_graphs,
         args.bf16,
+        args.sdp_on_bf16,
     )
 
 
diff --git a/notebooks/AI_HW_Summit_2022.ipynb b/notebooks/AI_HW_Summit_2022.ipynb
index 2b9bf711b8..4ebb252cf3 100644
--- a/notebooks/AI_HW_Summit_2022.ipynb
+++ b/notebooks/AI_HW_Summit_2022.ipynb
@@ -262,7 +262,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0"
+    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0"
    ]
   },
   {
diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py
index 8926866cc3..b2d93730a4 100644
--- a/optimum/habana/accelerate/accelerator.py
+++ b/optimum/habana/accelerate/accelerator.py
@@ -156,7 +156,7 @@ def __init__(
         if deepspeed_plugin:
             if not is_deepspeed_available():
                 raise ImportError(
-                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
                 )
 
             mixed_precision = (
diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py
index 6e507acc2c..ccaa522ac7 100644
--- a/optimum/habana/accelerate/state.py
+++ b/optimum/habana/accelerate/state.py
@@ -56,7 +56,7 @@ def __init__(self, cpu: bool = False, **kwargs):
                     if not is_deepspeed_available():
                         raise ImportError(
                             "DeepSpeed is not available, install it with: `pip install"
-                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
                         )
                     self.distributed_type = GaudiDistributedType.DEEPSPEED
                     import deepspeed
diff --git a/optimum/habana/diffusers/models/unet_2d_condition.py b/optimum/habana/diffusers/models/unet_2d_condition.py
index 7bb641bbf1..1218c0fc65 100644
--- a/optimum/habana/diffusers/models/unet_2d_condition.py
+++ b/optimum/habana/diffusers/models/unet_2d_condition.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
@@ -5,7 +6,12 @@
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
 from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, torch_utils, unscale_lora_layers
 
-from optimum.habana.diffusers.utils.torch_utils import gaudi_fourier_filter
+from ..utils.torch_utils import gaudi_fourier_filter
+from .attention_processor import (
+    AttentionProcessor,
+    AttnProcessor2_0,
+    ScaledDotProductAttention,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -357,3 +363,50 @@ def gaudi_unet_2d_condition_model_forward(
         return (sample,)
 
     return UNet2DConditionOutput(sample=sample)
+
+
+def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention.
+    Sets the attention processor to use to compute attention.
+    Parameters:
+        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+            The instantiated processor class or a dictionary of processor classes that will be set as the processor
+            for **all** `Attention` layers.
+            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+            processor. This is strongly recommended when setting trainable attention processors.
+    """
+    count = len(self.attn_processors.keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+
+    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+        if hasattr(module, "set_processor"):
+            if os.environ.get("PATCH_SDPA") is not None:
+                setattr(module, "attention_module", ScaledDotProductAttention())
+                module.set_processor(processor(module.attention_module))
+            else:
+                if isinstance(processor, dict):
+                    attention_processor = processor.pop(f"{name}.processor", None)
+                    if attention_processor is not None:
+                        module.set_processor(attention_processor)
+                else:
+                    module.set_processor(processor)
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+
+
+def set_default_attn_processor_hpu(self):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    Disables custom attention processors and sets the default attention implementation from HPU.
+    """
+    processor = AttnProcessor2_0()
+    set_attn_processor_hpu(self, processor)
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 630bc9c18b..ff9a139839 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-import os
 import time
 from dataclasses import dataclass
 from math import ceil
@@ -30,15 +29,11 @@
 from diffusers.utils import BaseOutput, deprecate
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from optimum.habana.diffusers.models.attention_processor import (
-    AttentionProcessor,
-    AttnProcessor2_0,
-    ScaledDotProductAttention,
-)
 from optimum.utils import logging
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 
 
@@ -101,59 +96,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-    """
-    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention.
-    Sets the attention processor to use to compute attention.
-    Parameters:
-        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-            The instantiated processor class or a dictionary of processor classes that will be set as the processor
-            for **all** `Attention` layers.
-
-            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-            processor. This is strongly recommended when setting trainable attention processors.
-
-    """
-
-    count = len(self.attn_processors.keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-        if hasattr(module, "set_processor"):
-            if os.environ.get("PATCH_SDPA") is not None:
-                setattr(module, "attention_module", ScaledDotProductAttention())
-                module.set_processor(processor(module.attention_module))
-            else:
-                if isinstance(processor, dict):
-                    attention_processor = processor.pop(f"{name}.processor", None)
-                    if attention_processor is not None:
-                        module.set_processor(attention_processor)
-                else:
-                    module.set_processor(processor)
-
-        for sub_name, child in module.named_children():
-            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-
-def set_default_attn_processor_hpu(self):
-    """
-    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    Disables custom attention processors and sets the default attention implementation from HPU.
-    """
-
-    processor = AttnProcessor2_0()
-    set_attn_processor_hpu(self, processor)
-
-
 class GaudiStableDiffusionPipeline(GaudiDiffusionPipeline, StableDiffusionPipeline):
     """
     Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L73
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index f4a0dbd244..6a1b74d129 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
 from dataclasses import dataclass
 from math import ceil
@@ -38,6 +37,7 @@
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 from ..stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
 
@@ -138,6 +138,8 @@ def __init__(
             force_zeros_for_empty_prompt,
         )
 
+        self.unet.set_default_attn_processor = set_default_attn_processor_hpu
+
         self.to(self._device)
 
     def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index 6cd3de0a72..67f07437a1 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -635,7 +635,9 @@ def pre_attn_forward(
             )
             position_ids = position_ids.unsqueeze(0)
 
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
+        query_states, key_states = apply_customized_rope(
+            query_states, key_states, cos, sin, position_ids, self.training
+        )
 
         if use_cache:
             # reuse k, v, self_attention
diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
index 0e1378ee57..4608a56d3f 100644
--- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import os
+import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -68,7 +69,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
         )
 
     # epsilon is used for probabilistic rounding
-    epsilon = torch.rand([], device="hpu")
+    epsilon = torch.rand(1).item()
 
     def compute_num_masked_span(input_length):
         """Given input length, compute how many spans should be masked"""
@@ -106,19 +107,9 @@ def compute_num_masked_span(input_length):
         num_masked_span = compute_num_masked_span(input_length)
 
         # get random indices to mask
-        """
-        Original code:
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        spec_aug_mask_idx = torch.tensor(
+            random.sample(range(input_length - (mask_length - 1)), num_masked_span), dtype=torch.int32
         )
-        When (input_length - (mask_length - 1) < 0), then num_masked_span=0
-        and we get: spec_aug_mask_idx=array([], dtype=int64)
-        However torch rewrite fails, because torch.randperm expects positive number
-        This causes a unit test to fail:
-        RUN_SLOW=true  GAUDI2_CI=1 python -m pytest tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py -v -s -k test_compute_mask_indices_short_audio
-        """
-        spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
-
         # pick first sampled index that will serve as a dummy index to pad vector
         # to ensure same dimension for all batches due to probabilistic rounding
         # Picking first sample just pads those vectors twice.
@@ -133,13 +124,12 @@ def compute_num_masked_span(input_length):
         spec_aug_mask_idx = torch.cat(
             [
                 spec_aug_mask_idx,
-                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx,
+                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32) * dummy_mask_idx,
             ]
         )
         spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long))
 
-    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs)
-
+    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs).to("hpu")
     # expand masked indices to masked spans
     spec_aug_mask_idxs = torch.broadcast_to(
         spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
@@ -248,7 +238,7 @@ def gaudi_wav2vec2_encoder_forward(
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        dropout_probability = torch.rand([], device="hpu")
+        dropout_probability = torch.rand([])
 
         skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
         if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
index a3aa728e81..528be1b407 100755
--- a/optimum/habana/utils.py
+++ b/optimum/habana/utils.py
@@ -31,7 +31,7 @@
 logger = logging.get_logger(__name__)
 
 
-CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.18.0")
+CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.19.0")
 
 
 def to_device_dtype(my_input: Any, target_device: torch.device = None, target_dtype: torch.dtype = None):
diff --git a/optimum/habana/version.py b/optimum/habana/version.py
index 56cc966b00..2610736245 100644
--- a/optimum/habana/version.py
+++ b/optimum/habana/version.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.15.0.dev0"
+__version__ = "1.16.0.dev0"
diff --git a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json b/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
index 3789c63fa9..c2a58cc25c 100644
--- a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
+++ b/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
@@ -8,7 +8,7 @@
                     "learning_rate": 5e-5,
                     "train_batch_size": 2,
                     "train_runtime": 470,
-                    "train_samples_per_second": 22,
+                    "train_samples_per_second": 20.48,
                     "eval_accuracy": 0.6,
                     "extra_arguments": [
                         "--bf16",
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json
index 37948b9746..605e719faf 100755
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ b/tests/baselines/bert_large_uncased_whole_word_masking.json
@@ -77,7 +77,7 @@
                     "learning_rate": 3e-5,
                     "train_batch_size": 32,
                     "eval_f1": 91.71,
-                    "train_runtime": 77.307,
+                    "train_runtime": 80.307,
                     "train_samples_per_second": 2150.333,
                     "extra_arguments": [
                         "--max_seq_length 384",
@@ -95,7 +95,7 @@
                     "train_batch_size": 256,
                     "eval_f1": 0.867,
                     "train_runtime": 33.2909,
-                    "train_samples_per_second": 1151.598,
+                    "train_samples_per_second": 1100.598,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -115,4 +115,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json
index 8b9037b32b..4d7233e089 100755
--- a/tests/baselines/roberta_large.json
+++ b/tests/baselines/roberta_large.json
@@ -67,7 +67,7 @@
                     "learning_rate": 7e-5,
                     "train_batch_size": 32,
                     "eval_f1": 94.09,
-                    "train_runtime": 77.333,
+                    "train_runtime": 79.333,
                     "train_samples_per_second": 2138.366,
                     "extra_arguments": [
                         "--max_seq_length 384",
@@ -95,4 +95,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
index d1a563c9ff..055d321152 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/baselines/whisper_small.json
@@ -41,10 +41,10 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_wer": 0.3806988352745424,
-                    "train_runtime": 312.5894,
-                    "train_samples_per_second": 280.111,
-                    "eval_samples_per_second": 19.073,
+                    "eval_wer": 0.4693843594009983,
+                    "train_runtime": 380.00,
+                    "train_samples_per_second": 218.0,
+                    "eval_samples_per_second": 31.0,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 97bbb7632d..03663b7fc8 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -1348,6 +1348,7 @@ def test_stable_diffusion_xl_inference_script(self):
                 --image_save_dir {run_dir}
                 --use_habana
                 --gaudi_config Habana/stable-diffusion
+                --sdp_on_bf16
                 --bf16
                 """.split()
             cmd_line.append("--prompts")
@@ -1390,6 +1391,7 @@ def _sdxl_generation(self, scheduler: str, batch_size: int, num_images_per_promp
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 **kwargs,
             )
+            pipeline.unet.set_default_attn_processor(pipeline.unet)
             num_images_per_prompt = num_images_per_prompt
             res = {}
             outputs = pipeline(
@@ -2466,6 +2468,7 @@ def test_train_text_to_image_sdxl(self):
                  --dataloader_num_workers 8
                  --use_hpu_graphs_for_training
                  --use_hpu_graphs_for_inference
+                 --sdp_on_bf16
                  --bf16
                  --adjust_throughput
                  --center_crop
@@ -2474,7 +2477,7 @@ def test_train_text_to_image_sdxl(self):
                  --output_dir {tmpdir}
                 """.split()
 
-            # Run train_text_to_image_sdxl.y
+            # Run train_text_to_image_sdxl.py
             p = subprocess.Popen(cmd_line)
             return_code = p.wait()
 
@@ -2548,6 +2551,7 @@ def test_train_controlnet(self):
                     --checkpointing_steps 1000
                     --throughput_warmup_steps 3
                     --use_hpu_graphs
+                    --sdp_on_bf16
                     --bf16
                     --max_train_steps 10
                     --output_dir {tmpdir}
@@ -3718,6 +3722,7 @@ def test_deterministic_image_generation(self):
                 --use_habana
                 --use_hpu_graphs
                 --gaudi_config Habana/stable-diffusion
+                --sdp_on_bf16
                 --bf16
                 --use_cpu_rng
                 """.split()
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 27dd1b75c2..20d808b69f 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -206,6 +206,9 @@ def _test_text_translation(
         if "opus-mt-zh-en" in model_name:
             command_args.append("--max_source_length 512")
 
+        if "Babelscape/mrebel-large" in model_name or "nllb-200-distilled-600M" in model_name:
+            command_args.append("--sdp_on_bf16")
+
         command = self._build_command(
             task=task,
             deepspeed=deepspeed,
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 20f26f9012..a1271d4da1 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -528,6 +528,34 @@ def test(self):
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
                 env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
 
+            if self.EXAMPLE_NAME == "run_audio_classification":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_image_classification":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_glue":
+                if model_name == "bert-large-uncased-whole-word-masking":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_qa":
+                if model_name == "bert-large-uncased-whole-word-masking" or model_name == "albert-large-v2":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_bridgetower":
+                if model_name == "BridgeTower/bridgetower-large-itm-mlm-itc":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_speech_recognition_seq2seq":
+                if model_name == "openai/whisper-small":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_image2text_lora_finetune":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 6335f28ebf..180a2bb3f9 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,6 +97,7 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
+            "--sdp_on_bf16",
         ]
     else:
         command += [
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index c73d4d0565..36143e8f91 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -67,6 +67,11 @@ def _test_image_to_text(
         "--use_hpu_graphs",
     ]
 
+    if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model_name or "tiiuae/falcon-11B-vlm" in model_name:
+        command += [
+            "--sdp_on_bf16",
+        ]
+
     command.append("--bf16")
     command.append("--sdp_on_bf16")
 
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 10ac6b7adb..ec1cc67475 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -44,7 +44,7 @@
             ("google/gemma-7b", 1, False, 109.70751574382221, True),
             ("google/gemma-2-9b", 1, False, 92.302359446567, True),
             ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605, False),
-            ("Deci/DeciLM-7B", 1, False, 120, False),
+            ("Deci/DeciLM-7B", 1, False, 115, False),
             ("Qwen/Qwen2-7B", 256, False, 8870.945160540245, True),
             ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395, False),
             ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False),
@@ -218,6 +218,12 @@ def _test_text_generation(
     if "gemma" in model_name.lower():
         command += ["--use_flash_attention"]
 
+    if "decilm" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
+    if "mamba-130m-hf" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model:
         command += ["--reuse_cache"]