diff --git a/setup.py b/setup.py
index f79c9d3d2eb40..3b58e2a517d3a 100644
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,7 @@ def parse_arg_remove_string(argv, arg_name_equal):
 wheel_name_suffix = parse_arg_remove_string(sys.argv, "--wheel_name_suffix=")
 
 cuda_version = None
+is_cuda_version_12 = False
 rocm_version = None
 is_migraphx = False
 is_rocm = False
@@ -63,6 +64,8 @@ def parse_arg_remove_string(argv, arg_name_equal):
 if wheel_name_suffix == "gpu":
     # TODO: how to support multiple CUDA versions?
     cuda_version = parse_arg_remove_string(sys.argv, "--cuda_version=")
+    if cuda_version:
+        is_cuda_version_12 = cuda_version.startswith("12.")
 elif parse_arg_remove_boolean(sys.argv, "--use_rocm"):
     is_rocm = True
     rocm_version = parse_arg_remove_string(sys.argv, "--rocm_version=")
@@ -721,7 +724,6 @@ def reformat_run_count(count_str):
 with open(requirements_path) as f:
     install_requires = f.read().splitlines()
 
-
 if enable_training:
 
     def save_build_and_package_info(package_name, version_number, cuda_version, rocm_version):
@@ -754,6 +756,20 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
 
     save_build_and_package_info(package_name, version_number, cuda_version, rocm_version)
 
+extras_require = {}
+if package_name == "onnxruntime-gpu" and is_cuda_version_12:
+    extras_require = {
+        "cuda": [
+            "nvidia-cuda-nvrtc-cu12~=12.0",
+            "nvidia-cuda-runtime-cu12~=12.0",
+            "nvidia-cufft-cu12~=11.0",
+            "nvidia-curand-cu12~=10.0",
+        ],
+        "cudnn": [
+            "nvidia-cudnn-cu12~=9.0",
+        ],
+    }
+
 # Setup
 setup(
     name=package_name,
@@ -771,6 +787,7 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
     download_url="https://github.com/microsoft/onnxruntime/tags",
     data_files=data_files,
     install_requires=install_requires,
+    extras_require=extras_require,
     python_requires=">=3.10",
     keywords="onnx machine learning",
     entry_points={
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c234a69a73ed8..b2b44e587cb09 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2366,11 +2366,32 @@ def run_nodejs_tests(nodejs_binding_dir):
     run_subprocess(args, cwd=nodejs_binding_dir)
 
 
+def parse_cuda_version_from_json(cuda_home):
+    version_file_path = os.path.join(cuda_home, "version.json")
+    if not os.path.exists(version_file_path):
+        print(f"version.json not found in {cuda_home}.")
+    else:
+        try:
+            with open(version_file_path) as version_file:
+                version_data = json.load(version_file)
+                cudart_info = version_data.get("cuda")
+                if cudart_info and "version" in cudart_info:
+                    parts = cudart_info["version"].split(".")
+                    return ".".join(parts[:2])
+        except FileNotFoundError:
+            print(f"version.json not found in {cuda_home}.")
+        except json.JSONDecodeError:
+            print(f"Error decoding JSON from version.json in {cuda_home}.")
+
+    return ""
+
+
 def build_python_wheel(
     source_dir,
     build_dir,
     configs,
     use_cuda,
+    cuda_home,
     cuda_version,
     use_rocm,
     use_migraphx,
@@ -2418,6 +2439,7 @@ def build_python_wheel(
         if use_cuda:
             # The following line assumes no other EP is enabled
             args.append("--wheel_name_suffix=gpu")
+            cuda_version = cuda_version or parse_cuda_version_from_json(cuda_home)
             if cuda_version:
                 args.append(f"--cuda_version={cuda_version}")
         elif use_rocm:
@@ -3075,6 +3097,7 @@ def main():
                 build_dir,
                 configs,
                 args.use_cuda,
+                cuda_home,
                 args.cuda_version,
                 args.use_rocm,
                 args.use_migraphx,
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index f48573abd3dba..265174665e840 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -56,7 +56,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90"
+          EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90"
           use_tensorrt: True
 
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
@@ -80,4 +80,4 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
           EP_NAME: directml
-          cmake_build_type: ${{ parameters.cmake_build_type }}
\ No newline at end of file
+          cmake_build_type: ${{ parameters.cmake_build_type }}