diff --git a/tabrepo/benchmark/models/ag/__init__.py b/tabrepo/benchmark/models/ag/__init__.py
index 4cfded7c..5f914646 100644
--- a/tabrepo/benchmark/models/ag/__init__.py
+++ b/tabrepo/benchmark/models/ag/__init__.py
@@ -1,16 +1,20 @@
 from __future__ import annotations
 
 from tabrepo.benchmark.models.ag.ebm.ebm_model import ExplainableBoostingMachineModel
+from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel
 from tabrepo.benchmark.models.ag.modernnca.modernnca_model import ModernNCAModel
 from tabrepo.benchmark.models.ag.realmlp.realmlp_model import RealMLPModel
 from tabrepo.benchmark.models.ag.tabdpt.tabdpt_model import TabDPTModel
 from tabrepo.benchmark.models.ag.tabicl.tabicl_model import TabICLModel
 from tabrepo.benchmark.models.ag.tabm.tabm_model import TabMModel
-from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import TabPFNV2ClientModel
+from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import (
+    TabPFNV2ClientModel,
+)
 from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
 
 __all__ = [
     "ExplainableBoostingMachineModel",
+    "LimiXModel",
     "ModernNCAModel",
     "RealMLPModel",
     "TabDPTModel",
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt b/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt
new file mode 100644
index 00000000..ac4aee55
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright Zhengxiao Du
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json
new file mode 100644
index 00000000..a51a30de
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json
@@ -0,0 +1,102 @@
+[
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_10"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_10"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  }
+]
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json
new file mode 100644
index 00000000..1290c2ca
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json
@@ -0,0 +1,102 @@
+[
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "logNormal"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  }
+]
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json
new file mode 100644
index 00000000..79480c0e
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json
@@ -0,0 +1,201 @@
+[
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },{
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "power"
+      ],
+      "discrete_flag": false,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "power"
+      ],
+      "discrete_flag": false,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "power"
+      ],
+      "discrete_flag": false,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "power"
+      ],
+      "discrete_flag": false,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  }
+]
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json
new file mode 100644
index 00000000..74bc3c7e
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json
@@ -0,0 +1,201 @@
+[
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },{
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile_uniform_all_data"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "onehot"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": false,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": false,
+      "calculate_sample_attention": false,
+      "subsample_ratio": 0.7,
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  }
+]
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json
new file mode 100644
index 00000000..1290c2ca
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json
@@ -0,0 +1,102 @@
+[
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "quantile"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        "logNormal"
+      ],
+      "discrete_flag": false,
+      "original_flag": true,
+      "svd_tag": "svd"
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "ordinal_strict_feature_shuffled"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  },
+  {
+    "RebalanceFeatureDistribution": {
+      "worker_tags": [
+        null
+      ],
+      "discrete_flag": true,
+      "original_flag": false,
+      "svd_tag": null
+    },
+    "CategoricalFeatureEncoder": {
+      "encoding_strategy": "numeric"
+    },
+    "FeatureShuffler": {
+      "mode": "shuffle"
+    },
+    "retrieval_config": {
+      "use_retrieval": true,
+      "retrieval_before_preprocessing": false,
+      "calculate_feature_attention": true,
+      "calculate_sample_attention": true,
+      "subsample_ratio": "dynamic",
+      "subsample_type": "sample",
+      "use_type": "mixed"
+    }
+  }
+]
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py
new file mode 100644
index 00000000..484178ac
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py
@@ -0,0 +1,274 @@
+import gc
+from typing import Literal, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, DistributedSampler
+
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.data_utils import TabularInferenceDataset
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.inference_utils import NonPaddingDistributedSampler, swap_rows_back
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.loading import load_model
+
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.retrieval_utils import RelabelRetrievalY
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import os, socket, contextlib
+
+def _pick_free_port():
+    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+def setup():
+    if dist.is_initialized():
+        return dist.get_rank(), dist.get_world_size()
+
+    # Support for single GPU usage in a normal python script
+    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
+
+    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+    os.environ.setdefault("MASTER_PORT", str(_pick_free_port()))
+
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["LOCAL_RANK"] = "0"
+
+    dist.init_process_group(backend="nccl", init_method="env://", rank=0, world_size=1)
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    torch.cuda.set_device(rank)
+    return rank, world_size
+
+
+def cleanup():
+    if not dist.is_initialized():
+        print("Distributed environment is not initialized, nothing to clean up.")
+        return
+
+    print("Cleaning up distributed environment...")
+    dist.destroy_process_group()
+    print("Distributed environment cleaned up.")
+
+class InferenceResultWithRetrieval:
+    def __init__(self,
+                 model: torch.nn.Module,
+                 sample_selection_type: Literal["AM", "DDP"] = "AM",
+                 ):
+        self.model=model
+        self.sample_selection_type = sample_selection_type
+        self.dataset = None
+
+    def _prepare_data(self,
+                      X_train: torch.Tensor,
+                      y_train: torch.Tensor,
+                      X_test: torch.Tensor,
+                      attention_score: np.ndarray = None,
+                      retrieval_len: int = 2000
+                      ) -> TabularInferenceDataset:
+        if self.sample_selection_type == "AM":
+            use_retrieval = True
+        else:
+            use_retrieval = False
+        dataset = TabularInferenceDataset(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            attention_score=attention_score,
+            retrieval_len=retrieval_len,
+            use_retrieval=use_retrieval
+        )
+        return dataset
+
+    def inference(self,
+                  X_train: torch.Tensor = None,
+                  y_train: torch.Tensor = None,
+                  X_test: torch.Tensor = None,
+                  dataset: TabularInferenceDataset = None,
+                  attention_score: np.ndarray | torch.Tensor = None,
+                  retrieval_len: int = 2000,
+                  dynamic_ratio:float=None,
+                  task_type: Literal["reg", "cls"] = "reg"):
+        self.rank,self.world_size = setup()
+        model = self.model.cuda() # self.rank
+        model = DDP(model, device_ids=[self.rank],find_unused_parameters=False)
+        if isinstance(retrieval_len,str):
+            if retrieval_len == "dynamic":
+                if dynamic_ratio is not None:
+                    retrieval_len =int(dynamic_ratio*X_train.shape[0]/len(torch.unique(y_train)))
+                else:
+                    retrieval_len = int(X_train.shape[0]/len(torch.unique(y_train)))
+        if isinstance(retrieval_len, float):
+            self.retrieval_len = int(retrieval_len * X_train.shape[0])
+        else:
+            self.retrieval_len = retrieval_len
+        if dataset is None:
+            dataset = self._prepare_data(X_train, y_train, X_test, attention_score, self.retrieval_len)
+        sampler = NonPaddingDistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=False)
+        outputs = []
+        dataloader = DataLoader(dataset,
+                                batch_size=16,
+                                shuffle=False,
+                                drop_last=False,
+                                sampler=sampler
+                                )
+        indice = []
+        for data in dataloader:
+            with (
+                torch.autocast(torch.device(model.device).type, enabled=True),
+                torch.inference_mode(),
+            ):
+                if self.sample_selection_type == "DDP":
+                    indice.append(data["idx"])
+                    X_test = data["X_test"].unsqueeze(1)
+                    X_train_item = torch.cat([X_train.unsqueeze(0) for _ in range(X_test.shape[0])], dim=0)
+                    Y_train_item = torch.cat([y_train.unsqueeze(0).unsqueeze(-1) for _ in range(X_test.shape[0])], dim=0)
+                    x_ = torch.cat([X_train_item, X_test], dim=1)
+                    output = model(x=x_, y=Y_train_item.squeeze(-1), eval_pos=Y_train_item.shape[1], task_type=task_type)
+                else:
+                    indice.append(data["idx"])
+                    X_train = data["X_train"]
+                    X_test = data["X_test"].unsqueeze(1)
+                    y_ = data["y_train"]
+
+                    x_ = torch.cat([X_train, X_test], dim=1)
+                    if task_type == "cls":
+                        relabel = RelabelRetrievalY(y_)
+                        y_ = relabel.transform_y()
+
+                    output=model(x=x_, y=y_.squeeze(-1), eval_pos=y_.shape[1], task_type=task_type)
+                    if len(output.shape) == 3:
+                        output = output.view(-1, output.shape[-1])
+                    if task_type == "cls":
+                        output = output.cpu().numpy()
+                        output = relabel.inverse_transform_y(output)
+                        output = torch.tensor(output, dtype=torch.float32, device=model.device)
+
+            outputs.append(output.cpu())
+            del output
+            gc.collect()
+            torch.cuda.empty_cache()
+        del model
+        outputs = torch.cat(outputs, dim=0)
+        local_result_cpu = outputs.cpu()
+        indice = torch.cat(indice, dim=0)
+        local_indice_cpu = indice.cpu()
+        outputs = [None for _ in range(self.world_size)]
+        gathered_indice = [None for _ in range(self.world_size)]
+        dist.all_gather_object(gathered_indice, local_indice_cpu)
+        dist.all_gather_object(outputs, local_result_cpu)
+        del local_result_cpu
+        outputs = torch.cat(outputs, dim=0).to(torch.float32)
+        gathered_indice = torch.cat(gathered_indice, dim=0)
+        outputs = swap_rows_back(outputs, gathered_indice)
+        gc.collect()
+        torch.cuda.empty_cache()
+        return outputs.squeeze(0)
+
+
+class InferenceAttentionMap:
+    def __init__(self,
+                 model_path: str,
+                 calculate_feature_attention: bool = False,
+                 calculate_sample_attention: bool = False,
+                 ):
+        self.calculate_feature_attention = calculate_feature_attention
+        self.calculate_sample_attention = calculate_sample_attention
+        self.model = load_model(model_path, calculate_feature_attention=calculate_feature_attention,
+                           calculate_sample_attention=calculate_sample_attention)
+
+        self.dataset = None
+
+    def _prepare_data(self,
+                      X_train: torch.Tensor,
+                      y_train: torch.Tensor,
+                      X_test: torch.Tensor,
+                      ) -> TabularInferenceDataset:
+        dataset = TabularInferenceDataset(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            use_retrieval=False
+        )
+        return dataset
+
+    def inference(self,
+                  X_train: torch.Tensor | np.ndarray,
+                  y_train: torch.Tensor | np.ndarray,
+                  X_test: torch.Tensor | np.ndarray,
+                  task_type: Literal["reg", "cls"] = "reg") -> tuple[torch.Tensor | None, torch.Tensor | None]:
+        self.rank, self.world_size = setup()
+        # device = torch.device(f"cuda:{self.rank}")
+        model = self.model.cuda()
+        model = DDP(model, device_ids=[0])
+        model.eval()
+        if isinstance(X_train, np.ndarray):
+            X_train = torch.from_numpy(X_train).float()
+        if isinstance(y_train, np.ndarray):
+            y_train = torch.from_numpy(y_train).float()
+        if isinstance(X_test, np.ndarray):
+            X_test = torch.from_numpy(X_test).float()
+        dataset = self._prepare_data(X_train, y_train, X_test)
+
+        sampler = NonPaddingDistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=False)
+        dataloader = DataLoader(dataset,
+                                batch_size=16,
+                                shuffle=False,
+                                drop_last=False,
+                                sampler=sampler
+                                )
+        local_feature_attention = []
+        local_sample_attention = []
+        feature_attention=None
+        sample_attention=None
+        indice=[]
+        for batch_idx, data in enumerate(dataloader):
+            X_test=data["X_test"]
+            idx=data["idx"]
+            indice.append(idx)
+            x_=torch.cat([X_train,X_test],dim=0).unsqueeze(dim=0)
+
+            y_=y_train.unsqueeze(0)
+            with(torch.autocast(device_type='cuda', enabled=True), torch.inference_mode()):
+                output,feature_attention,sample_attention = model(x=x_, y=y_, eval_pos=y_.shape[1], task_type=task_type)
+
+            if self.calculate_sample_attention:
+                local_sample_attention.append(sample_attention.permute(1,0,2))
+            if self.calculate_feature_attention:
+                local_feature_attention.append(feature_attention[y_.shape[1]:,:,:])
+            del output, sample_attention, feature_attention,X_test
+            gc.collect()
+            torch.cuda.empty_cache()
+        indice=torch.cat(indice,dim=0)
+        if self.calculate_feature_attention:
+            feature_attentions = torch.cat(local_feature_attention,dim=0)# shape[len_Dtest, feature_num//feature_per_group, feature_num//feature_per_group,]
+            local_result_cpu = feature_attentions.cpu()
+            local_indice_cpu=indice.cpu()
+            gathered_feature = [None for _ in range(self.world_size)]
+            gathered_indice = [None for _ in range(self.world_size)]
+            dist.all_gather_object(gathered_feature, local_result_cpu)
+            dist.all_gather_object(gathered_indice, local_indice_cpu)
+            feature_attention = torch.cat(gathered_feature, dim=0)
+            gathered_indice=torch.cat(gathered_indice, dim=0)
+            feature_attention=swap_rows_back(feature_attention, gathered_indice)
+            del gathered_feature
+        if self.calculate_sample_attention:
+            sample_attentions = torch.cat(local_sample_attention,dim=0)
+            local_indice_cpu = indice.cpu()
+            local_result_cpu = sample_attentions.cpu()
+            gathered_sample = [None for _ in range(self.world_size)]
+            gathered_indice = [None for _ in range(self.world_size)]
+            dist.all_gather_object(gathered_sample, local_result_cpu)
+            dist.all_gather_object(gathered_indice, local_indice_cpu)
+            sample_attention = torch.cat(gathered_sample, dim=0)
+            gathered_indice = torch.cat(gathered_indice, dim=0)
+            sample_attention = swap_rows_back(sample_attention, gathered_indice)
+            del gathered_sample
+
+
+        dist.barrier()
+        del sample_attentions,feature_attentions,model
+        gc.collect()
+        torch.cuda.empty_cache()
+        return feature_attention, sample_attention.permute(1,0,2)
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py
new file mode 100644
index 00000000..d9ff540e
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py
@@ -0,0 +1,596 @@
+from tabrepo.benchmark.models.ag.limix.LimiX.inference.inference_method import InferenceAttentionMap, InferenceResultWithRetrieval
+from tabrepo.benchmark.models.ag.limix.LimiX.inference.preprocess import (
+    FeatureShuffler, 
+    FilterValidFeatures, 
+    CategoricalFeatureEncoder, 
+    RebalanceFeatureDistribution, 
+    SubSampleData)
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.loading import load_model
+import torch
+from typing import List, Literal
+import random
+from sklearn.utils.validation import check_X_y, check_array
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
+from sklearn.compose import ColumnTransformer, make_column_selector
+from sklearn.preprocessing import FunctionTransformer
+import numpy as np
+from itertools import chain, repeat
+import pandas as pd
+import einops
+import json
+import os
+
+
+NA_PLACEHOLDER = "__MISSING__"
+
+class LimiXPredictor:
+    """"LimiX model inferencer, supporting tasks such as classification, regression, and missing value prediction."""
+    def __init__(self,
+                 X_train,
+                 y_train,
+                 device:torch.device, 
+                 model_path:str, 
+                 inference_config: list|str,
+                 mix_precision:bool=True,
+                 outlier_remove_std: float=12,
+                 softmax_temperature:float=0.9,
+                 task_type: Literal['Classification', 'Regression']='Classification',
+                 mask_prediction:bool=False,
+                 categorical_features_indices:List[int]|None=None,
+                 inference_with_DDP: bool = False,
+                 seed:int=0):
+        """
+        init LimiXPredictor
+        
+        Args:
+            device: The device for performing inference; GPU is recommended
+            model_path: The model path of LimiX
+            mix_precision: Whether to use mixed precision inference
+            outlier_remove_std: Standard deviation used for removing outliers
+            softmax_temperature: Softmax temperature coefficient
+            task_type:  task type
+            mask_prediction: Whether to enable missing value prediction
+            categorical_features_indices: Index numbers of categorical features, currently not in use
+            inference_config: inference_config_setting,
+            inference_with_DDP: If using DDP to inference,
+            seed: Random seed
+        """
+        self.X_train = X_train
+        self.y_train = y_train
+        if isinstance(inference_config, str):
+            if os.path.isfile(inference_config):
+                with open(inference_config, 'r') as f:
+                    inference_config = json.load(f)
+            else:
+                raise ValueError(f"inference_config is not a config file path: {inference_config}")
+        self.model_path = model_path
+        self.device = device
+        self.mix_precision = mix_precision
+        self.categorical_features_indices = categorical_features_indices
+        self.seed = seed
+        self.inference_config = inference_config
+        n_estimators = len(inference_config)
+        assert n_estimators > 0, f"Invalid configuration file! the number of pipelines is 0!"
+        self.n_estimators = n_estimators
+        self.model = None
+        self.outlier_remove_std = outlier_remove_std
+        self.class_shuffle_factor = 3
+        self.min_seq_len_for_category_infer = 100
+        self.max_unique_num_for_category_infer = 30
+        self.min_unique_num_for_numerical_infer = 4
+        self.preprocess_num = 4
+        self.softmax_temperature = softmax_temperature
+        self.task_type = task_type
+        self.mask_prediction = mask_prediction        
+        self.inference_with_DDP=inference_with_DDP
+        self.model=load_model(model_path=model_path,mask_prediction=mask_prediction)
+        
+        self.preprocess_pipelines = []
+        self.preprocess_configs = []
+        
+        random.seed(seed)
+        rand_gen = np.random.default_rng(seed)
+        self.seeds = [random.randint(0, 10000) for _ in range(n_estimators*self.preprocess_num)]
+        start_idx = rand_gen.integers(0, 1000)
+        all_shifts = list(range(start_idx, start_idx + n_estimators))
+        self.all_shifts = rand_gen.choice(all_shifts, size=n_estimators, replace=False)
+        
+        if self.mask_prediction:
+            for inference_config_item in inference_config:
+                if len(inference_config_item['RebalanceFeatureDistribution']['worker_tags']) > 0:
+                    for i, v in enumerate(inference_config_item['RebalanceFeatureDistribution']['worker_tags']):
+                        if v == 'power':
+                            print("WARNING: Missing value imputation does not currently support the preprocessing method of power! Using the default worker_tags method")
+                            inference_config_item['RebalanceFeatureDistribution']['worker_tags'].pop(i)
+                            inference_config_item['RebalanceFeatureDistribution']['worker_tags'].append(None)
+                inference_config_item['RebalanceFeatureDistribution']['discrete_flag'] = True
+
+        for idx in range(n_estimators):
+            pipeline = []
+            inference_config_item = inference_config[idx]
+            retrieval_config = inference_config_item["retrieval_config"]
+            if retrieval_config["use_retrieval"] and retrieval_config["retrieval_before_preprocessing"]:
+                if retrieval_config["subsample_type"] == "sample":
+                    assert retrieval_config[
+                        "calculate_sample_attention"], "Retrieval on sample level must calculate sample attention score before."
+                    if retrieval_config["use_type"] == "mixed":
+                        assert retrieval_config[
+                            "calculate_feature_attention"], "Retrieval on mixed type must calculate sample and feature attention score before."
+                if retrieval_config["subsample_type"] == "feature":
+                    assert retrieval_config[
+                        "calculate_feature_attention"], "Retrieval on sample level must calculate feature attention score before."
+                pipeline.append(
+                    InferenceAttentionMap(model_path, retrieval_config["calculate_feature_attention"],
+                                          retrieval_config["calculate_sample_attention"]))
+                pipeline.append(SubSampleData(retrieval_config["subsample_type"], retrieval_config["use_type"]))
+            pipeline.append(FilterValidFeatures())
+            pipeline.append(RebalanceFeatureDistribution(**inference_config_item['RebalanceFeatureDistribution']))
+            pipeline.append(CategoricalFeatureEncoder(**inference_config_item['CategoricalFeatureEncoder']))
+            shuffler = FeatureShuffler(**inference_config_item['FeatureShuffler'])
+            shuffler.shift = all_shifts[idx]
+            pipeline.append(shuffler)
+            if retrieval_config["use_retrieval"] and not retrieval_config["retrieval_before_preprocessing"]:
+                if retrieval_config["subsample_type"] == "sample":
+                    assert retrieval_config[
+                        "calculate_sample_attention"], "Retrieval on sample level must calculate sample attention score before."
+                    if retrieval_config["use_type"] == "mixed":
+                        assert retrieval_config[
+                            "calculate_feature_attention"], "Retrieval on mixed type must calculate sample and feature attention score before."
+                if retrieval_config["subsample_type"] == "feature":
+                    assert retrieval_config[
+                        "calculate_feature_attention"], "Retrieval on sample level must calculate feature attention score before."
+                pipeline.append(
+                    InferenceAttentionMap(model_path, retrieval_config["calculate_feature_attention"],
+                                          retrieval_config["calculate_sample_attention"]))
+                pipeline.append(SubSampleData(retrieval_config["subsample_type"], retrieval_config["use_type"]))
+            self.preprocess_pipelines.append(pipeline)
+
+
+    def _check_n_features(self, X, reset):
+        """Check whether the number of features matches the previous evaluation"""
+        n_features = X.shape[1]
+        if reset:
+            self.n_features_in_ = n_features
+        else:
+            if self.n_features_in_ != n_features:
+                raise ValueError(
+                    f"X has {n_features} features, "
+                    f"but this estimator is expecting {self.n_features_in_} features."
+                )
+    
+    def validate_data(self, x=None, y=None, reset=True, validate_separately=False, **check_params):
+        """
+        {'accept_sparse': False, 'dtype': None, 'ensure_all_finite': 'allow-nan'}
+        """
+        # Validate both x and y simultaneously
+        if y is not None:
+            x, y = check_X_y(x, y, **check_params)
+            self._check_n_features(x, reset=reset)
+            return x, y
+
+        # Validate X
+        if x is not None:
+            x = check_array(x, **check_params)
+            self._check_n_features(x, reset=reset)
+            return x
+
+        return None
+    
+    def convert_x_dtypes(self, x:np.ndarray, dtypes:Literal["float32", "float64"] = "float64"):
+        NUMERIC_DTYPE_KINDS = "?bBiufm"
+        OBJECT_DTYPE_KINDS = "OV"
+        STRING_DTYPE_KINDS = "SaU"
+        
+        if x.dtype.kind in NUMERIC_DTYPE_KINDS:
+            x = pd.DataFrame(x, copy=False, dtype=dtypes)
+        elif x.dtype.kind in OBJECT_DTYPE_KINDS:
+            x = pd.DataFrame(x, copy=True)
+            x = x.convert_dtypes()
+        else:
+            raise ValueError(f"Unsupport string dtypes! {x.dtype}")
+
+        integer_columns = x.select_dtypes(include=["number"]).columns
+        if len(integer_columns) > 0:
+            x[integer_columns] = x[integer_columns].astype(dtypes)
+        return x
+    
+    def convert_category2num(self, x, dtype:np.floating=np.float64, placeholder: str = NA_PLACEHOLDER,):
+        ordinal_encoder = OrdinalEncoder(categories="auto",
+                                        dtype=dtype,
+                                        handle_unknown="use_encoded_value",
+                                        unknown_value=-1,
+                                        encoded_missing_value=np.nan)
+        col_encoder = ColumnTransformer(transformers=[("encoder", ordinal_encoder, make_column_selector(dtype_include=["category", "string"]))],
+                                        remainder=FunctionTransformer(),
+                                        sparse_threshold=0.0,
+                                        verbose_feature_names_out=False,
+                                    )
+        
+        string_cols = x.select_dtypes(include=["string", "object"]).columns
+        if len(string_cols) > 0:
+            x[string_cols] = x[string_cols].fillna(placeholder)
+        
+        X_encoded = col_encoder.fit_transform(x)
+
+        string_cols_ix = [x.columns.get_loc(col) for col in string_cols]
+        placeholder_mask = x[string_cols] == placeholder
+        string_cols_ix_2 = list(range(len(string_cols_ix)))
+        X_encoded[:, string_cols_ix_2] = np.where(
+            placeholder_mask,
+            np.nan,
+            X_encoded[:, string_cols_ix_2],
+        )
+
+        return X_encoded
+
+    
+    def get_categorical_features_indices(self, x:np.ndarray):
+        if x.shape[0] < self.min_seq_len_for_category_infer:
+            return []
+        categorical_idx = []
+        for idx, col in enumerate(x.T):
+            if len(np.unique(col)) < self.min_unique_num_for_numerical_infer:
+                categorical_idx.append(idx)
+        return categorical_idx
+        
+    def predict(self, X_test):
+        # TODO: assumes incorrectly labele encoded input data, bad practice, fix later
+        pred = self.predict_proba(X_test)
+        if self.task_type == "Classification":
+            return np.argmax(pred, axis=1)
+        return pred
+    def predict_proba(self, X_test):
+
+        predict_batch_size = 5000
+
+        def get_batch_intervals(n, bs):
+            return [(i, min(i + bs, n)) for i in range(0, n, bs)]
+
+        if len(X_test) <= predict_batch_size:
+            return self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test)
+
+        return np.concatenate(
+            [
+                self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test[s:e])
+                for s, e in get_batch_intervals(X_test.shape[0], predict_batch_size)
+            ],
+            axis=0,
+        )
+
+        return self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test)
+
+    def _predict(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray:
+        """
+        Perform inference using the LimiX model
+        
+        Args:
+        x_train: Training data x
+        y_train: Training data y
+        x_test:  Testing data x
+        """
+        if self.task_type == "Classification":
+            return self._predict_cls(x_train, y_train, x_test)
+        elif self.task_type == "Regression":
+            return self._predict_reg(x_train, y_train, x_test)
+        else:
+            raise ValueError(f"Unsupported task type, supported tasks include classification and regression!")
+        
+    def _predict_cls(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray:
+        np_rng = np.random.default_rng(self.seed)
+        
+        x_train, y_train = self.validate_data(x_train, y_train, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False)
+        x_test = self.validate_data(x_test, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False)
+        
+        # "Concatenate x_train and x_test to ensure the preprocessing logic is completely consistent.
+        x = np.concatenate([x_train, x_test], axis=0)
+        
+        # Encode y_train
+        self.label_encoder = LabelEncoder()
+        y = self.label_encoder.fit_transform(y_train)
+        self.classes = self.label_encoder.classes_
+        self.n_classes = len(self.classes)
+        
+        # shuffle y
+        noise = np_rng.random((self.n_estimators * self.class_shuffle_factor, self.n_classes))
+        shufflings = np.argsort(noise, axis=1)
+        uniqs = np.unique(shufflings, axis=0)
+        balance_count = self.n_estimators // len(uniqs)
+        self.class_permutations = list(chain.from_iterable(repeat(elem, balance_count) for elem in uniqs))
+        cout = self.n_estimators%len(uniqs)
+        if self.n_estimators%len(uniqs) > 0:
+            self.class_permutations += [uniqs[i] for i in np_rng.choice(len(uniqs), size=cout)]
+        
+        # Preprocess x
+        x = self.convert_x_dtypes(x)
+        x = self.convert_category2num(x)
+        categorical_idx = self.get_categorical_features_indices(x)
+        outputs = []
+        mask_predictions = []
+        for id_pipe, pipe in enumerate(self.preprocess_pipelines):
+            x_ = x.copy()
+            y_ = self.class_permutations[id_pipe][y.copy()]
+            categorical_idx_ = categorical_idx.copy()
+            for id_step, step in enumerate(pipe):
+                if isinstance(step, RebalanceFeatureDistribution):
+                    x_train_ = x_[:len(y_train)]
+                    x_test_ = x_[len(y_train):]
+                    if x_train_.shape[1] != x_test_.shape[1]:
+                        x_test_ = x_test_[:, :x_train_.shape[1]]
+                    x_train_, categorical_idx_ = step.fit_transform(x_train_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step])
+                    x_test_, categorical_idx_ = step.transform(x_test_)
+                    x_ = np.concatenate([x_train_, x_test_], axis=0)
+                elif isinstance(step, InferenceAttentionMap):
+                    feature_attention_score, sample_attention_score = step.inference(X_train=x_[:len(y_train)],
+                                                                                     y_train=y_train,
+                                                                                     X_test=x_[len(y_train):],
+                                                                                     task_type="cls")
+                   
+                elif isinstance(step, SubSampleData):
+                    step.fit(torch.from_numpy(x_[:len(y_train)]), torch.from_numpy(y_train),
+                             feature_attention_score=feature_attention_score,
+                             sample_attention_score=sample_attention_score,
+                             subsample_ratio=self.inference_config[id_pipe]["retrieval_config"]["subsample_ratio"])
+                    if self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "feature":
+                        x_ = step.transform(torch.from_numpy(x_[len(y_train):]).float())
+                        categorical_idx_ = self.get_categorical_features_indices(x_)
+                    else:
+                        attention_score = step.transform(torch.from_numpy(x_[len(y_train):]).float())
+                else:
+                    x_, categorical_idx_ = step.fit_transform(x_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step])
+                    # print(f"step {id_step} categorical_idx_ {categorical_idx_}")
+            
+            x_ = torch.from_numpy(x_[:, :]).float().to(self.device)
+            y_ = torch.from_numpy(y_).float().to(self.device)
+            torch.manual_seed(self.seed)
+            torch.cuda.manual_seed_all(self.seed)
+            if self.inference_config[id_pipe]["retrieval_config"]["use_retrieval"] and \
+                    self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "sample":
+                inference = InferenceResultWithRetrieval(model=self.model,
+                                                         sample_selection_type="AM")
+                # Remove .squeeze() here as it broke the pipeline if the dataset has only one feature
+                output = inference.inference(x_[:len(y_train)], y_,
+                                             x_[len(y_train):],
+                                             attention_score=attention_score,
+                                             retrieval_len=self.inference_config[id_pipe]["retrieval_config"][
+                                                 "subsample_ratio"],
+                                             dynamic_ratio=self.inference_config[id_pipe]["retrieval_config"][
+                                                 "dynamic_ratio"] if "dynamic_ratio" in self.inference_config[id_pipe][
+                                                 "retrieval_config"] else None,
+                                             task_type="cls")
+                if self.softmax_temperature != 1:
+                    output = (output[:, :self.n_classes].float() / self.softmax_temperature)
+
+                output = output[..., self.class_permutations[id_pipe]]
+                outputs.append(output)
+            elif self.inference_with_DDP:
+                inference = InferenceResultWithRetrieval(model=self.model,
+                                                         sample_selection_type="DDP")
+                output = inference.inference(x_[:len(y_train)].squeeze(1), y_, x_[len(y_train):].squeeze(1),
+                                             task_type="cls")
+                if self.softmax_temperature != 1:
+                    output = (output[:, :self.n_classes].float() / self.softmax_temperature)
+
+                output = output[..., self.class_permutations[id_pipe]]
+                outputs.append(output)
+            else:
+                self.model.to(self.device)
+                with(torch.autocast(device_type='cuda', enabled=self.mix_precision), torch.inference_mode()):
+                    x_=x_.unsqueeze(0)
+                    y_ = y_.unsqueeze(0)
+                    output=self.model(x=x_, y=y_, eval_pos=y_.shape[1], task_type='cls')
+
+                    if self.mask_prediction:
+                        process_config = output['process_config']
+                        output_feature_pred = self.PostProcessInModel(output['feature_pred'], process_config)
+                        output_feature_pred = self.PostProcess(output_feature_pred, pipe, process_config)
+                        mask_predictions.append(output_feature_pred)
+                        output = output['cls_output']
+
+                    output = output if isinstance(output, dict) else output.squeeze(0)
+
+                    if self.softmax_temperature != 1:
+                        output = (output[:, :self.n_classes].float() / self.softmax_temperature)
+
+                    output = output[..., self.class_permutations[id_pipe]]
+                outputs.append(output)
+            
+        outputs = [torch.nn.functional.softmax(o, dim=1) for o in outputs]
+        output = torch.stack(outputs).mean(dim=0)
+        mask_prediction = np.stack(mask_predictions).mean(axis=0) if mask_predictions != [] and self.mask_prediction else None
+        output = output.float().cpu().numpy()
+
+        if self.mask_prediction:
+            return output / output.sum(axis=1, keepdims=True), mask_prediction
+        else:
+            return output / output.sum(axis=1, keepdims=True)
+    
+    def PostProcessInModel(self, feature_pred:torch.tensor, config: dict) -> torch.tensor:
+        # Revert preprocess in model forward
+        feature_pred = feature_pred / torch.sqrt(config['features_per_group'] / config['num_used_features'].to(self.device))
+        feature_pred = feature_pred*config['std_for_normalization'] + config['mean_for_normalization']
+        feature_pred = einops.rearrange(feature_pred, "b s f n -> s b (f n)").squeeze(1).float().cpu().numpy()
+        if config['n_x_padding'] > 0:
+            feature_pred = feature_pred[:,:-config['n_x_padding']]
+        return feature_pred
+    
+    def PostProcess(self, feature_pred:np.ndarray, pipeline:List, config: dict, gt=False) -> np.ndarray:        
+        # Revert preprocess in the Classifier
+        for id_step, step in enumerate(reversed(pipeline)):
+            if isinstance(step, FeatureShuffler):
+                if step.mode == "shuffle":
+                    inv_p = np.argsort(step.feature_indices)
+                    feature_pred = feature_pred[:, inv_p]
+                else:
+                    raise NotImplementedError
+            elif isinstance(step, CategoricalFeatureEncoder):
+                if step.encoding_strategy != 'onehot':
+                    if step.category_mappings is not None:
+                        categorical_indices = list(step.category_mappings.keys())
+                        feature_pred[:, categorical_indices] = np.round(feature_pred[:, categorical_indices])
+                    if step.transformer is not None:
+                        for idx, p in step.category_mappings.items():
+                            feature_pred[:, idx] = np.clip(feature_pred[:, idx], a_min=0, a_max=max(p))
+                            inv_p = np.argsort(p)
+                            feature_pred[:, idx] = inv_p[feature_pred[:, idx].astype(int)].astype(feature_pred.dtype)
+                        inv_col = np.argsort(step.feature_indices)
+                        feature_pred = feature_pred[:, inv_col]
+                else:
+                    if len(step.categorical_features) == 0 or step.transformer is None:
+                        continue
+                    cont_features_indices = [idx for idx in range(feature_pred.shape[1]) if idx not in step.categorical_features]
+                    
+                    assert np.array_equal(step.categorical_features, np.arange(len(step.categorical_features)))
+                    start_idx = 0
+                    for idx, out_category in enumerate(step.transformer.named_transformers_['one_hot_encoder'].categories_):
+                        assert len(out_category) >= 2
+                        if not np.any(np.isnan(out_category)):
+                            if len(out_category) == 2: # e.g. [3, 5.5]
+                                feature_pred[:,start_idx] = np.round(np.clip(feature_pred[:,start_idx], a_min=0, a_max=1))
+                                start_idx += 1
+                            else:
+                                arr = feature_pred[:, start_idx:start_idx+len(out_category)]
+                                feature_pred[:, start_idx:start_idx+len(out_category)] = (arr == arr.max(axis=1, keepdims=True)).astype(float)
+                                start_idx += len(out_category)
+                        else:
+                            if len(out_category) == 2: # e.g. [0, nan]
+                                feature_pred[:,start_idx] = 0
+                                start_idx += 1
+                            else:
+                                arr = feature_pred[:, start_idx:start_idx+len(out_category)-1]
+                                feature_pred[:, start_idx:start_idx+len(out_category)-1] = (arr == arr.max(axis=1, keepdims=True)).astype(float)
+                                feature_pred[:, start_idx+len(out_category)-1] = 0
+                                start_idx += len(out_category)
+                    feature_pred = np.column_stack([step.transformer.named_transformers_['one_hot_encoder'].inverse_transform(feature_pred[:, step.categorical_features]), feature_pred[:, cont_features_indices]])
+                    
+            elif isinstance(step, RebalanceFeatureDistribution):
+                if step.svd_tag == 'svd' and step.svd_n_comp > 0:
+                    feature_pred = feature_pred[:, :-step.svd_n_comp]
+                if step.worker_tags[0] in ["quantile_uniform_10", "quantile_uniform_5", "quantile_uniform_all_data"] and step.n_quantile_features > 0:
+                    feature_pred = feature_pred[:, :-step.n_quantile_features]
+                elif step.worker_tags[0] == "power":
+                    raise ValueError(f"Missing value imputation does not currently support the preprocessing method of power!")
+                    cont_features_indices = [idx for idx in range(feature_pred.shape[1]) if idx not in step.dis_ix]
+                    feature_pred[:, cont_features_indices] = step.worker.named_transformers_['feat_transform'].inverse_transform(feature_pred[:, cont_features_indices])
+                    # reverse feature order
+                if step.feature_indices is not None:
+                    inv_p = np.argsort(step.feature_indices)
+                    feature_pred = feature_pred[:, inv_p]
+
+                    
+            elif isinstance(step, FilterValidFeatures):
+                deleted_indices = np.where(step.invalid_indices)[0]
+                if len(deleted_indices) > 0:
+                    original_cols = len(deleted_indices) + feature_pred.shape[1]
+                    restored = np.zeros((feature_pred.shape[0], original_cols))                
+                    all_indices = set(range(original_cols))
+                    kept_indices = list(all_indices - set(deleted_indices)) 
+                    for i, idx in enumerate(kept_indices):
+                        restored[:, idx] = feature_pred[:, i]                
+                    for i, idx in enumerate(deleted_indices):
+                        restored[:, idx] = step.invalid_features[:, i]
+                    feature_pred = restored.copy()
+        return feature_pred
+        
+    def _predict_reg(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray:
+
+        # For some reason, they scale the data outside of the model, we do it here.
+        y_mean = y_train.mean()
+        y_std = y_train.std()
+        y_train = (y_train - y_mean) / y_std
+
+        np_rng = np.random.default_rng(self.seed)
+        
+        x_train, y_train = self.validate_data(x_train, y_train, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False)
+        x_test = self.validate_data(x_test, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False)
+        
+        # "Concatenate x_train and x_test to ensure the preprocessing logic is completely consistent.
+        x = np.concatenate([x_train, x_test], axis=0)
+    
+        # preprocess x
+        x = self.convert_x_dtypes(x)
+        x = self.convert_category2num(x)
+        x = x.astype(float)
+        categorical_idx = self.get_categorical_features_indices(x)
+    
+        outputs = []
+        mask_predictions = []
+        for id_pipe, pipe in enumerate(self.preprocess_pipelines):
+            x_ = x.copy()
+            y_ = y_train.copy()
+            categorical_idx_ = categorical_idx.copy()
+            for id_step, step in enumerate(pipe):
+                if isinstance(step, RebalanceFeatureDistribution):
+                    x_train_ = x_[:len(y_train)]
+                    x_test_ = x_[len(y_train):]
+                    if x_train_.shape[1] != x_test_.shape[1]:
+                        x_test_ = x_test_[:, :x_train_.shape[1]]
+                    x_train_, categorical_idx_ = step.fit_transform(x_train_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step])
+                    x_test_, categorical_idx_ = step.transform(x_test_)
+                    x_ = np.concatenate([x_train_, x_test_], axis=0)
+                elif isinstance(step, InferenceAttentionMap):
+
+                    feature_attention_score, sample_attention_score = step.inference(X_train=x_[:len(y_train)],
+                                                                                     y_train=y_train,
+                                                                                     X_test=x_[len(y_train):],
+                                                                                     task_type="reg")
+                    
+                elif isinstance(step, SubSampleData):
+                    step.fit(torch.from_numpy(x_[:len(y_train)]), torch.from_numpy(y_train),
+                             feature_attention_score=feature_attention_score,
+                             sample_attention_score=sample_attention_score,
+                             subsample_ratio=self.inference_config[id_pipe]["retrieval_config"]["subsample_ratio"])
+                    if self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "feature":
+                        x_ = step.transform(torch.from_numpy(x_[len(y_train):]).float())
+                        categorical_idx_ = self.get_categorical_features_indices(x_)
+                    else:
+                        attention_score = step.transform(torch.from_numpy(x_[len(y_train):]).float())
+                else:
+                    x_, categorical_idx_ = step.fit_transform(x_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step])
+            
+            x_ = torch.from_numpy(x_[:, :]).float().to(self.device)
+            y_ = torch.from_numpy(y_).float().to(self.device)
+            torch.manual_seed(self.seed)
+            torch.cuda.manual_seed_all(self.seed)
+            if self.inference_config[id_pipe]["retrieval_config"]["use_retrieval"] and \
+                    self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "sample":
+                inference = InferenceResultWithRetrieval(model=self.model,
+                                                         sample_selection_type="AM")
+                output = inference.inference(x_[:len(y_train)], y_,
+                                             x_[len(y_train):],
+                                             attention_score=attention_score,
+                                             retrieval_len=self.inference_config[id_pipe]["retrieval_config"][
+                                                 "subsample_ratio"], task_type="reg")
+                outputs.append(output)
+            elif self.inference_with_DDP:
+                inference = InferenceResultWithRetrieval(model=self.model,
+                                                         sample_selection_type="DDP")
+                output = inference.inference(x_[:len(y_train)].squeeze(1), y_, x_[len(y_train):].squeeze(1),
+                                             task_type="reg")
+                outputs.append(output)
+            else:
+                self.model.to(self.device)
+                with(torch.autocast(device_type='cuda', enabled=self.mix_precision), torch.inference_mode()):
+                    x_=x_.unsqueeze(0)
+                    y_ = y_.unsqueeze(0)
+
+                    output=self.model(x=x_, y=y_, eval_pos=y_.shape[1], task_type='reg')
+
+                if self.mask_prediction:
+                    process_config = output['process_config']
+                    output_feature_pred = self.PostProcessInModel(output['feature_pred'], process_config)
+                    output_feature_pred = self.PostProcess(output_feature_pred, pipe, process_config)
+                    mask_predictions.append(output_feature_pred)
+                    output = output['reg_output']
+
+                output = output if isinstance(output, dict) else output.squeeze(0)
+            outputs.append(output)
+            
+        output = torch.stack(outputs).squeeze(2).mean(dim=0)
+        mask_prediction = np.stack(mask_predictions).mean(axis=0) if mask_predictions != [] else None
+        output = (output * y_std) + y_mean
+        output = output.cpu().numpy()
+
+        if self.mask_prediction:
+            return output, mask_prediction
+        else:
+            return output
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py
new file mode 100644
index 00000000..f60ec146
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py
@@ -0,0 +1,589 @@
+import numpy as np
+from torch.utils.data import DataLoader, Dataset
+import torch
+import warnings
+import scipy
+from typing_extensions import override
+from typing import Literal, Any
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    OrdinalEncoder,
+    FunctionTransformer,
+    PowerTransformer,
+    StandardScaler,
+    QuantileTransformer, MinMaxScaler
+)
+from sklearn.pipeline import FeatureUnion, Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.utils.validation import check_is_fitted
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.data_utils import TabularInferenceDataset
+from functools import partial
+MAXINT_RANDOM_SEED = int(np.iinfo(np.int32).max)
+
+class SelectiveInversePipeline(Pipeline):
+    def __init__(self, steps, skip_inverse=None):
+        super().__init__(steps)
+        self.skip_inverse = skip_inverse or []
+    
+    def inverse_transform(self, X):
+        """跳过指定步骤的inverse_transform"""
+        if X.shape[1] == 0:
+            return X
+        for step_idx in range(len(self.steps) - 1, -1, -1):
+            name, transformer = self.steps[step_idx]
+            try:
+                check_is_fitted(transformer)
+            except:
+                continue
+            
+            if name in self.skip_inverse:
+                continue
+                
+            if hasattr(transformer, 'inverse_transform'):
+                X = transformer.inverse_transform(X)
+                if np.any(np.isnan(X)):
+                    print(f"After reverse RebalanceFeatureDistribution of {name}, there is nan")
+        return X
+
+class RobustPowerTransformer(PowerTransformer):
+    """PowerTransformer with automatic feature reversion when variance or value constraints fail."""
+
+    def __init__(self, var_tolerance: float = 1e-3,
+                 max_abs_value: float = 100,
+                 **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.var_tolerance = var_tolerance
+        self.max_abs_value = max_abs_value
+        self.restore_indices_: np.ndarray | None = None
+
+
+    def fit(self, X, y=None):
+        fitted = super().fit(X, y)
+        self.restore_indices_ = np.array([], dtype=int)
+        return fitted
+
+    def fit_transform(self, X, y=None):
+        Z = super().fit_transform(X,y)
+        self.restore_indices_ = self._should_revert(Z)
+        return Z
+
+    def _should_revert(self, Z: np.ndarray) -> np.ndarray:
+        """Determine which columns to revert to their original values."""
+        variances = np.nanvar(Z, axis=0)
+        bad_var = np.flatnonzero(np.abs(variances - 1.0) > self.var_tolerance)
+
+        bad_large = np.flatnonzero(np.any(Z > self.max_abs_value, axis=0))
+
+        return np.unique(np.concatenate([bad_var, bad_large]))
+
+    def _apply_reversion(self, Z: np.ndarray, X: np.ndarray) -> np.ndarray:
+        if self.restore_indices_.size > 0:
+            Z[:, self.restore_indices_] = X[:, self.restore_indices_]
+        return Z
+
+    def transform(self, X):
+        Z = super().transform(X)
+        # self.restore_indices_ = self._should_revert(Z)
+        return self._apply_reversion(Z, X)
+
+    def _yeo_johnson_optimize(self, x: np.ndarray) -> float:
+        "Overload_yeo_johnson_optimize to avoid crashes caused by values such as NaN and Inf."
+        try:
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore",
+                                        message=r"overflow encountered",
+                                        category=RuntimeWarning)
+                return super()._yeo_johnson_optimize(x)  # type: ignore
+        except scipy.optimize._optimize.BracketError:
+            return np.nan
+
+    def _yeo_johnson_transform(self, x: np.ndarray, lmbda: float) -> np.ndarray:
+        "_yeo_johnson_transform to avoid crashes caused by NaN"
+        if np.isnan(lmbda):
+            return x
+        return super()._yeo_johnson_transform(x, lmbda)  # type: ignore
+
+class BasePreprocess:
+    """Abstract base class for preprocessing class"""
+
+    def fit(self, x:np.ndarray, categorical_features:list[int], seed:int)->list[int]:
+        """Fit the preprocessing model to the data"""
+        raise NotImplementedError
+    
+    def transform(self, x:np.ndarray)->tuple[np.ndarray, list[int]]:
+        """Transform the data using the fitted preprocessing model"""
+        raise NotImplementedError
+    
+    def fit_transform(self, x:np.ndarray, categorical_features:list[int], seed:int)->tuple[np.ndarray, list[int]]:
+        """Fit the preprocessing model to the data and transform the data"""
+        self.fit(x, categorical_features, seed)
+        return self.transform(x)
+
+def infer_random_state(
+    random_state: int | np.random.RandomState | np.random.Generator | None,
+) -> tuple[int, np.random.Generator]:
+    """Infer the random state and return the seed and generator"""
+    if random_state is None:
+        np_rng = np.random.default_rng()
+        return int(np_rng.integers(0, MAXINT_RANDOM_SEED)), np_rng
+        
+    if isinstance(random_state, (int, np.integer)):
+        return int(random_state), np.random.default_rng(random_state)
+        
+    if isinstance(random_state, np.random.RandomState):
+        seed = int(random_state.randint(0, MAXINT_RANDOM_SEED))
+        return seed, np.random.default_rng(seed)
+        
+    if isinstance(random_state, np.random.Generator):
+        return int(random_state.integers(0, MAXINT_RANDOM_SEED)), random_state
+        
+    raise ValueError(f"Invalid random_state {random_state}")
+
+class FilterValidFeatures(BasePreprocess):
+    def __init__(self):
+        self.valid_features: list[bool] | None = None
+        self.categorical_idx: list[int] | None = None
+        self.invalid_indices: list[int] | None = None
+        self.invalid_features: list[int] | None = None
+
+    @override
+    def fit(self,x: np.ndarray, categorical_idx: list[int], seed:int) -> list[int]:
+        self.categorical_idx = categorical_idx
+        self.valid_features = ((x[0:1, :] == x).mean(axis=0) < 1.0).tolist()
+        self.invalid_indices = ((x[0:1, :] == x).mean(axis=0) == 1.0).tolist()
+        if not any(self.valid_features):
+            raise ValueError("All features are constant! Please check your data.")
+
+        self.categorical_idx = [
+            index
+            for index, idx in enumerate(np.where(self.valid_features)[0])
+            if idx in categorical_idx
+        ]
+
+        return self.categorical_idx
+    
+    @override
+    def transform(self,x: np.ndarray) -> tuple[np.ndarray, list[int]]:
+        assert self.valid_features is not None, "You must call fit first to get effective_features"
+        self.invalid_features = x[:, self.invalid_indices]
+        return x[:, self.valid_features], self.categorical_idx
+
+class FeatureShuffler(BasePreprocess):
+    """
+    Feature column reordering preprocessor
+    """
+
+    def __init__(
+        self,
+        mode: Literal['rotate', 'shuffle'] | None = "shuffle",
+        offset: int = 0,
+    ):
+        super().__init__()
+        self.mode = mode
+        self.offset = offset
+        self.random_seed = None
+        self.feature_indices = None
+        self.categorical_indices = None
+    
+    @override
+    def fit(self, data: np.ndarray, categorical_cols: list[int], seed:int) -> list[int]:
+        n_features = data.shape[1]
+        self.random_seed = seed
+        
+        indices = np.arange(n_features)
+        
+        if self.mode == "rotate":
+            self.feature_indices = np.roll(indices, self.offset)
+        elif self.mode == "shuffle":
+            _, rng = infer_random_state(self.random_seed)
+            self.feature_indices = rng.permutation(indices)
+        elif self.mode is None:
+            self.feature_indices = np.arange(n_features)
+        else:
+            raise ValueError(f"Unsupported reordering mode: {self.mode}")
+
+        is_categorical = np.isin(np.arange(n_features), categorical_cols)
+        self.categorical_indices = np.where(is_categorical[self.feature_indices])[0].tolist()
+        
+        return self.categorical_indices
+
+    @override
+    def transform(self, data: np.ndarray, *, is_test: bool = False) -> tuple[np.ndarray, list[int]]:
+        if self.feature_indices is None:
+            raise RuntimeError("Please call the fit method first to initialize")
+        if len(self.feature_indices) != data.shape[1]:
+            raise ValueError("The number of features in the input data does not match the training data")
+            
+        return data[:, self.feature_indices], self.categorical_indices or []
+
+class CategoricalFeatureEncoder(BasePreprocess):
+    """
+    Categorical feature encoder
+    """
+
+    def __init__(
+        self,
+        encoding_strategy: Literal['ordinal', 'ordinal_strict_feature_shuffled', 'ordinal_shuffled', 'onehot', 'numeric']|None = "ordinal",
+    ):
+        super().__init__()
+        self.encoding_strategy = encoding_strategy
+        self.random_seed = None
+        self.transformer = None
+        self.category_mappings = None
+        self.categorical_features = None
+
+    @override
+    def fit(self, data: np.ndarray, feature_indices: list[int], seed:int) -> list[int]:
+        self.random_seed = seed
+        self.transformer, self.categorical_features = self._create_transformer(data, feature_indices)
+        
+        if self.transformer is not None:
+            self.transformer.fit(data)
+            
+            if self.encoding_strategy == "ordinal_shuffled":
+                _, rng = infer_random_state(self.random_seed)
+                categories = self.transformer.named_transformers_["ordinal_encoder"].categories_
+                self.category_mappings = {
+                    idx: rng.permutation(len(cat)) 
+                    for idx, cat in enumerate(categories)
+                }
+        
+        return self.categorical_features
+
+    @override
+    def transform(self, data: np.ndarray, *, is_test: bool = False) -> tuple[np.ndarray, list[int]]:
+        if self.transformer is None:
+            return data, self.categorical_features or []
+        # todo 不生效？
+        transformed = self.transformer.transform(data)
+        
+        if self.category_mappings is not None:
+            for col_idx, mapping in self.category_mappings.items():
+                col_data = transformed[:, col_idx]
+                valid_mask = ~np.isnan(col_data)
+                col_data[valid_mask] = mapping[col_data[valid_mask].astype(int)]
+                
+        return transformed, self.categorical_features
+
+    @override
+    def fit_transform(self, data: np.ndarray, categorical_columns: list[int], seed:int) -> tuple[np.ndarray, list[int]]:
+        self.random_seed = seed
+        return self._fit_transform(data, categorical_columns)
+
+    def _fit_transform(
+        self,
+        X: np.ndarray,
+        categorical_features: list[int],
+    ) -> tuple[np.ndarray, list[int]]:
+        ct, categorical_features = self._create_transformer(X, categorical_features)
+        if ct is None:
+            self.transformer = None
+            return X, categorical_features
+
+        _, rng = infer_random_state(self.random_seed)
+
+        if self.encoding_strategy.startswith("ordinal"):       
+            Xt = ct.fit_transform(X)
+            categorical_features = list(range(len(categorical_features)))
+
+            if self.encoding_strategy.endswith("_shuffled"):
+                self.category_mappings = {}
+                for col_ix in categorical_features:
+                    col_cats = len(
+                        ct.named_transformers_["ordinal_encoder"].categories_[col_ix],
+                    )
+                    perm = rng.permutation(col_cats)
+                    self.category_mappings[col_ix] = perm
+                    
+                    col_data = Xt[:, col_ix]
+                    valid_mask = ~np.isnan(col_data)
+                    col_data[valid_mask] = perm[col_data[valid_mask].astype(int)].astype(col_data.dtype)
+
+        elif self.encoding_strategy == "onehot":
+            Xt = ct.fit_transform(X)
+            if Xt.size >= 1_000_000:
+                ct = None
+                Xt = X
+            else:
+                categorical_features = list(range(Xt.shape[1]))[
+                    ct.output_indices_["one_hot_encoder"]
+                ]
+        else:
+            raise ValueError(
+                f"Unknown categorical transform {self.encoding_strategy}",
+            )
+
+        self.transformer = ct
+        self.categorical_features = categorical_features
+        return Xt, categorical_features
+
+    @staticmethod
+    def get_least_common_category_count(column: np.ndarray) -> int:
+        """Retrieve the smallest count value among categorical features"""
+        if len(column) == 0:
+            return 0
+        return int(np.unique(column, return_counts=True)[1].min())
+
+    def _create_transformer(self, data: np.ndarray, categorical_columns: list[int]) -> tuple[ColumnTransformer | None, list[int]]:
+        """Create an appropriate column transformer"""
+        if self.encoding_strategy.startswith("ordinal"):
+            suffix = self.encoding_strategy[len("ordinal"):]
+            
+            if "feature_shuffled" in suffix:
+                categorical_columns = [
+                    idx for idx in categorical_columns 
+                    if self._is_valid_common_category(data[:, idx], suffix)
+                ]
+            remainder_columns = [idx for idx in range(data.shape[1]) if idx not in categorical_columns]
+            self.feature_indices = categorical_columns + remainder_columns
+                
+            return ColumnTransformer(
+                [("ordinal_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), categorical_columns)],
+                remainder="passthrough"
+            ), categorical_columns
+            
+        elif self.encoding_strategy == "onehot":
+            return ColumnTransformer(
+                [("one_hot_encoder", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), categorical_columns)],
+                remainder="passthrough"
+            ), categorical_columns
+            
+        elif self.encoding_strategy in ("numeric", "none"):
+            return None, categorical_columns
+            
+        raise ValueError(f"Unsupported encoding strategy: {self.encoding_strategy}")
+
+    def _is_valid_common_category(self, column: np.ndarray, suffix: str) -> bool:
+        """Check whether the input data meets the common category conditions"""
+        min_count = self.get_least_common_category_count(column)
+        unique_count = len(np.unique(column))
+        
+        if "strict_feature_shuffled" in suffix:
+            return min_count >= 10 and unique_count < (len(column) // 10)
+        return min_count >= 10
+
+# Avoid lambda to support pickle...
+def identity_function(x):
+    return x
+def feature_shift(x):
+    return x + np.abs(np.nanmin(x))
+def add_epsilon(x):
+    return x + 1e-10
+
+class RebalanceFeatureDistribution(BasePreprocess):
+    def __init__(
+            self,
+            *,
+            worker_tags: list[Literal['quantile', 'logNormal', 'quantile_uniform_10', 'quantile_uniform_5']] | None = ["quantile"],
+            discrete_flag: bool = False,
+            original_flag: bool = False,
+            svd_tag: Literal['svd'] | None = None,
+            joined_svd_feature: bool = True,
+            joined_log_normal: bool = True,
+    ):
+        super().__init__()
+        self.worker_tags = worker_tags
+        self.discrete_flag = discrete_flag
+        self.original_flag = original_flag
+        self.random_state = None
+        self.svd_tag = svd_tag
+        self.worker: Pipeline | ColumnTransformer | None = None
+        self.joined_svd_feature = joined_svd_feature
+        self.joined_log_normal = joined_log_normal
+        self.feature_indices = None
+
+    def fit(self, X: np.ndarray, categorical_features: list[int], seed:int) -> list[int]:
+        self.random_state = seed
+        n_samples, n_features = X.shape
+        worker, self.dis_ix = self._set(n_samples,n_features,categorical_features)
+        worker.fit(X)
+        self.worker = worker
+        return self.dis_ix
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        assert self.worker is not None
+        return self.worker.transform(X), self.dis_ix  # type: ignore
+
+
+    def _set(self,n_samples: int,
+        n_features: int,
+        categorical_features: list[int],
+        ):
+        static_seed, rng = infer_random_state(self.random_state)
+        all_ix = list(range(n_features))
+        workers = []
+        cont_ix = [i for i in all_ix if i not in categorical_features]
+        if self.original_flag:
+            trans_ixs = categorical_features + cont_ix if self.discrete_flag else cont_ix
+            workers.append(("original", "passthrough", all_ix))
+            dis_ix = categorical_features
+        elif self.discrete_flag:
+            # trans_ixs = all_ix
+            # dis_ix = categorical_features
+            trans_ixs = categorical_features + cont_ix
+            self.feature_indices = categorical_features + cont_ix
+            dis_ix = []
+        else:
+            workers.append(("discrete", "passthrough", categorical_features))
+            trans_ixs, dis_ix = cont_ix, list(range(len(categorical_features)))
+        for worker_tag in self.worker_tags:
+            if  worker_tag== "quantile":
+                sworker = QuantileTransformer(
+                    output_distribution="uniform",
+                    n_quantiles=max(n_samples // 10, 2),
+                    random_state=static_seed,
+                )
+            elif worker_tag == "logNormal":
+                sworker = Pipeline(steps=[
+                                        ("save_standard", Pipeline(steps=[
+                                            ("i2n_pre",
+                                             FunctionTransformer(
+                                                 func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),
+                                                 inverse_func=identity_function, check_inverse=False)),
+                                            ("fill_missing_pre",
+                                             SimpleImputer(missing_values=np.nan, strategy="mean",
+                                                           keep_empty_features=True)),
+                                            ("feature_shift",
+                                             FunctionTransformer(func=feature_shift)),
+                                            ("add_epsilon", FunctionTransformer(func=add_epsilon)),
+                                            ("logNormal", FunctionTransformer(np.log, validate=False)),
+                                            ("i2n_post",
+                                             FunctionTransformer(
+                                                 func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan,
+                                                                              posinf=np.nan),
+                                                 inverse_func=identity_function, check_inverse=False)),
+                                            ("fill_missing_post",
+                                             SimpleImputer(missing_values=np.nan, strategy="mean",
+                                                           keep_empty_features=True))])),
+                                        ])
+
+
+                trans_ixs = cont_ix
+            elif worker_tag == "quantile_uniform_10":
+                sworker = QuantileTransformer(
+                    output_distribution="uniform",
+                    n_quantiles=max(n_samples // 10, 2),
+                    random_state=static_seed,
+                )
+            elif worker_tag == "quantile_uniform_5":
+                sworker = QuantileTransformer(
+                    output_distribution="uniform",
+                    n_quantiles=max(n_samples // 5, 2),
+                    random_state=static_seed,
+                )
+            elif worker_tag == "quantile_uniform_all_data":
+                sworker = QuantileTransformer(
+                    output_distribution="uniform",
+                    n_quantiles=max(n_samples // 5, 2),
+                    random_state=static_seed,
+                    subsample=n_samples,
+                )
+            elif worker_tag == 'power':
+                self.feature_indices = categorical_features+cont_ix
+                self.dis_ix = dis_ix
+                nan_to_mean_transformer = SimpleImputer(
+                                                    missing_values=np.nan,
+                                                    strategy="mean",
+                                                    keep_empty_features=True,
+                                                )
+            
+                sworker = SelectiveInversePipeline(
+                                steps=[
+                                    ("power_transformer", RobustPowerTransformer(standardize=False)),
+                                    ("inf_to_nan_1", FunctionTransformer(
+                                                        func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),
+                                                        inverse_func=identity_function,
+                                                        check_inverse=False,
+                                                    )),
+                                    ("nan_to_mean_1", nan_to_mean_transformer),
+                                    ("scaler", StandardScaler()),
+                                    ("inf_to_nan_2", FunctionTransformer(
+                                                        func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),
+                                                        inverse_func=identity_function,
+                                                        check_inverse=False,
+                                                    )),
+                                    ("nan_to_mean_2", nan_to_mean_transformer),
+                                ],
+                        skip_inverse=['nan_to_mean_1', 'nan_to_mean_2']
+                )
+            else:
+                sworker = FunctionTransformer(identity_function)
+            if worker_tag in ["quantile_uniform_10", "quantile_uniform_5", "quantile_uniform_all_data"]:
+                self.n_quantile_features = len(trans_ixs)
+            workers.append(("feat_transform", sworker, trans_ixs))
+
+        CT_worker = ColumnTransformer(workers,remainder="drop",sparse_threshold=0.0)
+        if self.svd_tag == "svd" and n_features >= 2:
+            svd_worker = FeatureUnion([
+                    ("default", FunctionTransformer(func=identity_function)),
+                    ("svd",Pipeline(steps=[
+                                    ("save_standard",Pipeline(steps=[
+                                    ("i2n_pre", FunctionTransformer(func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),inverse_func=identity_function, check_inverse=False)),
+                                    ("fill_missing_pre", SimpleImputer(missing_values=np.nan, strategy="mean", keep_empty_features=True)),
+                                    ("standard", StandardScaler(with_mean=False)) ,
+                                    ("i2n_post", FunctionTransformer(func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),inverse_func=identity_function, check_inverse=False)),
+                                    ("fill_missing_post", SimpleImputer(missing_values=np.nan, strategy="mean", keep_empty_features=True))])),
+                                    ("svd",TruncatedSVD(algorithm="arpack",n_components=max(1,min(n_samples // 10 + 1,n_features // 2)),random_state=static_seed))]))
+                    ])
+            self.svd_n_comp = max(1,min(n_samples // 10 + 1,n_features // 2))
+            worker = Pipeline([("worker", CT_worker), ("svd_worker", svd_worker)])
+        else:   
+            self.svd_n_comp = 0
+            worker = CT_worker
+
+        self.worker = worker
+        return worker, dis_ix
+
+
+class SubSampleData():
+    def __init__(
+            self,
+            subsample_type: Literal["feature", "sample"] = "sample",
+            use_type: Literal["mixed", "only_sample"] = "mixed",
+    ):
+        super().__init__()
+        self.subsample_type = subsample_type
+        self.use_type = use_type
+
+    def fit(self,
+            x: torch.Tensor=None,
+            y: torch.Tensor = None,
+            feature_attention_score: torch.Tensor = None,
+            sample_attention_score: torch.Tensor = None,
+            subsample_ratio: float | int = 200,
+            subsample_idx:list[int] | np.ndarray[int] = None,
+            ):
+        if isinstance(subsample_ratio, float):
+            if self.subsample_type == "sample":
+                self.subsample_num = int(subsample_ratio * x.shape[0])
+            else:
+                self.subsample_num = int(subsample_ratio * x.shape[1])
+        else:
+            self.subsample_num = subsample_ratio
+        if self.subsample_type == "sample":
+            if self.use_type == "mixed":
+                y_feature_attention_score = feature_attention_score[:, -1, :].squeeze().permute(1, 0).unsqueeze(
+                    2).repeat(1, 1,
+                              sample_attention_score.shape[2])  # shape [features,test_sample_lens,train_sample_lens]
+
+                self.attention_score = torch.mean(sample_attention_score * y_feature_attention_score,
+                                                  dim=0)  # shape [test_sample_lens,train_sample_lens]
+            else:
+                self.attention_score = sample_attention_score[-1, :, :]
+            self.X_train = x
+            self.y_train = y
+        else:
+            y_feature_attention_score = torch.mean(feature_attention_score[:, -1, :].squeeze(),dim=0)  # shape [test_sample_lens,features]
+            if subsample_idx is None:
+                self.subsample_idx = np.argsort(y_feature_attention_score)[-min(self.subsample_num, x.shape[0]):]
+            else:
+                self.subsample_idx = subsample_idx
+            self.X_train = x
+
+    def transform(self, x: torch.Tensor=None) -> np.ndarray |torch.Tensor | TabularInferenceDataset:
+        if self.subsample_type == "feature":
+            return torch.cat([self.X_train, x], dim=0)[:, self.subsample_idx].numpy()
+        else:
+            return self.attention_score
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py
new file mode 100644
index 00000000..6c7c6937
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py
@@ -0,0 +1,555 @@
+import torch
+import torch.nn as nn
+from tabrepo.benchmark.models.ag.limix.LimiX.model.layer import EncoderBaseLayer, MLP
+from typing import Any,Literal
+from torch.nn.init import orthogonal_
+import numpy as np
+
+def calc_mean(x:torch.Tensor, dim:int):
+    num = torch.sum(~torch.isnan(x), dim=dim).clip(min=1.0)
+    return torch.nansum(x, dim=dim) / num, num
+
+def calc_std(x:torch.Tensor, dim:int, mean_v:torch.Tensor|None = None, value_num:torch.Tensor|None=None ):
+    if mean_v is None or value_num is None:
+        mean_v, value_num = calc_mean(x, dim)
+    mean_broadcast = torch.repeat_interleave(mean_v.unsqueeze(dim), x.shape[dim], dim=dim,)
+    return torch.sqrt(torch.nansum(torch.square(mean_broadcast - x), dim=dim) / (value_num - 1))
+
+def drop_outliers(
+                    x:torch.Tensor, 
+                    std_sigma:float=4,
+                    eval_pos:int=-1,
+                    lower:torch.Tensor|None = None,
+                    upper:torch.Tensor|None = None,
+                    dim:int=1
+                    ):
+        assert len(x.shape)==3, "x.shape must be B,S,F"
+
+        if lower is None:
+            data = x[:,:eval_pos].clone()
+            data_mean, value_num = calc_mean(data, dim=dim)
+            data_std = calc_std(data, dim=dim, mean_v=data_mean, value_num=value_num)
+            cut_off = data_std * std_sigma
+            lower, upper = data_mean - cut_off, data_mean + cut_off
+            
+            data[torch.logical_or(data > upper, data < lower)] = np.nan
+            data_mean, value_num = calc_mean(data, dim=dim)
+            data_std = calc_std(data, dim=dim, mean_v=data_mean, value_num=value_num)
+            cut_off = data_std * std_sigma
+            lower, upper = data_mean - cut_off, data_mean + cut_off
+        
+        x = torch.maximum(-torch.log(1 + torch.abs(x)) + lower, x)
+        x = torch.minimum(torch.log(1 + torch.abs(x)) + upper, x)
+        
+        return x, lower, upper
+    
+def normalize_mean0_std1(
+                        x:torch.Tensor, 
+                        eval_pos:int=-1,
+                        clip:bool=True,
+                        dim:int=1,
+                        mean: torch.Tensor | None = None,
+                        std: torch.Tensor | None = None
+                        ):
+    if mean is None:
+        mean, value_num = calc_mean(x[:,:eval_pos], dim=dim)
+        std = calc_std(x[:,:eval_pos], dim=dim, mean_v=mean, value_num=value_num) + 1e-20
+        
+        if x.shape[1] == 1 or eval_pos == 1:
+            std[:] = 1.0
+    x = (x - mean.unsqueeze(1).expand_as(x)) / std.unsqueeze(1).expand_as(x)
+    if clip:
+        x = torch.clip(x, min=-100, max=100)
+    return x, mean, std
+    
+
+class LinearEncoder(nn.Module):
+    """linear input encoder"""
+    def __init__(
+                self,
+                num_features: int,
+                emsize: int,
+                nan_to_zero: bool = False,
+                bias: bool = True,
+                in_keys:list[str]=['data'],
+                out_key:str='data',
+    ):
+        """Initialize the LinearEncoder.
+
+        Args:
+            num_features: The number of input features.
+            emsize: The embedding size, i.e. the number of output features.
+            nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False.
+            bias: Whether to use a bias term in the linear layer. Defaults to True.
+        """
+        super().__init__()
+        self.layer = nn.Linear(num_features, emsize, bias=bias)
+        self.nan_to_zero = nan_to_zero
+        self.in_keys = in_keys
+        self.out_key = out_key
+        
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        assert 'data' in input and 'nan_encoding' in input
+        x = [input[key] for key in self.in_keys] 
+        x = torch.cat(x, dim=-1) # type: ignore
+        if self.nan_to_zero:
+            x = torch.nan_to_num(x, nan=0.0)
+            
+        input[self.out_key] = self.layer(x)
+        return input
+
+class MLPEncoder(nn.Module):
+    """MLP input encoder"""
+    def __init__(
+                self,
+                num_features: int,
+                emsize: int,
+                nan_to_zero: bool = False,
+                bias: bool = True,
+                in_keys: list[str] = ['data'],
+                out_key: str = 'data',
+    ):
+        """Initialize the MLPEncoder.
+
+        Args:
+            num_features: The number of input features.
+            emsize: The embedding size, i.e. the number of output features.
+            nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False.
+            bias: Whether to use a bias term in the linear layer. Defaults to True.
+        """
+        super().__init__()
+        self.layer = nn.Sequential(
+            nn.Linear(num_features, emsize * 2, bias=bias),
+            nn.LayerNorm(emsize * 2),
+            nn.GELU(),
+            nn.Linear(emsize * 2, emsize, bias=bias),
+            nn.LayerNorm(emsize),
+            nn.GELU()
+        )
+        self.nan_to_zero = nan_to_zero
+        self.in_keys = in_keys
+        self.out_key = out_key
+        
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        assert 'data' in input and 'nan_encoding' in input
+        x = [input[key] for key in self.in_keys]
+        x = torch.cat(x, dim=-1) # type: ignore
+        if self.nan_to_zero:
+            x = torch.nan_to_num(x, nan=0.0)
+        input[self.out_key] = x
+        return input
+
+class MaskEmbEncoder(nn.Module):
+    """
+    For masked features, use the mask vector to obtain their representations; 
+    for numerical features, use a nonlinear network to obtain their representations
+    """
+    def __init__(
+                self,
+                num_features: int,
+                emsize: int,
+                mask_embedding_size: int,
+                nan_to_zero: bool = False,
+                bias: bool = True,
+                in_keys: list[str] = ['data'],
+                out_key: str = 'data',
+    ):
+        """Initialize the MaskEmbEncoder.
+
+        Args:
+            num_features: The number of input features.
+            emsize: The embedding size, i.e. the number of output features.
+            nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False.
+            bias: Whether to use a bias term in the linear layer. Defaults to True.
+        """
+        super().__init__()
+        self.embedding_dim = emsize
+        self.mask_embedding_size = mask_embedding_size
+        self.in_keys = in_keys
+        self.out_key = out_key
+
+        # All masked positions use the same vector
+        self.mask_embedding = nn.Parameter(torch.randn(self.mask_embedding_size))
+
+        # MLP for numerical features: input is 1, output is embedding_dim
+        self.numeric_mlp = nn.Sequential(
+            nn.Linear(1, self.embedding_dim // 2, bias=bias),
+            nn.LayerNorm(self.embedding_dim // 2),
+            nn.ReLU(),
+            nn.Linear(self.embedding_dim // 2, self.embedding_dim, bias=bias),
+            nn.LayerNorm(self.embedding_dim),
+            nn.ReLU()
+        )
+
+        # Merging layer: maps the concatenated feature vectors back to embedding_dim.
+        self.fusion_network = nn.Sequential(
+            nn.Linear(num_features * self.embedding_dim, self.embedding_dim, bias=bias),
+            nn.LayerNorm(self.embedding_dim),
+            nn.ReLU(),
+            nn.Linear(self.embedding_dim, self.embedding_dim, bias=bias),
+            nn.LayerNorm(self.embedding_dim)
+        )
+        self.nan_to_zero = nan_to_zero
+    
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        assert 'data' in input and 'nan_encoding' in input
+        x = [input[key] for key in self.in_keys]
+        x = torch.cat(x, dim=-1) # type: ignore
+        batch_size, seq_len, group, feature_num = x.shape
+        x_flat = x.view(-1, feature_num)
+        is_mask = torch.isnan(x_flat)
+        feature_embeddings = []   
+        for i in range(feature_num):
+            feat_vals = x_flat[:, i].unsqueeze(-1)
+            feat_is_mask = is_mask[:, i].unsqueeze(-1)
+
+            # Processing numerical features
+            numeric_input = torch.where(~feat_is_mask, feat_vals, torch.zeros_like(feat_vals))
+            numeric_emb = self.numeric_mlp(numeric_input)
+
+            # Construct mask embedding
+            mask_emb = self.mask_embedding.expand(numeric_emb.shape[0], -1)
+            
+            # Merge the embedding results of masked features and numerical features
+            combined_emb = torch.where(feat_is_mask.expand_as(numeric_emb), mask_emb, numeric_emb)
+            feature_embeddings.append(combined_emb)
+        concat_vector = torch.cat(feature_embeddings, dim=-1)
+
+        sample_representation = self.fusion_network(concat_vector)
+        output = sample_representation.view(batch_size, seq_len, group, -1)
+        
+        
+        input[self.out_key] = output
+        return input
+
+class NanEncoder(nn.Module):
+    """Encoder stage that deals with NaN and infinite values in the input"""
+    def __init__(
+        self,
+        nan_value: float = -2.0,
+        inf_value: float = 2.0,
+        neg_info_value: float = 4.0,
+        in_keys:list[str]=['data'],
+        out_key:str='nan_encoding'
+    ):
+        """Initialize the NanEncoder.
+
+        Args:
+            keep_nans: Flag to maintain NaN values as individual indicators. 
+        """
+        super().__init__()
+        self.nan_value = nan_value
+        self.inf_value = inf_value
+        self.neg_info_value = neg_info_value
+        self.in_keys = in_keys
+        self.out_key = out_key
+        
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        x:torch.Tensor = input[self.in_keys[0]] # type: ignore
+        eval_pos = input['eval_pos']
+        
+        mean_value, _ = calc_mean(x[:,:eval_pos,:], dim=1)
+        
+        nans_indicator = torch.zeros_like(x, dtype=x.dtype)
+        nans_indicator[torch.isnan(x)] = self.nan_value
+        pos_inf_mask = torch.isinf(x) & (torch.sign(x) == 1)
+        nans_indicator[pos_inf_mask] = self.inf_value
+        neg_inf_mask = torch.isinf(x) & (torch.sign(x) == -1)
+        nans_indicator[neg_inf_mask] = self.neg_info_value
+        nan_mask = torch.logical_or(torch.isnan(x), torch.isinf(x))
+        # avoid inplace operations
+        x = x.clone()
+        x[nan_mask] = mean_value.unsqueeze(1).expand_as(x)[nan_mask]
+        
+        input[self.in_keys[0]] = x
+        input[self.out_key ] = nans_indicator
+        return input
+        
+    
+class ValidFeatureEncoder(nn.Module):
+    """Valid feature encoder"""
+    def __init__(
+        self,
+        num_features: int,
+        nan_normalize: bool=True,
+        sqrt_normalize: bool=True,
+        in_keys:list[str]=['data'],
+        out_key:str='data'
+    ):
+        """Initialize the ValidFeatureEncoder.
+
+        Args:
+            num_features: The target number of features to transform the input into.
+            nan_normalize: Indicates whether to normalize based on the number of features actually used.
+            sqrt_normalize: Legacy option to normalize using the square root rather than the count of used features.
+        """
+        super().__init__()
+        self.num_features = num_features
+        self.nan_normalize = nan_normalize
+        self.sqrt_normalize = sqrt_normalize
+        self.in_keys = in_keys
+        self.out_key = out_key
+        self.valid_feature_num = None
+    
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        x:torch.Tensor = input[self.in_keys[0]]  # type: ignore
+        valid_feature = ~torch.all(x == x[:, 0:1, :], dim=1)
+        self.valid_feature_num = torch.clip(valid_feature.sum(-1).unsqueeze(-1),min=1)
+
+        if self.nan_normalize:
+            if self.sqrt_normalize:
+                x = x * torch.sqrt(self.num_features / self.valid_feature_num).unsqueeze(1).expand_as(x)
+            else:
+                x = x * (self.num_features / self.valid_feature_num)
+        
+        zeros = torch.zeros(
+            *x.shape[:-1],
+            self.num_features - x.shape[-1],
+            device=x.device,
+            dtype=x.dtype,
+        )
+        x = torch.cat([x, zeros], -1)
+        
+        input[self.out_key] = x
+        return input
+    
+
+class EmbYEncoderStep(nn.Module):
+    """A simple linear input encoder step."""
+
+    def __init__(
+        self,
+        *,
+        emsize: int,
+        n_classes: int = 10,
+        in_keys: list[str] = ['data'],
+        out_key: str = 'data',
+    ):
+        """Initialize the EmbYEncoderStep.
+
+        Args:
+            emsize: The embedding size, i.e. the number of output features.
+            n_classes: Number of classes
+        """
+        super().__init__()
+        
+        # Ensure the embedding dimension is large enough to support orthogonal initialization.
+        assert emsize > n_classes + 1, (f"emsize ({emsize}) must be >= n_classes+1 ({n_classes+1}) for orthogonal initialization")
+
+        # Generate an orthogonal matrix of size (n_classes + 1) × emsize
+        ortho_matrix = torch.empty(n_classes + 1, emsize)
+        orthogonal_(ortho_matrix)  # Initialize in-place as an orthogonal matrix
+
+        # Decompose the matrix: the first n_classes rows are used for y_embedding, and the last row is used for y_mask.
+        y_embed_weights = ortho_matrix[:n_classes, :]  # Shape (n_classes, emsize)
+        y_mask_weight = ortho_matrix[n_classes:n_classes+1, :]  # Shape (1, emsize)
+
+        self.y_embedding = nn.Embedding(n_classes, emsize)
+        self.y_embedding.weight.data = y_embed_weights.clone()
+
+        self.y_mask = nn.Embedding(1, emsize)
+        self.y_mask.weight.data = y_mask_weight.clone()
+        self.in_keys = in_keys
+        self.out_key = out_key
+        if len(self.in_keys) > 1:
+            print("Warning: The EmbYEncoderStepl function is only for processing Y, and in_keys must contain exactly one key.")
+        
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        y = input[self.in_keys[0]]
+        eval_pos = input['eval_pos']
+        y = y.int() # type: ignore
+        y_train = y[:,:eval_pos]
+        y_test = torch.zeros_like(y[:, eval_pos:], dtype=torch.int)
+        y_train_emb = self.y_embedding(y_train).to(torch.float16)
+        y_test_emb = self.y_mask(y_test).to(torch.float16)
+        y_emb = torch.cat([y_train_emb, y_test_emb], dim=1)
+        
+        input[self.out_key] = y_emb
+        return input
+
+class MulticlassTargetEncoder(nn.Module):
+    """Use the target's index as the class value, with each class corresponding to an index"""
+    def __init__(
+        self,
+        in_keys:list[str]=['data'],
+        out_key:str='data'
+    ):
+        """Initialize the ValidFeatureEncoder.
+
+        Args:
+            in_keys: the keys of the input parameter
+            out_key: the key of the output result.
+        """
+        super().__init__()
+        self.in_keys = in_keys
+        self.out_key = out_key
+    
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        x:torch.Tensor = input[self.in_keys[0]]  # type: ignore
+        eval_pos = input['eval_pos']
+        unique_xs = [
+            torch.unique(x[b, :eval_pos]) for b in range(x.shape[0])
+        ]
+        x_ = x.clone()
+        for b in range(x.shape[0]):
+            x_[b, :, :] = (x[b, :, :].unsqueeze(-1) > unique_xs[b]).sum(dim=-1)
+   
+        input[self.out_key] = x_
+        return input
+
+class NormalizationEncoder(nn.Module):
+    """normalize encoder"""
+    def __init__(
+                self, 
+                train_only:bool,
+                normalize_x:bool,
+                remove_outliers:bool,
+                std_sigma:float=4.0,
+                in_keys:list[str]=['data'],
+                out_key:str='data'
+                
+    ):
+        super().__init__()
+        self.train_only = train_only
+        self.normalize_x = normalize_x
+        self.remove_outliers = remove_outliers
+        self.std_sigma = std_sigma
+        self.in_keys = in_keys
+        self.out_key = out_key
+        self.mean = None
+        self.std = None
+
+    def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]:
+        x = input[self.in_keys[0]]
+        eval_pos = input['eval_pos']
+        pos = eval_pos if self.train_only else -1
+        if self.remove_outliers:
+            x, lower, upper = drop_outliers(x, eval_pos=pos, std_sigma=self.std_sigma)
+        if self.normalize_x:
+            x, self.mean, self.std = normalize_mean0_std1(x, eval_pos=pos )
+        
+        input[self.out_key] = x
+        return input
+
+
+
+def get_x_encoder(
+    *,
+    num_features: int,
+    embedding_size: int,
+    mask_embedding_size: int,
+    encoder_use_bias: bool,
+    in_keys: list = ['data']
+):
+    inputs_to_merge = {}
+    for in_key in in_keys:
+        inputs_to_merge[in_key] = {'dim': num_features}
+
+    encoder_steps = [
+        MaskEmbEncoder(
+            num_features=sum([i["dim"] for i in inputs_to_merge.values()]),
+            emsize=embedding_size,
+            mask_embedding_size=mask_embedding_size,
+            bias=encoder_use_bias,
+        ),
+    ]
+
+    return nn.Sequential(*encoder_steps,)
+
+
+def get_cls_y_encoder(
+    *,
+    num_inputs: int,
+    embedding_size: int,
+    nan_handling_y_encoder: bool,
+    max_num_classes: int
+) -> nn.Module:
+    steps = []
+    inputs_to_merge = [{"name": "data", "dim": num_inputs}]
+    if nan_handling_y_encoder:
+        steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')]
+        inputs_to_merge += [{"name": "nan_indicators", "dim": num_inputs}]
+
+    if max_num_classes >= 2:
+        steps += [MulticlassTargetEncoder()]
+
+    steps += [
+            EmbYEncoderStep(
+                emsize=embedding_size,
+                n_classes=max_num_classes
+        )
+    ]
+    return nn.Sequential(*steps)
+
+def get_reg_y_encoder(
+    *,
+    num_inputs: int,
+    embedding_size: int,
+    nan_handling_y_encoder: bool,
+    max_num_classes: int
+) -> nn.Module:
+    steps = []
+    inputs_to_merge = [{"name": "data", "dim": num_inputs}]
+    if nan_handling_y_encoder:
+        steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')]
+        inputs_to_merge += [{"name": "nan_indicators", "dim": num_inputs}]
+
+    steps += [
+        LinearEncoder(
+            num_features=sum([i["dim"] for i in inputs_to_merge]),  # type: ignore
+            emsize=embedding_size,
+            in_keys=['data', 'nan_encoding'],
+            out_key='data'
+        ),
+    ]
+    return nn.Sequential(*steps)
+
+
+def preprocesss_4_x(
+    *,
+    num_features: int,
+    nan_handling_enabled: bool,
+    normalize_on_train_only: bool,
+    normalize_x: bool,
+    remove_outliers: bool,
+    normalize_by_used_features: bool,
+    ):
+    """feature preprocess"""
+    inputs_to_merge = {"data": {"dim": num_features}}
+
+    preprocess_steps = []
+
+    # Obtain the positions of features with NaN and Inf values, and replace these features with the mean of the corresponding feature
+    preprocess_steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')]   
+    
+    if nan_handling_enabled:
+        inputs_to_merge["nan_encoding"] = {"dim": num_features}
+        preprocess_steps += [
+            # Zero values are added to convert the input into a fixed number of features, without normalization (variance is not constant). 
+            # This transformation is applied to the nan_indicators set, which shares the same shape as x. 
+            # However, since x has been imputed prior to this step, this operation is theoretically redundant.
+            ValidFeatureEncoder(
+                num_features=num_features,
+                nan_normalize=False,
+                in_keys=["nan_encoding"],
+                out_key="nan_encoding"
+            ),
+        ]
+
+    preprocess_steps += [
+        NormalizationEncoder(
+            train_only=normalize_on_train_only,
+            normalize_x=normalize_x,
+            remove_outliers=remove_outliers,
+        ),
+    ]
+
+    preprocess_steps += [
+        # Convert the input into a fixed number of features by adding zero values, with normalization applied (variance is constant).
+        ValidFeatureEncoder(
+            num_features=num_features,
+            nan_normalize=normalize_by_used_features,
+        ),
+    ]
+
+    return nn.Sequential(*preprocess_steps)
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py
new file mode 100644
index 00000000..85ba11d2
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py
@@ -0,0 +1,461 @@
+from typing import Callable, Literal, Optional
+import functools
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_kvpacked_func, flash_attn_varlen_qkvpacked_func
+
+    HAVE_FLASH_ATTN = True
+except (ModuleNotFoundError, ImportError):
+    HAVE_FLASH_ATTN = False
+
+from functools import partial
+from typing_extensions import override
+
+Activation = Literal['gelu']
+
+ACTIVATION_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = {
+    'gelu': nn.GELU(), 
+    'relu': nn.ReLU(),
+}
+
+class LayerNormMixedPrecision(nn.LayerNorm):
+    """
+    When the embedding dimension is below 512, use half precision for computation to improve performance. 
+    If the embedding dimension exceeds 512, it may cause training instability.
+    """
+    def forward(self, input: torch.Tensor):
+        if input.dtype == torch.float16 and sum(self.normalized_shape) < 512:
+            with torch.amp.autocast("cuda" if input.is_cuda else "cpu", enabled=False):
+                return super().forward(input)
+        else:
+            return super().forward(input)
+
+class MLP(torch.nn.Module):
+    """Multi-Layer Perceptron"""
+    def __init__(self, 
+                 in_features: int, 
+                 hidden_size:int, 
+                 out_features: int, 
+                 has_bias:bool, 
+                 device: torch.device | None, 
+                 dtype: torch.dtype | None,  
+                 activation: Activation = 'gelu', 
+                 depth:int=2):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.activation = activation
+        self.layers = []
+       
+        if depth == 1:
+            self.layers.append(nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype))
+        else:
+             # input layer
+            self.layers.append(nn.Linear(in_features, hidden_size, bias=has_bias, device=device, dtype=dtype))
+            self.layers.append(ACTIVATION_FN[self.activation])
+            # hidden layers
+            for i in range(depth - 2):
+                self.layers.append(nn.Linear(hidden_size, hidden_size, bias=has_bias, device=device, dtype=dtype))
+                self.layers.append(ACTIVATION_FN[self.activation])
+            # output layer
+            self.layers.append(nn.Linear(hidden_size, out_features, bias=has_bias, device=device, dtype=dtype))
+            torch.nn.init.normal_(self.layers[-1].weight)
+        self.mlp = nn.Sequential(*self.layers)
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+
+class MultiheadAttention(torch.nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        qkv_combined: bool = True,
+        dropout:float=0,
+        recompute:bool=False
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.qkv_combined = qkv_combined
+        self.dropout = dropout
+        self.recompute = recompute
+        self.device = device
+        self.dtype = dtype
+
+        self.out_proj_weight = torch.nn.Parameter(torch.empty(self.num_heads, self.head_dim, self.embed_dim, device=self.device, dtype=self.dtype))
+        self.qkv_proj_weight = torch.nn.Parameter(torch.empty(3, self.num_heads, self.head_dim, self.embed_dim, device=device, dtype=dtype))
+
+        torch.nn.init.normal_(self.out_proj_weight)
+        nn.init.xavier_uniform_(self.qkv_proj_weight)
+
+        self.q_proj_weight = None
+        self.kv_proj_weight = None
+        
+        if recompute:
+            self.forward = partial(checkpoint, self.forward, use_reentrant=False)  # type: ignore
+    
+    def get_cu_seqlens(self, batch_size: int, seqlen: int, device: torch.device) -> torch.Tensor:
+        return torch.arange(
+            0,
+            (batch_size + 1) * seqlen,
+            step=seqlen,
+            dtype=torch.int32,
+            device=device,
+        )
+    
+    def compute_attention_by_torch(self, qkv:torch.Tensor|None, q:torch.Tensor|None, kv:torch.Tensor|None, attn_mask:torch.Tensor|None) -> torch.Tensor:
+        '''Since flash attention does not support attn_mask, use scaled_dot_product_attention to compute attention when attn_mask is not None'''
+        if qkv is not None:
+            q, k, v = qkv.unbind(dim=-3)
+        elif kv is not None and q is not None:
+            k,v = kv.unbind(dim=-3)
+        else:
+            raise ValueError("When qkv is None, q and kv cannot both be None at the same time")
+        assert q is not None and k is not None and v is not None, "q, k, and v must not be None"
+        
+        attention_outputs = torch.nn.functional.scaled_dot_product_attention(
+                q.transpose(1, 2),
+                k.transpose(1, 2),
+                v.transpose(1, 2),
+                attn_mask=attn_mask,
+                dropout_p=self.dropout,
+            )
+        attention_outputs = attention_outputs.transpose(1, 2)
+        return attention_outputs
+    
+    def compute_attention_by_flashattn(self, qkv:torch.Tensor|None, q:torch.Tensor|None, kv:torch.Tensor|None) -> torch.Tensor:
+        "Compute attention using flash attention"
+        assert HAVE_FLASH_ATTN, "Flash attention is not supported. Please install/reinstall flash attention."
+        if self.qkv_combined and qkv is not None:
+            B,S = qkv.shape[:2]
+            atten_out = flash_attn_varlen_qkvpacked_func( # type: ignore
+                        qkv.reshape(B * S, 3, self.num_heads, self.head_dim),
+                        self.get_cu_seqlens(B, S, qkv.device),
+                        S,
+                        dropout_p=self.dropout,
+                        softmax_scale=None,
+                        causal=False,
+                        return_attn_probs=False,
+                        deterministic=False,
+                    )
+        elif not self.qkv_combined and q is not None and kv is not None:
+            B,S = q.shape[:2]
+            kv_shape = kv.shape
+            atten_out = flash_attn_varlen_kvpacked_func( # type: ignore
+                    q.reshape(B * S, self.num_heads, self.head_dim),
+                    kv.reshape(B * kv_shape[1], 2, self.num_heads, self.head_dim),
+                    self.get_cu_seqlens(B, S, q.device),
+                    self.get_cu_seqlens(B, kv_shape[1], kv.device),
+                    S,
+                    kv_shape[1],
+                    dropout_p=self.dropout,
+                    causal=False,
+                    return_attn_probs=False,
+                    deterministic=False,
+                )
+        return atten_out # type: ignore
+    
+    @override
+    def forward(self, 
+                x: torch.Tensor, 
+                x_kv: Optional[torch.Tensor] = None, 
+                copy_first_head_kv: bool = False,
+                attn_mask: torch.Tensor | None = None, 
+                calculate_sample_attention:bool=False, 
+                calculate_feature_attention:bool=False) -> tuple[torch.Tensor,torch.Tensor | None,torch.Tensor | None]:
+        """
+        x: [batch_size, seq_len, feature, embed_dim]
+        kv: Optional[batch_size, seq_len_kv, feature, embed_dim] — only needed if qkv_combined=False
+        copy_first_head: Reuse the results from the first attention head
+        """
+        # feature attention: [B S F E]  
+        # item attention: [B F S E]
+        # B, T, C = x.shape
+        B, S, _, _ = x.shape
+        assert x.shape[-1] == self.embed_dim
+
+        x = x.reshape(-1, *x.shape[-2:])
+        BS, F, E = x.shape
+        
+        qkv = None
+        q = None
+        kv = None
+        feature_attention=None
+        sample_attention=None
+        # batch_size = None
+        # seqlen = None
+        if self.qkv_combined:
+            qkv = torch.einsum("... s, j h d s -> ... j h d", x, self.qkv_proj_weight)
+        else:
+            self.q_proj_weight = self.qkv_proj_weight[0]
+            self.kv_proj_weight = self.qkv_proj_weight[1:]
+            assert x_kv is not None, "kv combined attention requires kv input"
+            x_kv = x_kv.reshape(-1, *x_kv.shape[-2:])
+            q = torch.einsum("... s, h d s -> ... h d", x, self.q_proj_weight)
+            if copy_first_head_kv:
+                kv_weights = self.kv_proj_weight[:,:1]
+                kv = torch.einsum("... s, j h d s -> ... j h d", x_kv, kv_weights)
+                expand_shape = [-1 for _ in kv.shape]
+                expand_shape[-2] = self.num_heads
+                kv = kv.expand(*expand_shape)
+            else:
+                kv = torch.einsum("... s, j h d s -> ... j h d", x_kv, self.kv_proj_weight)
+                
+        if attn_mask is None and HAVE_FLASH_ATTN:
+            atten_out = self.compute_attention_by_flashattn(qkv, q, kv)
+        else:
+            atten_out = self.compute_attention_by_torch(qkv, q, kv, attn_mask)
+                
+        atten_out = atten_out.reshape(BS, F, self.num_heads, self.head_dim)
+
+        if qkv is not None:
+            q, k, v = qkv.unbind(dim=2)
+        else:
+            k,v=kv.unbind(dim=2)
+        if calculate_feature_attention:
+            logits = torch.einsum("b q h d, b k h d -> b q k h", q, k)
+            logits *= (
+                torch.sqrt(torch.tensor(1.0 / (q.shape[-1]*q.shape[-2]))).to(k.device)
+            )
+            ps = torch.softmax(logits, dim=2).to(torch.float16)
+            del logits
+            feature_attention = torch.mean(ps, dim=-1)
+            del ps
+        if calculate_sample_attention:
+            logits = torch.einsum("b q h d, b k h d -> b q k h", q, k)
+            logits *= (
+                torch.sqrt(torch.tensor(1.0 / (q.shape[-1] * q.shape[-2]))).to(k.device)
+            )
+            ps = torch.softmax(logits, dim=2).to(torch.float16)
+            del logits
+            sample_attention = torch.mean(ps, dim=-1)
+            del ps
+        out = torch.einsum(
+            "... h d, h d s -> ... s",
+            atten_out,
+            self.out_proj_weight,
+        )
+
+        return out.reshape(B, S, *out.shape[1:]),feature_attention,sample_attention
+
+class EncoderBaseLayer(nn.Module):
+    "Base encoder layer of the Transformer model"
+    def __init__(self, 
+                 nhead: int, 
+                 embed_dim: int, 
+                 hid_dim:int, 
+                 dropout: float=0,
+                 activation: str='gelu',
+                 layer_norm_eps: float=1e-5,
+                 device: torch.device|None=None,
+                 dtype: torch.dtype|None=None,
+                 recompute_attn: bool=False,
+                 calculate_sample_attention: bool = False,
+                 calculate_feature_attention: bool = False,
+                 ):
+        super().__init__()
+        self.nhead = nhead
+        self.embed_dim = embed_dim
+        self.hid_dim = hid_dim
+        self.dropout = dropout
+        self.activation = activation
+        self.layer_norm_eps = layer_norm_eps
+        self.device = device
+        self.dtype = dtype
+        self.head_dim = self.embed_dim // self.nhead
+        self.recompute_attn = recompute_attn
+        
+        self.feature_attentions = []
+        self.sequence_attentions = []
+        self.mlp = []
+        self.feature_attn_num = 1           # feature attention number
+        self.items_attn_num = 1             # items attention number
+        self.mlp_num = 1                    # mlp number
+        self.calculate_sample_attention = calculate_sample_attention
+        self.calculate_feature_attention = calculate_feature_attention
+        self.feature_attn_num = 2
+        self.mlp_num = 3
+        
+        # attention+MLP
+        self.feature_attentions = nn.ModuleList(
+                                                    [
+                                                        MultiheadAttention(
+                                                                embed_dim=self.embed_dim,
+                                                                num_heads=self.nhead,
+                                                                device=self.device,
+                                                                dtype=self.dtype,
+                                                                qkv_combined=True,
+                                                                dropout=self.dropout,
+                                                                recompute=self.recompute_attn,
+                                                        ) 
+                                                        for _ in range(self.feature_attn_num)
+                                                    ]
+                                                )
+        self.sequence_attentions = nn.ModuleList(
+                                                    [
+                                                        MultiheadAttention(
+                                                            embed_dim=self.embed_dim,
+                                                            num_heads=self.nhead,
+                                                            device=self.device,
+                                                            dtype=self.dtype,
+                                                            qkv_combined=False,
+                                                            dropout=self.dropout,
+                                                            recompute=self.recompute_attn,
+                                                        ) 
+                                                        for _ in range(self.items_attn_num)
+                                                    ]
+                                                )
+        self.mlp = nn.ModuleList(
+                                    [
+                                        MLP(
+                                            in_features=self.embed_dim,
+                                            hidden_size=self.hid_dim,
+                                            out_features=self.embed_dim,
+                                            has_bias=False,
+                                            device=self.device,
+                                            dtype=self.dtype,
+                                            activation=self.activation,
+                                            depth=2,
+                                        ) 
+                                        for _ in range(self.mlp_num)
+                                    ]
+                                 )
+        
+        self.layer_steps = [
+                            partial(
+                                self.call_features_attention,
+                                index=0
+                            ),
+                            self.mlp[0],
+                            partial(
+                                self.call_features_attention,
+                                index=1
+                            ),
+                            self.mlp[1],
+                            partial(
+                                self.call_sequence_attention,
+                                index=0
+                            ),
+                            self.mlp[2]
+        ]
+    
+        self.layer_norms = nn.ModuleList(
+            [
+                LayerNormMixedPrecision(normalized_shape=self.embed_dim, eps=self.layer_norm_eps, 
+                                        elementwise_affine=False, device=self.device, dtype=self.dtype)
+                for _ in range(len(self.layer_steps))
+            ]
+        )
+    
+    def create_attn_mask(self, q_mask:torch.Tensor, k_mask:torch.Tensor)->torch.Tensor:
+        """
+        Create attention mask
+        
+        Args:
+            q_mask (torch.Tensor): Query sequence mask, with shape [batch_size, head_count, q_seq_len]
+            k_mask (torch.Tensor): Key sequence mask, with shape   [batch_size, head_count, k_seq_len]
+        
+        Returns:
+            torch.Tensor: attention mask, with shape [batch_size, head_count, q_seq_len, k_seq_len]
+        """
+        _, _, q_seq_len = q_mask.shape
+        _, _, k_seq_len = k_mask.shape
+        
+        q_mask_bool = q_mask.bool()  # [batch_size, head_count, q_seq_len]
+        k_mask_bool = k_mask.bool()  # [batch_size, head_count, k_seq_len]
+        
+        q_expanded = q_mask_bool.unsqueeze(-1)
+        k_expanded = k_mask_bool.unsqueeze(-2)
+        
+        valid_attn = q_expanded & k_expanded
+        attn_mask = ~valid_attn
+        _, _, q_seq_len, k_seq_len = attn_mask.shape
+        attn_mask = attn_mask.reshape(-1, q_seq_len, k_seq_len)
+        attn_mask = attn_mask.unsqueeze(1).expand(-1, 6, -1, -1)
+        
+        return attn_mask
+
+    def call_features_attention(self, x: torch.Tensor, feature_atten_mask: torch.Tensor | None, eval_pos: int,
+                                index: int = 0,calculate_feature_attention:bool=False):
+        assert len(self.feature_attentions) > index
+        attn_mask = None
+        if feature_atten_mask is not None:
+            attn_mask = self.create_attn_mask(feature_atten_mask, feature_atten_mask)
+        return self.feature_attentions[index](
+                                x,
+                                x_kv=None,
+                                attn_mask=attn_mask,
+                                calculate_feature_attention=calculate_feature_attention
+                            )
+
+    def call_sequence_attention(self, x: torch.Tensor, feature_atten_mask: torch.Tensor | None, eval_pos: int,
+                                index: int = 0,calculate_sample_attention:bool=False):
+        assert len(self.sequence_attentions) > index
+        sample_attention=None
+        if eval_pos < x.shape[1]:
+            x_test,_,sample_attention = self.sequence_attentions[index](
+                                                    x=x[:, eval_pos:].transpose(1, 2),
+                                                    x_kv=x[:, :eval_pos].transpose(1, 2),
+                                                    copy_first_head_kv=True,
+                                                    calculate_sample_attention=calculate_sample_attention
+                                                )
+            x_test=x_test.transpose(1, 2)
+        else:
+            x_test = None
+            print(f"Warning: eval_pos >= x.shape[1]!")
+        x_train = self.sequence_attentions[index](
+                        x = x[:, :eval_pos].transpose(1, 2),
+                        x_kv = x[:, :eval_pos].transpose(1, 2)
+                    )[0].transpose(1, 2)
+        
+        if x_test is not None:
+            return torch.cat([x_train, x_test], dim=1),None,sample_attention
+        else:
+            return x_train
+
+    def forward(self, x: torch.Tensor, feature_atten_mask: torch.Tensor, eval_pos: int,layer_idx:int) -> tuple[torch.Tensor,torch.Tensor | None,torch.Tensor | None]:
+        feature_attenion=None
+        sample_attention=None
+        for idx, (sublayer, layer_norm) in enumerate(zip(self.layer_steps, self.layer_norms)):
+            residual = x
+            x = layer_norm(x)
+            if idx == 2 and self.calculate_feature_attention and layer_idx == 11:
+                x, feature_attenion, _ = sublayer(x, feature_atten_mask, eval_pos,calculate_feature_attention=True)
+            elif idx == 4 and self.calculate_sample_attention and layer_idx == 11:
+                x, _, sample_attention = sublayer(x, feature_atten_mask, eval_pos,calculate_sample_attention=True)
+            else:
+                if isinstance(sublayer, functools.partial):
+                    x = sublayer(x, feature_atten_mask, eval_pos)
+                    if isinstance(x, tuple):
+                        x = x[0]
+                else:
+                    x = sublayer(x)
+                    if isinstance(x, tuple):
+                        x = x[0]
+                x = x + residual
+        return x,feature_attenion,sample_attention
+                
+class LayerStack(nn.Module):
+    """
+    A flexible container module similar to ``nn.Sequential`` that allows 
+    keyword arguments to be passed through to each layer.
+    """
+    def __init__(self, layers: list[nn.Module]):
+        super().__init__()
+        self.layers = nn.ModuleList(layers)
+    
+    def forward(self, x, **kwargs):
+        for idx,layer in enumerate(self.layers):
+            kwargs["layer_idx"] = idx
+            x,feature_attention,sample_attention = layer(x,**kwargs)
+        return x,feature_attention,sample_attention
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py
new file mode 100644
index 00000000..75ea811d
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py
@@ -0,0 +1,285 @@
+import torch
+import torch.nn as nn
+from tabrepo.benchmark.models.ag.limix.LimiX.model.layer import EncoderBaseLayer, MLP, LayerStack
+from typing import Any, Literal
+from tabrepo.benchmark.models.ag.limix.LimiX.model.encoders import get_x_encoder, get_cls_y_encoder, get_reg_y_encoder, preprocesss_4_x
+
+
+
+
+class FeaturesTransformer(nn.Module):
+    def __init__(
+                self,
+                *,
+                preprocess_config_x:dict[str, Any],
+                encoder_config_x:dict[str, Any],
+                encoder_config_y:dict[str, Any],
+                decoder_config:dict[str, Any],
+                nlayers:int,
+                nhead: int, 
+                embed_dim: int, 
+                hid_dim:int,
+                mask_prediction: bool = False,
+                features_per_group:int = 2,
+                dropout: float=0,
+                activation: str='gelu',
+                layer_norm_eps: float=1e-5,
+                device: torch.device|None=None,
+                dtype: torch.dtype|None=None,
+                recompute_attn: bool=False,
+                calculate_sample_attention: bool = False,
+                calculate_feature_attention: bool = False
+                ):
+        super().__init__()
+        
+        self.preprocess_config_x = preprocess_config_x
+        self.encoder_config_x = encoder_config_x
+        self.encoder_config_y = encoder_config_y
+        self.decoder_config = decoder_config
+        self.nlayers = nlayers
+        self.nhead = nhead
+        self.embed_dim = embed_dim
+        self.hid_dim = hid_dim
+        self.mask_prediction = mask_prediction
+        self.features_per_group = features_per_group
+        self.dropout = dropout
+        self.activation = activation
+        self.layer_norm_eps = layer_norm_eps
+        self.device = device
+        self.dtype = dtype
+        self.recompute_attn = recompute_attn
+
+        layer_creator = lambda: EncoderBaseLayer(
+            embed_dim=self.embed_dim,
+            hid_dim=self.hid_dim,
+            nhead=self.nhead,
+            dropout=self.dropout,
+            activation=self.activation, # type: ignore
+            layer_norm_eps=self.layer_norm_eps,
+            device=self.device,
+            dtype=self.dtype,
+            recompute_attn=self.recompute_attn,
+            calculate_sample_attention=calculate_sample_attention,
+            calculate_feature_attention=calculate_feature_attention
+        )
+
+        self.encoder_x = get_x_encoder( **encoder_config_x)
+        self.cls_y_encoder = get_cls_y_encoder(**encoder_config_y)
+        self.reg_y_encoder = get_reg_y_encoder(**encoder_config_y)
+
+        self.transformer_encoder = LayerStack([layer_creator() for _ in range(self.nlayers)])
+        self.encoder_out_norm = nn.LayerNorm(self.embed_dim, eps=1e-5, elementwise_affine=False)
+
+        self.cls_y_decoder = nn.Sequential(
+                                            nn.Linear(self.embed_dim, self.hid_dim),
+                                            nn.GELU(),
+                                            nn.Linear(self.hid_dim, decoder_config['num_classes']),
+                                            )
+        
+        self.reg_y_decoder = nn.Sequential(
+                                        nn.Linear(self.embed_dim, self.hid_dim),
+                                        nn.LayerNorm(self.hid_dim),
+                                        nn.GELU(),
+                                        nn.Linear(self.hid_dim, 1),
+                                        )
+        self.feature_decoder = nn.Sequential(
+                                        nn.Linear(self.embed_dim, self.hid_dim),
+                                        nn.LayerNorm(self.hid_dim),
+                                        nn.GELU(),
+                                        nn.Linear(self.hid_dim, self.features_per_group),
+                                        )
+        
+        self.feature_positional_embedding = nn.Linear(self.embed_dim // 4, self.embed_dim)
+        
+        self.x_preprocess = preprocesss_4_x(**preprocess_config_x)
+        self.calculate_sample_attention = calculate_sample_attention
+        self.calculate_feature_attention = calculate_feature_attention
+
+    def forward(self, x: torch.Tensor, 
+                y: torch.Tensor, 
+                eval_pos: int, 
+                task_type: Literal['reg', 'cls'] = 'cls') -> torch.Tensor | dict[str, torch.Tensor] | tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        '''
+            x: The input x, which includes both train x and test x, Shape: [batch, sequence, feature]
+            y: The input y, which includes both train y and test y, Shape: [batch, label]
+            eval_pos: Train x and train y split point
+            task_type: Type of task, options: cls(classification), reg(regression)
+        '''
+        assert x is not None and y is not None, "x and y must not be none"
+        assert eval_pos > 0, "eval_pos must be a positive number"
+        assert len(x.shape)==3, "x must be [Batch, seq, Feature] but is {}".format(x.shape)
+        assert len(y.shape)==2, "y must be [Batch, label]"
+        assert eval_pos < x.shape[1] and eval_pos <= y.shape[1], "The split point between train x and test x must be less than the feature dimension of x, and less than or equal to the label dimension of y"
+        
+        batch_size, seq_len, num_feature = x.shape
+        x = {'data':x, 'mask':torch.isnan(x).to(torch.int32).to(x.device)}
+        y = {'data':y}
+        
+        feature_to_add = num_feature%self.features_per_group
+        if feature_to_add > 0:
+            # Extend the feature dimension of x when it is insufficient
+            for k in x:
+                x[k] = torch.cat(
+                    (
+                        x[k],
+                        torch.zeros(
+                            batch_size,
+                            seq_len,
+                            feature_to_add,
+                            device=x[k].device,
+                            dtype=x[k].dtype
+                        )
+                    ),
+                    dim=-1
+                )
+        for k in x:
+            x[k] = x[k].reshape(batch_size, seq_len, x[k].shape[2]//self.features_per_group, self.features_per_group)
+        x['eval_pos'] = eval_pos
+        preprocessed_x = self.x_preprocess(x)
+        preprocessed_x = self.process_4_x(preprocessed_x)
+        x_encoder_result = self.encoder_x(preprocessed_x)
+        x_emb_result = x_encoder_result['data']
+        
+        for k in y:
+            # Extend the label dimension of y when it is insufficient
+            y[k] = y[k].unsqueeze(-1)
+            if y[k].shape[1] < x['data'].shape[1]:
+                y[k] = torch.cat(
+                    (
+                        y[k],
+                        torch.nan
+                        * torch.zeros(
+                            y[k].shape[0],
+                            x["data"].shape[1] - y[k].shape[1],
+                            y[k].shape[2],
+                            device=y[k].device,
+                            dtype=y[k].dtype,
+                        ),
+                    ),
+                    dim=1
+                )
+        # Mask the test y
+        y["data"][eval_pos:] = torch.nan
+        
+        if task_type == 'cls':
+            y_type =  torch.zeros_like(y['data'], device=y['data'].device)
+        else:
+            y_type =  torch.ones_like(y['data'], device=y['data'].device)
+            
+        embedded_y = self.mixed_y_embedding(y, y_type=y_type, eval_pos=eval_pos)
+
+        if torch.isnan(embedded_y).any():
+            raise ValueError("embedded_y contains NaN values; please add a NanEncoder in the encoder")
+        
+        embedded_x = self.add_embeddings(x_emb_result)
+        embedded_all = torch.cat((embedded_x, embedded_y.unsqueeze(2)), dim=2)
+        if torch.isnan(embedded_all).any():
+            raise ValueError("embedded_all contains NaN values; please add a NanEncoder in the encoder")
+        if self.calculate_sample_attention or self.calculate_feature_attention:
+            return self.transformer_encoder(embedded_all, feature_atten_mask=None, eval_pos=eval_pos)
+        else:
+            pass
+        encoder_out = self.transformer_encoder(embedded_all, feature_atten_mask=None, eval_pos=eval_pos)[0]
+        encoder_out = self.encoder_out_norm(encoder_out)
+        
+        test_encoder_out = encoder_out[:, eval_pos:, -1]
+        test_y_type = y_type[:,eval_pos:]
+        encoder_out_4_feature = encoder_out[:, :, :-1, :]
+        if self.mask_prediction:
+            cls_output, reg_output = self.y_decoder(test_encoder_out, test_y_type)
+            feature_pred = self.feature_decoder(encoder_out_4_feature)
+            output_decoded = {
+                "cls_output": cls_output,
+                "reg_output": reg_output,
+                "feature_pred": feature_pred,
+                "process_config": {
+                    "n_x_padding": feature_to_add,
+                    "features_per_group": self.x_preprocess[3].num_features,
+                    "num_used_features": self.x_preprocess[3].valid_feature_num,
+                    "mean_for_normalization": self.x_preprocess[2].mean,
+                    "std_for_normalization": self.x_preprocess[2].std
+                }
+            }
+        else:
+            cls_output, reg_output = self.y_decoder(test_encoder_out, test_y_type)
+            if task_type=="cls":
+                output_decoded = cls_output
+            else:
+                output_decoded = reg_output
+            
+        return output_decoded
+
+    
+    def mixed_y_embedding(self, y:dict, y_type:torch.Tensor, eval_pos:int):
+        y = y['data']
+        seq_len, batch_size, y_num = y.shape
+        y_flat = y.reshape(-1)
+        y_type_flat = y_type.reshape(-1)
+        
+        idx = torch.arange(len(y_flat), device=y.device)
+        idx_cls = idx[y_type_flat == 0]
+        idx_reg = idx[y_type_flat == 1]
+        y_cls = y_flat[idx_cls]
+        y_reg = y_flat[idx_reg]
+
+        y_cls = y_cls.reshape(seq_len, -1, y_num)
+        y_reg = y_reg.reshape(seq_len, -1, y_num)
+        y_cls = {'data': y_cls, 'eval_pos':eval_pos}
+        y_reg = {'data': y_reg, 'eval_pos':eval_pos}
+
+        cls_y_emb = self.cls_y_encoder(y_cls) if len(idx_cls) > 0 else None
+        reg_y_emb = self.reg_y_encoder(y_reg) if len(idx_reg) > 0 else None
+        cls_y_emb = cls_y_emb['data'] if cls_y_emb is not None else None
+        reg_y_emb = reg_y_emb['data'] if reg_y_emb is not None else None
+        
+        emb_size = self.embed_dim
+        out = torch.empty(len(y_flat), emb_size, dtype=torch.float16, device=y_flat.device)
+        if cls_y_emb is not None:            
+            cls_y_emb_flat = cls_y_emb.reshape(-1, emb_size)
+            out.index_put_((idx_cls,), cls_y_emb_flat)
+
+        if reg_y_emb is not None:
+            reg_y_emb_flat = reg_y_emb.reshape(-1, emb_size).to(torch.float16)
+            out.index_put_((idx_reg,), reg_y_emb_flat)
+
+        output = out.reshape(seq_len, batch_size, emb_size)
+        return output
+    
+    def process_4_x(self, data:dict):
+        x_input = data['data']
+        mask = data['mask'].to(torch.bool)
+        x_input = torch.where(mask, float('nan'), x_input)
+        data['data'] = x_input
+        return data
+    
+    def add_embeddings(self, x:torch.Tensor):
+        with torch.cuda.amp.autocast(enabled=False):
+            embs = torch.randn(
+                (x.shape[2], x.shape[3] // 4),
+                device=x.device,
+                dtype=torch.float32,
+            )
+            torch.nn.init.orthogonal_(embs)
+        embs =self.feature_positional_embedding(embs.to(x.dtype))
+        x += embs[None, None]
+        return x
+    
+    def y_decoder(self, test_encoder_out, test_y_type):
+        seq_len, _, emb_size = test_encoder_out.shape
+        flat_test_encoder_out = test_encoder_out.reshape(-1, emb_size)
+        flat_test_y_type = test_y_type.reshape(-1)
+        
+        idx = torch.arange(len(flat_test_encoder_out), device=flat_test_encoder_out.device)
+        idx_cls = idx[flat_test_y_type == 0]
+        idx_reg = idx[flat_test_y_type == 1]
+
+        cls_y_encoder_out = flat_test_encoder_out[idx_cls]
+        reg_y_encoder_out = flat_test_encoder_out[idx_reg]
+        cls_y_encoder_out = cls_y_encoder_out.reshape(seq_len, -1, emb_size)
+        reg_y_encoder_out = reg_y_encoder_out.reshape(seq_len, -1, emb_size)
+
+        cls_y = self.cls_y_decoder(cls_y_encoder_out)
+        reg_y = self.reg_y_decoder(reg_y_encoder_out)
+
+        return cls_y, reg_y
+    
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/__init__.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py
new file mode 100644
index 00000000..ceb48b08
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py
@@ -0,0 +1,261 @@
+import os
+
+import numpy as np
+import pandas as pd
+import torch
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler, LabelEncoder
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+
+from tabrepo.benchmark.models.ag.limix.LimiX.utils.inference_utils import shuffle_data_along_dim
+
+
+class TabularFinetuneDataset(Dataset):
+    """
+        A custom PyTorch Dataset for fine-tuning, supporting data shuffling and retrieval-based selection.
+
+        This dataset prepares training and testing splits for each item. It can either shuffle the
+        training data randomly or select training examples based on pre-computed attention scores
+        (retrieval). For each 'step', it provides a unique training set and a corresponding test set.
+        """
+
+    def __init__(self,
+                 X_train: torch.Tensor,
+                 y_train: torch.Tensor,
+                 attention_score: np.ndarray = None,
+                 retrieval_len: int = 2000,
+                 use_retrieval: bool = True,
+                 split_ratio: float = 0.8,
+                 ):
+
+        """
+            Initializes the FinetuneDataset.
+            Args:
+                X_train (torch.Tensor): The full set of input training data.
+                y_train (torch.Tensor): The full set of corresponding training labels.
+                attention_score (np.ndarray, optional): Pre-computed attention scores for retrieval.
+                                                         Shape: (num_samples_in_X_train,num_samples_in_original_X_test).
+                                                         Required if use_retrieval is True.
+                retrieval_len (int, optional): The number of top samples to select based on attention scores.
+                                               Used only if use_retrieval is True.
+                                               Note: The parameter in init_dataset is named 'train_len'.
+                use_retrieval (bool, optional): Flag to determine data selection strategy.
+                                                If True, uses attention scores for selection.
+                                                If False, uses random shuffling.
+                split_ratio (float, optional): Split ratio for selection strategy.
+            """
+        self.init_dataset(X_train, y_train, attention_score, retrieval_len, use_retrieval, split_ratio)
+
+    def __len__(self):
+        """
+                Returns the number of steps/items in the dataset.
+
+                Returns:
+                    int: The number of steps, which corresponds to the size of the first dimension
+                         of the generated X_test tensor.
+                """
+        return self.max_steps
+
+    def __getitem__(self, idx: int) -> dict[str, list]:
+        """
+                Retrieves a single item (a training/test split configuration) by index.
+
+                Args:
+                    idx (int): The index of the item to retrieve.
+
+                Returns:
+                    dict[str, list]: A dictionary containing the tensors for the training and testing splits
+                                     for this specific step/index.
+                                     Keys: 'X_train', 'y_train', 'X_test', 'y_test'.
+                """
+        return dict(
+            X_train=self.X_train[idx], # Training features for this step
+            y_train=self.y_train[idx], # Training labels for this step
+            X_test=self.X_test[idx], # Testing features for this step
+            y_test=self.y_test[idx], # Testing labels for this step
+        )
+
+    def init_dataset(self,
+                     X_train: torch.Tensor,
+                     y_train: torch.Tensor,
+                     attention_score: np.ndarray = None,
+                     train_len: int = 2000,
+                     use_retrieval: bool = False,
+                     split_ratio: float = 0.8,
+                     ):
+
+        if not use_retrieval:
+            X_train = shuffle_data_along_dim(X_train, 0)[:min(train_len, X_train.shape[0])]
+            y_train = shuffle_data_along_dim(y_train, 0)[:min(train_len, X_train.shape[0])]
+            self.X_train = torch.cat([X_train.unsqueeze(0) for _ in range(self.max_steps)], dim=0)
+            self.y_train = torch.cat([y_train.unsqueeze(0) for _ in range(self.max_steps)], dim=0)
+            X = self.X_train
+            y = self.y_train
+
+            # adapt train_test_split mode
+            split = int(X.shape[1] * split_ratio)
+            self.X_train = X[:, split:]
+            self.y_train = y[:, split:]
+            self.X_test = X[:, :split]
+            self.y_test = y[:, :split]
+            self.max_steps = self.X_test.shape[0]
+        else:
+            top_k_indices = np.argsort(attention_score)[:, -min(train_len, X_train.shape[0]):]
+            self.X_train = torch.cat([X_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0)
+            self.y_train = torch.cat([y_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0)
+            X = shuffle_data_along_dim(self.X_train, 1)
+            y = shuffle_data_along_dim(self.y_train, 1)
+
+            # adapt train_test_split mode
+            split = int(X.shape[1] * split_ratio)
+            self.X_train = X[:, split:]
+            self.y_train = y[:, split:]
+            self.X_test = X[:, :split]
+            self.y_test = y[:, :split]
+            self.max_steps = self.X_train.shape[0]
+
+
+class TabularInferenceDataset(Dataset):
+    """
+        A PyTorch Dataset for tabular data inference scenarios.
+
+        This dataset is designed to provide data for inference tasks where
+        you might have a fixed training set and varying test samples, optionally
+        selecting the training set based on relevance (retrieval) for each test sample.
+        When retrieval is used, each test sample (or step) is paired with a specific,
+        potentially unique, subset of the training data. When retrieval is not used,
+        it's assumed a single, fixed training set is used for all test samples.
+        """
+
+    def __init__(self,
+                 X_train: torch.Tensor,
+                 y_train: torch.Tensor,
+                 X_test: torch.Tensor,
+                 attention_score: np.ndarray|torch.Tensor = None,
+                 retrieval_len: int = 2000,
+                 use_retrieval: bool = True,
+                 ):
+        """
+                Initializes the TabularInferenceDataset.
+
+                Args:
+                    X_train (torch.Tensor): The full set of input training features.
+                                            Shape: (num_train_samples, ...).
+                    y_train (torch.Tensor): The full set of corresponding training labels.
+                                            Shape: (num_train_samples, ...).
+                    X_test (torch.Tensor): The set of input features for inference/test samples.
+                                           Shape: (num_test_samples, ...).
+                    attention_score (np.ndarray, optional): Pre-computed attention scores
+                                                            for retrieval logic. Shape depends
+                                                            on implementation, e.g., Shape: (num_samples_in_X_train,num_samples_in_X_test).
+                                                            Required if use_retrieval is True.
+                    retrieval_len (int, optional): The number of top training samples to select
+                                                   based on attention scores for each test sample.
+                                                   Used only if use_retrieval is True.
+                    use_retrieval (bool, optional): Flag to determine data preparation strategy.
+                                                    If True, uses attention scores to select relevant training data
+                                                    for each test sample.
+                                                    If False, assumes a fixed training set is used for all.
+                """
+        self.init_dataset(X_train, y_train, X_test, attention_score, retrieval_len, use_retrieval)
+        # The number of inference steps equals the number of test samples
+        self.max_steps = self.X_test.shape[0]
+        self.use_retrieval = use_retrieval
+
+    def __len__(self):
+        """
+                Returns the number of steps/items in the dataset.
+                Returns:
+                    int: The number of steps, which corresponds to the size of the first dimension
+                         of the generated X_test tensor.
+                """
+        return self.max_steps
+
+    def __getitem__(self, idx: int) -> dict[str, list]:
+        """
+                Retrieves a single item (data for one inference step) by index.
+
+                Args:
+                    idx (int): The index of the test sample/step to retrieve.
+
+                Returns:
+                    dict[str, torch.Tensor]: A dictionary containing the data needed for this inference step.
+                                             If `use_retrieval` is True, it includes the specific
+                                             `X_train`, `y_train`, and `X_test` for this step.
+                                             If `use_retrieval` is False, it only includes `X_test`,
+                                             as a fixed training set is assumed.
+                """
+        if self.use_retrieval:
+            # Return the specific training data selected for this test sample
+            return dict(
+                idx=int(idx),
+                X_train=self.X_train[idx], # Training features for this step (retrieved)
+                X_test=self.X_test[idx], # Training labels for this step (retrieved)
+                y_train=self.y_train[idx], # The test sample features
+            )
+        else:
+            # Return only the test data; training data is assumed to be fixed and
+            # provided.
+            return dict(
+                idx=int(idx),
+                X_test=self.X_test[idx],
+            )
+
+    def init_dataset(self,
+                     X_train: torch.Tensor,
+                     y_train: torch.Tensor,
+                     X_test: torch.Tensor,
+                     attention_score: np.ndarray = None,
+                     train_len: int = 2000,
+                     use_retrieval: bool = False,
+                     ):
+        if use_retrieval:
+            print(X_train.shape)
+            top_k_indices = np.argsort(attention_score)[:, -min(train_len, X_train.shape[0]):]
+            self.X_train = torch.cat([X_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0)
+            self.y_train = torch.cat([y_train[y_iter].unsqueeze(0) for y_iter in top_k_indices], dim=0).unsqueeze(-1)
+            self.X_test = X_test
+        else:
+            self.X_test = X_test
+
+
+
+
+def load_data(data_root,folder):
+    le = LabelEncoder()
+    train_path = os.path.join(data_root,folder, folder + '_train.csv')
+    test_path = os.path.join(data_root,folder, folder + '_test.csv')
+    if os.path.exists(train_path):
+        train_df = pd.read_csv(train_path)
+        if os.path.exists(test_path):
+            test_df = pd.read_csv(test_path)
+        else:
+            train_df, test_df = train_test_split(train_df, test_size=0.5, random_state=42)
+    X_train = train_df.iloc[:, :-1]
+    y_train = train_df.iloc[:, -1]
+    X_test = test_df.iloc[:, :-1]
+    y_test = test_df.iloc[:, -1]
+    for col in X_train.columns:
+        if X_train[col].dtype == 'object':
+            try:
+                le = LabelEncoder()
+                X_train[col] = le.fit_transform(X_train[col])
+                X_test[col] = le.transform(X_test[col])
+            except Exception as e:
+                X_train = X_train.drop(columns=[col])
+                X_test = X_test.drop(columns=[col])
+    y_train = le.fit_transform(y_train)
+    y_test = le.transform(y_test)
+    trainX, trainy = X_train, y_train
+    trainX = np.asarray(trainX, dtype=np.float32)
+    trainy = np.asarray(trainy, dtype=np.int64)
+
+
+    testX, testy = X_test, y_test
+    testX = np.asarray(testX, dtype=np.float32)
+    testy = np.asarray(testy, dtype=np.int64)
+    return trainX, trainy, testX, testy
+if __name__ == '__main__':
+    pass
+
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py
new file mode 100644
index 00000000..70cdcb80
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py
@@ -0,0 +1,190 @@
+import argparse
+import json
+import logging
+import os
+from datetime import datetime
+
+import numpy as np
+import torch
+from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
+from torch.utils.data import DistributedSampler
+
+
+def shuffle_data_along_dim(X: torch.Tensor | np.ndarray, dim: int = 0) -> torch.Tensor | np.ndarray:
+    """
+        Shuffles data (torch.Tensor or np.ndarray) along a specified axis.
+
+        Args:
+            X (torch.Tensor | np.ndarray): The input multidimensional tensor or array.
+            dim (int): The dimension along which to shuffle elements.
+
+        Returns:
+            X_(torch.Tensor | np.ndarray): A new tensor or array with elements shuffled along the specified dimension.
+        """
+    if isinstance(X, np.ndarray):
+        shuffled_indices = np.random.permutation(X.shape[dim])
+        reshaped_indices = shuffled_indices.reshape(
+            tuple(1 if i != dim else -1 for i in range(X.ndim))
+        )
+        shuffled_array = np.take_along_axis(X, reshaped_indices, axis=dim)
+        return shuffled_array
+    elif isinstance(X, torch.Tensor):
+        dim_size = X.size(dim)
+        shuffled_indices = torch.randperm(dim_size, device=X.device)
+        index_shape = [1] * X.dim()
+        index_shape[dim] = dim_size
+        expanded_indices = shuffled_indices.view(index_shape)
+        broadcasted_indices = expanded_indices.expand_as(X)
+        shuffled_tensor = torch.gather(X, dim, broadcasted_indices)
+        return shuffled_tensor
+    else:
+        raise TypeError("Data must be a torch.Tensor or np.ndarray")
+
+
+def auc_metric(target, pred, multi_class='ovo', numpy=False):
+    lib = np if numpy else torch
+    try:
+        if not numpy:
+            target = torch.tensor(target) if not torch.is_tensor(target) else target
+            pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
+        if len(lib.unique(target)) > 2:
+            if not numpy:
+                return torch.tensor(roc_auc_score(target, pred, multi_class=multi_class))
+            return roc_auc_score(target, pred, multi_class=multi_class)
+        else:
+            if len(pred.shape) == 2:
+                pred = pred[:, 1]
+            if not numpy:
+                return torch.tensor(roc_auc_score(target, pred))
+            return roc_auc_score(target, pred)
+    except ValueError as e:
+        print(e)
+        return np.nan if numpy else torch.tensor(np.nan)
+
+
+def calculate_result(y_test_encoded, y_pred_proba):
+    y_pred_label = np.argmax(y_pred_proba, axis=1)
+    if len(np.unique(y_test_encoded)) == 2:
+        final_auc = roc_auc_score(y_test_encoded, y_pred_proba[:, 1])
+    else:
+        final_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class="ovo")
+    print(f"✅ AUC = {final_auc:.4f}")
+
+    # --- Accuracy ---
+    acc = accuracy_score(y_test_encoded, y_pred_label)
+    print(f"✅ Accuracy = {acc:.4f}")
+
+    # --- F1 Score ---
+    f1 = f1_score(y_test_encoded, y_pred_label, average='macro' if len(np.unique(y_test_encoded)) > 2 else 'binary')
+    print(f"✅ F1 Score = {f1:.4f}")
+
+    # --- Cross Entropy / LogLoss ---
+    ce = log_loss(y_test_encoded, y_pred_proba)
+    print(f"✅ LogLoss (Cross Entropy) = {ce:.4f}")
+
+    # --- ECE (Expected Calibration Error) ---
+    def compute_ece(y_true, y_prob, n_bins=10):
+        """Expected Calibration Error (ECE) implementation"""
+        bin_boundaries = np.linspace(0.0, 1.0, n_bins + 1)
+        ece = 0.0
+        y_true = np.array(y_true)
+        y_prob = np.array(y_prob)
+
+        if y_prob.ndim == 2 and y_prob.shape[1] > 1:
+            confidences = np.max(y_prob, axis=1)
+            predictions = np.argmax(y_prob, axis=1)
+        else:
+            confidences = y_prob if y_prob.ndim == 1 else y_prob[:, 1]
+            predictions = (confidences >= 0.5).astype(int)
+
+        accuracies = (predictions == y_true)
+
+        for i in range(n_bins):
+            bin_lower = bin_boundaries[i]
+            bin_upper = bin_boundaries[i + 1]
+            in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
+            prop_in_bin = np.mean(in_bin)
+            if prop_in_bin > 0:
+                acc_in_bin = np.mean(accuracies[in_bin])
+                avg_conf_in_bin = np.mean(confidences[in_bin])
+                ece += np.abs(acc_in_bin - avg_conf_in_bin) * prop_in_bin
+        return ece
+
+    ece = compute_ece(y_test_encoded, y_pred_proba, n_bins=10)
+    print(f"✅ ECE (Expected Calibration Error, 10 bins) = {ece:.4f}")
+
+    return acc, final_auc, f1, ce, ece
+
+
+
+
+
+def generate_infenerce_config(args):
+    retrieval_config = dict(
+        use_retrieval=False,
+        retrieval_before_preprocessing=False,
+        calculate_feature_attention=False,
+        calculate_sample_attention=False,
+        subsample_ratio=1,
+        subsample_type=None,
+        use_type=None,
+    )
+
+    config_list = [
+        dict(RebalanceFeatureDistribution=dict(worker_tags=["quantile"], discrete_flag=False, original_flag=True,
+                                               svd_tag="svd"),
+             CategoricalFeatureEncoder=dict(encoding_strategy="ordinal_strict_feature_shuffled"),
+             FeatureShuffler=dict(mode="shuffle"),
+             retrieval_config=retrieval_config,
+             ),
+        dict(RebalanceFeatureDistribution=dict(worker_tags=["quantile"], discrete_flag=False, original_flag=True,
+                                               svd_tag="svd"),
+             CategoricalFeatureEncoder=dict(encoding_strategy="ordinal_strict_feature_shuffled"),
+             FeatureShuffler=dict(mode="shuffle"), retrieval_config=retrieval_config,
+             ),
+        dict(RebalanceFeatureDistribution=dict(worker_tags=[None], discrete_flag=True, original_flag=False,
+                                               svd_tag=None),
+             CategoricalFeatureEncoder=dict(encoding_strategy="numeric"),
+             FeatureShuffler=dict(mode="shuffle"),
+             retrieval_config=retrieval_config,
+             ),
+        dict(RebalanceFeatureDistribution=dict(worker_tags=[None], discrete_flag=True, original_flag=False,
+                                               svd_tag=None),
+             CategoricalFeatureEncoder=dict(encoding_strategy="numeric"),
+             FeatureShuffler=dict(mode="shuffle"),
+             retrieval_config=retrieval_config)
+    ]
+
+    with open(args.inference_config_path, 'w') as f:
+        json.dump(config_list, f)
+
+
+class NonPaddingDistributedSampler(DistributedSampler):
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.num_samples = len(range(rank, len(dataset), num_replicas))
+        self.total_size = len(dataset)
+
+    def __iter__(self):
+        indices = list(range(len(self.dataset)))
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        return iter(indices)
+
+def swap_rows_back(tensor, indices):
+    """
+
+    Args:
+        tensor (torch.Tensor):
+        indices (list|torch.Tensor):
+
+    Returns:
+        torch.Tensor:
+    """
+    inverse_indices = [0] * len(indices)
+    for i, idx in enumerate(indices):
+        inverse_indices[idx] = i
+    return tensor[inverse_indices]
+
+if __name__ == "__main__":
+    args = init_args()
+    generate_infenerce_config(args)
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py
new file mode 100644
index 00000000..3ea3ec58
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py
@@ -0,0 +1,31 @@
+import torch
+
+from tabrepo.benchmark.models.ag.limix.LimiX.model.transformer import FeaturesTransformer
+
+
+def load_model(model_path,calculate_sample_attention:bool=False,calculate_feature_attention:bool=False,mask_prediction:bool=False):
+    state_dict = torch.load(model_path, map_location="cpu", weights_only=False)
+    config = state_dict['config']
+    model = FeaturesTransformer(
+        preprocess_config_x=config['preprocess_config_x'],
+        encoder_config_x=config['encoder_config_x'],
+        encoder_config_y=config['encoder_config_y'],
+        decoder_config=config['decoder_config'],
+        nlayers=config['nlayers'],
+        nhead=config['nhead'],
+        embed_dim=config['embed_dim'],
+        hid_dim=config['hid_dim'],
+        mask_prediction=mask_prediction,
+        features_per_group=config['features_per_group'],
+        dropout=config['dropout'],
+        layer_norm_eps=config.get('layer_norm_eps', 1e-5),
+        device=None,
+        dtype=None,
+        recompute_attn=config['recompute_attn'],
+        calculate_sample_attention=calculate_sample_attention,
+        calculate_feature_attention=calculate_feature_attention
+    )
+    model.load_state_dict(state_dict['state_dict'])
+
+    model.eval()
+    return model
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py
new file mode 100644
index 00000000..cceb97c3
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py
@@ -0,0 +1,35 @@
+import numpy as np
+import torch
+from sklearn.preprocessing import LabelEncoder
+
+
+class RelabelRetrievalY:
+    def __init__(self, y_train: torch.Tensor):
+        self.y_train = y_train.cpu().numpy()
+        self.label_encoders = [LabelEncoder() for i in range(y_train.shape[0])]
+
+    def transform_y(self, ):
+        for i in range(self.y_train.shape[0]):
+            self.y_train[i] = np.expand_dims(self.label_encoders[i].fit_transform(self.y_train[i].ravel()), axis=1)
+        self.label_y = self.y_train.copy().astype(np.int32)
+        self.y_train = torch.tensor(self.y_train, dtype=torch.float32, device=torch.device('cuda'))
+        return self.y_train
+
+    def inverse_transform_y(self, X: np.ndarray) -> np.ndarray:
+        for i in range(X.shape[0]):
+            batch_label = np.unique(self.label_y[i])
+            reverse_perm = self.label_encoders[i].inverse_transform(batch_label).astype(np.int32)
+            reverse_output = np.full_like(X[i], fill_value=-np.inf)
+            reverse_output[reverse_perm] = X[i, batch_label]
+            X[i] = reverse_output
+        return X
+
+
+if __name__ == '__main__':
+    y_train = torch.tensor([[[7],[7],[8], [5]],[[4], [3],[3], [6]]])
+    output = np.array([[0.2, 2, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
+                       [0.2, 2, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]],dtype=np.float32)
+
+    relabel = RelabelRetrievalY(y_train)
+    y_train, label_y = relabel.transform_y()
+    output = relabel.inverse_transform_y(output)
diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py
new file mode 100644
index 00000000..8698e911
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py
@@ -0,0 +1,30 @@
+import os
+from huggingface_hub import snapshot_download, hf_hub_download
+
+def download_datset(repo_id:str, revision:str, repo_type:str='dataset', save_dir:str="./my_cache"):
+    print(f"Downloading {repo_id} ...")
+    snapshot_download(
+        repo_id=repo_id,
+        revision=revision,
+        repo_type=repo_type,
+        local_dir=save_dir,
+        ignore_patterns=None,
+        force_download=False
+    )
+    
+def list_folders_to_csv(path:str, output_csv:str):
+    import csv
+    folders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
+    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['dataset name'])
+        for folder in folders:
+            writer.writerow([folder])
+            
+def download_model(repo_id:str, filename:str, save_path:str='.') -> str:
+    file_path = hf_hub_download(
+                                repo_id=repo_id,
+                                filename=filename,
+                                local_dir=save_path
+                            )
+    return file_path
\ No newline at end of file
diff --git a/tabrepo/benchmark/models/ag/limix/__init__.py b/tabrepo/benchmark/models/ag/limix/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tabrepo/benchmark/models/ag/limix/limix_model.py b/tabrepo/benchmark/models/ag/limix/limix_model.py
new file mode 100644
index 00000000..92dc7aaa
--- /dev/null
+++ b/tabrepo/benchmark/models/ag/limix/limix_model.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+import logging
+import os
+import sys
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from autogluon.common.utils.resource_utils import ResourceManager
+from autogluon.core.models import AbstractModel
+from autogluon.features.generators import LabelEncoderFeatureGenerator
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class LimiXModel(AbstractModel):
+    """Rel: https://github.com/limix-ldm/LimiX."""
+
+    ag_key = "LIMIX"
+    ag_name = "LimiX"
+    _DEFAULT_CHECKPOINT_PATH = "LimiX-16M.ckpt"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._feature_generator = None
+        self._cat_features = None
+        self._cat_indices = None
+
+    def _preprocess(self, X: pd.DataFrame, is_train=False, **kwargs) -> pd.DataFrame:
+        X = super()._preprocess(X, **kwargs)
+
+        if is_train:
+            self._cat_indices = []
+
+            # X will be the training data°.
+            self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
+            self._feature_generator.fit(X=X)
+
+        # This converts categorical features to numeric via stateful label encoding.
+        if self._feature_generator.features_in:
+            X = X.copy()
+            X[self._feature_generator.features_in] = self._feature_generator.transform(
+                X=X
+            )
+
+            if is_train:
+                # Detect/set cat features and indices
+                if self._cat_features is None:
+                    self._cat_features = self._feature_generator.features_in[:]
+                self._cat_indices = [
+                    X.columns.get_loc(col) for col in self._cat_features
+                ]
+
+        return X
+
+    def _fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        num_cpus: int = 1,
+        num_gpus: int = 0,
+        verbosity: int = 2,
+        **kwargs,
+    ):
+        import torch
+
+        from tabrepo.benchmark.models.ag.limix.LimiX.inference.predictor import (
+            LimiXPredictor,
+        )
+
+        is_classification = self.problem_type in ["binary", "multiclass"]
+        device = "cuda" if num_gpus != 0 else "cpu"
+        if (device == "cuda") and (not torch.cuda.is_available()):
+            # FIXME: warn instead and switch to CPU.
+            raise AssertionError(
+                "Fit specified to use GPU, but CUDA is not available on this machine. "
+                "Please switch to CPU usage instead.",
+            )
+
+        X = self.preprocess(X, is_train=True)
+
+        cls_config_default_config = (
+            Path(__file__).parent / "LimiX" / "config" / "cls_default_noretrieval.json"
+        )
+        reg_config_default_config = (
+            Path(__file__).parent / "LimiX" / "config" / "reg_default_noretrieval.json"
+        )
+        inference_config = (
+            cls_config_default_config
+            if is_classification
+            else reg_config_default_config
+        )
+
+        hps = self._get_model_params()
+        hps["device"] = device
+
+        self.model = LimiXPredictor(
+            X_train=X,
+            y_train=y,
+            seed=self.random_seed,
+            model_path=self.download_model(),
+            categorical_features_indices=self._cat_indices,
+            inference_config=str(inference_config.resolve()),
+            task_type="Classification" if is_classification else "Regression",
+            **hps,
+        )
+
+    def _get_default_resources(self) -> tuple[int, int]:
+        # Use only physical cores for better performance based on benchmarks
+        num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
+
+        num_gpus = min(1, ResourceManager.get_gpu_count_torch(cuda_only=True))
+
+        return num_cpus, num_gpus
+
+    def get_minimum_resources(
+        self, is_gpu_available: bool = False
+    ) -> dict[str, int | float]:
+        return {
+            "num_cpus": 1,
+            "num_gpus": 1 if is_gpu_available else 0,
+        }
+
+    @staticmethod
+    def download_model():
+        from huggingface_hub import hf_hub_download
+
+        model_dir = _user_cache_dir(platform=sys.platform, appname="limix")
+        model_dir.mkdir(exist_ok=True, parents=True)
+
+        final_model_path = model_dir / LimiXModel._DEFAULT_CHECKPOINT_PATH
+
+        if not final_model_path.exists():
+            model_file = hf_hub_download(
+                repo_id="stableai-org/LimiX-16M",
+                filename=LimiXModel._DEFAULT_CHECKPOINT_PATH,
+                local_dir=str(model_dir),
+            )
+            assert str(final_model_path) == model_file
+        return str(final_model_path)
+
+    def _set_default_params(self):
+        default_params = {}
+        for param, val in default_params.items():
+            self._set_default_param_value(param, val)
+
+    def _get_random_seed_from_hyperparameters(
+        self, hyperparameters: dict
+    ) -> int | None | str:
+        return hyperparameters.get("seed", "N/A")
+
+    @classmethod
+    def supported_problem_types(cls) -> list[str] | None:
+        return ["binary", "multiclass", "regression"]
+
+    def _get_default_auxiliary_params(self) -> dict:
+        default_auxiliary_params = super()._get_default_auxiliary_params()
+        default_auxiliary_params.update(
+            {
+                "max_classes": 10,
+            }
+        )
+        return default_auxiliary_params
+
+    @classmethod
+    def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
+        """Set fold_fitting_strategy to sequential_local,
+        as parallel folding crashes if model weights aren't pre-downloaded.
+        """
+        default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
+        extra_ag_args_ensemble = {
+            "fold_fitting_strategy": "sequential_local",
+            "refit_folds": True,
+        }
+        default_ag_args_ensemble.update(extra_ag_args_ensemble)
+        return default_ag_args_ensemble
+
+    @classmethod
+    def _class_tags(cls):
+        return {"can_estimate_memory_usage_static": False}
+
+    def _more_tags(self) -> dict:
+        return {"can_refit_full": True}
+
+
+def _user_cache_dir(platform: str, appname: str = "tabpfn") -> Path:
+    use_instead_path = (Path.cwd() / ".tabpfn_models").resolve()
+
+    # https://docs.python.org/3/library/sys.html#sys.platform
+    if platform == "win32":
+        # Honestly, I don't want to do what `platformdirs` does:
+        # https://github.com/tox-dev/platformdirs/blob/b769439b2a3b70769a93905944a71b3e63ef4823/src/platformdirs/windows.py#L252-L265
+        APPDATA_PATH = os.environ.get("APPDATA", "")
+        if APPDATA_PATH.strip() != "":
+            return Path(APPDATA_PATH) / appname
+
+        warnings.warn(
+            "Could not find APPDATA environment variable to get user cache dir,"
+            " but detected platform 'win32'."
+            f" Defaulting to a path '{use_instead_path}'."
+            " If you would prefer, please specify a directory when creating"
+            " the model.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return use_instead_path
+
+    if platform == "darwin":
+        return Path.home() / "Library" / "Caches" / appname
+
+    # TODO: Not entirely sure here, Python doesn't explicitly list
+    # all of these and defaults to the underlying operating system
+    # if not sure.
+    linux_likes = ("freebsd", "linux", "netbsd", "openbsd")
+    if any(platform.startswith(linux) for linux in linux_likes):
+        # The reason to use "" as default is that the env var could exist but be empty.
+        # We catch all this with the `.strip() != ""` below
+        XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME", "")
+        if XDG_CACHE_HOME.strip() != "":
+            return Path(XDG_CACHE_HOME) / appname
+        return Path.home() / ".cache" / appname
+
+    warnings.warn(
+        f"Unknown platform '{platform}' to get user cache dir."
+        f" Defaulting to a path at the execution site '{use_instead_path}'."
+        " If you would prefer, please specify a directory when creating"
+        " the model.",
+        UserWarning,
+        stacklevel=2,
+    )
+    return use_instead_path
diff --git a/tabrepo/benchmark/models/model_register.py b/tabrepo/benchmark/models/model_register.py
index 9066e788..3a4eeca4 100644
--- a/tabrepo/benchmark/models/model_register.py
+++ b/tabrepo/benchmark/models/model_register.py
@@ -6,6 +6,7 @@
 
 from tabrepo.benchmark.models.ag import (
     ExplainableBoostingMachineModel,
+    LimiXModel,
     ModernNCAModel,
     RealMLPModel,
     TabDPTModel,
@@ -26,6 +27,7 @@
     TabDPTModel,
     TabMModel,
     ModernNCAModel,
+    LimiXModel,
 ]
 
 for _model_cls in _models_to_add:
@@ -43,7 +45,10 @@ def infer_model_cls(model_cls: str, model_register: ModelRegistry = None):
                 if real_model_cls.ag_name == model_cls:
                     model_cls = real_model_cls
                     break
-        elif model_cls in [str(real_model_cls.__name__) for real_model_cls in model_register.model_cls_list]:
+        elif model_cls in [
+            str(real_model_cls.__name__)
+            for real_model_cls in model_register.model_cls_list
+        ]:
             for real_model_cls in model_register.model_cls_list:
                 if model_cls == str(real_model_cls.__name__):
                     model_cls = real_model_cls
diff --git a/tabrepo/models/limix/__init__.py b/tabrepo/models/limix/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tabrepo/models/limix/generate.py b/tabrepo/models/limix/generate.py
new file mode 100644
index 00000000..0a2079be
--- /dev/null
+++ b/tabrepo/models/limix/generate.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel
+from tabrepo.utils.config_utils import ConfigGenerator
+
+gen_limix = ConfigGenerator(model_cls=LimiXModel, manual_configs=[{}], search_space={})
diff --git a/tabrepo/models/utils.py b/tabrepo/models/utils.py
index 32e17039..7c5c2a0a 100644
--- a/tabrepo/models/utils.py
+++ b/tabrepo/models/utils.py
@@ -47,6 +47,7 @@ def get_configs_generator_from_name(model_name: str):
         "TabPFNv2": lambda: importlib.import_module("tabrepo.models.tabpfnv2.generate").gen_tabpfnv2,
         "XGBoost": lambda: importlib.import_module("tabrepo.models.xgboost.generate").gen_xgboost,
         "Mitra": lambda: importlib.import_module("tabrepo.models.mitra.generate").gen_mitra,
+        "LimiX": lambda: importlib.import_module("tabrepo.models.limix.generate").gen_limix,
     }
 
     if model_name not in name_to_import_map:
diff --git a/tst/benchmark/models/test_limix.py b/tst/benchmark/models/test_limix.py
new file mode 100644
index 00000000..b96726ab
--- /dev/null
+++ b/tst/benchmark/models/test_limix.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import pytest
+
+
+def test_limix():
+    try:
+        from autogluon.tabular.testing import FitHelper
+        from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel
+
+        FitHelper.verify_model(model_cls=LimiXModel, model_hyperparameters={})
+    except ImportError as err:
+        pytest.skip(
+            f"Import Error, skipping test... "
+            f"Ensure you have the proper dependencies installed to run this test:\n"
+            f"{err}"
+        )