diff --git a/tabrepo/benchmark/models/ag/__init__.py b/tabrepo/benchmark/models/ag/__init__.py index 4cfded7c..5f914646 100644 --- a/tabrepo/benchmark/models/ag/__init__.py +++ b/tabrepo/benchmark/models/ag/__init__.py @@ -1,16 +1,20 @@ from __future__ import annotations from tabrepo.benchmark.models.ag.ebm.ebm_model import ExplainableBoostingMachineModel +from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel from tabrepo.benchmark.models.ag.modernnca.modernnca_model import ModernNCAModel from tabrepo.benchmark.models.ag.realmlp.realmlp_model import RealMLPModel from tabrepo.benchmark.models.ag.tabdpt.tabdpt_model import TabDPTModel from tabrepo.benchmark.models.ag.tabicl.tabicl_model import TabICLModel from tabrepo.benchmark.models.ag.tabm.tabm_model import TabMModel -from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import TabPFNV2ClientModel +from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import ( + TabPFNV2ClientModel, +) from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model __all__ = [ "ExplainableBoostingMachineModel", + "LimiXModel", "ModernNCAModel", "RealMLPModel", "TabDPTModel", diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt b/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt new file mode 100644 index 00000000..ac4aee55 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright Zhengxiao Du + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json new file mode 100644 index 00000000..a51a30de --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_noretrieval.json @@ -0,0 +1,102 @@ +[ + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_10" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_10" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + } +] \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json new file mode 100644 index 00000000..1290c2ca --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/cls_default_retrieval.json @@ -0,0 +1,102 @@ +[ + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "logNormal" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + } +] \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json new file mode 100644 index 00000000..79480c0e --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval.json @@ -0,0 +1,201 @@ +[ + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + },{ + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "power" + ], + "discrete_flag": false, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "power" + ], + "discrete_flag": false, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "power" + ], + "discrete_flag": false, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "power" + ], + "discrete_flag": false, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + } +] \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json new file mode 100644 index 00000000..74bc3c7e --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_noretrieval_MVI.json @@ -0,0 +1,201 @@ +[ + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + },{ + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile_uniform_all_data" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "onehot" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": false, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": false, + "calculate_sample_attention": false, + "subsample_ratio": 0.7, + "subsample_type": "sample", + "use_type": "mixed" + } + } +] \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json new file mode 100644 index 00000000..1290c2ca --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/config/reg_default_retrieval.json @@ -0,0 +1,102 @@ +[ + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "quantile" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + "logNormal" + ], + "discrete_flag": false, + "original_flag": true, + "svd_tag": "svd" + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "ordinal_strict_feature_shuffled" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + }, + { + "RebalanceFeatureDistribution": { + "worker_tags": [ + null + ], + "discrete_flag": true, + "original_flag": false, + "svd_tag": null + }, + "CategoricalFeatureEncoder": { + "encoding_strategy": "numeric" + }, + "FeatureShuffler": { + "mode": "shuffle" + }, + "retrieval_config": { + "use_retrieval": true, + "retrieval_before_preprocessing": false, + "calculate_feature_attention": true, + "calculate_sample_attention": true, + "subsample_ratio": "dynamic", + "subsample_type": "sample", + "use_type": "mixed" + } + } +] \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py new file mode 100644 index 00000000..484178ac --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/inference_method.py @@ -0,0 +1,274 @@ +import gc +from typing import Literal, Tuple + +import numpy as np +import torch +from torch import nn +from torch.utils.data import DataLoader, DistributedSampler + +from tabrepo.benchmark.models.ag.limix.LimiX.utils.data_utils import TabularInferenceDataset +from tabrepo.benchmark.models.ag.limix.LimiX.utils.inference_utils import NonPaddingDistributedSampler, swap_rows_back +from tabrepo.benchmark.models.ag.limix.LimiX.utils.loading import load_model + +from tabrepo.benchmark.models.ag.limix.LimiX.utils.retrieval_utils import RelabelRetrievalY +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +import os, socket, contextlib + +def _pick_free_port(): + with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("", 0)) + return s.getsockname()[1] + +def setup(): + if dist.is_initialized(): + return dist.get_rank(), dist.get_world_size() + + # Support for single GPU usage in a normal python script + os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") + + os.environ.setdefault("MASTER_ADDR", "127.0.0.1") + os.environ.setdefault("MASTER_PORT", str(_pick_free_port())) + + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["LOCAL_RANK"] = "0" + + dist.init_process_group(backend="nccl", init_method="env://", rank=0, world_size=1) + + rank = dist.get_rank() + world_size = dist.get_world_size() + torch.cuda.set_device(rank) + return rank, world_size + + +def cleanup(): + if not dist.is_initialized(): + print("Distributed environment is not initialized, nothing to clean up.") + return + + print("Cleaning up distributed environment...") + dist.destroy_process_group() + print("Distributed environment cleaned up.") + +class InferenceResultWithRetrieval: + def __init__(self, + model: torch.nn.Module, + sample_selection_type: Literal["AM", "DDP"] = "AM", + ): + self.model=model + self.sample_selection_type = sample_selection_type + self.dataset = None + + def _prepare_data(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + X_test: torch.Tensor, + attention_score: np.ndarray = None, + retrieval_len: int = 2000 + ) -> TabularInferenceDataset: + if self.sample_selection_type == "AM": + use_retrieval = True + else: + use_retrieval = False + dataset = TabularInferenceDataset( + X_train=X_train, + y_train=y_train, + X_test=X_test, + attention_score=attention_score, + retrieval_len=retrieval_len, + use_retrieval=use_retrieval + ) + return dataset + + def inference(self, + X_train: torch.Tensor = None, + y_train: torch.Tensor = None, + X_test: torch.Tensor = None, + dataset: TabularInferenceDataset = None, + attention_score: np.ndarray | torch.Tensor = None, + retrieval_len: int = 2000, + dynamic_ratio:float=None, + task_type: Literal["reg", "cls"] = "reg"): + self.rank,self.world_size = setup() + model = self.model.cuda() # self.rank + model = DDP(model, device_ids=[self.rank],find_unused_parameters=False) + if isinstance(retrieval_len,str): + if retrieval_len == "dynamic": + if dynamic_ratio is not None: + retrieval_len =int(dynamic_ratio*X_train.shape[0]/len(torch.unique(y_train))) + else: + retrieval_len = int(X_train.shape[0]/len(torch.unique(y_train))) + if isinstance(retrieval_len, float): + self.retrieval_len = int(retrieval_len * X_train.shape[0]) + else: + self.retrieval_len = retrieval_len + if dataset is None: + dataset = self._prepare_data(X_train, y_train, X_test, attention_score, self.retrieval_len) + sampler = NonPaddingDistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=False) + outputs = [] + dataloader = DataLoader(dataset, + batch_size=16, + shuffle=False, + drop_last=False, + sampler=sampler + ) + indice = [] + for data in dataloader: + with ( + torch.autocast(torch.device(model.device).type, enabled=True), + torch.inference_mode(), + ): + if self.sample_selection_type == "DDP": + indice.append(data["idx"]) + X_test = data["X_test"].unsqueeze(1) + X_train_item = torch.cat([X_train.unsqueeze(0) for _ in range(X_test.shape[0])], dim=0) + Y_train_item = torch.cat([y_train.unsqueeze(0).unsqueeze(-1) for _ in range(X_test.shape[0])], dim=0) + x_ = torch.cat([X_train_item, X_test], dim=1) + output = model(x=x_, y=Y_train_item.squeeze(-1), eval_pos=Y_train_item.shape[1], task_type=task_type) + else: + indice.append(data["idx"]) + X_train = data["X_train"] + X_test = data["X_test"].unsqueeze(1) + y_ = data["y_train"] + + x_ = torch.cat([X_train, X_test], dim=1) + if task_type == "cls": + relabel = RelabelRetrievalY(y_) + y_ = relabel.transform_y() + + output=model(x=x_, y=y_.squeeze(-1), eval_pos=y_.shape[1], task_type=task_type) + if len(output.shape) == 3: + output = output.view(-1, output.shape[-1]) + if task_type == "cls": + output = output.cpu().numpy() + output = relabel.inverse_transform_y(output) + output = torch.tensor(output, dtype=torch.float32, device=model.device) + + outputs.append(output.cpu()) + del output + gc.collect() + torch.cuda.empty_cache() + del model + outputs = torch.cat(outputs, dim=0) + local_result_cpu = outputs.cpu() + indice = torch.cat(indice, dim=0) + local_indice_cpu = indice.cpu() + outputs = [None for _ in range(self.world_size)] + gathered_indice = [None for _ in range(self.world_size)] + dist.all_gather_object(gathered_indice, local_indice_cpu) + dist.all_gather_object(outputs, local_result_cpu) + del local_result_cpu + outputs = torch.cat(outputs, dim=0).to(torch.float32) + gathered_indice = torch.cat(gathered_indice, dim=0) + outputs = swap_rows_back(outputs, gathered_indice) + gc.collect() + torch.cuda.empty_cache() + return outputs.squeeze(0) + + +class InferenceAttentionMap: + def __init__(self, + model_path: str, + calculate_feature_attention: bool = False, + calculate_sample_attention: bool = False, + ): + self.calculate_feature_attention = calculate_feature_attention + self.calculate_sample_attention = calculate_sample_attention + self.model = load_model(model_path, calculate_feature_attention=calculate_feature_attention, + calculate_sample_attention=calculate_sample_attention) + + self.dataset = None + + def _prepare_data(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + X_test: torch.Tensor, + ) -> TabularInferenceDataset: + dataset = TabularInferenceDataset( + X_train=X_train, + y_train=y_train, + X_test=X_test, + use_retrieval=False + ) + return dataset + + def inference(self, + X_train: torch.Tensor | np.ndarray, + y_train: torch.Tensor | np.ndarray, + X_test: torch.Tensor | np.ndarray, + task_type: Literal["reg", "cls"] = "reg") -> tuple[torch.Tensor | None, torch.Tensor | None]: + self.rank, self.world_size = setup() + # device = torch.device(f"cuda:{self.rank}") + model = self.model.cuda() + model = DDP(model, device_ids=[0]) + model.eval() + if isinstance(X_train, np.ndarray): + X_train = torch.from_numpy(X_train).float() + if isinstance(y_train, np.ndarray): + y_train = torch.from_numpy(y_train).float() + if isinstance(X_test, np.ndarray): + X_test = torch.from_numpy(X_test).float() + dataset = self._prepare_data(X_train, y_train, X_test) + + sampler = NonPaddingDistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=False) + dataloader = DataLoader(dataset, + batch_size=16, + shuffle=False, + drop_last=False, + sampler=sampler + ) + local_feature_attention = [] + local_sample_attention = [] + feature_attention=None + sample_attention=None + indice=[] + for batch_idx, data in enumerate(dataloader): + X_test=data["X_test"] + idx=data["idx"] + indice.append(idx) + x_=torch.cat([X_train,X_test],dim=0).unsqueeze(dim=0) + + y_=y_train.unsqueeze(0) + with(torch.autocast(device_type='cuda', enabled=True), torch.inference_mode()): + output,feature_attention,sample_attention = model(x=x_, y=y_, eval_pos=y_.shape[1], task_type=task_type) + + if self.calculate_sample_attention: + local_sample_attention.append(sample_attention.permute(1,0,2)) + if self.calculate_feature_attention: + local_feature_attention.append(feature_attention[y_.shape[1]:,:,:]) + del output, sample_attention, feature_attention,X_test + gc.collect() + torch.cuda.empty_cache() + indice=torch.cat(indice,dim=0) + if self.calculate_feature_attention: + feature_attentions = torch.cat(local_feature_attention,dim=0)# shape[len_Dtest, feature_num//feature_per_group, feature_num//feature_per_group,] + local_result_cpu = feature_attentions.cpu() + local_indice_cpu=indice.cpu() + gathered_feature = [None for _ in range(self.world_size)] + gathered_indice = [None for _ in range(self.world_size)] + dist.all_gather_object(gathered_feature, local_result_cpu) + dist.all_gather_object(gathered_indice, local_indice_cpu) + feature_attention = torch.cat(gathered_feature, dim=0) + gathered_indice=torch.cat(gathered_indice, dim=0) + feature_attention=swap_rows_back(feature_attention, gathered_indice) + del gathered_feature + if self.calculate_sample_attention: + sample_attentions = torch.cat(local_sample_attention,dim=0) + local_indice_cpu = indice.cpu() + local_result_cpu = sample_attentions.cpu() + gathered_sample = [None for _ in range(self.world_size)] + gathered_indice = [None for _ in range(self.world_size)] + dist.all_gather_object(gathered_sample, local_result_cpu) + dist.all_gather_object(gathered_indice, local_indice_cpu) + sample_attention = torch.cat(gathered_sample, dim=0) + gathered_indice = torch.cat(gathered_indice, dim=0) + sample_attention = swap_rows_back(sample_attention, gathered_indice) + del gathered_sample + + + dist.barrier() + del sample_attentions,feature_attentions,model + gc.collect() + torch.cuda.empty_cache() + return feature_attention, sample_attention.permute(1,0,2) diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py new file mode 100644 index 00000000..d9ff540e --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/predictor.py @@ -0,0 +1,596 @@ +from tabrepo.benchmark.models.ag.limix.LimiX.inference.inference_method import InferenceAttentionMap, InferenceResultWithRetrieval +from tabrepo.benchmark.models.ag.limix.LimiX.inference.preprocess import ( + FeatureShuffler, + FilterValidFeatures, + CategoricalFeatureEncoder, + RebalanceFeatureDistribution, + SubSampleData) +from tabrepo.benchmark.models.ag.limix.LimiX.utils.loading import load_model +import torch +from typing import List, Literal +import random +from sklearn.utils.validation import check_X_y, check_array +from sklearn.preprocessing import LabelEncoder, OrdinalEncoder +from sklearn.compose import ColumnTransformer, make_column_selector +from sklearn.preprocessing import FunctionTransformer +import numpy as np +from itertools import chain, repeat +import pandas as pd +import einops +import json +import os + + +NA_PLACEHOLDER = "__MISSING__" + +class LimiXPredictor: + """"LimiX model inferencer, supporting tasks such as classification, regression, and missing value prediction.""" + def __init__(self, + X_train, + y_train, + device:torch.device, + model_path:str, + inference_config: list|str, + mix_precision:bool=True, + outlier_remove_std: float=12, + softmax_temperature:float=0.9, + task_type: Literal['Classification', 'Regression']='Classification', + mask_prediction:bool=False, + categorical_features_indices:List[int]|None=None, + inference_with_DDP: bool = False, + seed:int=0): + """ + init LimiXPredictor + + Args: + device: The device for performing inference; GPU is recommended + model_path: The model path of LimiX + mix_precision: Whether to use mixed precision inference + outlier_remove_std: Standard deviation used for removing outliers + softmax_temperature: Softmax temperature coefficient + task_type: task type + mask_prediction: Whether to enable missing value prediction + categorical_features_indices: Index numbers of categorical features, currently not in use + inference_config: inference_config_setting, + inference_with_DDP: If using DDP to inference, + seed: Random seed + """ + self.X_train = X_train + self.y_train = y_train + if isinstance(inference_config, str): + if os.path.isfile(inference_config): + with open(inference_config, 'r') as f: + inference_config = json.load(f) + else: + raise ValueError(f"inference_config is not a config file path: {inference_config}") + self.model_path = model_path + self.device = device + self.mix_precision = mix_precision + self.categorical_features_indices = categorical_features_indices + self.seed = seed + self.inference_config = inference_config + n_estimators = len(inference_config) + assert n_estimators > 0, f"Invalid configuration file! the number of pipelines is 0!" + self.n_estimators = n_estimators + self.model = None + self.outlier_remove_std = outlier_remove_std + self.class_shuffle_factor = 3 + self.min_seq_len_for_category_infer = 100 + self.max_unique_num_for_category_infer = 30 + self.min_unique_num_for_numerical_infer = 4 + self.preprocess_num = 4 + self.softmax_temperature = softmax_temperature + self.task_type = task_type + self.mask_prediction = mask_prediction + self.inference_with_DDP=inference_with_DDP + self.model=load_model(model_path=model_path,mask_prediction=mask_prediction) + + self.preprocess_pipelines = [] + self.preprocess_configs = [] + + random.seed(seed) + rand_gen = np.random.default_rng(seed) + self.seeds = [random.randint(0, 10000) for _ in range(n_estimators*self.preprocess_num)] + start_idx = rand_gen.integers(0, 1000) + all_shifts = list(range(start_idx, start_idx + n_estimators)) + self.all_shifts = rand_gen.choice(all_shifts, size=n_estimators, replace=False) + + if self.mask_prediction: + for inference_config_item in inference_config: + if len(inference_config_item['RebalanceFeatureDistribution']['worker_tags']) > 0: + for i, v in enumerate(inference_config_item['RebalanceFeatureDistribution']['worker_tags']): + if v == 'power': + print("WARNING: Missing value imputation does not currently support the preprocessing method of power! Using the default worker_tags method") + inference_config_item['RebalanceFeatureDistribution']['worker_tags'].pop(i) + inference_config_item['RebalanceFeatureDistribution']['worker_tags'].append(None) + inference_config_item['RebalanceFeatureDistribution']['discrete_flag'] = True + + for idx in range(n_estimators): + pipeline = [] + inference_config_item = inference_config[idx] + retrieval_config = inference_config_item["retrieval_config"] + if retrieval_config["use_retrieval"] and retrieval_config["retrieval_before_preprocessing"]: + if retrieval_config["subsample_type"] == "sample": + assert retrieval_config[ + "calculate_sample_attention"], "Retrieval on sample level must calculate sample attention score before." + if retrieval_config["use_type"] == "mixed": + assert retrieval_config[ + "calculate_feature_attention"], "Retrieval on mixed type must calculate sample and feature attention score before." + if retrieval_config["subsample_type"] == "feature": + assert retrieval_config[ + "calculate_feature_attention"], "Retrieval on sample level must calculate feature attention score before." + pipeline.append( + InferenceAttentionMap(model_path, retrieval_config["calculate_feature_attention"], + retrieval_config["calculate_sample_attention"])) + pipeline.append(SubSampleData(retrieval_config["subsample_type"], retrieval_config["use_type"])) + pipeline.append(FilterValidFeatures()) + pipeline.append(RebalanceFeatureDistribution(**inference_config_item['RebalanceFeatureDistribution'])) + pipeline.append(CategoricalFeatureEncoder(**inference_config_item['CategoricalFeatureEncoder'])) + shuffler = FeatureShuffler(**inference_config_item['FeatureShuffler']) + shuffler.shift = all_shifts[idx] + pipeline.append(shuffler) + if retrieval_config["use_retrieval"] and not retrieval_config["retrieval_before_preprocessing"]: + if retrieval_config["subsample_type"] == "sample": + assert retrieval_config[ + "calculate_sample_attention"], "Retrieval on sample level must calculate sample attention score before." + if retrieval_config["use_type"] == "mixed": + assert retrieval_config[ + "calculate_feature_attention"], "Retrieval on mixed type must calculate sample and feature attention score before." + if retrieval_config["subsample_type"] == "feature": + assert retrieval_config[ + "calculate_feature_attention"], "Retrieval on sample level must calculate feature attention score before." + pipeline.append( + InferenceAttentionMap(model_path, retrieval_config["calculate_feature_attention"], + retrieval_config["calculate_sample_attention"])) + pipeline.append(SubSampleData(retrieval_config["subsample_type"], retrieval_config["use_type"])) + self.preprocess_pipelines.append(pipeline) + + + def _check_n_features(self, X, reset): + """Check whether the number of features matches the previous evaluation""" + n_features = X.shape[1] + if reset: + self.n_features_in_ = n_features + else: + if self.n_features_in_ != n_features: + raise ValueError( + f"X has {n_features} features, " + f"but this estimator is expecting {self.n_features_in_} features." + ) + + def validate_data(self, x=None, y=None, reset=True, validate_separately=False, **check_params): + """ + {'accept_sparse': False, 'dtype': None, 'ensure_all_finite': 'allow-nan'} + """ + # Validate both x and y simultaneously + if y is not None: + x, y = check_X_y(x, y, **check_params) + self._check_n_features(x, reset=reset) + return x, y + + # Validate X + if x is not None: + x = check_array(x, **check_params) + self._check_n_features(x, reset=reset) + return x + + return None + + def convert_x_dtypes(self, x:np.ndarray, dtypes:Literal["float32", "float64"] = "float64"): + NUMERIC_DTYPE_KINDS = "?bBiufm" + OBJECT_DTYPE_KINDS = "OV" + STRING_DTYPE_KINDS = "SaU" + + if x.dtype.kind in NUMERIC_DTYPE_KINDS: + x = pd.DataFrame(x, copy=False, dtype=dtypes) + elif x.dtype.kind in OBJECT_DTYPE_KINDS: + x = pd.DataFrame(x, copy=True) + x = x.convert_dtypes() + else: + raise ValueError(f"Unsupport string dtypes! {x.dtype}") + + integer_columns = x.select_dtypes(include=["number"]).columns + if len(integer_columns) > 0: + x[integer_columns] = x[integer_columns].astype(dtypes) + return x + + def convert_category2num(self, x, dtype:np.floating=np.float64, placeholder: str = NA_PLACEHOLDER,): + ordinal_encoder = OrdinalEncoder(categories="auto", + dtype=dtype, + handle_unknown="use_encoded_value", + unknown_value=-1, + encoded_missing_value=np.nan) + col_encoder = ColumnTransformer(transformers=[("encoder", ordinal_encoder, make_column_selector(dtype_include=["category", "string"]))], + remainder=FunctionTransformer(), + sparse_threshold=0.0, + verbose_feature_names_out=False, + ) + + string_cols = x.select_dtypes(include=["string", "object"]).columns + if len(string_cols) > 0: + x[string_cols] = x[string_cols].fillna(placeholder) + + X_encoded = col_encoder.fit_transform(x) + + string_cols_ix = [x.columns.get_loc(col) for col in string_cols] + placeholder_mask = x[string_cols] == placeholder + string_cols_ix_2 = list(range(len(string_cols_ix))) + X_encoded[:, string_cols_ix_2] = np.where( + placeholder_mask, + np.nan, + X_encoded[:, string_cols_ix_2], + ) + + return X_encoded + + + def get_categorical_features_indices(self, x:np.ndarray): + if x.shape[0] < self.min_seq_len_for_category_infer: + return [] + categorical_idx = [] + for idx, col in enumerate(x.T): + if len(np.unique(col)) < self.min_unique_num_for_numerical_infer: + categorical_idx.append(idx) + return categorical_idx + + def predict(self, X_test): + # TODO: assumes incorrectly labele encoded input data, bad practice, fix later + pred = self.predict_proba(X_test) + if self.task_type == "Classification": + return np.argmax(pred, axis=1) + return pred + def predict_proba(self, X_test): + + predict_batch_size = 5000 + + def get_batch_intervals(n, bs): + return [(i, min(i + bs, n)) for i in range(0, n, bs)] + + if len(X_test) <= predict_batch_size: + return self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test) + + return np.concatenate( + [ + self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test[s:e]) + for s, e in get_batch_intervals(X_test.shape[0], predict_batch_size) + ], + axis=0, + ) + + return self._predict(x_train=self.X_train, y_train=self.y_train, x_test=X_test) + + def _predict(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray: + """ + Perform inference using the LimiX model + + Args: + x_train: Training data x + y_train: Training data y + x_test: Testing data x + """ + if self.task_type == "Classification": + return self._predict_cls(x_train, y_train, x_test) + elif self.task_type == "Regression": + return self._predict_reg(x_train, y_train, x_test) + else: + raise ValueError(f"Unsupported task type, supported tasks include classification and regression!") + + def _predict_cls(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray: + np_rng = np.random.default_rng(self.seed) + + x_train, y_train = self.validate_data(x_train, y_train, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False) + x_test = self.validate_data(x_test, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False) + + # "Concatenate x_train and x_test to ensure the preprocessing logic is completely consistent. + x = np.concatenate([x_train, x_test], axis=0) + + # Encode y_train + self.label_encoder = LabelEncoder() + y = self.label_encoder.fit_transform(y_train) + self.classes = self.label_encoder.classes_ + self.n_classes = len(self.classes) + + # shuffle y + noise = np_rng.random((self.n_estimators * self.class_shuffle_factor, self.n_classes)) + shufflings = np.argsort(noise, axis=1) + uniqs = np.unique(shufflings, axis=0) + balance_count = self.n_estimators // len(uniqs) + self.class_permutations = list(chain.from_iterable(repeat(elem, balance_count) for elem in uniqs)) + cout = self.n_estimators%len(uniqs) + if self.n_estimators%len(uniqs) > 0: + self.class_permutations += [uniqs[i] for i in np_rng.choice(len(uniqs), size=cout)] + + # Preprocess x + x = self.convert_x_dtypes(x) + x = self.convert_category2num(x) + categorical_idx = self.get_categorical_features_indices(x) + outputs = [] + mask_predictions = [] + for id_pipe, pipe in enumerate(self.preprocess_pipelines): + x_ = x.copy() + y_ = self.class_permutations[id_pipe][y.copy()] + categorical_idx_ = categorical_idx.copy() + for id_step, step in enumerate(pipe): + if isinstance(step, RebalanceFeatureDistribution): + x_train_ = x_[:len(y_train)] + x_test_ = x_[len(y_train):] + if x_train_.shape[1] != x_test_.shape[1]: + x_test_ = x_test_[:, :x_train_.shape[1]] + x_train_, categorical_idx_ = step.fit_transform(x_train_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step]) + x_test_, categorical_idx_ = step.transform(x_test_) + x_ = np.concatenate([x_train_, x_test_], axis=0) + elif isinstance(step, InferenceAttentionMap): + feature_attention_score, sample_attention_score = step.inference(X_train=x_[:len(y_train)], + y_train=y_train, + X_test=x_[len(y_train):], + task_type="cls") + + elif isinstance(step, SubSampleData): + step.fit(torch.from_numpy(x_[:len(y_train)]), torch.from_numpy(y_train), + feature_attention_score=feature_attention_score, + sample_attention_score=sample_attention_score, + subsample_ratio=self.inference_config[id_pipe]["retrieval_config"]["subsample_ratio"]) + if self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "feature": + x_ = step.transform(torch.from_numpy(x_[len(y_train):]).float()) + categorical_idx_ = self.get_categorical_features_indices(x_) + else: + attention_score = step.transform(torch.from_numpy(x_[len(y_train):]).float()) + else: + x_, categorical_idx_ = step.fit_transform(x_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step]) + # print(f"step {id_step} categorical_idx_ {categorical_idx_}") + + x_ = torch.from_numpy(x_[:, :]).float().to(self.device) + y_ = torch.from_numpy(y_).float().to(self.device) + torch.manual_seed(self.seed) + torch.cuda.manual_seed_all(self.seed) + if self.inference_config[id_pipe]["retrieval_config"]["use_retrieval"] and \ + self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "sample": + inference = InferenceResultWithRetrieval(model=self.model, + sample_selection_type="AM") + # Remove .squeeze() here as it broke the pipeline if the dataset has only one feature + output = inference.inference(x_[:len(y_train)], y_, + x_[len(y_train):], + attention_score=attention_score, + retrieval_len=self.inference_config[id_pipe]["retrieval_config"][ + "subsample_ratio"], + dynamic_ratio=self.inference_config[id_pipe]["retrieval_config"][ + "dynamic_ratio"] if "dynamic_ratio" in self.inference_config[id_pipe][ + "retrieval_config"] else None, + task_type="cls") + if self.softmax_temperature != 1: + output = (output[:, :self.n_classes].float() / self.softmax_temperature) + + output = output[..., self.class_permutations[id_pipe]] + outputs.append(output) + elif self.inference_with_DDP: + inference = InferenceResultWithRetrieval(model=self.model, + sample_selection_type="DDP") + output = inference.inference(x_[:len(y_train)].squeeze(1), y_, x_[len(y_train):].squeeze(1), + task_type="cls") + if self.softmax_temperature != 1: + output = (output[:, :self.n_classes].float() / self.softmax_temperature) + + output = output[..., self.class_permutations[id_pipe]] + outputs.append(output) + else: + self.model.to(self.device) + with(torch.autocast(device_type='cuda', enabled=self.mix_precision), torch.inference_mode()): + x_=x_.unsqueeze(0) + y_ = y_.unsqueeze(0) + output=self.model(x=x_, y=y_, eval_pos=y_.shape[1], task_type='cls') + + if self.mask_prediction: + process_config = output['process_config'] + output_feature_pred = self.PostProcessInModel(output['feature_pred'], process_config) + output_feature_pred = self.PostProcess(output_feature_pred, pipe, process_config) + mask_predictions.append(output_feature_pred) + output = output['cls_output'] + + output = output if isinstance(output, dict) else output.squeeze(0) + + if self.softmax_temperature != 1: + output = (output[:, :self.n_classes].float() / self.softmax_temperature) + + output = output[..., self.class_permutations[id_pipe]] + outputs.append(output) + + outputs = [torch.nn.functional.softmax(o, dim=1) for o in outputs] + output = torch.stack(outputs).mean(dim=0) + mask_prediction = np.stack(mask_predictions).mean(axis=0) if mask_predictions != [] and self.mask_prediction else None + output = output.float().cpu().numpy() + + if self.mask_prediction: + return output / output.sum(axis=1, keepdims=True), mask_prediction + else: + return output / output.sum(axis=1, keepdims=True) + + def PostProcessInModel(self, feature_pred:torch.tensor, config: dict) -> torch.tensor: + # Revert preprocess in model forward + feature_pred = feature_pred / torch.sqrt(config['features_per_group'] / config['num_used_features'].to(self.device)) + feature_pred = feature_pred*config['std_for_normalization'] + config['mean_for_normalization'] + feature_pred = einops.rearrange(feature_pred, "b s f n -> s b (f n)").squeeze(1).float().cpu().numpy() + if config['n_x_padding'] > 0: + feature_pred = feature_pred[:,:-config['n_x_padding']] + return feature_pred + + def PostProcess(self, feature_pred:np.ndarray, pipeline:List, config: dict, gt=False) -> np.ndarray: + # Revert preprocess in the Classifier + for id_step, step in enumerate(reversed(pipeline)): + if isinstance(step, FeatureShuffler): + if step.mode == "shuffle": + inv_p = np.argsort(step.feature_indices) + feature_pred = feature_pred[:, inv_p] + else: + raise NotImplementedError + elif isinstance(step, CategoricalFeatureEncoder): + if step.encoding_strategy != 'onehot': + if step.category_mappings is not None: + categorical_indices = list(step.category_mappings.keys()) + feature_pred[:, categorical_indices] = np.round(feature_pred[:, categorical_indices]) + if step.transformer is not None: + for idx, p in step.category_mappings.items(): + feature_pred[:, idx] = np.clip(feature_pred[:, idx], a_min=0, a_max=max(p)) + inv_p = np.argsort(p) + feature_pred[:, idx] = inv_p[feature_pred[:, idx].astype(int)].astype(feature_pred.dtype) + inv_col = np.argsort(step.feature_indices) + feature_pred = feature_pred[:, inv_col] + else: + if len(step.categorical_features) == 0 or step.transformer is None: + continue + cont_features_indices = [idx for idx in range(feature_pred.shape[1]) if idx not in step.categorical_features] + + assert np.array_equal(step.categorical_features, np.arange(len(step.categorical_features))) + start_idx = 0 + for idx, out_category in enumerate(step.transformer.named_transformers_['one_hot_encoder'].categories_): + assert len(out_category) >= 2 + if not np.any(np.isnan(out_category)): + if len(out_category) == 2: # e.g. [3, 5.5] + feature_pred[:,start_idx] = np.round(np.clip(feature_pred[:,start_idx], a_min=0, a_max=1)) + start_idx += 1 + else: + arr = feature_pred[:, start_idx:start_idx+len(out_category)] + feature_pred[:, start_idx:start_idx+len(out_category)] = (arr == arr.max(axis=1, keepdims=True)).astype(float) + start_idx += len(out_category) + else: + if len(out_category) == 2: # e.g. [0, nan] + feature_pred[:,start_idx] = 0 + start_idx += 1 + else: + arr = feature_pred[:, start_idx:start_idx+len(out_category)-1] + feature_pred[:, start_idx:start_idx+len(out_category)-1] = (arr == arr.max(axis=1, keepdims=True)).astype(float) + feature_pred[:, start_idx+len(out_category)-1] = 0 + start_idx += len(out_category) + feature_pred = np.column_stack([step.transformer.named_transformers_['one_hot_encoder'].inverse_transform(feature_pred[:, step.categorical_features]), feature_pred[:, cont_features_indices]]) + + elif isinstance(step, RebalanceFeatureDistribution): + if step.svd_tag == 'svd' and step.svd_n_comp > 0: + feature_pred = feature_pred[:, :-step.svd_n_comp] + if step.worker_tags[0] in ["quantile_uniform_10", "quantile_uniform_5", "quantile_uniform_all_data"] and step.n_quantile_features > 0: + feature_pred = feature_pred[:, :-step.n_quantile_features] + elif step.worker_tags[0] == "power": + raise ValueError(f"Missing value imputation does not currently support the preprocessing method of power!") + cont_features_indices = [idx for idx in range(feature_pred.shape[1]) if idx not in step.dis_ix] + feature_pred[:, cont_features_indices] = step.worker.named_transformers_['feat_transform'].inverse_transform(feature_pred[:, cont_features_indices]) + # reverse feature order + if step.feature_indices is not None: + inv_p = np.argsort(step.feature_indices) + feature_pred = feature_pred[:, inv_p] + + + elif isinstance(step, FilterValidFeatures): + deleted_indices = np.where(step.invalid_indices)[0] + if len(deleted_indices) > 0: + original_cols = len(deleted_indices) + feature_pred.shape[1] + restored = np.zeros((feature_pred.shape[0], original_cols)) + all_indices = set(range(original_cols)) + kept_indices = list(all_indices - set(deleted_indices)) + for i, idx in enumerate(kept_indices): + restored[:, idx] = feature_pred[:, i] + for i, idx in enumerate(deleted_indices): + restored[:, idx] = step.invalid_features[:, i] + feature_pred = restored.copy() + return feature_pred + + def _predict_reg(self, x_train:np.ndarray, y_train:np.ndarray, x_test:np.ndarray) -> np.ndarray: + + # For some reason, they scale the data outside of the model, we do it here. + y_mean = y_train.mean() + y_std = y_train.std() + y_train = (y_train - y_mean) / y_std + + np_rng = np.random.default_rng(self.seed) + + x_train, y_train = self.validate_data(x_train, y_train, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False) + x_test = self.validate_data(x_test, reset=True, validate_separately=False, accept_sparse=False, dtype=None, force_all_finite=False) + + # "Concatenate x_train and x_test to ensure the preprocessing logic is completely consistent. + x = np.concatenate([x_train, x_test], axis=0) + + # preprocess x + x = self.convert_x_dtypes(x) + x = self.convert_category2num(x) + x = x.astype(float) + categorical_idx = self.get_categorical_features_indices(x) + + outputs = [] + mask_predictions = [] + for id_pipe, pipe in enumerate(self.preprocess_pipelines): + x_ = x.copy() + y_ = y_train.copy() + categorical_idx_ = categorical_idx.copy() + for id_step, step in enumerate(pipe): + if isinstance(step, RebalanceFeatureDistribution): + x_train_ = x_[:len(y_train)] + x_test_ = x_[len(y_train):] + if x_train_.shape[1] != x_test_.shape[1]: + x_test_ = x_test_[:, :x_train_.shape[1]] + x_train_, categorical_idx_ = step.fit_transform(x_train_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step]) + x_test_, categorical_idx_ = step.transform(x_test_) + x_ = np.concatenate([x_train_, x_test_], axis=0) + elif isinstance(step, InferenceAttentionMap): + + feature_attention_score, sample_attention_score = step.inference(X_train=x_[:len(y_train)], + y_train=y_train, + X_test=x_[len(y_train):], + task_type="reg") + + elif isinstance(step, SubSampleData): + step.fit(torch.from_numpy(x_[:len(y_train)]), torch.from_numpy(y_train), + feature_attention_score=feature_attention_score, + sample_attention_score=sample_attention_score, + subsample_ratio=self.inference_config[id_pipe]["retrieval_config"]["subsample_ratio"]) + if self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "feature": + x_ = step.transform(torch.from_numpy(x_[len(y_train):]).float()) + categorical_idx_ = self.get_categorical_features_indices(x_) + else: + attention_score = step.transform(torch.from_numpy(x_[len(y_train):]).float()) + else: + x_, categorical_idx_ = step.fit_transform(x_, categorical_idx_, self.seeds[id_pipe*self.preprocess_num+id_step]) + + x_ = torch.from_numpy(x_[:, :]).float().to(self.device) + y_ = torch.from_numpy(y_).float().to(self.device) + torch.manual_seed(self.seed) + torch.cuda.manual_seed_all(self.seed) + if self.inference_config[id_pipe]["retrieval_config"]["use_retrieval"] and \ + self.inference_config[id_pipe]["retrieval_config"]["subsample_type"] == "sample": + inference = InferenceResultWithRetrieval(model=self.model, + sample_selection_type="AM") + output = inference.inference(x_[:len(y_train)], y_, + x_[len(y_train):], + attention_score=attention_score, + retrieval_len=self.inference_config[id_pipe]["retrieval_config"][ + "subsample_ratio"], task_type="reg") + outputs.append(output) + elif self.inference_with_DDP: + inference = InferenceResultWithRetrieval(model=self.model, + sample_selection_type="DDP") + output = inference.inference(x_[:len(y_train)].squeeze(1), y_, x_[len(y_train):].squeeze(1), + task_type="reg") + outputs.append(output) + else: + self.model.to(self.device) + with(torch.autocast(device_type='cuda', enabled=self.mix_precision), torch.inference_mode()): + x_=x_.unsqueeze(0) + y_ = y_.unsqueeze(0) + + output=self.model(x=x_, y=y_, eval_pos=y_.shape[1], task_type='reg') + + if self.mask_prediction: + process_config = output['process_config'] + output_feature_pred = self.PostProcessInModel(output['feature_pred'], process_config) + output_feature_pred = self.PostProcess(output_feature_pred, pipe, process_config) + mask_predictions.append(output_feature_pred) + output = output['reg_output'] + + output = output if isinstance(output, dict) else output.squeeze(0) + outputs.append(output) + + output = torch.stack(outputs).squeeze(2).mean(dim=0) + mask_prediction = np.stack(mask_predictions).mean(axis=0) if mask_predictions != [] else None + output = (output * y_std) + y_mean + output = output.cpu().numpy() + + if self.mask_prediction: + return output, mask_prediction + else: + return output diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py b/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py new file mode 100644 index 00000000..f60ec146 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/inference/preprocess.py @@ -0,0 +1,589 @@ +import numpy as np +from torch.utils.data import DataLoader, Dataset +import torch +import warnings +import scipy +from typing_extensions import override +from typing import Literal, Any +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import ( + OneHotEncoder, + OrdinalEncoder, + FunctionTransformer, + PowerTransformer, + StandardScaler, + QuantileTransformer, MinMaxScaler +) +from sklearn.pipeline import FeatureUnion, Pipeline +from sklearn.impute import SimpleImputer +from sklearn.decomposition import TruncatedSVD +from sklearn.utils.validation import check_is_fitted +from tabrepo.benchmark.models.ag.limix.LimiX.utils.data_utils import TabularInferenceDataset +from functools import partial +MAXINT_RANDOM_SEED = int(np.iinfo(np.int32).max) + +class SelectiveInversePipeline(Pipeline): + def __init__(self, steps, skip_inverse=None): + super().__init__(steps) + self.skip_inverse = skip_inverse or [] + + def inverse_transform(self, X): + """跳过指定步骤的inverse_transform""" + if X.shape[1] == 0: + return X + for step_idx in range(len(self.steps) - 1, -1, -1): + name, transformer = self.steps[step_idx] + try: + check_is_fitted(transformer) + except: + continue + + if name in self.skip_inverse: + continue + + if hasattr(transformer, 'inverse_transform'): + X = transformer.inverse_transform(X) + if np.any(np.isnan(X)): + print(f"After reverse RebalanceFeatureDistribution of {name}, there is nan") + return X + +class RobustPowerTransformer(PowerTransformer): + """PowerTransformer with automatic feature reversion when variance or value constraints fail.""" + + def __init__(self, var_tolerance: float = 1e-3, + max_abs_value: float = 100, + **kwargs: Any) -> None: + super().__init__(**kwargs) + self.var_tolerance = var_tolerance + self.max_abs_value = max_abs_value + self.restore_indices_: np.ndarray | None = None + + + def fit(self, X, y=None): + fitted = super().fit(X, y) + self.restore_indices_ = np.array([], dtype=int) + return fitted + + def fit_transform(self, X, y=None): + Z = super().fit_transform(X,y) + self.restore_indices_ = self._should_revert(Z) + return Z + + def _should_revert(self, Z: np.ndarray) -> np.ndarray: + """Determine which columns to revert to their original values.""" + variances = np.nanvar(Z, axis=0) + bad_var = np.flatnonzero(np.abs(variances - 1.0) > self.var_tolerance) + + bad_large = np.flatnonzero(np.any(Z > self.max_abs_value, axis=0)) + + return np.unique(np.concatenate([bad_var, bad_large])) + + def _apply_reversion(self, Z: np.ndarray, X: np.ndarray) -> np.ndarray: + if self.restore_indices_.size > 0: + Z[:, self.restore_indices_] = X[:, self.restore_indices_] + return Z + + def transform(self, X): + Z = super().transform(X) + # self.restore_indices_ = self._should_revert(Z) + return self._apply_reversion(Z, X) + + def _yeo_johnson_optimize(self, x: np.ndarray) -> float: + "Overload_yeo_johnson_optimize to avoid crashes caused by values such as NaN and Inf." + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", + message=r"overflow encountered", + category=RuntimeWarning) + return super()._yeo_johnson_optimize(x) # type: ignore + except scipy.optimize._optimize.BracketError: + return np.nan + + def _yeo_johnson_transform(self, x: np.ndarray, lmbda: float) -> np.ndarray: + "_yeo_johnson_transform to avoid crashes caused by NaN" + if np.isnan(lmbda): + return x + return super()._yeo_johnson_transform(x, lmbda) # type: ignore + +class BasePreprocess: + """Abstract base class for preprocessing class""" + + def fit(self, x:np.ndarray, categorical_features:list[int], seed:int)->list[int]: + """Fit the preprocessing model to the data""" + raise NotImplementedError + + def transform(self, x:np.ndarray)->tuple[np.ndarray, list[int]]: + """Transform the data using the fitted preprocessing model""" + raise NotImplementedError + + def fit_transform(self, x:np.ndarray, categorical_features:list[int], seed:int)->tuple[np.ndarray, list[int]]: + """Fit the preprocessing model to the data and transform the data""" + self.fit(x, categorical_features, seed) + return self.transform(x) + +def infer_random_state( + random_state: int | np.random.RandomState | np.random.Generator | None, +) -> tuple[int, np.random.Generator]: + """Infer the random state and return the seed and generator""" + if random_state is None: + np_rng = np.random.default_rng() + return int(np_rng.integers(0, MAXINT_RANDOM_SEED)), np_rng + + if isinstance(random_state, (int, np.integer)): + return int(random_state), np.random.default_rng(random_state) + + if isinstance(random_state, np.random.RandomState): + seed = int(random_state.randint(0, MAXINT_RANDOM_SEED)) + return seed, np.random.default_rng(seed) + + if isinstance(random_state, np.random.Generator): + return int(random_state.integers(0, MAXINT_RANDOM_SEED)), random_state + + raise ValueError(f"Invalid random_state {random_state}") + +class FilterValidFeatures(BasePreprocess): + def __init__(self): + self.valid_features: list[bool] | None = None + self.categorical_idx: list[int] | None = None + self.invalid_indices: list[int] | None = None + self.invalid_features: list[int] | None = None + + @override + def fit(self,x: np.ndarray, categorical_idx: list[int], seed:int) -> list[int]: + self.categorical_idx = categorical_idx + self.valid_features = ((x[0:1, :] == x).mean(axis=0) < 1.0).tolist() + self.invalid_indices = ((x[0:1, :] == x).mean(axis=0) == 1.0).tolist() + if not any(self.valid_features): + raise ValueError("All features are constant! Please check your data.") + + self.categorical_idx = [ + index + for index, idx in enumerate(np.where(self.valid_features)[0]) + if idx in categorical_idx + ] + + return self.categorical_idx + + @override + def transform(self,x: np.ndarray) -> tuple[np.ndarray, list[int]]: + assert self.valid_features is not None, "You must call fit first to get effective_features" + self.invalid_features = x[:, self.invalid_indices] + return x[:, self.valid_features], self.categorical_idx + +class FeatureShuffler(BasePreprocess): + """ + Feature column reordering preprocessor + """ + + def __init__( + self, + mode: Literal['rotate', 'shuffle'] | None = "shuffle", + offset: int = 0, + ): + super().__init__() + self.mode = mode + self.offset = offset + self.random_seed = None + self.feature_indices = None + self.categorical_indices = None + + @override + def fit(self, data: np.ndarray, categorical_cols: list[int], seed:int) -> list[int]: + n_features = data.shape[1] + self.random_seed = seed + + indices = np.arange(n_features) + + if self.mode == "rotate": + self.feature_indices = np.roll(indices, self.offset) + elif self.mode == "shuffle": + _, rng = infer_random_state(self.random_seed) + self.feature_indices = rng.permutation(indices) + elif self.mode is None: + self.feature_indices = np.arange(n_features) + else: + raise ValueError(f"Unsupported reordering mode: {self.mode}") + + is_categorical = np.isin(np.arange(n_features), categorical_cols) + self.categorical_indices = np.where(is_categorical[self.feature_indices])[0].tolist() + + return self.categorical_indices + + @override + def transform(self, data: np.ndarray, *, is_test: bool = False) -> tuple[np.ndarray, list[int]]: + if self.feature_indices is None: + raise RuntimeError("Please call the fit method first to initialize") + if len(self.feature_indices) != data.shape[1]: + raise ValueError("The number of features in the input data does not match the training data") + + return data[:, self.feature_indices], self.categorical_indices or [] + +class CategoricalFeatureEncoder(BasePreprocess): + """ + Categorical feature encoder + """ + + def __init__( + self, + encoding_strategy: Literal['ordinal', 'ordinal_strict_feature_shuffled', 'ordinal_shuffled', 'onehot', 'numeric']|None = "ordinal", + ): + super().__init__() + self.encoding_strategy = encoding_strategy + self.random_seed = None + self.transformer = None + self.category_mappings = None + self.categorical_features = None + + @override + def fit(self, data: np.ndarray, feature_indices: list[int], seed:int) -> list[int]: + self.random_seed = seed + self.transformer, self.categorical_features = self._create_transformer(data, feature_indices) + + if self.transformer is not None: + self.transformer.fit(data) + + if self.encoding_strategy == "ordinal_shuffled": + _, rng = infer_random_state(self.random_seed) + categories = self.transformer.named_transformers_["ordinal_encoder"].categories_ + self.category_mappings = { + idx: rng.permutation(len(cat)) + for idx, cat in enumerate(categories) + } + + return self.categorical_features + + @override + def transform(self, data: np.ndarray, *, is_test: bool = False) -> tuple[np.ndarray, list[int]]: + if self.transformer is None: + return data, self.categorical_features or [] + # todo 不生效? + transformed = self.transformer.transform(data) + + if self.category_mappings is not None: + for col_idx, mapping in self.category_mappings.items(): + col_data = transformed[:, col_idx] + valid_mask = ~np.isnan(col_data) + col_data[valid_mask] = mapping[col_data[valid_mask].astype(int)] + + return transformed, self.categorical_features + + @override + def fit_transform(self, data: np.ndarray, categorical_columns: list[int], seed:int) -> tuple[np.ndarray, list[int]]: + self.random_seed = seed + return self._fit_transform(data, categorical_columns) + + def _fit_transform( + self, + X: np.ndarray, + categorical_features: list[int], + ) -> tuple[np.ndarray, list[int]]: + ct, categorical_features = self._create_transformer(X, categorical_features) + if ct is None: + self.transformer = None + return X, categorical_features + + _, rng = infer_random_state(self.random_seed) + + if self.encoding_strategy.startswith("ordinal"): + Xt = ct.fit_transform(X) + categorical_features = list(range(len(categorical_features))) + + if self.encoding_strategy.endswith("_shuffled"): + self.category_mappings = {} + for col_ix in categorical_features: + col_cats = len( + ct.named_transformers_["ordinal_encoder"].categories_[col_ix], + ) + perm = rng.permutation(col_cats) + self.category_mappings[col_ix] = perm + + col_data = Xt[:, col_ix] + valid_mask = ~np.isnan(col_data) + col_data[valid_mask] = perm[col_data[valid_mask].astype(int)].astype(col_data.dtype) + + elif self.encoding_strategy == "onehot": + Xt = ct.fit_transform(X) + if Xt.size >= 1_000_000: + ct = None + Xt = X + else: + categorical_features = list(range(Xt.shape[1]))[ + ct.output_indices_["one_hot_encoder"] + ] + else: + raise ValueError( + f"Unknown categorical transform {self.encoding_strategy}", + ) + + self.transformer = ct + self.categorical_features = categorical_features + return Xt, categorical_features + + @staticmethod + def get_least_common_category_count(column: np.ndarray) -> int: + """Retrieve the smallest count value among categorical features""" + if len(column) == 0: + return 0 + return int(np.unique(column, return_counts=True)[1].min()) + + def _create_transformer(self, data: np.ndarray, categorical_columns: list[int]) -> tuple[ColumnTransformer | None, list[int]]: + """Create an appropriate column transformer""" + if self.encoding_strategy.startswith("ordinal"): + suffix = self.encoding_strategy[len("ordinal"):] + + if "feature_shuffled" in suffix: + categorical_columns = [ + idx for idx in categorical_columns + if self._is_valid_common_category(data[:, idx], suffix) + ] + remainder_columns = [idx for idx in range(data.shape[1]) if idx not in categorical_columns] + self.feature_indices = categorical_columns + remainder_columns + + return ColumnTransformer( + [("ordinal_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), categorical_columns)], + remainder="passthrough" + ), categorical_columns + + elif self.encoding_strategy == "onehot": + return ColumnTransformer( + [("one_hot_encoder", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), categorical_columns)], + remainder="passthrough" + ), categorical_columns + + elif self.encoding_strategy in ("numeric", "none"): + return None, categorical_columns + + raise ValueError(f"Unsupported encoding strategy: {self.encoding_strategy}") + + def _is_valid_common_category(self, column: np.ndarray, suffix: str) -> bool: + """Check whether the input data meets the common category conditions""" + min_count = self.get_least_common_category_count(column) + unique_count = len(np.unique(column)) + + if "strict_feature_shuffled" in suffix: + return min_count >= 10 and unique_count < (len(column) // 10) + return min_count >= 10 + +# Avoid lambda to support pickle... +def identity_function(x): + return x +def feature_shift(x): + return x + np.abs(np.nanmin(x)) +def add_epsilon(x): + return x + 1e-10 + +class RebalanceFeatureDistribution(BasePreprocess): + def __init__( + self, + *, + worker_tags: list[Literal['quantile', 'logNormal', 'quantile_uniform_10', 'quantile_uniform_5']] | None = ["quantile"], + discrete_flag: bool = False, + original_flag: bool = False, + svd_tag: Literal['svd'] | None = None, + joined_svd_feature: bool = True, + joined_log_normal: bool = True, + ): + super().__init__() + self.worker_tags = worker_tags + self.discrete_flag = discrete_flag + self.original_flag = original_flag + self.random_state = None + self.svd_tag = svd_tag + self.worker: Pipeline | ColumnTransformer | None = None + self.joined_svd_feature = joined_svd_feature + self.joined_log_normal = joined_log_normal + self.feature_indices = None + + def fit(self, X: np.ndarray, categorical_features: list[int], seed:int) -> list[int]: + self.random_state = seed + n_samples, n_features = X.shape + worker, self.dis_ix = self._set(n_samples,n_features,categorical_features) + worker.fit(X) + self.worker = worker + return self.dis_ix + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.worker is not None + return self.worker.transform(X), self.dis_ix # type: ignore + + + def _set(self,n_samples: int, + n_features: int, + categorical_features: list[int], + ): + static_seed, rng = infer_random_state(self.random_state) + all_ix = list(range(n_features)) + workers = [] + cont_ix = [i for i in all_ix if i not in categorical_features] + if self.original_flag: + trans_ixs = categorical_features + cont_ix if self.discrete_flag else cont_ix + workers.append(("original", "passthrough", all_ix)) + dis_ix = categorical_features + elif self.discrete_flag: + # trans_ixs = all_ix + # dis_ix = categorical_features + trans_ixs = categorical_features + cont_ix + self.feature_indices = categorical_features + cont_ix + dis_ix = [] + else: + workers.append(("discrete", "passthrough", categorical_features)) + trans_ixs, dis_ix = cont_ix, list(range(len(categorical_features))) + for worker_tag in self.worker_tags: + if worker_tag== "quantile": + sworker = QuantileTransformer( + output_distribution="uniform", + n_quantiles=max(n_samples // 10, 2), + random_state=static_seed, + ) + elif worker_tag == "logNormal": + sworker = Pipeline(steps=[ + ("save_standard", Pipeline(steps=[ + ("i2n_pre", + FunctionTransformer( + func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan), + inverse_func=identity_function, check_inverse=False)), + ("fill_missing_pre", + SimpleImputer(missing_values=np.nan, strategy="mean", + keep_empty_features=True)), + ("feature_shift", + FunctionTransformer(func=feature_shift)), + ("add_epsilon", FunctionTransformer(func=add_epsilon)), + ("logNormal", FunctionTransformer(np.log, validate=False)), + ("i2n_post", + FunctionTransformer( + func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, + posinf=np.nan), + inverse_func=identity_function, check_inverse=False)), + ("fill_missing_post", + SimpleImputer(missing_values=np.nan, strategy="mean", + keep_empty_features=True))])), + ]) + + + trans_ixs = cont_ix + elif worker_tag == "quantile_uniform_10": + sworker = QuantileTransformer( + output_distribution="uniform", + n_quantiles=max(n_samples // 10, 2), + random_state=static_seed, + ) + elif worker_tag == "quantile_uniform_5": + sworker = QuantileTransformer( + output_distribution="uniform", + n_quantiles=max(n_samples // 5, 2), + random_state=static_seed, + ) + elif worker_tag == "quantile_uniform_all_data": + sworker = QuantileTransformer( + output_distribution="uniform", + n_quantiles=max(n_samples // 5, 2), + random_state=static_seed, + subsample=n_samples, + ) + elif worker_tag == 'power': + self.feature_indices = categorical_features+cont_ix + self.dis_ix = dis_ix + nan_to_mean_transformer = SimpleImputer( + missing_values=np.nan, + strategy="mean", + keep_empty_features=True, + ) + + sworker = SelectiveInversePipeline( + steps=[ + ("power_transformer", RobustPowerTransformer(standardize=False)), + ("inf_to_nan_1", FunctionTransformer( + func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan), + inverse_func=identity_function, + check_inverse=False, + )), + ("nan_to_mean_1", nan_to_mean_transformer), + ("scaler", StandardScaler()), + ("inf_to_nan_2", FunctionTransformer( + func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan), + inverse_func=identity_function, + check_inverse=False, + )), + ("nan_to_mean_2", nan_to_mean_transformer), + ], + skip_inverse=['nan_to_mean_1', 'nan_to_mean_2'] + ) + else: + sworker = FunctionTransformer(identity_function) + if worker_tag in ["quantile_uniform_10", "quantile_uniform_5", "quantile_uniform_all_data"]: + self.n_quantile_features = len(trans_ixs) + workers.append(("feat_transform", sworker, trans_ixs)) + + CT_worker = ColumnTransformer(workers,remainder="drop",sparse_threshold=0.0) + if self.svd_tag == "svd" and n_features >= 2: + svd_worker = FeatureUnion([ + ("default", FunctionTransformer(func=identity_function)), + ("svd",Pipeline(steps=[ + ("save_standard",Pipeline(steps=[ + ("i2n_pre", FunctionTransformer(func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),inverse_func=identity_function, check_inverse=False)), + ("fill_missing_pre", SimpleImputer(missing_values=np.nan, strategy="mean", keep_empty_features=True)), + ("standard", StandardScaler(with_mean=False)) , + ("i2n_post", FunctionTransformer(func=partial(np.nan_to_num, nan=np.nan, neginf=np.nan, posinf=np.nan),inverse_func=identity_function, check_inverse=False)), + ("fill_missing_post", SimpleImputer(missing_values=np.nan, strategy="mean", keep_empty_features=True))])), + ("svd",TruncatedSVD(algorithm="arpack",n_components=max(1,min(n_samples // 10 + 1,n_features // 2)),random_state=static_seed))])) + ]) + self.svd_n_comp = max(1,min(n_samples // 10 + 1,n_features // 2)) + worker = Pipeline([("worker", CT_worker), ("svd_worker", svd_worker)]) + else: + self.svd_n_comp = 0 + worker = CT_worker + + self.worker = worker + return worker, dis_ix + + +class SubSampleData(): + def __init__( + self, + subsample_type: Literal["feature", "sample"] = "sample", + use_type: Literal["mixed", "only_sample"] = "mixed", + ): + super().__init__() + self.subsample_type = subsample_type + self.use_type = use_type + + def fit(self, + x: torch.Tensor=None, + y: torch.Tensor = None, + feature_attention_score: torch.Tensor = None, + sample_attention_score: torch.Tensor = None, + subsample_ratio: float | int = 200, + subsample_idx:list[int] | np.ndarray[int] = None, + ): + if isinstance(subsample_ratio, float): + if self.subsample_type == "sample": + self.subsample_num = int(subsample_ratio * x.shape[0]) + else: + self.subsample_num = int(subsample_ratio * x.shape[1]) + else: + self.subsample_num = subsample_ratio + if self.subsample_type == "sample": + if self.use_type == "mixed": + y_feature_attention_score = feature_attention_score[:, -1, :].squeeze().permute(1, 0).unsqueeze( + 2).repeat(1, 1, + sample_attention_score.shape[2]) # shape [features,test_sample_lens,train_sample_lens] + + self.attention_score = torch.mean(sample_attention_score * y_feature_attention_score, + dim=0) # shape [test_sample_lens,train_sample_lens] + else: + self.attention_score = sample_attention_score[-1, :, :] + self.X_train = x + self.y_train = y + else: + y_feature_attention_score = torch.mean(feature_attention_score[:, -1, :].squeeze(),dim=0) # shape [test_sample_lens,features] + if subsample_idx is None: + self.subsample_idx = np.argsort(y_feature_attention_score)[-min(self.subsample_num, x.shape[0]):] + else: + self.subsample_idx = subsample_idx + self.X_train = x + + def transform(self, x: torch.Tensor=None) -> np.ndarray |torch.Tensor | TabularInferenceDataset: + if self.subsample_type == "feature": + return torch.cat([self.X_train, x], dim=0)[:, self.subsample_idx].numpy() + else: + return self.attention_score diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py new file mode 100644 index 00000000..6c7c6937 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/encoders.py @@ -0,0 +1,555 @@ +import torch +import torch.nn as nn +from tabrepo.benchmark.models.ag.limix.LimiX.model.layer import EncoderBaseLayer, MLP +from typing import Any,Literal +from torch.nn.init import orthogonal_ +import numpy as np + +def calc_mean(x:torch.Tensor, dim:int): + num = torch.sum(~torch.isnan(x), dim=dim).clip(min=1.0) + return torch.nansum(x, dim=dim) / num, num + +def calc_std(x:torch.Tensor, dim:int, mean_v:torch.Tensor|None = None, value_num:torch.Tensor|None=None ): + if mean_v is None or value_num is None: + mean_v, value_num = calc_mean(x, dim) + mean_broadcast = torch.repeat_interleave(mean_v.unsqueeze(dim), x.shape[dim], dim=dim,) + return torch.sqrt(torch.nansum(torch.square(mean_broadcast - x), dim=dim) / (value_num - 1)) + +def drop_outliers( + x:torch.Tensor, + std_sigma:float=4, + eval_pos:int=-1, + lower:torch.Tensor|None = None, + upper:torch.Tensor|None = None, + dim:int=1 + ): + assert len(x.shape)==3, "x.shape must be B,S,F" + + if lower is None: + data = x[:,:eval_pos].clone() + data_mean, value_num = calc_mean(data, dim=dim) + data_std = calc_std(data, dim=dim, mean_v=data_mean, value_num=value_num) + cut_off = data_std * std_sigma + lower, upper = data_mean - cut_off, data_mean + cut_off + + data[torch.logical_or(data > upper, data < lower)] = np.nan + data_mean, value_num = calc_mean(data, dim=dim) + data_std = calc_std(data, dim=dim, mean_v=data_mean, value_num=value_num) + cut_off = data_std * std_sigma + lower, upper = data_mean - cut_off, data_mean + cut_off + + x = torch.maximum(-torch.log(1 + torch.abs(x)) + lower, x) + x = torch.minimum(torch.log(1 + torch.abs(x)) + upper, x) + + return x, lower, upper + +def normalize_mean0_std1( + x:torch.Tensor, + eval_pos:int=-1, + clip:bool=True, + dim:int=1, + mean: torch.Tensor | None = None, + std: torch.Tensor | None = None + ): + if mean is None: + mean, value_num = calc_mean(x[:,:eval_pos], dim=dim) + std = calc_std(x[:,:eval_pos], dim=dim, mean_v=mean, value_num=value_num) + 1e-20 + + if x.shape[1] == 1 or eval_pos == 1: + std[:] = 1.0 + x = (x - mean.unsqueeze(1).expand_as(x)) / std.unsqueeze(1).expand_as(x) + if clip: + x = torch.clip(x, min=-100, max=100) + return x, mean, std + + +class LinearEncoder(nn.Module): + """linear input encoder""" + def __init__( + self, + num_features: int, + emsize: int, + nan_to_zero: bool = False, + bias: bool = True, + in_keys:list[str]=['data'], + out_key:str='data', + ): + """Initialize the LinearEncoder. + + Args: + num_features: The number of input features. + emsize: The embedding size, i.e. the number of output features. + nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False. + bias: Whether to use a bias term in the linear layer. Defaults to True. + """ + super().__init__() + self.layer = nn.Linear(num_features, emsize, bias=bias) + self.nan_to_zero = nan_to_zero + self.in_keys = in_keys + self.out_key = out_key + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + assert 'data' in input and 'nan_encoding' in input + x = [input[key] for key in self.in_keys] + x = torch.cat(x, dim=-1) # type: ignore + if self.nan_to_zero: + x = torch.nan_to_num(x, nan=0.0) + + input[self.out_key] = self.layer(x) + return input + +class MLPEncoder(nn.Module): + """MLP input encoder""" + def __init__( + self, + num_features: int, + emsize: int, + nan_to_zero: bool = False, + bias: bool = True, + in_keys: list[str] = ['data'], + out_key: str = 'data', + ): + """Initialize the MLPEncoder. + + Args: + num_features: The number of input features. + emsize: The embedding size, i.e. the number of output features. + nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False. + bias: Whether to use a bias term in the linear layer. Defaults to True. + """ + super().__init__() + self.layer = nn.Sequential( + nn.Linear(num_features, emsize * 2, bias=bias), + nn.LayerNorm(emsize * 2), + nn.GELU(), + nn.Linear(emsize * 2, emsize, bias=bias), + nn.LayerNorm(emsize), + nn.GELU() + ) + self.nan_to_zero = nan_to_zero + self.in_keys = in_keys + self.out_key = out_key + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + assert 'data' in input and 'nan_encoding' in input + x = [input[key] for key in self.in_keys] + x = torch.cat(x, dim=-1) # type: ignore + if self.nan_to_zero: + x = torch.nan_to_num(x, nan=0.0) + input[self.out_key] = x + return input + +class MaskEmbEncoder(nn.Module): + """ + For masked features, use the mask vector to obtain their representations; + for numerical features, use a nonlinear network to obtain their representations + """ + def __init__( + self, + num_features: int, + emsize: int, + mask_embedding_size: int, + nan_to_zero: bool = False, + bias: bool = True, + in_keys: list[str] = ['data'], + out_key: str = 'data', + ): + """Initialize the MaskEmbEncoder. + + Args: + num_features: The number of input features. + emsize: The embedding size, i.e. the number of output features. + nan_to_zero: Whether to replace NaN values in the input by zero. Defaults to False. + bias: Whether to use a bias term in the linear layer. Defaults to True. + """ + super().__init__() + self.embedding_dim = emsize + self.mask_embedding_size = mask_embedding_size + self.in_keys = in_keys + self.out_key = out_key + + # All masked positions use the same vector + self.mask_embedding = nn.Parameter(torch.randn(self.mask_embedding_size)) + + # MLP for numerical features: input is 1, output is embedding_dim + self.numeric_mlp = nn.Sequential( + nn.Linear(1, self.embedding_dim // 2, bias=bias), + nn.LayerNorm(self.embedding_dim // 2), + nn.ReLU(), + nn.Linear(self.embedding_dim // 2, self.embedding_dim, bias=bias), + nn.LayerNorm(self.embedding_dim), + nn.ReLU() + ) + + # Merging layer: maps the concatenated feature vectors back to embedding_dim. + self.fusion_network = nn.Sequential( + nn.Linear(num_features * self.embedding_dim, self.embedding_dim, bias=bias), + nn.LayerNorm(self.embedding_dim), + nn.ReLU(), + nn.Linear(self.embedding_dim, self.embedding_dim, bias=bias), + nn.LayerNorm(self.embedding_dim) + ) + self.nan_to_zero = nan_to_zero + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + assert 'data' in input and 'nan_encoding' in input + x = [input[key] for key in self.in_keys] + x = torch.cat(x, dim=-1) # type: ignore + batch_size, seq_len, group, feature_num = x.shape + x_flat = x.view(-1, feature_num) + is_mask = torch.isnan(x_flat) + feature_embeddings = [] + for i in range(feature_num): + feat_vals = x_flat[:, i].unsqueeze(-1) + feat_is_mask = is_mask[:, i].unsqueeze(-1) + + # Processing numerical features + numeric_input = torch.where(~feat_is_mask, feat_vals, torch.zeros_like(feat_vals)) + numeric_emb = self.numeric_mlp(numeric_input) + + # Construct mask embedding + mask_emb = self.mask_embedding.expand(numeric_emb.shape[0], -1) + + # Merge the embedding results of masked features and numerical features + combined_emb = torch.where(feat_is_mask.expand_as(numeric_emb), mask_emb, numeric_emb) + feature_embeddings.append(combined_emb) + concat_vector = torch.cat(feature_embeddings, dim=-1) + + sample_representation = self.fusion_network(concat_vector) + output = sample_representation.view(batch_size, seq_len, group, -1) + + + input[self.out_key] = output + return input + +class NanEncoder(nn.Module): + """Encoder stage that deals with NaN and infinite values in the input""" + def __init__( + self, + nan_value: float = -2.0, + inf_value: float = 2.0, + neg_info_value: float = 4.0, + in_keys:list[str]=['data'], + out_key:str='nan_encoding' + ): + """Initialize the NanEncoder. + + Args: + keep_nans: Flag to maintain NaN values as individual indicators. + """ + super().__init__() + self.nan_value = nan_value + self.inf_value = inf_value + self.neg_info_value = neg_info_value + self.in_keys = in_keys + self.out_key = out_key + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + x:torch.Tensor = input[self.in_keys[0]] # type: ignore + eval_pos = input['eval_pos'] + + mean_value, _ = calc_mean(x[:,:eval_pos,:], dim=1) + + nans_indicator = torch.zeros_like(x, dtype=x.dtype) + nans_indicator[torch.isnan(x)] = self.nan_value + pos_inf_mask = torch.isinf(x) & (torch.sign(x) == 1) + nans_indicator[pos_inf_mask] = self.inf_value + neg_inf_mask = torch.isinf(x) & (torch.sign(x) == -1) + nans_indicator[neg_inf_mask] = self.neg_info_value + nan_mask = torch.logical_or(torch.isnan(x), torch.isinf(x)) + # avoid inplace operations + x = x.clone() + x[nan_mask] = mean_value.unsqueeze(1).expand_as(x)[nan_mask] + + input[self.in_keys[0]] = x + input[self.out_key ] = nans_indicator + return input + + +class ValidFeatureEncoder(nn.Module): + """Valid feature encoder""" + def __init__( + self, + num_features: int, + nan_normalize: bool=True, + sqrt_normalize: bool=True, + in_keys:list[str]=['data'], + out_key:str='data' + ): + """Initialize the ValidFeatureEncoder. + + Args: + num_features: The target number of features to transform the input into. + nan_normalize: Indicates whether to normalize based on the number of features actually used. + sqrt_normalize: Legacy option to normalize using the square root rather than the count of used features. + """ + super().__init__() + self.num_features = num_features + self.nan_normalize = nan_normalize + self.sqrt_normalize = sqrt_normalize + self.in_keys = in_keys + self.out_key = out_key + self.valid_feature_num = None + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + x:torch.Tensor = input[self.in_keys[0]] # type: ignore + valid_feature = ~torch.all(x == x[:, 0:1, :], dim=1) + self.valid_feature_num = torch.clip(valid_feature.sum(-1).unsqueeze(-1),min=1) + + if self.nan_normalize: + if self.sqrt_normalize: + x = x * torch.sqrt(self.num_features / self.valid_feature_num).unsqueeze(1).expand_as(x) + else: + x = x * (self.num_features / self.valid_feature_num) + + zeros = torch.zeros( + *x.shape[:-1], + self.num_features - x.shape[-1], + device=x.device, + dtype=x.dtype, + ) + x = torch.cat([x, zeros], -1) + + input[self.out_key] = x + return input + + +class EmbYEncoderStep(nn.Module): + """A simple linear input encoder step.""" + + def __init__( + self, + *, + emsize: int, + n_classes: int = 10, + in_keys: list[str] = ['data'], + out_key: str = 'data', + ): + """Initialize the EmbYEncoderStep. + + Args: + emsize: The embedding size, i.e. the number of output features. + n_classes: Number of classes + """ + super().__init__() + + # Ensure the embedding dimension is large enough to support orthogonal initialization. + assert emsize > n_classes + 1, (f"emsize ({emsize}) must be >= n_classes+1 ({n_classes+1}) for orthogonal initialization") + + # Generate an orthogonal matrix of size (n_classes + 1) × emsize + ortho_matrix = torch.empty(n_classes + 1, emsize) + orthogonal_(ortho_matrix) # Initialize in-place as an orthogonal matrix + + # Decompose the matrix: the first n_classes rows are used for y_embedding, and the last row is used for y_mask. + y_embed_weights = ortho_matrix[:n_classes, :] # Shape (n_classes, emsize) + y_mask_weight = ortho_matrix[n_classes:n_classes+1, :] # Shape (1, emsize) + + self.y_embedding = nn.Embedding(n_classes, emsize) + self.y_embedding.weight.data = y_embed_weights.clone() + + self.y_mask = nn.Embedding(1, emsize) + self.y_mask.weight.data = y_mask_weight.clone() + self.in_keys = in_keys + self.out_key = out_key + if len(self.in_keys) > 1: + print("Warning: The EmbYEncoderStepl function is only for processing Y, and in_keys must contain exactly one key.") + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + y = input[self.in_keys[0]] + eval_pos = input['eval_pos'] + y = y.int() # type: ignore + y_train = y[:,:eval_pos] + y_test = torch.zeros_like(y[:, eval_pos:], dtype=torch.int) + y_train_emb = self.y_embedding(y_train).to(torch.float16) + y_test_emb = self.y_mask(y_test).to(torch.float16) + y_emb = torch.cat([y_train_emb, y_test_emb], dim=1) + + input[self.out_key] = y_emb + return input + +class MulticlassTargetEncoder(nn.Module): + """Use the target's index as the class value, with each class corresponding to an index""" + def __init__( + self, + in_keys:list[str]=['data'], + out_key:str='data' + ): + """Initialize the ValidFeatureEncoder. + + Args: + in_keys: the keys of the input parameter + out_key: the key of the output result. + """ + super().__init__() + self.in_keys = in_keys + self.out_key = out_key + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + x:torch.Tensor = input[self.in_keys[0]] # type: ignore + eval_pos = input['eval_pos'] + unique_xs = [ + torch.unique(x[b, :eval_pos]) for b in range(x.shape[0]) + ] + x_ = x.clone() + for b in range(x.shape[0]): + x_[b, :, :] = (x[b, :, :].unsqueeze(-1) > unique_xs[b]).sum(dim=-1) + + input[self.out_key] = x_ + return input + +class NormalizationEncoder(nn.Module): + """normalize encoder""" + def __init__( + self, + train_only:bool, + normalize_x:bool, + remove_outliers:bool, + std_sigma:float=4.0, + in_keys:list[str]=['data'], + out_key:str='data' + + ): + super().__init__() + self.train_only = train_only + self.normalize_x = normalize_x + self.remove_outliers = remove_outliers + self.std_sigma = std_sigma + self.in_keys = in_keys + self.out_key = out_key + self.mean = None + self.std = None + + def forward(self, input:dict[str, torch.Tensor|int])->dict[str, torch.Tensor]: + x = input[self.in_keys[0]] + eval_pos = input['eval_pos'] + pos = eval_pos if self.train_only else -1 + if self.remove_outliers: + x, lower, upper = drop_outliers(x, eval_pos=pos, std_sigma=self.std_sigma) + if self.normalize_x: + x, self.mean, self.std = normalize_mean0_std1(x, eval_pos=pos ) + + input[self.out_key] = x + return input + + + +def get_x_encoder( + *, + num_features: int, + embedding_size: int, + mask_embedding_size: int, + encoder_use_bias: bool, + in_keys: list = ['data'] +): + inputs_to_merge = {} + for in_key in in_keys: + inputs_to_merge[in_key] = {'dim': num_features} + + encoder_steps = [ + MaskEmbEncoder( + num_features=sum([i["dim"] for i in inputs_to_merge.values()]), + emsize=embedding_size, + mask_embedding_size=mask_embedding_size, + bias=encoder_use_bias, + ), + ] + + return nn.Sequential(*encoder_steps,) + + +def get_cls_y_encoder( + *, + num_inputs: int, + embedding_size: int, + nan_handling_y_encoder: bool, + max_num_classes: int +) -> nn.Module: + steps = [] + inputs_to_merge = [{"name": "data", "dim": num_inputs}] + if nan_handling_y_encoder: + steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')] + inputs_to_merge += [{"name": "nan_indicators", "dim": num_inputs}] + + if max_num_classes >= 2: + steps += [MulticlassTargetEncoder()] + + steps += [ + EmbYEncoderStep( + emsize=embedding_size, + n_classes=max_num_classes + ) + ] + return nn.Sequential(*steps) + +def get_reg_y_encoder( + *, + num_inputs: int, + embedding_size: int, + nan_handling_y_encoder: bool, + max_num_classes: int +) -> nn.Module: + steps = [] + inputs_to_merge = [{"name": "data", "dim": num_inputs}] + if nan_handling_y_encoder: + steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')] + inputs_to_merge += [{"name": "nan_indicators", "dim": num_inputs}] + + steps += [ + LinearEncoder( + num_features=sum([i["dim"] for i in inputs_to_merge]), # type: ignore + emsize=embedding_size, + in_keys=['data', 'nan_encoding'], + out_key='data' + ), + ] + return nn.Sequential(*steps) + + +def preprocesss_4_x( + *, + num_features: int, + nan_handling_enabled: bool, + normalize_on_train_only: bool, + normalize_x: bool, + remove_outliers: bool, + normalize_by_used_features: bool, + ): + """feature preprocess""" + inputs_to_merge = {"data": {"dim": num_features}} + + preprocess_steps = [] + + # Obtain the positions of features with NaN and Inf values, and replace these features with the mean of the corresponding feature + preprocess_steps += [NanEncoder(in_keys=['data'], out_key='nan_encoding')] + + if nan_handling_enabled: + inputs_to_merge["nan_encoding"] = {"dim": num_features} + preprocess_steps += [ + # Zero values are added to convert the input into a fixed number of features, without normalization (variance is not constant). + # This transformation is applied to the nan_indicators set, which shares the same shape as x. + # However, since x has been imputed prior to this step, this operation is theoretically redundant. + ValidFeatureEncoder( + num_features=num_features, + nan_normalize=False, + in_keys=["nan_encoding"], + out_key="nan_encoding" + ), + ] + + preprocess_steps += [ + NormalizationEncoder( + train_only=normalize_on_train_only, + normalize_x=normalize_x, + remove_outliers=remove_outliers, + ), + ] + + preprocess_steps += [ + # Convert the input into a fixed number of features by adding zero values, with normalization applied (variance is constant). + ValidFeatureEncoder( + num_features=num_features, + nan_normalize=normalize_by_used_features, + ), + ] + + return nn.Sequential(*preprocess_steps) \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py new file mode 100644 index 00000000..85ba11d2 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/layer.py @@ -0,0 +1,461 @@ +from typing import Callable, Literal, Optional +import functools + +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +try: + from flash_attn.flash_attn_interface import flash_attn_varlen_kvpacked_func, flash_attn_varlen_qkvpacked_func + + HAVE_FLASH_ATTN = True +except (ModuleNotFoundError, ImportError): + HAVE_FLASH_ATTN = False + +from functools import partial +from typing_extensions import override + +Activation = Literal['gelu'] + +ACTIVATION_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = { + 'gelu': nn.GELU(), + 'relu': nn.ReLU(), +} + +class LayerNormMixedPrecision(nn.LayerNorm): + """ + When the embedding dimension is below 512, use half precision for computation to improve performance. + If the embedding dimension exceeds 512, it may cause training instability. + """ + def forward(self, input: torch.Tensor): + if input.dtype == torch.float16 and sum(self.normalized_shape) < 512: + with torch.amp.autocast("cuda" if input.is_cuda else "cpu", enabled=False): + return super().forward(input) + else: + return super().forward(input) + +class MLP(torch.nn.Module): + """Multi-Layer Perceptron""" + def __init__(self, + in_features: int, + hidden_size:int, + out_features: int, + has_bias:bool, + device: torch.device | None, + dtype: torch.dtype | None, + activation: Activation = 'gelu', + depth:int=2): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.activation = activation + self.layers = [] + + if depth == 1: + self.layers.append(nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype)) + else: + # input layer + self.layers.append(nn.Linear(in_features, hidden_size, bias=has_bias, device=device, dtype=dtype)) + self.layers.append(ACTIVATION_FN[self.activation]) + # hidden layers + for i in range(depth - 2): + self.layers.append(nn.Linear(hidden_size, hidden_size, bias=has_bias, device=device, dtype=dtype)) + self.layers.append(ACTIVATION_FN[self.activation]) + # output layer + self.layers.append(nn.Linear(hidden_size, out_features, bias=has_bias, device=device, dtype=dtype)) + torch.nn.init.normal_(self.layers[-1].weight) + self.mlp = nn.Sequential(*self.layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.mlp(x) + +class MultiheadAttention(torch.nn.Module): + def __init__( + self, + embed_dim: int, + num_heads: int, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + qkv_combined: bool = True, + dropout:float=0, + recompute:bool=False + ): + super().__init__() + assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads" + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.qkv_combined = qkv_combined + self.dropout = dropout + self.recompute = recompute + self.device = device + self.dtype = dtype + + self.out_proj_weight = torch.nn.Parameter(torch.empty(self.num_heads, self.head_dim, self.embed_dim, device=self.device, dtype=self.dtype)) + self.qkv_proj_weight = torch.nn.Parameter(torch.empty(3, self.num_heads, self.head_dim, self.embed_dim, device=device, dtype=dtype)) + + torch.nn.init.normal_(self.out_proj_weight) + nn.init.xavier_uniform_(self.qkv_proj_weight) + + self.q_proj_weight = None + self.kv_proj_weight = None + + if recompute: + self.forward = partial(checkpoint, self.forward, use_reentrant=False) # type: ignore + + def get_cu_seqlens(self, batch_size: int, seqlen: int, device: torch.device) -> torch.Tensor: + return torch.arange( + 0, + (batch_size + 1) * seqlen, + step=seqlen, + dtype=torch.int32, + device=device, + ) + + def compute_attention_by_torch(self, qkv:torch.Tensor|None, q:torch.Tensor|None, kv:torch.Tensor|None, attn_mask:torch.Tensor|None) -> torch.Tensor: + '''Since flash attention does not support attn_mask, use scaled_dot_product_attention to compute attention when attn_mask is not None''' + if qkv is not None: + q, k, v = qkv.unbind(dim=-3) + elif kv is not None and q is not None: + k,v = kv.unbind(dim=-3) + else: + raise ValueError("When qkv is None, q and kv cannot both be None at the same time") + assert q is not None and k is not None and v is not None, "q, k, and v must not be None" + + attention_outputs = torch.nn.functional.scaled_dot_product_attention( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + attn_mask=attn_mask, + dropout_p=self.dropout, + ) + attention_outputs = attention_outputs.transpose(1, 2) + return attention_outputs + + def compute_attention_by_flashattn(self, qkv:torch.Tensor|None, q:torch.Tensor|None, kv:torch.Tensor|None) -> torch.Tensor: + "Compute attention using flash attention" + assert HAVE_FLASH_ATTN, "Flash attention is not supported. Please install/reinstall flash attention." + if self.qkv_combined and qkv is not None: + B,S = qkv.shape[:2] + atten_out = flash_attn_varlen_qkvpacked_func( # type: ignore + qkv.reshape(B * S, 3, self.num_heads, self.head_dim), + self.get_cu_seqlens(B, S, qkv.device), + S, + dropout_p=self.dropout, + softmax_scale=None, + causal=False, + return_attn_probs=False, + deterministic=False, + ) + elif not self.qkv_combined and q is not None and kv is not None: + B,S = q.shape[:2] + kv_shape = kv.shape + atten_out = flash_attn_varlen_kvpacked_func( # type: ignore + q.reshape(B * S, self.num_heads, self.head_dim), + kv.reshape(B * kv_shape[1], 2, self.num_heads, self.head_dim), + self.get_cu_seqlens(B, S, q.device), + self.get_cu_seqlens(B, kv_shape[1], kv.device), + S, + kv_shape[1], + dropout_p=self.dropout, + causal=False, + return_attn_probs=False, + deterministic=False, + ) + return atten_out # type: ignore + + @override + def forward(self, + x: torch.Tensor, + x_kv: Optional[torch.Tensor] = None, + copy_first_head_kv: bool = False, + attn_mask: torch.Tensor | None = None, + calculate_sample_attention:bool=False, + calculate_feature_attention:bool=False) -> tuple[torch.Tensor,torch.Tensor | None,torch.Tensor | None]: + """ + x: [batch_size, seq_len, feature, embed_dim] + kv: Optional[batch_size, seq_len_kv, feature, embed_dim] — only needed if qkv_combined=False + copy_first_head: Reuse the results from the first attention head + """ + # feature attention: [B S F E] + # item attention: [B F S E] + # B, T, C = x.shape + B, S, _, _ = x.shape + assert x.shape[-1] == self.embed_dim + + x = x.reshape(-1, *x.shape[-2:]) + BS, F, E = x.shape + + qkv = None + q = None + kv = None + feature_attention=None + sample_attention=None + # batch_size = None + # seqlen = None + if self.qkv_combined: + qkv = torch.einsum("... s, j h d s -> ... j h d", x, self.qkv_proj_weight) + else: + self.q_proj_weight = self.qkv_proj_weight[0] + self.kv_proj_weight = self.qkv_proj_weight[1:] + assert x_kv is not None, "kv combined attention requires kv input" + x_kv = x_kv.reshape(-1, *x_kv.shape[-2:]) + q = torch.einsum("... s, h d s -> ... h d", x, self.q_proj_weight) + if copy_first_head_kv: + kv_weights = self.kv_proj_weight[:,:1] + kv = torch.einsum("... s, j h d s -> ... j h d", x_kv, kv_weights) + expand_shape = [-1 for _ in kv.shape] + expand_shape[-2] = self.num_heads + kv = kv.expand(*expand_shape) + else: + kv = torch.einsum("... s, j h d s -> ... j h d", x_kv, self.kv_proj_weight) + + if attn_mask is None and HAVE_FLASH_ATTN: + atten_out = self.compute_attention_by_flashattn(qkv, q, kv) + else: + atten_out = self.compute_attention_by_torch(qkv, q, kv, attn_mask) + + atten_out = atten_out.reshape(BS, F, self.num_heads, self.head_dim) + + if qkv is not None: + q, k, v = qkv.unbind(dim=2) + else: + k,v=kv.unbind(dim=2) + if calculate_feature_attention: + logits = torch.einsum("b q h d, b k h d -> b q k h", q, k) + logits *= ( + torch.sqrt(torch.tensor(1.0 / (q.shape[-1]*q.shape[-2]))).to(k.device) + ) + ps = torch.softmax(logits, dim=2).to(torch.float16) + del logits + feature_attention = torch.mean(ps, dim=-1) + del ps + if calculate_sample_attention: + logits = torch.einsum("b q h d, b k h d -> b q k h", q, k) + logits *= ( + torch.sqrt(torch.tensor(1.0 / (q.shape[-1] * q.shape[-2]))).to(k.device) + ) + ps = torch.softmax(logits, dim=2).to(torch.float16) + del logits + sample_attention = torch.mean(ps, dim=-1) + del ps + out = torch.einsum( + "... h d, h d s -> ... s", + atten_out, + self.out_proj_weight, + ) + + return out.reshape(B, S, *out.shape[1:]),feature_attention,sample_attention + +class EncoderBaseLayer(nn.Module): + "Base encoder layer of the Transformer model" + def __init__(self, + nhead: int, + embed_dim: int, + hid_dim:int, + dropout: float=0, + activation: str='gelu', + layer_norm_eps: float=1e-5, + device: torch.device|None=None, + dtype: torch.dtype|None=None, + recompute_attn: bool=False, + calculate_sample_attention: bool = False, + calculate_feature_attention: bool = False, + ): + super().__init__() + self.nhead = nhead + self.embed_dim = embed_dim + self.hid_dim = hid_dim + self.dropout = dropout + self.activation = activation + self.layer_norm_eps = layer_norm_eps + self.device = device + self.dtype = dtype + self.head_dim = self.embed_dim // self.nhead + self.recompute_attn = recompute_attn + + self.feature_attentions = [] + self.sequence_attentions = [] + self.mlp = [] + self.feature_attn_num = 1 # feature attention number + self.items_attn_num = 1 # items attention number + self.mlp_num = 1 # mlp number + self.calculate_sample_attention = calculate_sample_attention + self.calculate_feature_attention = calculate_feature_attention + self.feature_attn_num = 2 + self.mlp_num = 3 + + # attention+MLP + self.feature_attentions = nn.ModuleList( + [ + MultiheadAttention( + embed_dim=self.embed_dim, + num_heads=self.nhead, + device=self.device, + dtype=self.dtype, + qkv_combined=True, + dropout=self.dropout, + recompute=self.recompute_attn, + ) + for _ in range(self.feature_attn_num) + ] + ) + self.sequence_attentions = nn.ModuleList( + [ + MultiheadAttention( + embed_dim=self.embed_dim, + num_heads=self.nhead, + device=self.device, + dtype=self.dtype, + qkv_combined=False, + dropout=self.dropout, + recompute=self.recompute_attn, + ) + for _ in range(self.items_attn_num) + ] + ) + self.mlp = nn.ModuleList( + [ + MLP( + in_features=self.embed_dim, + hidden_size=self.hid_dim, + out_features=self.embed_dim, + has_bias=False, + device=self.device, + dtype=self.dtype, + activation=self.activation, + depth=2, + ) + for _ in range(self.mlp_num) + ] + ) + + self.layer_steps = [ + partial( + self.call_features_attention, + index=0 + ), + self.mlp[0], + partial( + self.call_features_attention, + index=1 + ), + self.mlp[1], + partial( + self.call_sequence_attention, + index=0 + ), + self.mlp[2] + ] + + self.layer_norms = nn.ModuleList( + [ + LayerNormMixedPrecision(normalized_shape=self.embed_dim, eps=self.layer_norm_eps, + elementwise_affine=False, device=self.device, dtype=self.dtype) + for _ in range(len(self.layer_steps)) + ] + ) + + def create_attn_mask(self, q_mask:torch.Tensor, k_mask:torch.Tensor)->torch.Tensor: + """ + Create attention mask + + Args: + q_mask (torch.Tensor): Query sequence mask, with shape [batch_size, head_count, q_seq_len] + k_mask (torch.Tensor): Key sequence mask, with shape [batch_size, head_count, k_seq_len] + + Returns: + torch.Tensor: attention mask, with shape [batch_size, head_count, q_seq_len, k_seq_len] + """ + _, _, q_seq_len = q_mask.shape + _, _, k_seq_len = k_mask.shape + + q_mask_bool = q_mask.bool() # [batch_size, head_count, q_seq_len] + k_mask_bool = k_mask.bool() # [batch_size, head_count, k_seq_len] + + q_expanded = q_mask_bool.unsqueeze(-1) + k_expanded = k_mask_bool.unsqueeze(-2) + + valid_attn = q_expanded & k_expanded + attn_mask = ~valid_attn + _, _, q_seq_len, k_seq_len = attn_mask.shape + attn_mask = attn_mask.reshape(-1, q_seq_len, k_seq_len) + attn_mask = attn_mask.unsqueeze(1).expand(-1, 6, -1, -1) + + return attn_mask + + def call_features_attention(self, x: torch.Tensor, feature_atten_mask: torch.Tensor | None, eval_pos: int, + index: int = 0,calculate_feature_attention:bool=False): + assert len(self.feature_attentions) > index + attn_mask = None + if feature_atten_mask is not None: + attn_mask = self.create_attn_mask(feature_atten_mask, feature_atten_mask) + return self.feature_attentions[index]( + x, + x_kv=None, + attn_mask=attn_mask, + calculate_feature_attention=calculate_feature_attention + ) + + def call_sequence_attention(self, x: torch.Tensor, feature_atten_mask: torch.Tensor | None, eval_pos: int, + index: int = 0,calculate_sample_attention:bool=False): + assert len(self.sequence_attentions) > index + sample_attention=None + if eval_pos < x.shape[1]: + x_test,_,sample_attention = self.sequence_attentions[index]( + x=x[:, eval_pos:].transpose(1, 2), + x_kv=x[:, :eval_pos].transpose(1, 2), + copy_first_head_kv=True, + calculate_sample_attention=calculate_sample_attention + ) + x_test=x_test.transpose(1, 2) + else: + x_test = None + print(f"Warning: eval_pos >= x.shape[1]!") + x_train = self.sequence_attentions[index]( + x = x[:, :eval_pos].transpose(1, 2), + x_kv = x[:, :eval_pos].transpose(1, 2) + )[0].transpose(1, 2) + + if x_test is not None: + return torch.cat([x_train, x_test], dim=1),None,sample_attention + else: + return x_train + + def forward(self, x: torch.Tensor, feature_atten_mask: torch.Tensor, eval_pos: int,layer_idx:int) -> tuple[torch.Tensor,torch.Tensor | None,torch.Tensor | None]: + feature_attenion=None + sample_attention=None + for idx, (sublayer, layer_norm) in enumerate(zip(self.layer_steps, self.layer_norms)): + residual = x + x = layer_norm(x) + if idx == 2 and self.calculate_feature_attention and layer_idx == 11: + x, feature_attenion, _ = sublayer(x, feature_atten_mask, eval_pos,calculate_feature_attention=True) + elif idx == 4 and self.calculate_sample_attention and layer_idx == 11: + x, _, sample_attention = sublayer(x, feature_atten_mask, eval_pos,calculate_sample_attention=True) + else: + if isinstance(sublayer, functools.partial): + x = sublayer(x, feature_atten_mask, eval_pos) + if isinstance(x, tuple): + x = x[0] + else: + x = sublayer(x) + if isinstance(x, tuple): + x = x[0] + x = x + residual + return x,feature_attenion,sample_attention + +class LayerStack(nn.Module): + """ + A flexible container module similar to ``nn.Sequential`` that allows + keyword arguments to be passed through to each layer. + """ + def __init__(self, layers: list[nn.Module]): + super().__init__() + self.layers = nn.ModuleList(layers) + + def forward(self, x, **kwargs): + for idx,layer in enumerate(self.layers): + kwargs["layer_idx"] = idx + x,feature_attention,sample_attention = layer(x,**kwargs) + return x,feature_attention,sample_attention diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py b/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py new file mode 100644 index 00000000..75ea811d --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/model/transformer.py @@ -0,0 +1,285 @@ +import torch +import torch.nn as nn +from tabrepo.benchmark.models.ag.limix.LimiX.model.layer import EncoderBaseLayer, MLP, LayerStack +from typing import Any, Literal +from tabrepo.benchmark.models.ag.limix.LimiX.model.encoders import get_x_encoder, get_cls_y_encoder, get_reg_y_encoder, preprocesss_4_x + + + + +class FeaturesTransformer(nn.Module): + def __init__( + self, + *, + preprocess_config_x:dict[str, Any], + encoder_config_x:dict[str, Any], + encoder_config_y:dict[str, Any], + decoder_config:dict[str, Any], + nlayers:int, + nhead: int, + embed_dim: int, + hid_dim:int, + mask_prediction: bool = False, + features_per_group:int = 2, + dropout: float=0, + activation: str='gelu', + layer_norm_eps: float=1e-5, + device: torch.device|None=None, + dtype: torch.dtype|None=None, + recompute_attn: bool=False, + calculate_sample_attention: bool = False, + calculate_feature_attention: bool = False + ): + super().__init__() + + self.preprocess_config_x = preprocess_config_x + self.encoder_config_x = encoder_config_x + self.encoder_config_y = encoder_config_y + self.decoder_config = decoder_config + self.nlayers = nlayers + self.nhead = nhead + self.embed_dim = embed_dim + self.hid_dim = hid_dim + self.mask_prediction = mask_prediction + self.features_per_group = features_per_group + self.dropout = dropout + self.activation = activation + self.layer_norm_eps = layer_norm_eps + self.device = device + self.dtype = dtype + self.recompute_attn = recompute_attn + + layer_creator = lambda: EncoderBaseLayer( + embed_dim=self.embed_dim, + hid_dim=self.hid_dim, + nhead=self.nhead, + dropout=self.dropout, + activation=self.activation, # type: ignore + layer_norm_eps=self.layer_norm_eps, + device=self.device, + dtype=self.dtype, + recompute_attn=self.recompute_attn, + calculate_sample_attention=calculate_sample_attention, + calculate_feature_attention=calculate_feature_attention + ) + + self.encoder_x = get_x_encoder( **encoder_config_x) + self.cls_y_encoder = get_cls_y_encoder(**encoder_config_y) + self.reg_y_encoder = get_reg_y_encoder(**encoder_config_y) + + self.transformer_encoder = LayerStack([layer_creator() for _ in range(self.nlayers)]) + self.encoder_out_norm = nn.LayerNorm(self.embed_dim, eps=1e-5, elementwise_affine=False) + + self.cls_y_decoder = nn.Sequential( + nn.Linear(self.embed_dim, self.hid_dim), + nn.GELU(), + nn.Linear(self.hid_dim, decoder_config['num_classes']), + ) + + self.reg_y_decoder = nn.Sequential( + nn.Linear(self.embed_dim, self.hid_dim), + nn.LayerNorm(self.hid_dim), + nn.GELU(), + nn.Linear(self.hid_dim, 1), + ) + self.feature_decoder = nn.Sequential( + nn.Linear(self.embed_dim, self.hid_dim), + nn.LayerNorm(self.hid_dim), + nn.GELU(), + nn.Linear(self.hid_dim, self.features_per_group), + ) + + self.feature_positional_embedding = nn.Linear(self.embed_dim // 4, self.embed_dim) + + self.x_preprocess = preprocesss_4_x(**preprocess_config_x) + self.calculate_sample_attention = calculate_sample_attention + self.calculate_feature_attention = calculate_feature_attention + + def forward(self, x: torch.Tensor, + y: torch.Tensor, + eval_pos: int, + task_type: Literal['reg', 'cls'] = 'cls') -> torch.Tensor | dict[str, torch.Tensor] | tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]: + ''' + x: The input x, which includes both train x and test x, Shape: [batch, sequence, feature] + y: The input y, which includes both train y and test y, Shape: [batch, label] + eval_pos: Train x and train y split point + task_type: Type of task, options: cls(classification), reg(regression) + ''' + assert x is not None and y is not None, "x and y must not be none" + assert eval_pos > 0, "eval_pos must be a positive number" + assert len(x.shape)==3, "x must be [Batch, seq, Feature] but is {}".format(x.shape) + assert len(y.shape)==2, "y must be [Batch, label]" + assert eval_pos < x.shape[1] and eval_pos <= y.shape[1], "The split point between train x and test x must be less than the feature dimension of x, and less than or equal to the label dimension of y" + + batch_size, seq_len, num_feature = x.shape + x = {'data':x, 'mask':torch.isnan(x).to(torch.int32).to(x.device)} + y = {'data':y} + + feature_to_add = num_feature%self.features_per_group + if feature_to_add > 0: + # Extend the feature dimension of x when it is insufficient + for k in x: + x[k] = torch.cat( + ( + x[k], + torch.zeros( + batch_size, + seq_len, + feature_to_add, + device=x[k].device, + dtype=x[k].dtype + ) + ), + dim=-1 + ) + for k in x: + x[k] = x[k].reshape(batch_size, seq_len, x[k].shape[2]//self.features_per_group, self.features_per_group) + x['eval_pos'] = eval_pos + preprocessed_x = self.x_preprocess(x) + preprocessed_x = self.process_4_x(preprocessed_x) + x_encoder_result = self.encoder_x(preprocessed_x) + x_emb_result = x_encoder_result['data'] + + for k in y: + # Extend the label dimension of y when it is insufficient + y[k] = y[k].unsqueeze(-1) + if y[k].shape[1] < x['data'].shape[1]: + y[k] = torch.cat( + ( + y[k], + torch.nan + * torch.zeros( + y[k].shape[0], + x["data"].shape[1] - y[k].shape[1], + y[k].shape[2], + device=y[k].device, + dtype=y[k].dtype, + ), + ), + dim=1 + ) + # Mask the test y + y["data"][eval_pos:] = torch.nan + + if task_type == 'cls': + y_type = torch.zeros_like(y['data'], device=y['data'].device) + else: + y_type = torch.ones_like(y['data'], device=y['data'].device) + + embedded_y = self.mixed_y_embedding(y, y_type=y_type, eval_pos=eval_pos) + + if torch.isnan(embedded_y).any(): + raise ValueError("embedded_y contains NaN values; please add a NanEncoder in the encoder") + + embedded_x = self.add_embeddings(x_emb_result) + embedded_all = torch.cat((embedded_x, embedded_y.unsqueeze(2)), dim=2) + if torch.isnan(embedded_all).any(): + raise ValueError("embedded_all contains NaN values; please add a NanEncoder in the encoder") + if self.calculate_sample_attention or self.calculate_feature_attention: + return self.transformer_encoder(embedded_all, feature_atten_mask=None, eval_pos=eval_pos) + else: + pass + encoder_out = self.transformer_encoder(embedded_all, feature_atten_mask=None, eval_pos=eval_pos)[0] + encoder_out = self.encoder_out_norm(encoder_out) + + test_encoder_out = encoder_out[:, eval_pos:, -1] + test_y_type = y_type[:,eval_pos:] + encoder_out_4_feature = encoder_out[:, :, :-1, :] + if self.mask_prediction: + cls_output, reg_output = self.y_decoder(test_encoder_out, test_y_type) + feature_pred = self.feature_decoder(encoder_out_4_feature) + output_decoded = { + "cls_output": cls_output, + "reg_output": reg_output, + "feature_pred": feature_pred, + "process_config": { + "n_x_padding": feature_to_add, + "features_per_group": self.x_preprocess[3].num_features, + "num_used_features": self.x_preprocess[3].valid_feature_num, + "mean_for_normalization": self.x_preprocess[2].mean, + "std_for_normalization": self.x_preprocess[2].std + } + } + else: + cls_output, reg_output = self.y_decoder(test_encoder_out, test_y_type) + if task_type=="cls": + output_decoded = cls_output + else: + output_decoded = reg_output + + return output_decoded + + + def mixed_y_embedding(self, y:dict, y_type:torch.Tensor, eval_pos:int): + y = y['data'] + seq_len, batch_size, y_num = y.shape + y_flat = y.reshape(-1) + y_type_flat = y_type.reshape(-1) + + idx = torch.arange(len(y_flat), device=y.device) + idx_cls = idx[y_type_flat == 0] + idx_reg = idx[y_type_flat == 1] + y_cls = y_flat[idx_cls] + y_reg = y_flat[idx_reg] + + y_cls = y_cls.reshape(seq_len, -1, y_num) + y_reg = y_reg.reshape(seq_len, -1, y_num) + y_cls = {'data': y_cls, 'eval_pos':eval_pos} + y_reg = {'data': y_reg, 'eval_pos':eval_pos} + + cls_y_emb = self.cls_y_encoder(y_cls) if len(idx_cls) > 0 else None + reg_y_emb = self.reg_y_encoder(y_reg) if len(idx_reg) > 0 else None + cls_y_emb = cls_y_emb['data'] if cls_y_emb is not None else None + reg_y_emb = reg_y_emb['data'] if reg_y_emb is not None else None + + emb_size = self.embed_dim + out = torch.empty(len(y_flat), emb_size, dtype=torch.float16, device=y_flat.device) + if cls_y_emb is not None: + cls_y_emb_flat = cls_y_emb.reshape(-1, emb_size) + out.index_put_((idx_cls,), cls_y_emb_flat) + + if reg_y_emb is not None: + reg_y_emb_flat = reg_y_emb.reshape(-1, emb_size).to(torch.float16) + out.index_put_((idx_reg,), reg_y_emb_flat) + + output = out.reshape(seq_len, batch_size, emb_size) + return output + + def process_4_x(self, data:dict): + x_input = data['data'] + mask = data['mask'].to(torch.bool) + x_input = torch.where(mask, float('nan'), x_input) + data['data'] = x_input + return data + + def add_embeddings(self, x:torch.Tensor): + with torch.cuda.amp.autocast(enabled=False): + embs = torch.randn( + (x.shape[2], x.shape[3] // 4), + device=x.device, + dtype=torch.float32, + ) + torch.nn.init.orthogonal_(embs) + embs =self.feature_positional_embedding(embs.to(x.dtype)) + x += embs[None, None] + return x + + def y_decoder(self, test_encoder_out, test_y_type): + seq_len, _, emb_size = test_encoder_out.shape + flat_test_encoder_out = test_encoder_out.reshape(-1, emb_size) + flat_test_y_type = test_y_type.reshape(-1) + + idx = torch.arange(len(flat_test_encoder_out), device=flat_test_encoder_out.device) + idx_cls = idx[flat_test_y_type == 0] + idx_reg = idx[flat_test_y_type == 1] + + cls_y_encoder_out = flat_test_encoder_out[idx_cls] + reg_y_encoder_out = flat_test_encoder_out[idx_reg] + cls_y_encoder_out = cls_y_encoder_out.reshape(seq_len, -1, emb_size) + reg_y_encoder_out = reg_y_encoder_out.reshape(seq_len, -1, emb_size) + + cls_y = self.cls_y_decoder(cls_y_encoder_out) + reg_y = self.reg_y_decoder(reg_y_encoder_out) + + return cls_y, reg_y + \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/__init__.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py new file mode 100644 index 00000000..ceb48b08 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/data_utils.py @@ -0,0 +1,261 @@ +import os + +import numpy as np +import pandas as pd +import torch +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler, LabelEncoder +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm + +from tabrepo.benchmark.models.ag.limix.LimiX.utils.inference_utils import shuffle_data_along_dim + + +class TabularFinetuneDataset(Dataset): + """ + A custom PyTorch Dataset for fine-tuning, supporting data shuffling and retrieval-based selection. + + This dataset prepares training and testing splits for each item. It can either shuffle the + training data randomly or select training examples based on pre-computed attention scores + (retrieval). For each 'step', it provides a unique training set and a corresponding test set. + """ + + def __init__(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + attention_score: np.ndarray = None, + retrieval_len: int = 2000, + use_retrieval: bool = True, + split_ratio: float = 0.8, + ): + + """ + Initializes the FinetuneDataset. + Args: + X_train (torch.Tensor): The full set of input training data. + y_train (torch.Tensor): The full set of corresponding training labels. + attention_score (np.ndarray, optional): Pre-computed attention scores for retrieval. + Shape: (num_samples_in_X_train,num_samples_in_original_X_test). + Required if use_retrieval is True. + retrieval_len (int, optional): The number of top samples to select based on attention scores. + Used only if use_retrieval is True. + Note: The parameter in init_dataset is named 'train_len'. + use_retrieval (bool, optional): Flag to determine data selection strategy. + If True, uses attention scores for selection. + If False, uses random shuffling. + split_ratio (float, optional): Split ratio for selection strategy. + """ + self.init_dataset(X_train, y_train, attention_score, retrieval_len, use_retrieval, split_ratio) + + def __len__(self): + """ + Returns the number of steps/items in the dataset. + + Returns: + int: The number of steps, which corresponds to the size of the first dimension + of the generated X_test tensor. + """ + return self.max_steps + + def __getitem__(self, idx: int) -> dict[str, list]: + """ + Retrieves a single item (a training/test split configuration) by index. + + Args: + idx (int): The index of the item to retrieve. + + Returns: + dict[str, list]: A dictionary containing the tensors for the training and testing splits + for this specific step/index. + Keys: 'X_train', 'y_train', 'X_test', 'y_test'. + """ + return dict( + X_train=self.X_train[idx], # Training features for this step + y_train=self.y_train[idx], # Training labels for this step + X_test=self.X_test[idx], # Testing features for this step + y_test=self.y_test[idx], # Testing labels for this step + ) + + def init_dataset(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + attention_score: np.ndarray = None, + train_len: int = 2000, + use_retrieval: bool = False, + split_ratio: float = 0.8, + ): + + if not use_retrieval: + X_train = shuffle_data_along_dim(X_train, 0)[:min(train_len, X_train.shape[0])] + y_train = shuffle_data_along_dim(y_train, 0)[:min(train_len, X_train.shape[0])] + self.X_train = torch.cat([X_train.unsqueeze(0) for _ in range(self.max_steps)], dim=0) + self.y_train = torch.cat([y_train.unsqueeze(0) for _ in range(self.max_steps)], dim=0) + X = self.X_train + y = self.y_train + + # adapt train_test_split mode + split = int(X.shape[1] * split_ratio) + self.X_train = X[:, split:] + self.y_train = y[:, split:] + self.X_test = X[:, :split] + self.y_test = y[:, :split] + self.max_steps = self.X_test.shape[0] + else: + top_k_indices = np.argsort(attention_score)[:, -min(train_len, X_train.shape[0]):] + self.X_train = torch.cat([X_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0) + self.y_train = torch.cat([y_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0) + X = shuffle_data_along_dim(self.X_train, 1) + y = shuffle_data_along_dim(self.y_train, 1) + + # adapt train_test_split mode + split = int(X.shape[1] * split_ratio) + self.X_train = X[:, split:] + self.y_train = y[:, split:] + self.X_test = X[:, :split] + self.y_test = y[:, :split] + self.max_steps = self.X_train.shape[0] + + +class TabularInferenceDataset(Dataset): + """ + A PyTorch Dataset for tabular data inference scenarios. + + This dataset is designed to provide data for inference tasks where + you might have a fixed training set and varying test samples, optionally + selecting the training set based on relevance (retrieval) for each test sample. + When retrieval is used, each test sample (or step) is paired with a specific, + potentially unique, subset of the training data. When retrieval is not used, + it's assumed a single, fixed training set is used for all test samples. + """ + + def __init__(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + X_test: torch.Tensor, + attention_score: np.ndarray|torch.Tensor = None, + retrieval_len: int = 2000, + use_retrieval: bool = True, + ): + """ + Initializes the TabularInferenceDataset. + + Args: + X_train (torch.Tensor): The full set of input training features. + Shape: (num_train_samples, ...). + y_train (torch.Tensor): The full set of corresponding training labels. + Shape: (num_train_samples, ...). + X_test (torch.Tensor): The set of input features for inference/test samples. + Shape: (num_test_samples, ...). + attention_score (np.ndarray, optional): Pre-computed attention scores + for retrieval logic. Shape depends + on implementation, e.g., Shape: (num_samples_in_X_train,num_samples_in_X_test). + Required if use_retrieval is True. + retrieval_len (int, optional): The number of top training samples to select + based on attention scores for each test sample. + Used only if use_retrieval is True. + use_retrieval (bool, optional): Flag to determine data preparation strategy. + If True, uses attention scores to select relevant training data + for each test sample. + If False, assumes a fixed training set is used for all. + """ + self.init_dataset(X_train, y_train, X_test, attention_score, retrieval_len, use_retrieval) + # The number of inference steps equals the number of test samples + self.max_steps = self.X_test.shape[0] + self.use_retrieval = use_retrieval + + def __len__(self): + """ + Returns the number of steps/items in the dataset. + Returns: + int: The number of steps, which corresponds to the size of the first dimension + of the generated X_test tensor. + """ + return self.max_steps + + def __getitem__(self, idx: int) -> dict[str, list]: + """ + Retrieves a single item (data for one inference step) by index. + + Args: + idx (int): The index of the test sample/step to retrieve. + + Returns: + dict[str, torch.Tensor]: A dictionary containing the data needed for this inference step. + If `use_retrieval` is True, it includes the specific + `X_train`, `y_train`, and `X_test` for this step. + If `use_retrieval` is False, it only includes `X_test`, + as a fixed training set is assumed. + """ + if self.use_retrieval: + # Return the specific training data selected for this test sample + return dict( + idx=int(idx), + X_train=self.X_train[idx], # Training features for this step (retrieved) + X_test=self.X_test[idx], # Training labels for this step (retrieved) + y_train=self.y_train[idx], # The test sample features + ) + else: + # Return only the test data; training data is assumed to be fixed and + # provided. + return dict( + idx=int(idx), + X_test=self.X_test[idx], + ) + + def init_dataset(self, + X_train: torch.Tensor, + y_train: torch.Tensor, + X_test: torch.Tensor, + attention_score: np.ndarray = None, + train_len: int = 2000, + use_retrieval: bool = False, + ): + if use_retrieval: + print(X_train.shape) + top_k_indices = np.argsort(attention_score)[:, -min(train_len, X_train.shape[0]):] + self.X_train = torch.cat([X_train[x_iter].unsqueeze(0) for x_iter in top_k_indices], dim=0) + self.y_train = torch.cat([y_train[y_iter].unsqueeze(0) for y_iter in top_k_indices], dim=0).unsqueeze(-1) + self.X_test = X_test + else: + self.X_test = X_test + + + + +def load_data(data_root,folder): + le = LabelEncoder() + train_path = os.path.join(data_root,folder, folder + '_train.csv') + test_path = os.path.join(data_root,folder, folder + '_test.csv') + if os.path.exists(train_path): + train_df = pd.read_csv(train_path) + if os.path.exists(test_path): + test_df = pd.read_csv(test_path) + else: + train_df, test_df = train_test_split(train_df, test_size=0.5, random_state=42) + X_train = train_df.iloc[:, :-1] + y_train = train_df.iloc[:, -1] + X_test = test_df.iloc[:, :-1] + y_test = test_df.iloc[:, -1] + for col in X_train.columns: + if X_train[col].dtype == 'object': + try: + le = LabelEncoder() + X_train[col] = le.fit_transform(X_train[col]) + X_test[col] = le.transform(X_test[col]) + except Exception as e: + X_train = X_train.drop(columns=[col]) + X_test = X_test.drop(columns=[col]) + y_train = le.fit_transform(y_train) + y_test = le.transform(y_test) + trainX, trainy = X_train, y_train + trainX = np.asarray(trainX, dtype=np.float32) + trainy = np.asarray(trainy, dtype=np.int64) + + + testX, testy = X_test, y_test + testX = np.asarray(testX, dtype=np.float32) + testy = np.asarray(testy, dtype=np.int64) + return trainX, trainy, testX, testy +if __name__ == '__main__': + pass + diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py new file mode 100644 index 00000000..70cdcb80 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/inference_utils.py @@ -0,0 +1,190 @@ +import argparse +import json +import logging +import os +from datetime import datetime + +import numpy as np +import torch +from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss +from torch.utils.data import DistributedSampler + + +def shuffle_data_along_dim(X: torch.Tensor | np.ndarray, dim: int = 0) -> torch.Tensor | np.ndarray: + """ + Shuffles data (torch.Tensor or np.ndarray) along a specified axis. + + Args: + X (torch.Tensor | np.ndarray): The input multidimensional tensor or array. + dim (int): The dimension along which to shuffle elements. + + Returns: + X_(torch.Tensor | np.ndarray): A new tensor or array with elements shuffled along the specified dimension. + """ + if isinstance(X, np.ndarray): + shuffled_indices = np.random.permutation(X.shape[dim]) + reshaped_indices = shuffled_indices.reshape( + tuple(1 if i != dim else -1 for i in range(X.ndim)) + ) + shuffled_array = np.take_along_axis(X, reshaped_indices, axis=dim) + return shuffled_array + elif isinstance(X, torch.Tensor): + dim_size = X.size(dim) + shuffled_indices = torch.randperm(dim_size, device=X.device) + index_shape = [1] * X.dim() + index_shape[dim] = dim_size + expanded_indices = shuffled_indices.view(index_shape) + broadcasted_indices = expanded_indices.expand_as(X) + shuffled_tensor = torch.gather(X, dim, broadcasted_indices) + return shuffled_tensor + else: + raise TypeError("Data must be a torch.Tensor or np.ndarray") + + +def auc_metric(target, pred, multi_class='ovo', numpy=False): + lib = np if numpy else torch + try: + if not numpy: + target = torch.tensor(target) if not torch.is_tensor(target) else target + pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred + if len(lib.unique(target)) > 2: + if not numpy: + return torch.tensor(roc_auc_score(target, pred, multi_class=multi_class)) + return roc_auc_score(target, pred, multi_class=multi_class) + else: + if len(pred.shape) == 2: + pred = pred[:, 1] + if not numpy: + return torch.tensor(roc_auc_score(target, pred)) + return roc_auc_score(target, pred) + except ValueError as e: + print(e) + return np.nan if numpy else torch.tensor(np.nan) + + +def calculate_result(y_test_encoded, y_pred_proba): + y_pred_label = np.argmax(y_pred_proba, axis=1) + if len(np.unique(y_test_encoded)) == 2: + final_auc = roc_auc_score(y_test_encoded, y_pred_proba[:, 1]) + else: + final_auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class="ovo") + print(f"✅ AUC = {final_auc:.4f}") + + # --- Accuracy --- + acc = accuracy_score(y_test_encoded, y_pred_label) + print(f"✅ Accuracy = {acc:.4f}") + + # --- F1 Score --- + f1 = f1_score(y_test_encoded, y_pred_label, average='macro' if len(np.unique(y_test_encoded)) > 2 else 'binary') + print(f"✅ F1 Score = {f1:.4f}") + + # --- Cross Entropy / LogLoss --- + ce = log_loss(y_test_encoded, y_pred_proba) + print(f"✅ LogLoss (Cross Entropy) = {ce:.4f}") + + # --- ECE (Expected Calibration Error) --- + def compute_ece(y_true, y_prob, n_bins=10): + """Expected Calibration Error (ECE) implementation""" + bin_boundaries = np.linspace(0.0, 1.0, n_bins + 1) + ece = 0.0 + y_true = np.array(y_true) + y_prob = np.array(y_prob) + + if y_prob.ndim == 2 and y_prob.shape[1] > 1: + confidences = np.max(y_prob, axis=1) + predictions = np.argmax(y_prob, axis=1) + else: + confidences = y_prob if y_prob.ndim == 1 else y_prob[:, 1] + predictions = (confidences >= 0.5).astype(int) + + accuracies = (predictions == y_true) + + for i in range(n_bins): + bin_lower = bin_boundaries[i] + bin_upper = bin_boundaries[i + 1] + in_bin = (confidences > bin_lower) & (confidences <= bin_upper) + prop_in_bin = np.mean(in_bin) + if prop_in_bin > 0: + acc_in_bin = np.mean(accuracies[in_bin]) + avg_conf_in_bin = np.mean(confidences[in_bin]) + ece += np.abs(acc_in_bin - avg_conf_in_bin) * prop_in_bin + return ece + + ece = compute_ece(y_test_encoded, y_pred_proba, n_bins=10) + print(f"✅ ECE (Expected Calibration Error, 10 bins) = {ece:.4f}") + + return acc, final_auc, f1, ce, ece + + + + + +def generate_infenerce_config(args): + retrieval_config = dict( + use_retrieval=False, + retrieval_before_preprocessing=False, + calculate_feature_attention=False, + calculate_sample_attention=False, + subsample_ratio=1, + subsample_type=None, + use_type=None, + ) + + config_list = [ + dict(RebalanceFeatureDistribution=dict(worker_tags=["quantile"], discrete_flag=False, original_flag=True, + svd_tag="svd"), + CategoricalFeatureEncoder=dict(encoding_strategy="ordinal_strict_feature_shuffled"), + FeatureShuffler=dict(mode="shuffle"), + retrieval_config=retrieval_config, + ), + dict(RebalanceFeatureDistribution=dict(worker_tags=["quantile"], discrete_flag=False, original_flag=True, + svd_tag="svd"), + CategoricalFeatureEncoder=dict(encoding_strategy="ordinal_strict_feature_shuffled"), + FeatureShuffler=dict(mode="shuffle"), retrieval_config=retrieval_config, + ), + dict(RebalanceFeatureDistribution=dict(worker_tags=[None], discrete_flag=True, original_flag=False, + svd_tag=None), + CategoricalFeatureEncoder=dict(encoding_strategy="numeric"), + FeatureShuffler=dict(mode="shuffle"), + retrieval_config=retrieval_config, + ), + dict(RebalanceFeatureDistribution=dict(worker_tags=[None], discrete_flag=True, original_flag=False, + svd_tag=None), + CategoricalFeatureEncoder=dict(encoding_strategy="numeric"), + FeatureShuffler=dict(mode="shuffle"), + retrieval_config=retrieval_config) + ] + + with open(args.inference_config_path, 'w') as f: + json.dump(config_list, f) + + +class NonPaddingDistributedSampler(DistributedSampler): + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.num_samples = len(range(rank, len(dataset), num_replicas)) + self.total_size = len(dataset) + + def __iter__(self): + indices = list(range(len(self.dataset))) + indices = indices[self.rank:self.total_size:self.num_replicas] + return iter(indices) + +def swap_rows_back(tensor, indices): + """ + + Args: + tensor (torch.Tensor): + indices (list|torch.Tensor): + + Returns: + torch.Tensor: + """ + inverse_indices = [0] * len(indices) + for i, idx in enumerate(indices): + inverse_indices[idx] = i + return tensor[inverse_indices] + +if __name__ == "__main__": + args = init_args() + generate_infenerce_config(args) diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py new file mode 100644 index 00000000..3ea3ec58 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/loading.py @@ -0,0 +1,31 @@ +import torch + +from tabrepo.benchmark.models.ag.limix.LimiX.model.transformer import FeaturesTransformer + + +def load_model(model_path,calculate_sample_attention:bool=False,calculate_feature_attention:bool=False,mask_prediction:bool=False): + state_dict = torch.load(model_path, map_location="cpu", weights_only=False) + config = state_dict['config'] + model = FeaturesTransformer( + preprocess_config_x=config['preprocess_config_x'], + encoder_config_x=config['encoder_config_x'], + encoder_config_y=config['encoder_config_y'], + decoder_config=config['decoder_config'], + nlayers=config['nlayers'], + nhead=config['nhead'], + embed_dim=config['embed_dim'], + hid_dim=config['hid_dim'], + mask_prediction=mask_prediction, + features_per_group=config['features_per_group'], + dropout=config['dropout'], + layer_norm_eps=config.get('layer_norm_eps', 1e-5), + device=None, + dtype=None, + recompute_attn=config['recompute_attn'], + calculate_sample_attention=calculate_sample_attention, + calculate_feature_attention=calculate_feature_attention + ) + model.load_state_dict(state_dict['state_dict']) + + model.eval() + return model \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py new file mode 100644 index 00000000..cceb97c3 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/retrieval_utils.py @@ -0,0 +1,35 @@ +import numpy as np +import torch +from sklearn.preprocessing import LabelEncoder + + +class RelabelRetrievalY: + def __init__(self, y_train: torch.Tensor): + self.y_train = y_train.cpu().numpy() + self.label_encoders = [LabelEncoder() for i in range(y_train.shape[0])] + + def transform_y(self, ): + for i in range(self.y_train.shape[0]): + self.y_train[i] = np.expand_dims(self.label_encoders[i].fit_transform(self.y_train[i].ravel()), axis=1) + self.label_y = self.y_train.copy().astype(np.int32) + self.y_train = torch.tensor(self.y_train, dtype=torch.float32, device=torch.device('cuda')) + return self.y_train + + def inverse_transform_y(self, X: np.ndarray) -> np.ndarray: + for i in range(X.shape[0]): + batch_label = np.unique(self.label_y[i]) + reverse_perm = self.label_encoders[i].inverse_transform(batch_label).astype(np.int32) + reverse_output = np.full_like(X[i], fill_value=-np.inf) + reverse_output[reverse_perm] = X[i, batch_label] + X[i] = reverse_output + return X + + +if __name__ == '__main__': + y_train = torch.tensor([[[7],[7],[8], [5]],[[4], [3],[3], [6]]]) + output = np.array([[0.2, 2, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], + [0.2, 2, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]],dtype=np.float32) + + relabel = RelabelRetrievalY(y_train) + y_train, label_y = relabel.transform_y() + output = relabel.inverse_transform_y(output) diff --git a/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py b/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py new file mode 100644 index 00000000..8698e911 --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/LimiX/utils/utils.py @@ -0,0 +1,30 @@ +import os +from huggingface_hub import snapshot_download, hf_hub_download + +def download_datset(repo_id:str, revision:str, repo_type:str='dataset', save_dir:str="./my_cache"): + print(f"Downloading {repo_id} ...") + snapshot_download( + repo_id=repo_id, + revision=revision, + repo_type=repo_type, + local_dir=save_dir, + ignore_patterns=None, + force_download=False + ) + +def list_folders_to_csv(path:str, output_csv:str): + import csv + folders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] + with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['dataset name']) + for folder in folders: + writer.writerow([folder]) + +def download_model(repo_id:str, filename:str, save_path:str='.') -> str: + file_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + local_dir=save_path + ) + return file_path \ No newline at end of file diff --git a/tabrepo/benchmark/models/ag/limix/__init__.py b/tabrepo/benchmark/models/ag/limix/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tabrepo/benchmark/models/ag/limix/limix_model.py b/tabrepo/benchmark/models/ag/limix/limix_model.py new file mode 100644 index 00000000..92dc7aaa --- /dev/null +++ b/tabrepo/benchmark/models/ag/limix/limix_model.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import logging +import os +import sys +import warnings +from pathlib import Path +from typing import TYPE_CHECKING + +from autogluon.common.utils.resource_utils import ResourceManager +from autogluon.core.models import AbstractModel +from autogluon.features.generators import LabelEncoderFeatureGenerator + +if TYPE_CHECKING: + import pandas as pd + +logger = logging.getLogger(__name__) + + +class LimiXModel(AbstractModel): + """Rel: https://github.com/limix-ldm/LimiX.""" + + ag_key = "LIMIX" + ag_name = "LimiX" + _DEFAULT_CHECKPOINT_PATH = "LimiX-16M.ckpt" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._feature_generator = None + self._cat_features = None + self._cat_indices = None + + def _preprocess(self, X: pd.DataFrame, is_train=False, **kwargs) -> pd.DataFrame: + X = super()._preprocess(X, **kwargs) + + if is_train: + self._cat_indices = [] + + # X will be the training data°. + self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0) + self._feature_generator.fit(X=X) + + # This converts categorical features to numeric via stateful label encoding. + if self._feature_generator.features_in: + X = X.copy() + X[self._feature_generator.features_in] = self._feature_generator.transform( + X=X + ) + + if is_train: + # Detect/set cat features and indices + if self._cat_features is None: + self._cat_features = self._feature_generator.features_in[:] + self._cat_indices = [ + X.columns.get_loc(col) for col in self._cat_features + ] + + return X + + def _fit( + self, + X: pd.DataFrame, + y: pd.Series, + num_cpus: int = 1, + num_gpus: int = 0, + verbosity: int = 2, + **kwargs, + ): + import torch + + from tabrepo.benchmark.models.ag.limix.LimiX.inference.predictor import ( + LimiXPredictor, + ) + + is_classification = self.problem_type in ["binary", "multiclass"] + device = "cuda" if num_gpus != 0 else "cpu" + if (device == "cuda") and (not torch.cuda.is_available()): + # FIXME: warn instead and switch to CPU. + raise AssertionError( + "Fit specified to use GPU, but CUDA is not available on this machine. " + "Please switch to CPU usage instead.", + ) + + X = self.preprocess(X, is_train=True) + + cls_config_default_config = ( + Path(__file__).parent / "LimiX" / "config" / "cls_default_noretrieval.json" + ) + reg_config_default_config = ( + Path(__file__).parent / "LimiX" / "config" / "reg_default_noretrieval.json" + ) + inference_config = ( + cls_config_default_config + if is_classification + else reg_config_default_config + ) + + hps = self._get_model_params() + hps["device"] = device + + self.model = LimiXPredictor( + X_train=X, + y_train=y, + seed=self.random_seed, + model_path=self.download_model(), + categorical_features_indices=self._cat_indices, + inference_config=str(inference_config.resolve()), + task_type="Classification" if is_classification else "Regression", + **hps, + ) + + def _get_default_resources(self) -> tuple[int, int]: + # Use only physical cores for better performance based on benchmarks + num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True) + + num_gpus = min(1, ResourceManager.get_gpu_count_torch(cuda_only=True)) + + return num_cpus, num_gpus + + def get_minimum_resources( + self, is_gpu_available: bool = False + ) -> dict[str, int | float]: + return { + "num_cpus": 1, + "num_gpus": 1 if is_gpu_available else 0, + } + + @staticmethod + def download_model(): + from huggingface_hub import hf_hub_download + + model_dir = _user_cache_dir(platform=sys.platform, appname="limix") + model_dir.mkdir(exist_ok=True, parents=True) + + final_model_path = model_dir / LimiXModel._DEFAULT_CHECKPOINT_PATH + + if not final_model_path.exists(): + model_file = hf_hub_download( + repo_id="stableai-org/LimiX-16M", + filename=LimiXModel._DEFAULT_CHECKPOINT_PATH, + local_dir=str(model_dir), + ) + assert str(final_model_path) == model_file + return str(final_model_path) + + def _set_default_params(self): + default_params = {} + for param, val in default_params.items(): + self._set_default_param_value(param, val) + + def _get_random_seed_from_hyperparameters( + self, hyperparameters: dict + ) -> int | None | str: + return hyperparameters.get("seed", "N/A") + + @classmethod + def supported_problem_types(cls) -> list[str] | None: + return ["binary", "multiclass", "regression"] + + def _get_default_auxiliary_params(self) -> dict: + default_auxiliary_params = super()._get_default_auxiliary_params() + default_auxiliary_params.update( + { + "max_classes": 10, + } + ) + return default_auxiliary_params + + @classmethod + def _get_default_ag_args_ensemble(cls, **kwargs) -> dict: + """Set fold_fitting_strategy to sequential_local, + as parallel folding crashes if model weights aren't pre-downloaded. + """ + default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs) + extra_ag_args_ensemble = { + "fold_fitting_strategy": "sequential_local", + "refit_folds": True, + } + default_ag_args_ensemble.update(extra_ag_args_ensemble) + return default_ag_args_ensemble + + @classmethod + def _class_tags(cls): + return {"can_estimate_memory_usage_static": False} + + def _more_tags(self) -> dict: + return {"can_refit_full": True} + + +def _user_cache_dir(platform: str, appname: str = "tabpfn") -> Path: + use_instead_path = (Path.cwd() / ".tabpfn_models").resolve() + + # https://docs.python.org/3/library/sys.html#sys.platform + if platform == "win32": + # Honestly, I don't want to do what `platformdirs` does: + # https://github.com/tox-dev/platformdirs/blob/b769439b2a3b70769a93905944a71b3e63ef4823/src/platformdirs/windows.py#L252-L265 + APPDATA_PATH = os.environ.get("APPDATA", "") + if APPDATA_PATH.strip() != "": + return Path(APPDATA_PATH) / appname + + warnings.warn( + "Could not find APPDATA environment variable to get user cache dir," + " but detected platform 'win32'." + f" Defaulting to a path '{use_instead_path}'." + " If you would prefer, please specify a directory when creating" + " the model.", + UserWarning, + stacklevel=2, + ) + return use_instead_path + + if platform == "darwin": + return Path.home() / "Library" / "Caches" / appname + + # TODO: Not entirely sure here, Python doesn't explicitly list + # all of these and defaults to the underlying operating system + # if not sure. + linux_likes = ("freebsd", "linux", "netbsd", "openbsd") + if any(platform.startswith(linux) for linux in linux_likes): + # The reason to use "" as default is that the env var could exist but be empty. + # We catch all this with the `.strip() != ""` below + XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME", "") + if XDG_CACHE_HOME.strip() != "": + return Path(XDG_CACHE_HOME) / appname + return Path.home() / ".cache" / appname + + warnings.warn( + f"Unknown platform '{platform}' to get user cache dir." + f" Defaulting to a path at the execution site '{use_instead_path}'." + " If you would prefer, please specify a directory when creating" + " the model.", + UserWarning, + stacklevel=2, + ) + return use_instead_path diff --git a/tabrepo/benchmark/models/model_register.py b/tabrepo/benchmark/models/model_register.py index 9066e788..3a4eeca4 100644 --- a/tabrepo/benchmark/models/model_register.py +++ b/tabrepo/benchmark/models/model_register.py @@ -6,6 +6,7 @@ from tabrepo.benchmark.models.ag import ( ExplainableBoostingMachineModel, + LimiXModel, ModernNCAModel, RealMLPModel, TabDPTModel, @@ -26,6 +27,7 @@ TabDPTModel, TabMModel, ModernNCAModel, + LimiXModel, ] for _model_cls in _models_to_add: @@ -43,7 +45,10 @@ def infer_model_cls(model_cls: str, model_register: ModelRegistry = None): if real_model_cls.ag_name == model_cls: model_cls = real_model_cls break - elif model_cls in [str(real_model_cls.__name__) for real_model_cls in model_register.model_cls_list]: + elif model_cls in [ + str(real_model_cls.__name__) + for real_model_cls in model_register.model_cls_list + ]: for real_model_cls in model_register.model_cls_list: if model_cls == str(real_model_cls.__name__): model_cls = real_model_cls diff --git a/tabrepo/models/limix/__init__.py b/tabrepo/models/limix/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tabrepo/models/limix/generate.py b/tabrepo/models/limix/generate.py new file mode 100644 index 00000000..0a2079be --- /dev/null +++ b/tabrepo/models/limix/generate.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel +from tabrepo.utils.config_utils import ConfigGenerator + +gen_limix = ConfigGenerator(model_cls=LimiXModel, manual_configs=[{}], search_space={}) diff --git a/tabrepo/models/utils.py b/tabrepo/models/utils.py index 32e17039..7c5c2a0a 100644 --- a/tabrepo/models/utils.py +++ b/tabrepo/models/utils.py @@ -47,6 +47,7 @@ def get_configs_generator_from_name(model_name: str): "TabPFNv2": lambda: importlib.import_module("tabrepo.models.tabpfnv2.generate").gen_tabpfnv2, "XGBoost": lambda: importlib.import_module("tabrepo.models.xgboost.generate").gen_xgboost, "Mitra": lambda: importlib.import_module("tabrepo.models.mitra.generate").gen_mitra, + "LimiX": lambda: importlib.import_module("tabrepo.models.limix.generate").gen_limix, } if model_name not in name_to_import_map: diff --git a/tst/benchmark/models/test_limix.py b/tst/benchmark/models/test_limix.py new file mode 100644 index 00000000..b96726ab --- /dev/null +++ b/tst/benchmark/models/test_limix.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +import pytest + + +def test_limix(): + try: + from autogluon.tabular.testing import FitHelper + from tabrepo.benchmark.models.ag.limix.limix_model import LimiXModel + + FitHelper.verify_model(model_cls=LimiXModel, model_hyperparameters={}) + except ImportError as err: + pytest.skip( + f"Import Error, skipping test... " + f"Ensure you have the proper dependencies installed to run this test:\n" + f"{err}" + )