diff --git a/README.md b/README.md index f4959807f..5e4cb9f8c 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,14 @@ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy) [![Readthedocs Preview](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml/badge.svg)](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml) # πŸ“° News | πŸ—žοΈ News | πŸ“ Description | | -- | ------ | +| Official WeChat group release | We created a WeChat group, welcome to join! (πŸ—ͺ[QR Code](docs/WeChat_QR_code.jpg)) | +| Official Discord release | We launch our first chatting channel in Discord (πŸ—ͺ[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy)) | | First release | **RDAgent** is released on GitHub | diff --git a/docs/WeChat_QR_code.jpg b/docs/WeChat_QR_code.jpg new file mode 100644 index 000000000..2651ceefa Binary files /dev/null and b/docs/WeChat_QR_code.jpg differ diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py index e82c5d9d9..8020e30ca 100644 --- a/rdagent/log/ui/app.py +++ b/rdagent/log/ui/app.py @@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]): st.latex(ft.factor_formulation) mks = "| Variable | Description |\n| --- | --- |\n" - for v, d in ft.variables.items(): - mks += f"| ${v}$ | {d} |\n" - st.markdown(mks) + if isinstance(ft.variables, dict): + for v, d in ft.variables.items(): + mks += f"| ${v}$ | {d} |\n" + st.markdown(mks) elif isinstance(tasks[0], ModelTask): st.markdown("**Model Tasks🚩**") diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py index 16ea5a3b9..a36ab7427 100644 --- a/rdagent/scenarios/kaggle/developer/feedback.py +++ b/rdagent/scenarios/kaggle/developer/feedback.py @@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac self.scen.vector_base.add_experience_to_vector_base(experiment_feedback) self.scen.vector_base.save() elif self.scen.if_using_graph_rag: - self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen) + trace.knowledge_base.load_from_documents([experiment_feedback], self.scen) return HypothesisFeedback( observations=observations, diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py index a6546efe3..95e2a9f45 100644 --- a/rdagent/scenarios/kaggle/developer/runner.py +++ b/rdagent/scenarios/kaggle/developer/runner.py @@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str: codes = "\n".join(codes) return md5_hash(codes) + def extract_model_task_from_code(self, code: str) -> str: + sys_prompt = ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["extract_model_task_from_code"]["system"]) + .render() + ) + + user_prompt = ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["extract_model_task_from_code"]["user"]) + .render(file_content=code) + ) + + model_task_description = APIBackend().build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=sys_prompt, + json_mode=True, + ) + + try: + response_json_analysis = json.loads(model_task_description) + task_desc = f"""name: {response_json_analysis['name']} + description: {response_json_analysis['description']} + """ + task_desc += ( + f"formulation: {response_json_analysis['formulation']}\n" + if response_json_analysis.get("formulation") + else "" + ) + task_desc += f"architecture: {response_json_analysis['architecture']}\n" + task_desc += ( + f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n" + if response_json_analysis.get("variables") + else "" + ) + task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n" + task_desc += f"model_type: {response_json_analysis['model_type']}\n" + except json.JSONDecodeError: + task_desc = "Failed to parse LLM's response as JSON" + + return task_desc + def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment: """ For the initial development, the experiment serves as a benchmark for feature engineering. @@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE feature_shape = org_data.shape[-1] exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape)) - sub_model_1_description = ( - self.extract_model_task_from_code( - (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text() - ) - + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}""" - ) - sub_model_2_description = ( - self.extract_model_task_from_code( - (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text() - ) - + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}""" - ) + model_map = { + "XGBoost": "model_xgboost.py", + "RandomForest": "model_randomforest.py", + "LightGBM": "model_lightgbm.py", + "NN": "model_nn.py", + } + + workspace_path = exp.experiment_workspace.workspace_path / "model" + + for model_name, model_file in model_map.items(): + model_file_path = workspace_path / model_file - exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description - exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description + if model_file_path.exists(): + model_description = ( + self.extract_model_task_from_code(model_file_path.read_text()) + + f"""code: {model_file_path.read_text()}""" + ) + else: + model_description = "" + + exp.experiment_workspace.model_description[model_name] = model_description if RUNNER_SETTINGS.cache_result: self.dump_cache_result(exp, result) @@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment: class KGFactorRunner(KGCachedRunner[KGFactorExperiment]): - def extract_model_task_from_code(self, code: str) -> str: - sys_prompt = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["extract_model_task_from_code"]["system"]) - .render() - ) - - user_prompt = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["extract_model_task_from_code"]["user"]) - .render(file_content=code) - ) - - model_task_description = APIBackend().build_messages_and_create_chat_completion( - user_prompt=user_prompt, - system_prompt=sys_prompt, - json_mode=True, - ) - - try: - response_json_analysis = json.loads(model_task_description) - task_desc = f"""name: {response_json_analysis['name']} - description: {response_json_analysis['description']} - """ - task_desc += ( - f"formulation: {response_json_analysis['formulation']}\n" - if response_json_analysis.get("formulation") - else "" - ) - task_desc += f"architecture: {response_json_analysis['architecture']}\n" - task_desc += ( - f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n" - if response_json_analysis.get("variables") - else "" - ) - task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n" - task_desc += f"model_type: {response_json_analysis['model_type']}\n" - except json.JSONDecodeError: - task_desc = "Failed to parse LLM's response as JSON" - - return task_desc - def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment: - if exp.based_experiments and exp.based_experiments[-1].result is None: - exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1]) current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py"))) implemented_factor_count = 0 for sub_ws in exp.sub_workspace_list: @@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment: if implemented_factor_count == 0: raise FactorEmptyError("No factor is implemented") + # initial template result + if exp.based_experiments and exp.based_experiments[-1].result is None: + exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1]) + if RUNNER_SETTINGS.cache_result: cache_hit, result = self.get_cache_result(exp) if cache_hit: diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py new file mode 100644 index 000000000..6dedaaa69 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py @@ -0,0 +1,86 @@ +import os + +import numpy as np +import pandas as pd +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + + +def preprocess_data(df): + """Preprocess the data with feature engineering.""" + # Convert time to more useful features + df["hour"] = df["time"] % 24 + df["day"] = (df["time"] // 24) % 7 + df["week"] = df["time"] // (24 * 7) + + # Create distance from center feature + df["dist_from_center"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2) + + # Create accuracy bins + df["accuracy_bins"] = pd.cut(df["accuracy"], bins=5, labels=False) + + # Create interaction features + df["xy"] = df["x"] * df["y"] + df["x_accuracy"] = df["x"] * df["accuracy"] + df["y_accuracy"] = df["y"] * df["accuracy"] + + return df + + +def preprocess_script(): + """Main preprocessing function.""" + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + return X_train, X_valid, y_train, y_valid, X_test, *others + + # Load the training data + train_df = pd.read_csv("/kaggle/input/train.csv").head(1000) + test_df = pd.read_csv("/kaggle/input/test.csv").head(1000) + + # Preprocess the data + train_df = preprocess_data(train_df) + test_df = preprocess_data(test_df) + + # Encode place_ids + place_id_encoder = LabelEncoder() + place_id_encoder.fit(train_df["place_id"]) + train_df["place_id"] = place_id_encoder.transform(train_df["place_id"]) + + # Split features and target for training data + X = train_df.drop(["place_id"], axis=1) + y = train_df["place_id"] + + # Prepare test data + test_row_ids = test_df["row_id"] + X_test = test_df.drop(["row_id"], axis=1) + + # Ensure X_test has the same columns as X + for col in X.columns: + if col not in X_test.columns: + X_test[col] = 0 # or some other appropriate default value + + X_test = X_test[X.columns] # Reorder columns to match X + + # Attempt stratified split, fall back to random split if necessary + try: + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) + except ValueError: + print("Warning: Stratified split not possible. Falling back to random split.") + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42) + + # Handle missing values + imputer = SimpleImputer(strategy="mean") + X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) + X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) + X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + + # Count the number of unique classes + n_classes = len(place_id_encoder.classes_) + + return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py new file mode 100644 index 000000000..8ae043acf --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py new file mode 100644 index 000000000..106674556 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py @@ -0,0 +1,33 @@ +""" +Motivation of the model: +The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. +It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good +baseline model for many classification tasks. +""" + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1) + + # Fit the model + model.fit(X_train, y_train) + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Predict using the trained model + y_pred = model.predict(X) + + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py new file mode 100644 index 000000000..804d15231 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py @@ -0,0 +1,51 @@ +import numpy as np +import pandas as pd +import xgboost as xgb +from sklearn.preprocessing import LabelEncoder + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """Define and train the model. Merge feature_select""" + + # Combine train and valid labels to get all unique labels + all_labels = np.unique(np.concatenate([y_train, y_valid])) + le = LabelEncoder().fit(all_labels) + + # Encode labels + y_train_encoded = le.transform(y_train) + y_valid_encoded = le.transform(y_valid) + + dtrain = xgb.DMatrix(X_train, label=y_train_encoded) + dvalid = xgb.DMatrix(X_valid, label=y_valid_encoded) + num_classes = len(le.classes_) + + params = { + "objective": "multi:softprob", + "num_class": num_classes, + "max_depth": 6, + "eta": 0.3, + "subsample": 0.8, + "colsample_bytree": 0.8, + "min_child_weight": 1, + "nthread": -1, + } + num_round = 100 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10) + + # Store the LabelEncoder in the model for later use in prediction + bst.le = le + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + dtest = xgb.DMatrix(X) + y_pred_prob = model.predict(dtest) + # Convert probabilities back to original labels if needed + # y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1)) + return y_pred_prob diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_randomforest.py new file mode 100644 index 000000000..d2a15deee --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(col)).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_xgboost.py new file mode 100644 index 000000000..d2a15deee --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(col)).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py new file mode 100644 index 000000000..b7747a49c --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/train.py @@ -0,0 +1,101 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import average_precision_score, log_loss + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def compute_map3(y_true, y_pred): + """Compute Mean Average Precision @ 3 for multi-class classification.""" + return average_precision_score(y_true, y_pred, average="micro") + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1) +X_valid = pd.concat(X_valid_l, axis=1) +X_test = pd.concat(X_test_l, axis=1) + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func, validation_score]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + # Check if the fit function accepts n_classes + if "n_classes" in m.fit.__code__.co_varnames: + model = m.fit(X_train, y_train, X_valid, y_valid, n_classes) + else: + model = m.fit(X_train, y_train, X_valid, y_valid) + + # Evaluate the model on the validation set + y_valid_pred = m.predict(model, X_valid) + validation_score = log_loss(y_valid, y_valid_pred) + + model_l.append((model, m.predict, validation_score)) + +# Sort models by validation score (lower is better for log loss) +model_l.sort(key=lambda x: x[2]) + +# 4) Use the best model for predictions +best_model, best_predict_func, _ = model_l[0] + +# 5) Make predictions on the validation set using the best model +y_valid_pred = best_predict_func(best_model, X_valid) + +# Compute metrics +map3 = compute_map3(y_valid, y_valid_pred) +print(f"MAP@3 on validation set: {map3}") + +# 6) Save the validation metrics +pd.Series(data=[map3], index=["MAP@3"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set using the best model +y_test_pred = best_predict_func(best_model, X_test) + +# Get top 3 predictions for each test sample +top_3_indices = np.argsort(-y_test_pred, axis=1)[:, :3] +top_3_place_ids = place_id_encoder.inverse_transform(top_3_indices) + +# Create submission DataFrame +submission_result = pd.DataFrame( + {"row_id": test_row_ids, "place_id": [" ".join(map(str, ids)) for ids in top_3_place_ids]} +) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py index 6222a2367..49f9d3bf3 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py @@ -36,12 +36,8 @@ def data_cleaner(text): y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]] - vectorizer = TfidfVectorizer() - X_train = vectorizer.fit_transform(train["full_text"]) - X_test = vectorizer.transform(test["full_text"]) - - X_train = pd.DataFrame.sparse.from_spmatrix(X_train) - X_test = pd.DataFrame.sparse.from_spmatrix(X_test) + X_train = train[["full_text"]] + X_test = test[["full_text"]] X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py index 8ae043acf..43d226087 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py @@ -1,4 +1,5 @@ import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer """ Here is the feature engineering code for each task, with a class that has a fit and transform method. @@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame): """ Fit the feature engineering model to the training data. """ - pass + self.vectorizer = TfidfVectorizer() + self.vectorizer.fit(train_df["full_text"]) def transform(self, X: pd.DataFrame): """ Transform the input data. """ + X = self.vectorizer.transform(X["full_text"]) + X = pd.DataFrame.sparse.from_spmatrix(X) return X diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py index 9c032ee26..3a433595e 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_xgboost.py @@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v X_train = select(X_train) xgb_estimator = xgb.XGBRegressor( - n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda" + n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda" ) model = MultiOutputRegressor(xgb_estimator, n_jobs=-1) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py index cd0e88bfb..0b3365960 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py @@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path): return module +def MCRMSE(y_true, y_pred): + return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0))) + + # 1) Preprocess the data X_train, X_valid, y_train, y_valid, X_test = preprocess_script() @@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path): for f in DIRNAME.glob("feature/feat*.py"): cls = import_module_from_path(f.stem, f).feature_engineering_cls() + print(X_train.head()) cls.fit(X_train) X_train_f = cls.transform(X_train) X_valid_f = cls.transform(X_valid) @@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: # 4) Evaluate the model on the validation set y_valid_pred_l = [] +metrics_all = [] for model, predict_func in model_l: y_valid_pred = predict_func(model, X_valid) y_valid_pred_l.append(y_valid_pred) - # print(y_valid_pred) - # print(y_valid_pred.shape) - -# 5) Ensemble -# Majority vote ensemble -y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0) - + metrics = MCRMSE(y_valid, y_valid_pred) + print(f"MCRMSE on valid set: {metrics}") + metrics_all.append(metrics) -# 6) Save the validation metrics -def MCRMSE(y_true, y_pred): - return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0))) - - -metrics = MCRMSE(y_valid, y_valid_pred_ensemble) -print(f"MCRMSE on valid set: {metrics}") -pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv") - -# 7) Make predictions on the test set and save them -y_test_pred_l = [] -for model, predict_func in model_l: - y_test_pred_l.append(predict_func(model, X_test)) +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv") -# For multiclass classification, use the mode of the predictions -y_test_pred = np.mean(y_test_pred_l, axis=0) +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test) submission_result = pd.read_csv("/kaggle/input/sample_submission.csv") diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py index bd4cee939..b6e9841d4 100644 --- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py @@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path): X_te = X_te.loc[:, ~X_te.columns.duplicated()] # Train the model + def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Flatten the columns of a DataFrame with MultiIndex columns, + for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b + """ + if df.columns.nlevels == 1: + return df + df.columns = ["_".join(col).strip() for col in df.columns.values] + return df + + X_tr = flatten_columns(X_tr) + X_val = flatten_columns(X_val) + X_te = flatten_columns(X_te) + model_l = [] # list[tuple[model, predict_func]] for f in DIRNAME.glob("model/model*.py"): m = import_module_from_path(f.stem, f) diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py index 2368dff5c..f3069f7d1 100644 --- a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/fea_share_preprocess.py @@ -11,24 +11,26 @@ def prepreprocess(): # Load the training data - train_df = pd.read_csv("/kaggle/input/train.csv").head(1000) + train_df = pd.read_csv("/kaggle/input/train.csv") # Load book and trade data - book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000) - trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000) + book_train = pd.read_parquet("/kaggle/input/book_train.parquet") + trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet") # Merge book and trade data with train_df merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left") merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left") - print(merged_df.head()) - # Split the data X = merged_df.drop(["target"], axis=1) y = merged_df["target"] + print(X.columns.to_list()) + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42) + print(X_train.columns.to_list()) + return X_train, X_valid, y_train, y_valid @@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame): def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols): X_transformed = preprocessor.transform(X) - # Convert arrays back to DataFrames X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index) return X_transformed @@ -79,11 +80,6 @@ def preprocess_script(): X_train, X_valid, y_train, y_valid = prepreprocess() - preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train) - - X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols) - X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols) - submission_df = pd.read_csv("/kaggle/input/test.csv") ids = submission_df["row_id"] @@ -94,10 +90,8 @@ def preprocess_script(): if col not in submission_df.columns: submission_df[col] = 0 # Fill with 0 or another appropriate value - X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols) - # Handle missing values - for df in [X_train, X_valid, X_test]: + for df in [X_train, X_valid, submission_df]: df.fillna(df.mean(), inplace=True) - return X_train, X_valid, y_train, y_valid, X_test, ids + return X_train, X_valid, y_train, y_valid, submission_df, ids diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py index b5b29ce4b..321c2596d 100644 --- a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/model_xgboost.py @@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v params = { "objective": "reg:squarederror", # Use squared error for regression "nthread": -1, - "tree_method": "gpu_hist", + "tree_method": "hist", "device": "cuda", } num_round = 200 diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py index 41a7ae74d..c5903aef2 100644 --- a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/train.py @@ -5,9 +5,7 @@ import numpy as np import pandas as pd from fea_share_preprocess import preprocess_script -from sklearn.metrics import mean_squared_error -from sklearn.model_selection import TimeSeriesSplit -from sklearn.preprocessing import LabelEncoder +from sklearn.impute import SimpleImputer # Set random seed for reproducibility SEED = 42 @@ -16,11 +14,10 @@ DIRNAME = Path(__file__).absolute().resolve().parent -def compute_rmse(y_true, y_pred): - """Compute RMSE for regression.""" - mse = mean_squared_error(y_true, y_pred) - rmse = np.sqrt(mse) - return rmse +def compute_rmspe(y_true, y_pred): + """Compute Root Mean Squared Percentage Error (RMSPE) for regression.""" + rmspe = np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2)) + return rmspe def import_module_from_path(module_name, module_path): @@ -30,10 +27,9 @@ def import_module_from_path(module_name, module_path): return module -print("begin preprocess") # 1) Preprocess the data X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() -print("preprocess done") + # 2) Auto feature engineering X_train_l, X_valid_l = [], [] @@ -61,8 +57,6 @@ def import_module_from_path(module_name, module_path): X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) X_test.replace([np.inf, -np.inf], np.nan, inplace=True) -from sklearn.impute import SimpleImputer - imputer = SimpleImputer(strategy="mean") X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) @@ -98,26 +92,20 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: # 4) Evaluate the model on the validation set y_valid_pred_l = [] +metrics_all = [] + for model, predict_func in model_l: y_valid_pred_l.append(predict_func(model, X_valid)) - print(predict_func(model, X_valid).shape) - -# 5) Ensemble -y_valid_pred = np.mean(y_valid_pred_l, axis=0) - -rmse = compute_rmse(y_valid, y_valid_pred) -print("Final RMSE on validation set: ", rmse) + metrics = compute_rmspe(y_valid, y_valid_pred_l[-1].ravel()) + print(f"RMSPE on valid set: {metrics}") + metrics_all.append(metrics) -# 6) Save the validation RMSE -pd.Series(data=[rmse], index=["RMSE"]).to_csv("submission_score.csv") +min_index = np.argmin(metrics_all) -# 7) Make predictions on the test set and save them -y_test_pred_l = [] -for m, m_pred in model_l: - y_test_pred_l.append(m_pred(m, X_test)) +pd.Series(data=[metrics_all[min_index]], index=["RMSPE"]).to_csv("submission_score.csv") -y_test_pred = np.mean(y_test_pred_l, axis=0).ravel() +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test).ravel() -# 8) Submit predictions for the test set -submission_result = pd.DataFrame({"id": ids, "price": y_test_pred}) +# 5) Submit predictions for the test set +submission_result = pd.DataFrame({"row_id": ids, "target": y_test_pred}) submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py index 733bce85e..f82e1f8e5 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py @@ -62,31 +62,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: # 4) Evaluate the model on the validation set y_valid_pred_l = [] +metrics_all = [] for model, predict_func in model_l: y_valid_pred = predict_func(model, X_valid) y_valid_pred_l.append(y_valid_pred) - # print(y_valid_pred) - # print(y_valid_pred.shape) + metrics = mean_squared_error(y_valid, y_valid_pred, squared=False) + print(f"RMLSE on valid set: {metrics}") + metrics_all.append(metrics) -# 5) Ensemble -# Majority vote ensemble -y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0) - - -# 6) Save the validation metrics -metrics = mean_squared_error(y_valid, y_valid_pred_ensemble, squared=False) -print(f"RMLSE on valid set: {metrics}") -pd.Series(data=[metrics], index=["RMLSE"]).to_csv("submission_score.csv") - -# 7) Make predictions on the test set and save them -y_test_pred_l = [] -for model, predict_func in model_l: - y_test_pred_l.append(predict_func(model, X_test)) - - -# For multiclass classification, use the mode of the predictions -y_test_pred = np.mean(y_test_pred_l, axis=0).ravel() +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["RMLSE"]).to_csv("submission_score.csv") +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test) submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"]) submission_result.insert(0, "id", ids) diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index 4efb009ae..eddf251fe 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -135,6 +135,7 @@ def source_data(self) -> str: X_valid = pd.read_pickle(data_folder / "X_valid.pkl") # TODO: Hardcoded for now, need to be fixed if self.competition == "feedback-prize-english-language-learning": + self.input_shape = X_valid.shape return "This is a sparse matrix of descriptive text." buffer = io.StringIO() X_valid.info(verbose=True, buf=buffer, show_counts=True) diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index fedb954cf..4b6a1f806 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -252,14 +252,15 @@ def convert_response(self, response: str) -> ModelHypothesis: response_dict = json.loads(response) hypothesis = KGHypothesis( - hypothesis=response_dict["hypothesis"], - reason=response_dict["reason"], - concise_reason=response_dict["concise_reason"], - concise_observation=response_dict["concise_observation"], - concise_justification=response_dict["concise_justification"], - concise_knowledge=response_dict["concise_knowledge"], - action=response_dict["action"], + hypothesis=response_dict.get("hypothesis", "Hypothesis not provided"), + reason=response_dict.get("reason", "Reason not provided"), + concise_reason=response_dict.get("concise_reason", "Concise reason not provided"), + concise_observation=response_dict.get("concise_observation", "Concise observation not provided"), + concise_justification=response_dict.get("concise_justification", "Concise justification not provided"), + concise_knowledge=response_dict.get("concise_knowledge", "Concise knowledge not provided"), + action=response_dict.get("action", "Action not provided"), ) + return hypothesis @@ -304,9 +305,9 @@ def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExp tasks = [] for factor_name in response_dict: - description = response_dict[factor_name]["description"] - formulation = response_dict[factor_name]["formulation"] - variables = response_dict[factor_name]["variables"] + description = (response_dict[factor_name].get("description", "Factor description not provided"),) + formulation = (response_dict[factor_name].get("formulation", "Factor formulation not provided"),) + variables = (response_dict[factor_name].get("variables", "Variables not provided"),) tasks.append( FactorTask( factor_name=factor_name, @@ -327,11 +328,11 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi tasks = [] tasks.append( ModelTask( - name=response_dict["model_name"], - description=response_dict["description"], - architecture=response_dict["architecture"], - hyperparameters=response_dict["hyperparameters"], - model_type=response_dict["model_type"], + name=response_dict.get("model_name", "Model name not provided"), + description=response_dict.get("description", "Description not provided"), + architecture=response_dict.get("architecture", "Architecture not provided"), + hyperparameters=response_dict.get("hyperparameters", "Hyperparameters not provided"), + model_type=response_dict.get("model_type", "Model type not provided"), version=2, ) ) diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py index cadb7dd17..14a8030b8 100644 --- a/test/utils/test_kaggle.py +++ b/test/utils/test_kaggle.py @@ -27,7 +27,7 @@ def test_competition_template(self): ws.execute() success = (ws.workspace_path / "submission.csv").exists() self.assertTrue(success, "submission.csv is not generated") - ws.clear() + # ws.clear() if __name__ == "__main__":