diff --git a/config/config.yaml b/config/config.yaml index 72cfdbf..34c08e7 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,16 +7,19 @@ save_artifacts: path: "./aga-artifacts" # path: "s3://autogluon-assistant-agts/outputs//aga-artifacts/" feature_transformers: - - _target_: autogluon_assistant.transformer.CAAFETransformer - eval_model: lightgbm - llm_model: gpt-3.5-turbo - num_iterations: 2 - optimization_metric: roc - - _target_: autogluon_assistant.transformer.OpenFETransformer - n_jobs: 1 - num_features_to_keep: 10 + # - _target_: autogluon_assistant.transformer.CAAFETransformer + # eval_model: lightgbm + # llm_model: gpt-3.5-turbo + # num_iterations: 2 + # optimization_metric: roc + # - _target_: autogluon_assistant.transformer.OpenFETransformer + # n_jobs: 1 + # num_features_to_keep: 10 #- _target_: autogluon_assistant.transformer.PretrainedEmbeddingTransformer # model_name: 'all-mpnet-base-v2' + - _target_: autogluon_assistant.transformer.SimpleGenTransformer + n_jobs: 1 + llm_model: bedrock-claude-3.5-sonnet autogluon: predictor_init_kwargs: {} predictor_fit_kwargs: diff --git a/pyproject.toml b/pyproject.toml index 3fe0fe1..0a3d5f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "python-calamine", "sentence-transformers>=3.1.0", "tenacity>=8.2.2,<10.0", + "dspy-ai==2.5.0" ] [project.optional-dependencies] @@ -63,7 +64,6 @@ target-version = ['py38', 'py39', 'py310'] known_first_party = "autogluon_assistant" known_third_party = [ "autogluon", - "caafe", "joblib", "langchain", "numpy", diff --git a/src/autogluon_assistant/transformer/__init__.py b/src/autogluon_assistant/transformer/__init__.py index 8ce8126..904cfa8 100644 --- a/src/autogluon_assistant/transformer/__init__.py +++ b/src/autogluon_assistant/transformer/__init__.py @@ -1,16 +1,16 @@ from .base import BaseTransformer, TransformTimeoutError from .feature_transformers import ( BaseFeatureTransformer, - CAAFETransformer, OpenFETransformer, PretrainedEmbeddingTransformer, + SimpleGenTransformer, ) __all__ = [ "BaseTransformer", "BaseFeatureTransformer", - "CAAFETransformer", "PretrainedEmbeddingTransformer", "OpenFETransformer", "TransformTimeoutError", + "SimpleGenTransformer", ] diff --git a/src/autogluon_assistant/transformer/feature_transformers/__init__.py b/src/autogluon_assistant/transformer/feature_transformers/__init__.py index 07d7656..cf6950c 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/__init__.py +++ b/src/autogluon_assistant/transformer/feature_transformers/__init__.py @@ -1,11 +1,11 @@ from .base import BaseFeatureTransformer -from .caafe import CAAFETransformer from .openfe import OpenFETransformer from .scentenceFT import PretrainedEmbeddingTransformer +from .simple_gen import SimpleGenTransformer __all__ = [ "BaseFeatureTransformer", - "CAAFETransformer", "OpenFETransformer", "PretrainedEmbeddingTransformer", + "SimpleGenTransformer", ] diff --git a/src/autogluon_assistant/transformer/feature_transformers/simple_gen.py b/src/autogluon_assistant/transformer/feature_transformers/simple_gen.py new file mode 100644 index 0000000..7988905 --- /dev/null +++ b/src/autogluon_assistant/transformer/feature_transformers/simple_gen.py @@ -0,0 +1,278 @@ +from typing import Mapping, Tuple, Optional, Dict +import logging +import ast +import re +import os +import traceback + +import pandas as pd +import numpy as np +import sklearn +import dspy +import copy +from sklearn.model_selection import train_test_split +from botocore.exceptions import ClientError +from tenacity import retry, stop_after_attempt, wait_exponential +from .base import BaseFeatureTransformer +from caafe.preprocessing import make_datasets_numeric + + +logger = logging.getLogger(__name__) +# pd.set_option("future.no_silent_downcasting", True) + + +class AWSAnthropicRetry(dspy.AWSAnthropic): + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=lambda exception: isinstance(exception, ClientError) + and exception.response["Error"]["Code"] == "ThrottlingException", + ) + def _call_model(self, body: str) -> str: + return super()._call_model(body) + + +def get_llm(model_str, kwargs) -> dspy.lm: + if model_str == "bedrock-claude-3.5-sonnet": # load credential from envrionment variables + return AWSAnthropicRetry( + aws_provider=dspy.Bedrock(region_name="us-west-2"), + model="anthropic.claude-3-5-sonnet-20240620-v1:0", + ) + elif model_str == "bedrock-claude-3-sonnet": # load credential from envrionment variables + return AWSAnthropicRetry( + aws_provider=dspy.Bedrock(region_name="us-west-2"), + model="anthropic.claude-3-sonnet-20240229-v1:0", + ) + elif model_str == "gpt-4-turbo": + return dspy.OpenAI(model="gpt-4-turbo", api_key=kwargs.get("openai_api_key", os.environ.get("OPENAI_API_KEY"))) + else: + raise + + +def extract_python_code(raw_output: str) -> str: + return re.search(r"```python\s*(.*?)\s*```", raw_output, re.DOTALL).group(1) + + +def validate_python_code(code_string: str) -> Tuple[bool, str]: + try: + ast.parse(code_string) + except SyntaxError as e: + return False, f"Syntax error in the code: {e}" + return True, "Code has no syntax error" + + +def construct_context(context: list) -> str: + return "\n".join( + [f"-----\nGenerated code:\n\n {code} \n\n Observed results: {status}\n-----" for code, status in context] + ) + + +def exec_python_code(code: str, df: pd.DataFrame): + access_scope = {"df": df, "pd": pd, "np": np, "sklearn": sklearn} + try: + ast.parse(code) + except Exception as e: + raise + + locals_dict = {"df": df} + try: + exec(code, globals(), locals_dict, access_scope=access_scope) + assert "df" in locals_dict + return locals_dict["df"] + except Exception as e: + raise + + +def eval_xgboost(X_train, X_test, y_train, y_test, task_type) -> float: + from xgboost import XGBClassifier, XGBRegressor + from sklearn.metrics import accuracy_score, roc_auc_score, r2_score + + if task_type == "binary": + metric_func = roc_auc_score + model = XGBClassifier() + elif task_type == "multiclass": + metric_func = accuracy_score + model = XGBClassifier() + else: + assert task_type == "regression" + metric_func = r2_score + model = XGBRegressor() + + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + return metric_func(y_test, y_pred) + + +class ExtractInternalKnowledge(dspy.Signature): + """Given task description and sample data, first guess the domain/application of + the data and then generate at most 3 feature eningeering tricks specific to this + dataset and domain. Don't do feature selection and dimension reduction.""" + + task_desc = dspy.InputField(desc="Task description") + sample_data = dspy.InputField(desc="Sample rows and columns of the dataset") + context = dspy.InputField(desc="Observations from previous solutions") + feature_description = dspy.OutputField( + desc="A list of at most 3 features engineering tricks specific to this dataset" + ) + + +class DescriptionToCode(dspy.Signature): + """Assume a dataframe called 'df' has been loaded in memory, given some sample data of 'df', + a list of feature engineering descriptions, generate python codeblock for each feature engineering + description and add these features as new columns to 'df'""" + + sample_data = dspy.InputField(desc="Sample rows and columns of the dataset") + feature_description = dspy.InputField(desc="A list of descriptions of useful features engineering method") + context = dspy.InputField(desc="Observations from previous solutions") + python_code = dspy.OutputField( + desc="Python code to transform dataset into features. Don't use function in the code." + ) + + +class MultiStepGen(dspy.Module): + + def __init__(self, max_iter: int): + super().__init__() + self.max_iter = max_iter + self.gen_domain_feature_desc = dspy.Predict(ExtractInternalKnowledge) + self.gen_code_from_desc = dspy.Predict(DescriptionToCode) + + def forward(self, df, task_desc) -> Optional[str]: + sample_data_desc = df.head(5).to_string() + context = [] + for _ in range(self.max_iter): + context_str = construct_context(context) + + feat_desc = self.gen_domain_feature_desc( + task_desc=task_desc, + sample_data=sample_data_desc, + context=context_str, + ).feature_description + + code = self.gen_code_from_desc( + sample_data=sample_data_desc, + feature_description=feat_desc, + context=context_str, + ).python_code + + try: + code = extract_python_code(code) + except Exception as e: + context.append((code, "error extracting python code {e}")) + logger.info(f"SimpleGen extract python code error\n\n {code} \n\n {e}") + continue + + is_valid, message = validate_python_code(code) + + if is_valid: + try: + exec_python_code(code, df) + context.append((code, "successfully run")) + return code + except Exception as e: + tb = traceback.format_exc() + context.append((code, f"Runtime Error: {e}. Trace: {tb}")) + logger.info(f"SimpleGen code has Runtime Error\n\n {code} \n\n {e} {tb}") + continue + else: + context.append((code, message)) + logger.info(f"SimpleGen code has Syntax Error\n\n {code} \n\n {message}") + continue + + +class SimpleGenTransformer(BaseFeatureTransformer): + + identifier = "simple-gen" + + def __init__( + self, + llm_model: str = "bedrock-claude-3.5-sonnet", + num_iterations: int = 3, + max_num_retries: int = 3, + eval_model: str = "xgboost", + **kwargs, + ) -> None: + assert eval_model == "xgboost", eval_model + self.iterations = num_iterations + self.max_num_retries = max_num_retries + self.eval_model = eval_model + self.metadata = {"transformer": "dspy-basic"} + self.llm_model = llm_model + llm = get_llm(llm_model, kwargs) + dspy.configure(lm=llm) + + def _fit_dataframes( + self, + train_X: pd.DataFrame, + train_y: pd.Series, + target_column_name: str, + problem_type: str, + dataset_description: str, + **kwargs, + ) -> None: + + assert problem_type in ("binary", "multiclass", "regression") + categorical_target = not pd.api.types.is_numeric_dtype(train_y) + if categorical_target: + encoded_y, _ = train_y.factorize() + + train_y = encoded_y if categorical_target else train_y + + # comptue baseline performance without any new features + best_code = "" + X_train_internal, X_test_internal, y_train_internal, y_test_internal = train_test_split( + train_X, train_y, test_size=0.2, random_state=42 + ) + transformed_train_X, transformed_test_X = make_datasets_numeric( + X_train_internal, X_test_internal, target_column_name + ) + best_perf = eval_xgboost( + transformed_train_X, + transformed_test_X, + y_train_internal, + y_test_internal, + problem_type, + ) + + for _ in range(self.iterations): + df_train = train_X.copy() + multi_step_gen = MultiStepGen(max_iter=self.max_num_retries) + code = multi_step_gen(df_train, dataset_description) + + if code is not None: + transformed_train_X = exec_python_code(code, X_train_internal) + transformed_test_X = exec_python_code(code, X_test_internal) + transformed_train_X, transformed_test_X = make_datasets_numeric( + transformed_train_X, transformed_test_X, target_column_name + ) + perf = eval_xgboost( + transformed_train_X, + transformed_test_X, + y_train_internal, + y_test_internal, + problem_type, + ) + if perf > best_perf: + best_code = code + best_perf = perf + + self.code = best_code + + if self.code == "": + logger.info("SimpleGen doesn't generate any good features") + logger.info(self.code) + else: + logger.info("SimpleGen generated features:\n") + logger.info(self.code) + + def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + if self.code == "": + return train_X, test_X + transformed_train_X = exec_python_code(self.code, train_X) + transformed_test_X = exec_python_code(self.code, test_X) + transformed_train_X, transformed_test_X = make_datasets_numeric(transformed_train_X, transformed_test_X, "") + return transformed_train_X, transformed_test_X + + def get_metadata(self) -> Mapping: + return self.metadata