diff --git a/.gitignore b/.gitignore index 5f94213e2..61b1160c5 100644 --- a/.gitignore +++ b/.gitignore @@ -29,7 +29,7 @@ lib/ lib64 parts/ sdist/ -src/ +src/* var/ wheels/ pip-wheel-metadata/ @@ -145,7 +145,26 @@ data/ pyvenv.cfg *~ tmp/ -docs/_build/ *.DS_Store */catboost_info + +# Spark files +metastore_db/ + .vscode + +# VSCode scala exstention files +.metals +.bloop +.bsp + +# meld +*.orig + +# supplementary files +rsync-repo.sh +requirements.txt +airflow.yaml + +# temporary +experiments/tests diff --git a/experiments/experiment_utils.py b/experiments/experiment_utils.py new file mode 100644 index 000000000..895a3510c --- /dev/null +++ b/experiments/experiment_utils.py @@ -0,0 +1,589 @@ +import importlib +import json +import os +from typing import Tuple, Optional, cast, Dict, List + +import mlflow +import pandas as pd +from pyspark.conf import SparkConf +from pyspark.sql import SparkSession +from pyspark.sql import DataFrame +from pyspark.sql import functions as sf +from rs_datasets import MovieLens, MillionSongDataset + +from replay.models import ( + ALSWrap, + SLIM, + LightFMWrap, + ItemKNN, + Word2VecRec, + PopRec, + RandomRec, + AssociationRulesItemRec, + UserPopRec, + Wilson, + ClusterRec, + UCB, +) +from replay.models.base_rec import BaseRecommender +from replay.utils import log_exec_timer, get_number_of_allocated_executors +from replay.data_preparator import DataPreparator, Indexer +from replay.splitters import DateSplitter, UserSplitter + + +def get_nmslib_hnsw_params(spark_app_id: str): + index_params_str = os.environ.get("NMSLIB_HNSW_PARAMS") + if not index_params_str: + raise ValueError( + f"To use nmslib hnsw index you need to set the 'NMSLIB_HNSW_PARAMS' env variable! " + 'For example, {"method":"hnsw","space":"negdotprod_sparse_fast","M":16,"efS":200,"efC":200,"post":0,' + '"index_path":"/tmp/nmslib_hnsw_index_{spark_app_id}","build_index_on":"executor"}.' + ) + nmslib_hnsw_params = json.loads(index_params_str) + if ( + "index_path" in nmslib_hnsw_params + and "{spark_app_id}" in nmslib_hnsw_params["index_path"] + ): + nmslib_hnsw_params["index_path"] = nmslib_hnsw_params[ + "index_path" + ].replace("{spark_app_id}", spark_app_id) + print(f"nmslib_hnsw_params: {nmslib_hnsw_params}") + return nmslib_hnsw_params + + +def get_hnswlib_params(spark_app_id: str): + index_params_str = os.environ.get("HNSWLIB_PARAMS") + if not index_params_str: + raise ValueError( + f"To use hnswlib index you need to set the 'HNSWLIB_PARAMS' env variable! " + 'For example, {"space":"ip","M":100,"efS":2000,"efC":2000,"post":0,' + '"index_path":"/tmp/hnswlib_index_{spark_app_id}","build_index_on":"executor"}.' + ) + hnswlib_params = json.loads(index_params_str) + if ( + "index_path" in hnswlib_params + and "{spark_app_id}" in hnswlib_params["index_path"] + ): + hnswlib_params["index_path"] = hnswlib_params["index_path"].replace( + "{spark_app_id}", spark_app_id + ) + print(f"hnswlib_params: {hnswlib_params}") + return hnswlib_params + + +def get_model(model_name: str, seed: int, spark_app_id: str): + """Initializes model and returns an instance of it + + Args: + model_name: model name indicating which model to use. For example, `ALS` and `ALS_HNSWLIB`, where second is ALS with the hnsw index. + seed: seed + spark_app_id: spark application id. used for model artifacts paths. + """ + + if model_name == "ALS": + als_rank = int(os.environ.get("ALS_RANK", 100)) + num_blocks = int(os.environ.get("NUM_BLOCKS", 10)) + + mlflow.log_params({"num_blocks": num_blocks, "ALS_rank": als_rank}) + + model = ALSWrap( + rank=als_rank, + seed=seed, + num_item_blocks=num_blocks, + num_user_blocks=num_blocks, + ) + + elif model_name == "Explicit_ALS": + als_rank = int(os.environ.get("ALS_RANK", 100)) + mlflow.log_param("ALS_rank", als_rank) + model = ALSWrap(rank=als_rank, seed=seed, implicit_prefs=False) + elif model_name == "ALS_HNSWLIB": + als_rank = int(os.environ.get("ALS_RANK", 100)) + num_blocks = int(os.environ.get("NUM_BLOCKS", 10)) + hnswlib_params = get_hnswlib_params(spark_app_id) + mlflow.log_params( + { + "ALS_rank": als_rank, + "num_blocks": num_blocks, + "build_index_on": hnswlib_params["build_index_on"], + "hnswlib_params": hnswlib_params, + } + ) + model = ALSWrap( + rank=als_rank, + seed=seed, + num_item_blocks=num_blocks, + num_user_blocks=num_blocks, + hnswlib_params=hnswlib_params, + ) + elif model_name == "SLIM": + model = SLIM(seed=seed) + elif model_name == "SLIM_NMSLIB_HNSW": + nmslib_hnsw_params = get_nmslib_hnsw_params(spark_app_id) + mlflow.log_params( + { + "build_index_on": nmslib_hnsw_params["build_index_on"], + "nmslib_hnsw_params": nmslib_hnsw_params, + } + ) + model = SLIM(seed=seed, nmslib_hnsw_params=nmslib_hnsw_params) + elif model_name == "ItemKNN": + num_neighbours = int(os.environ.get("NUM_NEIGHBOURS", 10)) + mlflow.log_param("num_neighbours", num_neighbours) + model = ItemKNN(num_neighbours=num_neighbours) + elif model_name == "ItemKNN_NMSLIB_HNSW": + num_neighbours = int(os.environ.get("NUM_NEIGHBOURS", 10)) + nmslib_hnsw_params = get_nmslib_hnsw_params(spark_app_id) + mlflow.log_params( + { + "build_index_on": nmslib_hnsw_params["build_index_on"], + "nmslib_hnsw_params": nmslib_hnsw_params, + "num_neighbours": num_neighbours + } + ) + model = ItemKNN(num_neighbours=num_neighbours, nmslib_hnsw_params=nmslib_hnsw_params) + elif model_name == "LightFM": + model = LightFMWrap(random_state=seed) + elif model_name == "Word2VecRec": + word2vec_rank = int(os.environ.get("WORD2VEC_RANK", 100)) + mlflow.log_param("word2vec_rank", word2vec_rank) + model = Word2VecRec(rank=word2vec_rank, seed=seed) + elif model_name == "Word2VecRec_HNSWLIB": + hnswlib_params = get_hnswlib_params(spark_app_id) + word2vec_rank = int(os.environ.get("WORD2VEC_RANK", 100)) + mlflow.log_params( + { + "build_index_on": hnswlib_params["build_index_on"], + "hnswlib_params": hnswlib_params, + "word2vec_rank": word2vec_rank, + } + ) + + model = Word2VecRec( + rank=word2vec_rank, + seed=seed, + hnswlib_params=hnswlib_params, + ) + elif model_name == "PopRec": + use_relevance = os.environ.get("USE_RELEVANCE", "False") == "True" + model = PopRec(use_relevance=use_relevance) + mlflow.log_param("USE_RELEVANCE", use_relevance) + elif model_name == "UserPopRec": + model = UserPopRec() + elif model_name == "RandomRec_uniform": + model = RandomRec(seed=seed, distribution="uniform") + elif model_name == "RandomRec_popular_based": + model = RandomRec(seed=seed, distribution="popular_based") + elif model_name == "RandomRec_relevance": + model = RandomRec(seed=seed, distribution="relevance") + elif model_name == "AssociationRulesItemRec": + model = AssociationRulesItemRec() + elif model_name == "Wilson": + model = Wilson() + elif model_name == "ClusterRec": + num_clusters = int(os.environ.get("NUM_CLUSTERS", "10")) + mlflow.log_param("num_clusters", num_clusters) + model = ClusterRec(num_clusters=num_clusters) + elif model_name == "ClusterRec_HNSWLIB": + num_clusters = int(os.environ.get("NUM_CLUSTERS", "10")) + hnswlib_params = get_hnswlib_params(spark_app_id) + mlflow.log_params( + { + "num_clusters": num_clusters, + "build_index_on": hnswlib_params["build_index_on"], + "hnswlib_params": hnswlib_params, + } + ) + model = ClusterRec( + num_clusters=num_clusters, hnswlib_params=hnswlib_params + ) + elif model_name == "UCB": + model = UCB(seed=seed) + else: + raise ValueError("Unknown model.") + + return model + + +def get_models(models: Dict) -> List[BaseRecommender]: + + list_of_models = [] + for model_class_name, model_kwargs in models.items(): + module_name = ".".join(model_class_name.split('.')[:-1]) + class_name = model_class_name.split('.')[-1] + module = importlib.import_module(module_name) + clazz = getattr(module, class_name) + + base_model = cast(BaseRecommender, clazz(**model_kwargs)) + list_of_models.append(base_model) + + return list_of_models + +def prepare_datasets(dataset_name: str, spark: SparkSession, partition_num: int): + + if dataset_name.startswith("MovieLens"): + # name__{size} pattern + dataset_params = dataset_name.split("__") + if len(dataset_params) == 1: + dataset_version = "1m" + elif len(dataset_params) == 2: + dataset_version = dataset_params[1] + else: + raise ValueError("Too many dataset params.") + train_target_path = f"{os.environ['DATASETS_DIR']}MovieLens/train_{dataset_version}.parquet" + test_target_path = f"{os.environ['DATASETS_DIR']}MovieLens/test_{dataset_version}.parquet" + elif dataset_name.startswith("MillionSongDataset"): + # MillionSongDataset__{fraction} pattern + dataset_params = dataset_name.split("__") + if len(dataset_params) == 1: + fraction = "1.0" + else: + fraction = dataset_params[1] + train_target_path = f"{os.environ['DATASETS_DIR']}MillionSongDataset/fraction_{fraction}_train.parquet" + test_target_path = f"{os.environ['DATASETS_DIR']}MillionSongDataset/fraction_{fraction}_test.parquet" + else: + raise ValueError("Unknown dataset.") + + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + is_exists = fs.exists(spark._jvm.org.apache.hadoop.fs.Path(train_target_path)) + if is_exists and os.environ.get("FORCE_RECREATE_DATASETS", "False") != "True": + print(f"Path '{train_target_path}' already exists and FORCE_RECREATE_DATASETS != True. " + "Skipping datasets creation.") + return + + if dataset_name.startswith("MovieLens"): + data = MovieLens( + dataset_version, path=f"{os.environ['RS_DATASETS_DIR']}MovieLens" + ) + data = data.ratings + mapping = { + "user_id": "user_id", + "item_id": "item_id", + "relevance": "rating", + "timestamp": "timestamp", + } + elif dataset_name.startswith("MillionSongDataset"): + if fraction == "1.0": + data = MillionSongDataset( + path=f"{os.environ['RS_DATASETS_DIR']}MillionSongDataset" + ) + data = data.train + elif fraction == "train_10x_users": + data = spark.read.parquet( + "file:///opt/spark_data/replay_datasets/MillionSongDataset/train_10x_users.parquet" + ) + elif fraction == "train_100m_users_1k_items": + data = spark.read.parquet( + "file:///opt/spark_data/replay_datasets/MillionSongDataset/train_100m_users_1k_items.parquet" + ) + else: + data = pd.read_csv(f"/opt/spark_data/replay_datasets/MillionSongDataset/train_{fraction}.csv") + + mapping = { + "user_id": "user_id", + "item_id": "item_id", + "relevance": "play_count", + } + + with log_exec_timer("DataPreparator execution") as preparator_timer: + preparator = DataPreparator(columns_mapping=mapping) + log = preparator.transform(data) + log = log.repartition(partition_num).cache() + log.write.mode("overwrite").format("noop").save() + mlflow.log_metric("preparator_sec", preparator_timer.duration) + + mlflow.log_metric("log_num_partitions", log.rdd.getNumPartitions()) + + if os.getenv("FILTER_LOG") == "True": + with log_exec_timer("log filtering") as log_filtering_timer: + # will consider ratings >= 3 as positive feedback. A positive feedback is treated with relevance = 1 + only_positives_log = log.filter( + sf.col("relevance") >= 3 # 1 + ).withColumn("relevance", sf.lit(1)) + only_positives_log = only_positives_log.cache() + only_positives_log.write.mode("overwrite").format("noop").save() + log = only_positives_log + mlflow.log_metric("log_filtering_sec", log_filtering_timer.duration) + + with log_exec_timer( + "log.count() execution" + ) as log_count_timer: + log_length = log.count() + mlflow.log_metric("log_count_sec", log_count_timer.duration) + mlflow.log_param("log_length", log_length) + + with log_exec_timer("Indexer training") as indexer_fit_timer: + indexer = Indexer(user_col="user_id", item_col="item_id") + indexer.fit( + users=log.select("user_id"), items=log.select("item_id") + ) + mlflow.log_metric("indexer_fit_sec", indexer_fit_timer.duration) + + with log_exec_timer("Indexer transform") as indexer_transform_timer: + log_replay = indexer.transform(df=log) + log_replay = log_replay.cache() + log_replay.write.mode("overwrite").format("noop").save() + mlflow.log_metric( + "indexer_transform_sec", indexer_transform_timer.duration + ) + + with log_exec_timer("Splitter execution") as splitter_timer: + if dataset_name.startswith("MovieLens"): + # MovieLens + train_spl = DateSplitter( + test_start=0.2, + drop_cold_items=True, + drop_cold_users=True, + ) + else: + # MillionSongDataset + train_spl = UserSplitter( + item_test_size=0.2, + shuffle=True, + drop_cold_items=True, + drop_cold_users=True, + ) + train, test = train_spl.split(log_replay) + + train = train.cache() + test = test.cache() + train.write.mode("overwrite").format("noop").save() + test.write.mode("overwrite").format("noop").save() + test = test.repartition(partition_num).cache() + mlflow.log_metric("splitter_sec", splitter_timer.duration) + + mlflow.log_metric("train_num_partitions", train.rdd.getNumPartitions()) + mlflow.log_metric("test_num_partitions", test.rdd.getNumPartitions()) + + with log_exec_timer("Train/test datasets saving to parquet") as parquets_save_timer: + # WARN: 'fraction' is not fraction of test or train, it is fraction of input dataset. + train.write.mode('overwrite').parquet( + train_target_path + ) + test.write.mode('overwrite').parquet( + test_target_path + ) + mlflow.log_metric(f"parquets_write_sec", parquets_save_timer.duration) + + +def get_datasets( + dataset_name, spark: SparkSession, partition_num: int +) -> Tuple[DataFrame, DataFrame, Optional[DataFrame]]: + """ + Reads prepared datasets from hdfs or disk and returns them. + + Args: + dataset_name: Dataset name with size postfix (optional). For example `MovieLens__10m` or `MovieLens__25m`. + spark: spark session + partition_num: Number of partitions in output dataframes. + + Returns: + train: train dataset + test: test dataset + user_features: dataframe with user features (optional) + + """ + user_features = None + if dataset_name.startswith("MovieLens"): + dataset_params = dataset_name.split("__") + if len(dataset_params) == 1: + dataset_version = "1m" + else: + dataset_version = dataset_params[1] + + with log_exec_timer( + "Train/test datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( # hdfs://node21.bdcl:9000 + f"{os.environ['DATASETS_DIR']}MovieLens/train_{dataset_version}.parquet" + ) + test = spark.read.parquet( # hdfs://node21.bdcl:9000 + f"{os.environ['DATASETS_DIR']}MovieLens/test_{dataset_version}.parquet" + ) + elif dataset_name.startswith("MillionSongDataset"): + # MillionSongDataset__{fraction} pattern + dataset_params = dataset_name.split("__") + if len(dataset_params) == 1: + fraction = "1.0" + else: + fraction = dataset_params[1] + + if fraction == "train_100m_users_1k_items": + with log_exec_timer( + "Train/test datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/fraction_{fraction}_train.parquet" + ) + test = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/fraction_{fraction}_test.parquet" + ) + else: + if partition_num in {6, 12, 24, 48}: + with log_exec_timer( + "Train/test datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/" + f"fraction_{fraction}_train_{partition_num}_partition.parquet" + ) + test = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/" + f"fraction_{fraction}_test_{partition_num}_partition.parquet" + ) + else: + with log_exec_timer( + "Train/test datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/" + f"fraction_{fraction}_train_24_partition.parquet" + ) + test = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}MillionSongDataset/" + f"fraction_{fraction}_test_24_partition.parquet" + ) + elif dataset_name == "ml1m": + with log_exec_timer( + "Train/test/user_features datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}ml1m_train.parquet" + ) + test = spark.read.parquet( + f"{os.environ['DATASETS_DIR']}ml1m_test.parquet" + ) + # user_features = spark.read.parquet( + # "/opt/spark_data/replay_datasets/ml1m_user_features.parquet" + # ) + # .select("user_idx", "gender_idx", "age", "occupation", "zip_code_idx") + elif dataset_name == "ml1m_first_level_default": + with log_exec_timer( + "Train/test/user_features datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + "file:///opt/spark_data/replay/experiments/ml1m_first_level_default/train.parquet" + ) + test = spark.read.parquet( + "file:///opt/spark_data/replay/experiments/ml1m_first_level_default/test.parquet" + ) + elif dataset_name == "ml1m_1m_users_3_7k_items": + with log_exec_timer( + "Train/test/user_features datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + "hdfs://node21.bdcl:9000/opt/spark_data/replay_datasets/ml1m_1m_users_3_7k_items_train.parquet" + ) + test = spark.read.parquet( + "hdfs://node21.bdcl:9000/opt/spark_data/replay_datasets/ml1m_1m_users_3_7k_items_test.parquet" + ) + user_features = spark.read.parquet( + "hdfs://node21.bdcl:9000/opt/spark_data/replay_datasets/" + "ml1m_1m_users_3_7k_items_user_features.parquet" + ) + elif dataset_name == "ml1m_1m_users_37k_items": + with log_exec_timer( + "Train/test/user_features datasets reading to parquet" + ) as parquets_read_timer: + train = spark.read.parquet( + "/opt/spark_data/replay_datasets/ml1m_1m_users_37k_items_train.parquet" + ) + test = spark.read.parquet( + "/opt/spark_data/replay_datasets/ml1m_1m_users_37k_items_test.parquet" + ) + user_features = spark.read.parquet( + "/opt/spark_data/replay_datasets/ml1m_1m_users_37k_items_user_features.parquet" + ) + else: + raise ValueError("Unknown dataset.") + + train = train.repartition(partition_num, "user_idx") + test = test.repartition(partition_num, "user_idx") + + mlflow.log_metric("parquets_read_sec", parquets_read_timer.duration) + + return train, test, user_features + + +def get_spark_configs_as_dict(spark_conf: SparkConf): + return { + "spark.driver.cores": spark_conf.get("spark.driver.cores"), + "spark.driver.memory": spark_conf.get("spark.driver.memory"), + "spark.memory.fraction": spark_conf.get("spark.memory.fraction"), + "spark.executor.cores": spark_conf.get("spark.executor.cores"), + "spark.executor.memory": spark_conf.get("spark.executor.memory"), + "spark.executor.instances": spark_conf.get("spark.executor.instances"), + "spark.sql.shuffle.partitions": spark_conf.get( + "spark.sql.shuffle.partitions" + ), + "spark.default.parallelism": spark_conf.get( + "spark.default.parallelism" + ), + } + + +def check_number_of_allocated_executors(spark: SparkSession): + """ + Checks whether enough executors are allocated or not. If not, then throws an exception. + + Args: + spark: spark session + """ + + if os.environ.get('CHECK_NUMBER_OF_ALLOCATED_EXECUTORS') != "True": + return + + spark_conf: SparkConf = spark.sparkContext.getConf() + + # if enough executors is not allocated in the cluster mode, then we stop the experiment + if spark_conf.get("spark.executor.instances"): + if get_number_of_allocated_executors(spark) < int( + spark_conf.get("spark.executor.instances") + ): + raise Exception("Not enough executors to run experiment!") + + +def get_partition_num(spark_conf: SparkConf): + if os.environ.get("PARTITION_NUM"): + partition_num = int(os.environ.get("PARTITION_NUM")) + else: + if spark_conf.get("spark.cores.max") is None: + partition_num = os.cpu_count() + else: + partition_num = int(spark_conf.get("spark.cores.max")) + + return partition_num + + +def get_log_info( + log: DataFrame, user_col="user_idx", item_col="item_idx" +) -> Tuple[int, int, int]: + """ + Basic log statistics + + >>> from replay.session_handler import State + >>> spark = State().session + >>> log = spark.createDataFrame([(1, 2), (3, 4), (5, 2)]).toDF("user_idx", "item_idx") + >>> log.show() + +--------+--------+ + |user_idx|item_idx| + +--------+--------+ + | 1| 2| + | 3| 4| + | 5| 2| + +--------+--------+ + + >>> rows_count, users_count, items_count = get_log_info(log) + >>> print((rows_count, users_count, items_count)) + (3, 3, 2) + + :param log: interaction log containing ``user_idx`` and ``item_idx`` + :param user_col: name of a columns containing users' identificators + :param item_col: name of a columns containing items' identificators + + :returns: statistics string + """ + cnt = log.count() + user_cnt = log.select(user_col).distinct().count() + item_cnt = log.select(item_col).distinct().count() + return cnt, user_cnt, item_cnt diff --git a/experiments/run_two_stage.py b/experiments/run_two_stage.py new file mode 100644 index 000000000..d93eded57 --- /dev/null +++ b/experiments/run_two_stage.py @@ -0,0 +1,333 @@ +""" +This script is a Spark application that executes replay TwoStagesScenario. +Parameters sets via environment variables. + +launch example: + $ export DATASET=MovieLens + $ export MODEL=ALS + $ export ALS_RANK=100 + $ export SEED=22 + $ export K=10 + $ python experiments/run_experiment.py + +or run in one line: + $ DATASET=MovieLens MODEL=ALS ALS_RANK=100 SEED=22 K=10 K_LIST_METRICS=5,10 python experiments/run_experiment.py + +All params: + DATASET: dataset name + Available values: + MovieLens__100k + MovieLens==MovieLens__1m + MovieLens__10m + MovieLens__20m + MovieLens__25m + MillionSongDataset + + MODEL: model name + Available values: + LightFM + PopRec + UserPopRec + ALS + Explicit_ALS + ALS_HNSWLIB + Word2VecRec + Word2VecRec_HNSWLIB + SLIM + SLIM_NMSLIB_HNSW + ItemKNN + ItemKNN_NMSLIB_HNSW + ClusterRec + RandomRec_uniform + RandomRec_popular_based + RandomRec_relevance + AssociationRulesItemRec + Wilson + UCB + + SEED: seed + + K: number of desired recommendations per user + + K_LIST_METRICS: List of K values (separated by commas) to calculate metrics. For example, K_LIST_METRICS=5,10. + It perform NDCG@5, NDCG@10, MAP@5, MAP@10, HitRate@5 and HitRate@10 calculation. + + NMSLIB_HNSW_PARAMS: nmslib hnsw index params. Double quotes must be used instead of single quotes + Example: {"method":"hnsw","space":"negdotprod_sparse_fast","M":100,"efS":2000,"efC":2000,"post":0, + "index_path":"/tmp/nmslib_hnsw_index_{spark_app_id}","build_index_on":"executor"} + The "space" parameter described on the page https://github.com/nmslib/nmslib/blob/master/manual/spaces.md. + Parameters "M", "efS" and "efC" are described at https://github.com/nmslib/nmslib/blob/master/manual/methods.md#graph-based-search-methods-sw-graph-and-hnsw. + The "build_index_on" parameter specifies whether the index will be built on the "driver" or "executor". + If "build_index_on"="executor" then "index_path" must be specified. + "index_path" determines where the built index should be stored. "index_path" can be a path in hdfs. + If "build_index_on"="driver", then the index built on the driver will be passed to the executors via the `SparkContext.addFile` mechanism. + + HNSWLIB_PARAMS: hnswlib index params. Double quotes must be used instead of single quotes + Example: {"space":"ip","M":100,"efS":2000,"efC":2000,"post":0, + "index_path":"/tmp/hnswlib_index_{spark_app_id}","build_index_on":"executor"} + The "space" parameter described on the page https://github.com/nmslib/hnswlib/blob/master/README.md#supported-distances + Parameters "M", "efS" and "efC" are described at https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md + Parameters "build_index_on" and "index_path" are the same as for NMSLIB_HNSW_PARAMS + + ALS_RANK: rank for ALS model, i.e. length of ALS factor vectors + + NUM_BLOCKS: num_item_blocks and num_user_blocks values in ALS model. Default: 10. + + WORD2VEC_RANK: rank of Word2Vec model + + NUM_NEIGHBOURS: ItemKNN param + + NUM_CLUSTERS: number of clusters in Cluster model + + USE_SCALA_UDFS_METRICS: if set to "True", then metrics will be calculated via scala UDFs + + USE_BUCKETING: if set to "True", then train and test dataframes will be bucketed + + DATASETS_DIR: where train and test datasets will be stored + + FORCE_RECREATE_DATASETS: if set to "True", then train and test dataframes will be recreated + + RS_DATASETS_DIR: where files will be downloaded by the rs_datasets package + + FILTER_LOG: if set to "True", the log will be filtered by "relevance" >= 1 + + CHECK_NUMBER_OF_ALLOCATED_EXECUTORS: If set to "True", then number of allocated executors will be checked. + And if there are not enough executors, then the program will stop. + + PARTITION_NUM: number of partition to repartition test and train dataframes. + +""" + +import logging +import os +from importlib.metadata import version + +import mlflow +from pyspark.conf import SparkConf +from pyspark.sql import SparkSession + +from experiment_utils import ( + get_datasets, + get_spark_configs_as_dict, + check_number_of_allocated_executors, + get_partition_num, + prepare_datasets, get_models, +) +from replay.experiment import Experiment +from replay.metrics import HitRate, MAP, NDCG +from replay.scenarios import TwoStagesScenario +from replay.session_handler import get_spark_session +from replay.splitters import UserSplitter +from replay.utils import ( + JobGroup, + log_exec_timer, +) +from replay.utils import logger + +import warnings + +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + +spark_logger = logging.getLogger("py4j") +spark_logger.setLevel(logging.WARN) + +formatter = logging.Formatter( + "%(asctime)s %(levelname)s %(name)s: %(message)s", + datefmt="%d/%m/%y %H:%M:%S", +) +streamHandler = logging.StreamHandler() +streamHandler.setFormatter(formatter) +# logger.addHandler(streamHandler) + +# fileHandler = logging.FileHandler("/tmp/replay.log") +# fileHandler.setFormatter(formatter) +# logger.addHandler(fileHandler) + +# logger.setLevel(logging.DEBUG) + +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + handlers=[ + # fileHandler, + streamHandler + ], +) + +logging.getLogger("urllib3").setLevel(logging.WARNING) + + +def main(spark: SparkSession, dataset_name: str): + spark_conf: SparkConf = spark.sparkContext.getConf() + + check_number_of_allocated_executors(spark) + + k = int(os.environ.get("K", 100)) + k_list_metrics = list( + map(int, os.environ.get("K_LIST_METRICS", "5,10,25,100").split(",")) + ) + seed = int(os.environ.get("SEED", 1234)) + mlflow_tracking_uri = os.environ.get( + "MLFLOW_TRACKING_URI", "http://node2.bdcl:8822" + ) + model_name = os.environ.get("MODEL", "some_mode") + + partition_num = get_partition_num(spark_conf) + + mlflow.set_tracking_uri(mlflow_tracking_uri) + mlflow.set_experiment(os.environ.get("EXPERIMENT", "delete")) + + with mlflow.start_run(): + params = get_spark_configs_as_dict(spark_conf) + params.update( + { + "spark.applicationId": spark.sparkContext.applicationId, + "pyspark": version("pyspark"), + "dataset": dataset_name, + "seed": seed, + "K": k, + } + ) + mlflow.log_params(params) + + prepare_datasets(dataset_name, spark, partition_num) + + train, test, user_features = get_datasets( + dataset_name, spark, partition_num + ) + + first_levels_models_params = { + "replay.models.knn.ItemKNN": {"num_neighbours": int(os.environ.get("NUM_NEIGHBOURS", 100))}, + "replay.models.als.ALSWrap": { + "rank": int(os.environ.get("ALS_RANK", 100)), + "seed": seed, + "num_item_blocks": int(os.environ.get("NUM_BLOCKS", 10)), + "num_user_blocks": int(os.environ.get("NUM_BLOCKS", 10)), + "hnswlib_params": { + "space": "ip", + "M": 100, + "efS": 2000, + "efC": 2000, + "post": 0, + "index_path": f"/tmp/als_hnswlib_index_{spark.sparkContext.applicationId}", + "build_index_on": "executor", + }, + }, + "replay.models.word2vec.Word2VecRec": { + "rank": int(os.environ.get("WORD2VEC_RANK", 100)), + "seed": seed, + "hnswlib_params": { + "space": "ip", + "M": 100, + "efS": 2000, + "efC": 2000, + "post": 0, + "index_path": f"/tmp/word2vec_hnswlib_index_{spark.sparkContext.applicationId}", + "build_index_on": "executor", + }, + }, + } + mlflow.log_params(first_levels_models_params) + + # Initialization of first level models + first_level_models = get_models(first_levels_models_params) + use_first_level_models_feat = [False, True, False] + assert len(first_level_models) == len(use_first_level_models_feat) + + mlflow.log_param( + "first_level_models", + [type(m).__name__ for m in first_level_models], + ) + mlflow.log_param( + "use_first_level_models_feat", use_first_level_models_feat + ) + + second_model_params = { + "cpu_limit": 80, # 20 + "memory_limit": int(80 * 0.95), # 40 + "timeout": 400, + "general_params": {"use_algos": [["lgb"]]}, + "lgb_params": { + "use_single_dataset_mode": True, + "convert_to_onnx": False, + "mini_batch_size": 1000, + }, + "linear_l2_params": {"default_params": {"regParam": [1e-5]}}, + "reader_params": {"cv": 2, "advanced_roles": False, "samples": 10_000} + } + mlflow.log_param("second_model_params", second_model_params) + + scenario = TwoStagesScenario( + train_splitter=UserSplitter( + item_test_size=0.2, shuffle=True, seed=42 + ), + first_level_models=first_level_models, + use_first_level_models_feat=use_first_level_models_feat, + second_model_type="slama", + second_model_params=second_model_params, + second_model_config_path=os.environ.get( + "PATH_TO_SLAMA_TABULAR_CONFIG", "tabular_config.yml" + ), + use_generated_features=True + ) + + # Model fitting + with log_exec_timer( + f"{type(scenario).__name__} fitting" + ) as timer, JobGroup( + f"{type(scenario).__name__} fitting", + f"{type(scenario).__name__}.fit()", + ): + scenario.fit(log=train, user_features=None, item_features=None) + mlflow.log_metric(f"{type(scenario).__name__}.fit_sec", timer.duration) + + # Model inference + with log_exec_timer( + f"{type(scenario).__name__} inference" + ) as timer, JobGroup( + f"{type(scenario).__name__} inference", + f"{type(scenario).__name__}.predict()", + ): + recs = scenario.predict( + log=train, + k=k, + users=test.select("user_idx").distinct(), + filter_seen_items=True, + ) + mlflow.log_metric( + f"{type(scenario).__name__}.predict_sec", timer.duration + ) + + with log_exec_timer("Metrics calculation") as metrics_timer, JobGroup( + "Metrics calculation", "e.add_result()" + ): + e = Experiment( + test, + { + MAP(): k_list_metrics, + NDCG(): k_list_metrics, + HitRate(): k_list_metrics, + }, + ) + e.add_result(model_name, recs) + mlflow.log_metric("metrics_sec", metrics_timer.duration) + metrics = dict() + for k in k_list_metrics: + metrics["NDCG.{}".format(k)] = e.results.at[ + model_name, "NDCG@{}".format(k) + ] + metrics["MAP.{}".format(k)] = e.results.at[ + model_name, "MAP@{}".format(k) + ] + metrics["HitRate.{}".format(k)] = e.results.at[ + model_name, "HitRate@{}".format(k) + ] + mlflow.log_metrics(metrics) + + +if __name__ == "__main__": + spark_sess = get_spark_session() + dataset = os.environ.get("DATASET", "MovieLens_1m") + main(spark=spark_sess, dataset_name=dataset) + spark_sess.stop() diff --git a/experiments/submit_two_stage.sh b/experiments/submit_two_stage.sh new file mode 100755 index 000000000..b815aad5b --- /dev/null +++ b/experiments/submit_two_stage.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT="$1" + +PYSPARK_PYTHON_PATH=${PYSPARK_PYTHON_PATH:-"/python_envs/.replay_venv/bin/python3"} +DRIVER_CORES=${DRIVER_CORES:-"2"} +DRIVER_MEMORY=${DRIVER_MEMORY:-"20g"} +DRIVER_MAX_RESULT_SIZE=${DRIVER_MAX_RESULT_SIZE:-"5g"} +EXECUTOR_CORES=${EXECUTOR_CORES:-"6"} +EXECUTOR_MEMORY=${EXECUTOR_MEMORY:-"40g"} +FILTER_LOG=${FILTER_LOG:-"True"} +SEED=${SEED:-"42"} +K=${K:-"100"} +K_LIST_METRICS=${K_LIST_METRICS:-"5,10,50,100"} +USE_BUCKETING=${USE_BUCKETING:-"True"} +ALS_RANK=${ALS_RANK:-"100"} +NUM_NEIGHBOURS=${NUM_NEIGHBOURS:-"100"} +NUM_CLUSTERS=${NUM_CLUSTERS:-"100"} +WORD2VEC_RANK=${WORD2VEC_RANK:-"100"} +HNSWLIB_PARAMS=${HNSWLIB_PARAMS:-'{\"space\":\"ip\",\"M\":100,\"efS\":2000,\"efC\":2000,\"post\":0,\"index_path\":\"/tmp/hnswlib_index_{spark_app_id}\",\"build_index_on\":\"executor\"}'} +NMSLIB_HNSW_PARAMS=${NMSLIB_HNSW_PARAMS:-'{\"method\":\"hnsw\",\"space\":\"negdotprod_sparse_fast\",\"M\":100,\"efS\":2000,\"efC\":2000,\"post\":0,\"index_path\":\"/tmp/nmslib_hnsw_index_{spark_app_id}\",\"build_index_on\":\"executor\"}'} + +WAREHOUSE_DIR=${WAREHOUSE_DIR:-"hdfs://node21.bdcl:9000/spark-warehouse"} +DATASETS_DIR=${DATASETS_DIR:-"/opt/spark_data/replay_datasets/"} +RS_DATASETS_DIR=${RS_DATASETS_DIR:-"/opt/spark_data/replay_datasets/"} +FORCE_RECREATE_DATASETS=${FORCE_RECREATE_DATASETS:-"False"} +CHECK_NUMBER_OF_ALLOCATED_EXECUTORS=${CHECK_NUMBER_OF_ALLOCATED_EXECUTORS:-"False"} # True + +EXECUTOR_INSTANCES=${EXECUTOR_INSTANCES:-"2"} +DATASET=${DATASET:-"MovieLens__1m"} # MillionSongDataset MovieLens__25m Netflix +MODEL=${MODEL:-"two_stage"} +EXPERIMENT=$MODEL"_$(cut -d'_' -f1 <<<$DATASET)" + +# calculable variables +# shellcheck disable=SC2004 +CORES_MAX=$(($EXECUTOR_CORES * $EXECUTOR_INSTANCES)) +# shellcheck disable=SC2004 +PARTITION_NUM=$((3 * $CORES_MAX)) +NUM_BLOCKS=$((3 * $CORES_MAX)) + + + +spark-submit \ +--master yarn \ +--deploy-mode cluster \ +--conf 'spark.yarn.appMasterEnv.SCRIPT_ENV=cluster' \ +--conf 'spark.yarn.appMasterEnv.PYSPARK_PYTHON='${PYSPARK_PYTHON_PATH} \ +--conf 'spark.yarn.appMasterEnv.MLFLOW_TRACKING_URI=http://node2.bdcl:8822' \ +--conf 'spark.yarn.appMasterEnv.DATASET='${DATASET} \ +--conf 'spark.yarn.appMasterEnv.SEED='${SEED} \ +--conf 'spark.yarn.appMasterEnv.K='${K} \ +--conf 'spark.yarn.appMasterEnv.K_LIST_METRICS='${K_LIST_METRICS} \ +--conf 'spark.yarn.appMasterEnv.MODEL='${MODEL} \ +--conf 'spark.yarn.appMasterEnv.ALS_RANK='${ALS_RANK} \ +--conf 'spark.yarn.appMasterEnv.NUM_NEIGHBOURS='${NUM_NEIGHBOURS} \ +--conf 'spark.yarn.appMasterEnv.NUM_CLUSTERS='${NUM_CLUSTERS} \ +--conf 'spark.yarn.appMasterEnv.WORD2VEC_RANK='${WORD2VEC_RANK} \ +--conf 'spark.yarn.appMasterEnv.HNSWLIB_PARAMS='${HNSWLIB_PARAMS} \ +--conf 'spark.yarn.appMasterEnv.NMSLIB_HNSW_PARAMS='${NMSLIB_HNSW_PARAMS} \ +--conf 'spark.yarn.appMasterEnv.USE_RELEVANCE='${USE_RELEVANCE} \ +--conf 'spark.yarn.appMasterEnv.LOG_TO_MLFLOW='${LOG_TO_MLFLOW} \ +--conf 'spark.yarn.appMasterEnv.USE_SCALA_UDFS_METRICS='${USE_SCALA_UDFS_METRICS} \ +--conf 'spark.yarn.appMasterEnv.EXPERIMENT='${EXPERIMENT} \ +--conf 'spark.yarn.appMasterEnv.FILTER_LOG='${FILTER_LOG} \ +--conf 'spark.yarn.appMasterEnv.NUM_BLOCKS='${NUM_BLOCKS} \ +--conf 'spark.yarn.appMasterEnv.PARTITION_NUM='${PARTITION_NUM} \ +--conf 'spark.yarn.appMasterEnv.USE_BUCKETING='${USE_BUCKETING} \ +--conf 'spark.yarn.appMasterEnv.DATASETS_DIR='${DATASETS_DIR} \ +--conf 'spark.yarn.appMasterEnv.RS_DATASETS_DIR='${RS_DATASETS_DIR} \ +--conf 'spark.yarn.appMasterEnv.FORCE_RECREATE_DATASETS='${FORCE_RECREATE_DATASETS} \ +--conf 'spark.yarn.appMasterEnv.CHECK_NUMBER_OF_ALLOCATED_EXECUTORS='${CHECK_NUMBER_OF_ALLOCATED_EXECUTORS} \ +--conf 'spark.yarn.appMasterEnv.GIT_PYTHON_REFRESH=quiet' \ +--conf "spark.yarn.tags=replay" \ +--conf 'spark.kryoserializer.buffer.max=2010m' \ +--conf 'spark.driver.cores='${DRIVER_CORES} \ +--conf 'spark.driver.memory='${DRIVER_MEMORY} \ +--conf 'spark.driver.maxResultSize='${DRIVER_MAX_RESULT_SIZE} \ +--conf 'spark.executor.instances='${EXECUTOR_INSTANCES} \ +--conf 'spark.executor.cores='${EXECUTOR_CORES} \ +--conf 'spark.executor.memory='${EXECUTOR_MEMORY} \ +--conf 'spark.cores.max='${CORES_MAX} \ +--conf 'spark.memory.fraction=0.4' \ +--conf 'spark.sql.shuffle.partitions='${PARTITION_NUM} \ +--conf 'spark.default.parallelism='${PARTITION_NUM} \ +--conf 'spark.yarn.maxAppAttempts=1' \ +--conf 'spark.rpc.message.maxSize=1024' \ +--conf 'spark.sql.autoBroadcastJoinThreshold=100MB' \ +--conf 'spark.sql.execution.arrow.pyspark.enabled=true' \ +--conf 'spark.scheduler.minRegisteredResourcesRatio=1.0' \ +--conf 'spark.scheduler.maxRegisteredResourcesWaitingTime=180s' \ +--conf 'spark.eventLog.enabled=true' \ +--conf 'spark.eventLog.dir=hdfs://node21.bdcl:9000/shared/spark-logs' \ +--conf 'spark.yarn.historyServer.allowTracking=true' \ +--conf 'spark.driver.extraJavaOptions=-Dio.netty.tryReflectionSetAccessible=true' \ +--conf 'spark.executor.extraJavaOptions=-Dio.netty.tryReflectionSetAccessible=true' \ +--conf 'spark.executor.extraClassPath=/jars/spark_3.1.1_synapseml_0.9.5_jars/*' \ +--conf 'spark.driver.extraClassPath=/jars/spark_3.1.1_synapseml_0.9.5_jars/*' \ +--conf 'spark.sql.warehouse.dir='${WAREHOUSE_DIR} \ +--conf 'spark.task.maxFailures=1' \ +--conf 'spark.excludeOnFailure.task.maxTaskAttemptsPerNode=1' \ +--conf 'spark.excludeOnFailure.stage.maxFailedTasksPerExecutor=1' \ +--conf 'spark.excludeOnFailure.stage.maxFailedExecutorsPerNode=1' \ +--conf 'spark.excludeOnFailure.application.maxFailedTasksPerExecutor=1' \ +--conf 'spark.excludeOnFailure.application.maxFailedExecutorsPerNode=1' \ +--conf 'spark.python.worker.reuse=true' \ +--conf 'spark.sql.optimizer.maxIterations=100' \ +--conf 'spark.files.overwrite=true' \ +--py-files 'dist/replay_rec-0.10.0-py3-none-any.whl,experiments/experiment_utils.py,/SLAMA/dist/SparkLightAutoML_DEV-0.3.2-py3-none-any.whl' \ +--files '/SLAMA/tabular_config.yml' \ +--num-executors ${EXECUTOR_INSTANCES} \ +--jars 'scala/target/scala-2.12/replay_2.12-0.1.jar,/SLAMA/jars/spark-lightautoml_2.12-0.1.1.jar' \ +--name ${EXPERIMENT} \ +"${SCRIPT}" + +# launch example: +# poetry build && ./experiments/submit_two_stage.sh experiments/run_two_stage.py + +#--conf 'spark.yarn.am.waitTime=1500000' \ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f02f14e13..c1acb83d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,8 @@ seaborn = "*" pyarrow = "*" implicit = ">=0.5" pytorch-ranger = "^0.1.1" +nmslib = "*" +hnswlib = "*" [tool.poetry.dev-dependencies] # dev only diff --git a/replay/ann/__init__.py b/replay/ann/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/replay/ann/ann_mixin.py b/replay/ann/ann_mixin.py new file mode 100644 index 000000000..50e63ba3e --- /dev/null +++ b/replay/ann/ann_mixin.py @@ -0,0 +1,195 @@ +import uuid +from abc import abstractmethod +from functools import cached_property +from typing import Optional, Dict, Union, Any + +from pyspark.sql import DataFrame +from pyspark.sql import functions as sf + +from replay.models.base_rec import BaseRecommender + + +class ANNMixin(BaseRecommender): + """ + This class overrides the `_fit_wrap` and `_inner_predict_wrap` methods of the base class, + adding an index construction in the `_fit_wrap` step and an index inference in the `_inner_predict_wrap` step. + """ + + @cached_property + def _spark_index_file_uid(self) -> str: + """ + Cached property that returns the uuid for the index file name. + The unique name is needed to store the index file in `SparkFiles` without conflicts with other index files. + """ + return uuid.uuid4().hex[-12:] + + @property + @abstractmethod + def _use_ann(self) -> bool: + """ + Property that determines whether the ANN (index) is used. + """ + ... + + @abstractmethod + def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame: + ... + + @abstractmethod + def _get_ann_build_params(self, log: DataFrame) -> Dict[str, Any]: + ... + + def _fit_wrap( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> None: + super()._fit_wrap(log, user_features, item_features) + + if self._use_ann: + vectors = self._get_vectors_to_build_ann(log) + ann_params = self._get_ann_build_params(log) + self._build_ann_index(vectors, **ann_params) + + @abstractmethod + def _get_vectors_to_infer_ann_inner( + self, log: DataFrame, users: DataFrame + ) -> DataFrame: + ... + + def _get_vectors_to_infer_ann( + self, log: DataFrame, users: DataFrame, filter_seen_items: bool + ) -> DataFrame: + + users = self._get_vectors_to_infer_ann_inner(log, users) + + # here we add `seen_item_idxs` to filter the viewed items in UDFs (see infer_index) + if filter_seen_items: + user_to_max_items = log.groupBy("user_idx").agg( + sf.count("item_idx").alias("num_items"), + sf.collect_set("item_idx").alias("seen_item_idxs"), + ) + users = users.join(user_to_max_items, on="user_idx") + + return users + + @abstractmethod + def _get_ann_infer_params(self) -> Dict[str, Any]: + ... + + @abstractmethod + def _build_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + dim: int = None, + num_elements: int = None, + id_col: Optional[str] = None, + index_type: str = None, + items_count: Optional[int] = None, + ) -> None: + ... + + @abstractmethod + def _infer_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + k: int, + filter_seen_items: bool, + index_dim: str = None, + index_type: str = None, + log: DataFrame = None, + ) -> DataFrame: + ... + + def _inner_predict_wrap( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + + if self._use_ann: + vectors = self._get_vectors_to_infer_ann( + log, users, filter_seen_items + ) + ann_params = self._get_ann_infer_params() + return self._infer_ann_index( + vectors, + k=k, + filter_seen_items=filter_seen_items, + log=log, + **ann_params, + ) + else: + return self._predict( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + ) + + def _unpack_infer_struct(self, inference_result: DataFrame) -> DataFrame: + """Transforms input dataframe. + Unpacks and explodes arrays from `neighbours` struct. + + >>> inference_result.printSchema() + root + |-- user_idx: integer (nullable = true) + |-- neighbours: struct (nullable = true) + | |-- item_idx: array (nullable = true) + | | |-- element: integer (containsNull = true) + | |-- distance: array (nullable = true) + | | |-- element: double (containsNull = true) + >>> self._unpack_infer_struct(inference_result).printSchema() + root + |-- user_idx: integer (nullable = true) + |-- item_idx: integer (nullable = true) + |-- relevance: double (nullable = true) + + Args: + inference_result: output of infer_index UDF + """ + res = inference_result.select( + "user_idx", + sf.explode( + sf.arrays_zip("neighbours.item_idx", "neighbours.distance") + ).alias("zip_exp"), + ) + + # Fix arrays_zip random behavior. It can return zip_exp.0 or zip_exp.item_idx in different machines + fields = res.schema["zip_exp"].jsonValue()["type"]["fields"] + item_idx_field_name: str = fields[0]["name"] + distance_field_name: str = fields[1]["name"] + + res = res.select( + "user_idx", + sf.col(f"zip_exp.{item_idx_field_name}").alias("item_idx"), + (sf.lit(-1.0) * sf.col(f"zip_exp.{distance_field_name}")).alias( + "relevance" + ), + ) + return res + + def _filter_seen( + self, recs: DataFrame, log: DataFrame, k: int, users: DataFrame + ): + """ + Overridden _filter_seen method from base class. + Filtering is not needed for ann methods, because the data is already filtered in udf. + """ + if self._use_ann: + return recs + + return super()._filter_seen(recs, log, k, users) diff --git a/replay/ann/utils.py b/replay/ann/utils.py new file mode 100644 index 000000000..6060bccf5 --- /dev/null +++ b/replay/ann/utils.py @@ -0,0 +1,72 @@ +import os +import tempfile +from typing import Optional, Callable + +from pyarrow import fs + +from replay.utils import FileSystem, FileInfo + + +def save_index_to_destination_fs( + sparse: bool, + save_index: Callable[[str], None], + target: FileInfo, +): + + if target.filesystem == FileSystem.HDFS: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = os.path.join(temp_dir, "index") + destination_filesystem = fs.HadoopFileSystem.from_uri( + target.hdfs_uri + ) + + save_index(temp_file_path) + + # here we copy index files from local disk to hdfs + # if index is sparse then we need to copy two files: "index" and "index.dat" + index_file_paths = [temp_file_path] + destination_paths = [target.path] + if sparse: + index_file_paths.append(temp_file_path + ".dat") + destination_paths.append(target.path + ".dat") + for index_file_path, destination_path in zip( + index_file_paths, destination_paths + ): + fs.copy_files( + "file://" + index_file_path, + destination_path, + destination_filesystem=destination_filesystem, + ) + # param use_threads=True (?) + else: + save_index(target.path) + + +def load_index_from_source_fs( + sparse: bool, + load_index: Callable[[str], None], + source: FileInfo, +): + if source.filesystem == FileSystem.HDFS: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = os.path.join(temp_dir, "index") + source_filesystem = fs.HadoopFileSystem.from_uri(source.hdfs_uri) + + # here we copy index files from hdfs to local disk + # if index is sparse then we need to copy two files: "index" and "index.dat" + index_file_paths = [source.path] + destination_paths = [temp_file_path] + if sparse: + index_file_paths.append(source.path + ".dat") + destination_paths.append(temp_file_path + ".dat") + for index_file_path, destination_path in zip( + index_file_paths, destination_paths + ): + fs.copy_files( + index_file_path, + "file://" + destination_path, + source_filesystem=source_filesystem, + ) + load_index(temp_file_path) + elif source.filesystem == FileSystem.LOCAL: + load_index(source.path) diff --git a/replay/history_based_fp.py b/replay/history_based_fp.py index f8e3280a1..28f6e7c5f 100644 --- a/replay/history_based_fp.py +++ b/replay/history_based_fp.py @@ -7,25 +7,43 @@ ``HistoryBasedFeaturesProcessor`` applies LogStatFeaturesProcessor and ConditionalPopularityProcessor as a pipeline. """ - +import os +import pickle from typing import Dict, Optional, List import pyspark.sql.functions as sf from datetime import datetime -from pyspark.sql import DataFrame -from pyspark.sql.types import TimestampType + +from pyarrow import fs +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import TimestampType, StructType, StructField, StringType from replay.utils import ( join_or_return, join_with_col_renaming, - unpersist_if_exists, + unpersist_if_exists, get_filesystem, create_folder, AbleToSaveAndLoad, do_path_exists, save_transformer, + load_transformer, ) -class EmptyFeatureProcessor: +class EmptyFeatureProcessor(AbleToSaveAndLoad): """Do not perform any transformations on the dataframe""" + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + spark = spark or cls._get_spark_session() + row = spark.read.parquet(path).first().asDict() + cls._validate_classname(row["classname"]) + return EmptyFeatureProcessor() + + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + spark = spark or self._get_spark_session() + spark.createDataFrame([{"data": '', "classname": self.get_classname()}]).write.parquet( + path, + mode='overwrite' if overwrite else 'error' + ) + def fit(self, log: DataFrame, features: Optional[DataFrame]) -> None: """ :param log: input DataFrame ``[user_idx, item_idx, timestamp, relevance]`` @@ -68,6 +86,28 @@ class LogStatFeaturesProcessor(EmptyFeatureProcessor): user_log_features: Optional[DataFrame] = None item_log_features: Optional[DataFrame] = None + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + spark = spark or cls._get_spark_session() + + row = spark.read.parquet(os.path.join(path, "data.parquet")).first().asDict() + + if row["has_user_log_features"]: + user_log_features = spark.read.parquet(os.path.join(path, "user_log_features.parquet")) + else: + user_log_features = None + + if row["has_item_log_features"]: + item_log_features = spark.read.parquet(os.path.join(path, "item_log_features.parquet")) + else: + item_log_features = None + + processor = pickle.loads(row["data"]) + processor.user_log_features = user_log_features + processor.item_log_features = item_log_features + + return processor + def _create_log_aggregates(self, agg_col: str = "user_idx") -> List: """ Create features based on relevance type @@ -345,6 +385,35 @@ def transform(self, log: DataFrame) -> DataFrame: return joined + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + create_folder(path, delete_if_exists=overwrite) + + if self.user_log_features is not None: + self.user_log_features.write.parquet(os.path.join(path, "user_log_features.parquet")) + + if self.item_log_features is not None: + self.item_log_features.write.parquet(os.path.join(path, "item_log_features.parquet")) + + spark = spark or self._get_spark_session() + + user_log_features = self.user_log_features + item_log_features = self.item_log_features + self.user_log_features = None + self.item_log_features = None + + data = pickle.dumps(self) + self.user_log_features = user_log_features + self.item_log_features = item_log_features + + df = spark.createDataFrame([{ + "data": data, + "classname": self.get_classname(), + "has_user_log_features": self.user_log_features is not None, + "has_item_log_features": self.item_log_features is not None + }]) + + df.write.parquet(os.path.join(path, "data.parquet")) + def __del__(self): unpersist_if_exists(self.user_log_features) unpersist_if_exists(self.item_log_features) @@ -360,6 +429,25 @@ class ConditionalPopularityProcessor(EmptyFeatureProcessor): conditional_pop_dict: Optional[Dict[str, DataFrame]] entity_name: str + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + spark = spark or cls._get_spark_session() + + row = spark.read.parquet(os.path.join(path, "data.parquet")).first().asDict() + + if row["has_conditional_pop_dict"]: + dfs_folder = os.path.join(path, "conditional_pop_dict") + conditional_pop_dict = dict() + for i, key in enumerate(row["key_order"]): + conditional_pop_dict[key] = spark.read.parquet(os.path.join(dfs_folder, f"{i}")) + else: + conditional_pop_dict = None + + transformer = pickle.loads(row["data"]) + transformer.conditional_pop_dict = conditional_pop_dict + + return transformer + def __init__( self, cat_features_list: List, @@ -445,13 +533,39 @@ def transform(self, log: DataFrame) -> DataFrame: joined = joined.fillna({f"{self.entity_name[0]}_pop_by_{key}": 0}) return joined + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + spark = spark or self._get_spark_session() + create_folder(path, delete_if_exists=overwrite) + + conditional_pop_dict = self.conditional_pop_dict + self.conditional_pop_dict = None + data = pickle.dumps(self) + self.conditional_pop_dict = conditional_pop_dict + + if self.conditional_pop_dict is not None: + dfs_folder = os.path.join(path, "conditional_pop_dict") + create_folder(dfs_folder) + key_order = [] + for i, (key, df) in enumerate(self.conditional_pop_dict.items()): + key_order.append(key) + df.write.parquet(os.path.join(dfs_folder, f"{i}")) + else: + key_order = [] + + spark.createDataFrame([{ + "classname": self.get_classname(), + "data": data, + "key_order": key_order, + "has_conditional_pop_dict": self.conditional_pop_dict is not None + }]).write.parquet(os.path.join(path, "data.parquet")) + def __del__(self): for df in self.conditional_pop_dict.values(): unpersist_if_exists(df) # pylint: disable=too-many-instance-attributes, too-many-arguments -class HistoryBasedFeaturesProcessor: +class HistoryBasedFeaturesProcessor(AbleToSaveAndLoad): """ Calculate user and item features based on interactions history (log). calculated features includes numbers of interactions, rating and timestamp distribution features @@ -465,6 +579,27 @@ class HistoryBasedFeaturesProcessor: user_cond_pop_proc = EmptyFeatureProcessor() item_cond_pop_proc = EmptyFeatureProcessor() + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + spark = spark or cls._get_spark_session() + + row = spark.read.parquet(os.path.join(path, "data.parquet")).first().asDict() + + log_processor = load_transformer(os.path.join(path, "log_processor")) \ + if row["has_log_processor"] else None + user_cond_pop_proc = load_transformer(os.path.join(path, "user_cond_pop_proc")) \ + if row["has_user_cond_pop_proc"] else None + + item_cond_pop_proc = load_transformer(os.path.join(path, "item_cond_pop_proc")) \ + if row["has_item_cond_pop_proc"] else None + + transformer = pickle.loads(row["data"]) + transformer.log_processor = log_processor + transformer.user_cond_pop_proc = user_cond_pop_proc + transformer.item_cond_pop_proc = item_cond_pop_proc + + return transformer + def __init__( self, use_log_features: bool = True, @@ -533,3 +668,35 @@ def transform( joined = self.item_cond_pop_proc.transform(joined) return joined + + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + spark = spark or self._get_spark_session() + create_folder(path, delete_if_exists=overwrite) + + log_processor = self.log_processor + user_cond_pop_proc = self.user_cond_pop_proc + item_cond_pop_proc = self.item_cond_pop_proc + self.log_processor = None + self.user_cond_pop_proc = None + self.item_cond_pop_proc = None + data = pickle.dumps(self) + self.log_processor = log_processor + self.user_cond_pop_proc = user_cond_pop_proc + self.item_cond_pop_proc = item_cond_pop_proc + + if self.log_processor is not None: + save_transformer(self.log_processor, os.path.join(path, "log_processor")) + + if self.user_cond_pop_proc is not None: + save_transformer(self.user_cond_pop_proc, os.path.join(path, "user_cond_pop_proc")) + + if self.item_cond_pop_proc is not None: + save_transformer(self.item_cond_pop_proc, os.path.join(path, "item_cond_pop_proc")) + + spark.createDataFrame([{ + "classname": self.get_classname(), + "data": data, + "has_log_processor": self.log_processor is not None, + "has_user_cond_pop_proc": self.user_cond_pop_proc is not None, + "has_item_cond_pop_proc": self.item_cond_pop_proc is not None + }]).write.parquet(os.path.join(path, "data.parquet")) diff --git a/replay/model_handler.py b/replay/model_handler.py index de5860cf3..5d5f54817 100644 --- a/replay/model_handler.py +++ b/replay/model_handler.py @@ -1,10 +1,8 @@ # pylint: disable=wildcard-import,invalid-name,unused-wildcard-import,unspecified-encoding -import os import json +import os import shutil from inspect import getfullargspec - -import joblib from os.path import exists, join import pyspark.sql.types as st @@ -15,6 +13,7 @@ from replay.models.base_rec import BaseRecommender from replay.session_handler import State from replay.splitters import * +from replay.utils import save_picklable_to_parquet, load_pickled_from_parquet def prepare_dir(path): @@ -26,7 +25,7 @@ def prepare_dir(path): os.makedirs(path) -def save(model: BaseRecommender, path: str): +def save(model: BaseRecommender, path: str, overwrite: bool = False): """ Save fitted model to disk as a folder @@ -34,26 +33,32 @@ def save(model: BaseRecommender, path: str): :param path: destination where model files will be stored :return: """ - prepare_dir(path) + spark = State().session + + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + if not overwrite: + is_exists = fs.exists(spark._jvm.org.apache.hadoop.fs.Path(path)) + if is_exists: + raise FileExistsError(f"Path '{path}' already exists. Mode is 'overwrite = False'.") + + fs.mkdirs(spark._jvm.org.apache.hadoop.fs.Path(join(path, "model"))) model._save_model(join(path, "model")) init_args = model._init_args init_args["_model_name"] = str(model) - with open(join(path, "init_args.json"), "w") as json_file: - json.dump(init_args, json_file) + sc = spark.sparkContext + df = spark.read.json(sc.parallelize([json.dumps(init_args)])) + df.coalesce(1).write.mode("overwrite").json(join(path, "init_args.json")) dataframes = model._dataframes df_path = join(path, "dataframes") - os.makedirs(df_path) for name, df in dataframes.items(): - df.write.parquet(join(df_path, name)) + if df is not None: + df.write.parquet(join(df_path, name)) + model.fit_users.write.mode("overwrite").parquet(join(df_path, "fit_users")) + model.fit_items.write.mode("overwrite").parquet(join(df_path, "fit_items")) - if hasattr(model, "fit_users"): - model.fit_users.write.parquet(join(df_path, "fit_users")) - if hasattr(model, "fit_items"): - model.fit_items.write.parquet(join(df_path, "fit_items")) - if hasattr(model, "study"): - joblib.dump(model.study, join(path, "study")) + save_picklable_to_parquet(model.study, join(path, "study")) def load(path: str) -> BaseRecommender: @@ -64,8 +69,7 @@ def load(path: str) -> BaseRecommender: :return: Restored trained model """ spark = State().session - with open(join(path, "init_args.json"), "r") as json_file: - args = json.load(json_file) + args = spark.read.json(join(path, "init_args.json")).first().asDict(recursive=True) name = args["_model_name"] del args["_model_name"] @@ -85,39 +89,46 @@ def load(path: str) -> BaseRecommender: model.arg = extra_args[arg] df_path = join(path, "dataframes") - dataframes = os.listdir(df_path) - for name in dataframes: - df = spark.read.parquet(join(df_path, name)) - setattr(model, name, df) + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + statuses = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(df_path)) + dataframes_paths = [str(f.getPath()) for f in statuses] + for dataframe_path in dataframes_paths: + df = spark.read.parquet(dataframe_path) + attr_name = dataframe_path.split("/")[-1] + setattr(model, attr_name, df) model._load_model(join(path, "model")) - model.study = ( - joblib.load(join(path, "study")) - if os.path.exists(join(path, "study")) - else None - ) + model.study = load_pickled_from_parquet(join(path, "study")) + return model -def save_indexer(indexer: Indexer, path: str): +def save_indexer(indexer: Indexer, path: str, overwrite: bool = False): """ Save fitted indexer to disk as a folder :param indexer: Trained indexer :param path: destination where indexer files will be stored """ - prepare_dir(path) + spark = State().session + + if not overwrite: + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + is_exists = fs.exists(spark._jvm.org.apache.hadoop.fs.Path(path)) + if is_exists: + raise FileExistsError(f"Path '{path}' already exists. Mode is 'overwrite = False'.") init_args = indexer._init_args init_args["user_type"] = str(indexer.user_type) init_args["item_type"] = str(indexer.item_type) - with open(join(path, "init_args.json"), "w") as json_file: - json.dump(init_args, json_file) + sc = spark.sparkContext + df = spark.read.json(sc.parallelize([json.dumps(init_args)])) + df.coalesce(1).write.mode("overwrite").json(join(path, "init_args.json")) - indexer.user_indexer.save(join(path, "user_indexer")) - indexer.item_indexer.save(join(path, "item_indexer")) - indexer.inv_user_indexer.save(join(path, "inv_user_indexer")) - indexer.inv_item_indexer.save(join(path, "inv_item_indexer")) + indexer.user_indexer.write().overwrite().save(join(path, "user_indexer")) + indexer.item_indexer.write().overwrite().save(join(path, "item_indexer")) + indexer.inv_user_indexer.write().overwrite().save(join(path, "inv_user_indexer")) + indexer.inv_item_indexer.write().overwrite().save(join(path, "inv_item_indexer")) def load_indexer(path: str) -> Indexer: @@ -127,9 +138,8 @@ def load_indexer(path: str) -> Indexer: :param path: path to folder :return: restored Indexer """ - State() - with open(join(path, "init_args.json"), "r") as json_file: - args = json.load(json_file) + spark = State().session + args = spark.read.json(join(path, "init_args.json")).first().asDict() user_type = args["user_type"] del args["user_type"] @@ -153,18 +163,22 @@ def load_indexer(path: str) -> Indexer: return indexer -def save_splitter(splitter: Splitter, path: str): +def save_splitter(splitter: Splitter, path: str, overwrite: bool = False): """ Save initialized splitter :param splitter: Initialized splitter :param path: destination where splitter files will be stored """ - prepare_dir(path) init_args = splitter._init_args init_args["_splitter_name"] = str(splitter) - with open(join(path, "init_args.json"), "w") as json_file: - json.dump(init_args, json_file) + spark = State().session + sc = spark.sparkContext + df = spark.read.json(sc.parallelize([json.dumps(init_args)])) + if overwrite: + df.coalesce(1).write.mode("overwrite").json(join(path, "init_args.json")) + else: + df.coalesce(1).write.json(join(path, "init_args.json")) def load_splitter(path: str) -> Splitter: @@ -174,9 +188,8 @@ def load_splitter(path: str) -> Splitter: :param path: path to folder :return: restored Splitter """ - State() - with open(join(path, "init_args.json"), "r") as json_file: - args = json.load(json_file) + spark = State().session + args = spark.read.json(join(path, "init_args.json")).first().asDict() name = args["_splitter_name"] del args["_splitter_name"] splitter = globals()[name] diff --git a/replay/models/als.py b/replay/models/als.py index 3601e18c4..f5051bb5d 100644 --- a/replay/models/als.py +++ b/replay/models/als.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import Optional, Tuple, Dict, Any import pyspark.sql.functions as sf @@ -7,14 +7,46 @@ from pyspark.sql.types import DoubleType from replay.models.base_rec import Recommender, ItemVectorModel +from replay.models.hnswlib import HnswlibMixin from replay.utils import list_to_vector_udf -class ALSWrap(Recommender, ItemVectorModel): +class ALSWrap(Recommender, ItemVectorModel, HnswlibMixin): """Wrapper for `Spark ALS `_. """ + def _get_ann_infer_params(self) -> Dict[str, Any]: + return { + "features_col": "user_factors", + "params": self._hnswlib_params, + "index_dim": self.rank, + } + + def _get_vectors_to_infer_ann_inner(self, log: DataFrame, users: DataFrame) -> DataFrame: + user_vectors, _ = self.get_features(users) + return user_vectors + + def _get_ann_build_params(self, log: DataFrame): + self.num_elements = log.select("item_idx").distinct().count() + return { + "features_col": "item_factors", + "params": self._hnswlib_params, + "dim": self.rank, + "num_elements": self.num_elements, + "id_col": "item_idx", + } + + def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame: + item_vectors, _ = self.get_features( + log.select("item_idx").distinct() + ) + return item_vectors + + @property + def _use_ann(self) -> bool: + return self._hnswlib_params is not None + _seed: Optional[int] = None _search_space = { "rank": {"type": "loguniform_int", "args": [8, 256]}, @@ -25,6 +57,9 @@ def __init__( rank: int = 10, implicit_prefs: bool = True, seed: Optional[int] = None, + num_item_blocks: Optional[int] = None, + num_user_blocks: Optional[int] = None, + hnswlib_params: Optional[dict] = None, ): """ :param rank: hidden dimension for the approximate matrix @@ -34,6 +69,9 @@ def __init__( self.rank = rank self.implicit_prefs = implicit_prefs self._seed = seed + self._num_item_blocks = num_item_blocks + self._num_user_blocks = num_user_blocks + self._hnswlib_params = hnswlib_params @property def _init_args(self): @@ -41,24 +79,38 @@ def _init_args(self): "rank": self.rank, "implicit_prefs": self.implicit_prefs, "seed": self._seed, + "hnswlib_params": self._hnswlib_params } def _save_model(self, path: str): self.model.write().overwrite().save(path) + if self._hnswlib_params: + self._save_hnswlib_index(path) + def _load_model(self, path: str): self.model = ALSModel.load(path) self.model.itemFactors.cache() self.model.userFactors.cache() + if self._hnswlib_params: + self._load_hnswlib_index(path) + def _fit( self, log: DataFrame, user_features: Optional[DataFrame] = None, item_features: Optional[DataFrame] = None, ) -> None: + if self._num_item_blocks is None: + self._num_item_blocks = log.rdd.getNumPartitions() + if self._num_user_blocks is None: + self._num_user_blocks = log.rdd.getNumPartitions() + self.model = ALS( rank=self.rank, + numItemBlocks=self._num_item_blocks, + numUserBlocks=self._num_user_blocks, userCol="user_idx", itemCol="item_idx", ratingCol="relevance", diff --git a/replay/models/base_rec.py b/replay/models/base_rec.py index 4d07f5136..1892a3a6f 100644 --- a/replay/models/base_rec.py +++ b/replay/models/base_rec.py @@ -545,7 +545,7 @@ def _predict_wrap( message = f"k = {k} > number of items = {num_items}" self.logger.debug(message) - recs = self._predict( + recs = self._inner_predict_wrap( log, k, users, @@ -631,6 +631,48 @@ def _predict( ``[user_idx, item_idx, relevance]`` """ + def _inner_predict_wrap( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + Inner method that wrap _predict method. Can be overwritten. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :return: recommendation dataframe + ``[user_idx, item_idx, relevance]`` + """ + + return self._predict( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + ) + def _get_fit_counts(self, entity: str) -> int: if not hasattr(self, f"_num_{entity}s"): setattr( @@ -1406,7 +1448,10 @@ def predict_pairs( ) -class NeighbourRec(Recommender, ABC): +from replay.models.nmslib_hnsw import NmslibHnswMixin + + +class NeighbourRec(Recommender, NmslibHnswMixin, ABC): """Base class that requires log at prediction time""" similarity: Optional[DataFrame] @@ -1588,6 +1633,31 @@ def _get_nearest_items( "similarity" if metric is None else metric, ) + def _get_ann_build_params(self, log: DataFrame) -> Dict[str, Any]: + items_count = log.select(sf.max("item_idx")).first()[0] + 1 + return { + "features_col": None, + "params": self._nmslib_hnsw_params, + "index_type": "sparse", + "items_count": items_count, + } + + def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame: + similarity_df = self.similarity.select( + "similarity", "item_idx_one", "item_idx_two" + ) + return similarity_df + + def _get_vectors_to_infer_ann_inner( + self, log: DataFrame, users: DataFrame + ) -> DataFrame: + + user_vectors = ( + log.groupBy("user_idx").agg( + sf.collect_list("item_idx").alias("vector_items"), + sf.collect_list("relevance").alias("vector_relevances")) + ) + return user_vectors class NonPersonalizedRecommender(Recommender, ABC): """Base class for non-personalized recommenders with popularity statistics.""" diff --git a/replay/models/base_rec.py.orig b/replay/models/base_rec.py.orig new file mode 100644 index 000000000..a1adf8442 --- /dev/null +++ b/replay/models/base_rec.py.orig @@ -0,0 +1,1962 @@ +# pylint: disable=too-many-lines +""" +Base abstract classes: +- BaseRecommender - the simplest base class +- Recommender - base class for models that fit on interaction log +- HybridRecommender - base class for models that accept user or item features +- UserRecommender - base class that accepts only user features, but not item features +- NeighbourRec - base class that requires log at prediction time +- ItemVectorModel - class for models which provides items' vectors. + Implements similar items search. +- NonPersonalizedRecommender - base class for non-personalized recommenders + with popularity statistics +""" +import collections +import pickle + +import joblib +import os +import logging +from abc import ABC, abstractmethod +from copy import deepcopy, copy +from os.path import join +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Union, + Sequence, + Set, + Tuple, TypeVar, +) + +import numpy as np +import pandas as pd +from numpy.random import default_rng +from optuna import create_study +from optuna.samplers import TPESampler +from pyspark.sql import DataFrame, Window +from pyspark.sql import functions as sf +from pyspark.sql.column import Column + +from replay.constants import REC_SCHEMA +from replay.metrics import Metric, NDCG +from replay.optuna_objective import SplitData, MainObjective +from replay.session_handler import State +from replay.utils import ( + cache_temp_view, + convert2spark, + cosine_similarity, + drop_temp_view, + get_top_k, + vector_euclidean_distance_similarity, + vector_dot, +) + + +# pylint: disable=too-many-instance-attributes +class BaseRecommender(ABC): + """Base recommender""" + + model: Any + _logger: Optional[logging.Logger] = None + can_predict_cold_users: bool = False + can_predict_cold_items: bool = False + can_predict_item_to_item: bool = False + _search_space: Optional[ + Dict[str, Union[str, Sequence[Union[str, int, float]]]] + ] = None + _objective = MainObjective + study = None + fit_users: DataFrame + fit_items: DataFrame + _num_users: int + _num_items: int + _user_dim_size: int + _item_dim_size: int + cached_dfs: Optional[Set] = None + + def copy(self): + return copy(self) + + # pylint: disable=too-many-arguments, too-many-locals, no-member + def optimize( + self, + train: DataFrame, + test: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + param_borders: Optional[Dict[str, List[Any]]] = None, + criterion: Metric = NDCG(), + k: int = 10, + budget: int = 10, + new_study: bool = True, + ) -> Optional[Dict[str, Any]]: + """ + Searches best parameters with optuna. + + :param train: train data + :param test: test data + :param user_features: user features + :param item_features: item features + :param param_borders: a dictionary with search borders, where + key is the parameter name and value is the range of possible values + ``{param: [low, high]}``. In case of categorical parameters it is + all possible values: ``{cat_param: [cat_1, cat_2, cat_3]}``. + :param criterion: metric to use for optimization + :param k: recommendation list length + :param budget: number of points to try + :param new_study: keep searching with previous study or start a new study + :return: dictionary with best parameters + """ + if self._search_space is None: + self.logger.warning( + "%s has no hyper parameters to optimize", str(self) + ) + return None + + if self.study is None or new_study: + self.study = create_study( + direction="maximize", sampler=TPESampler() + ) + + search_space = self._prepare_param_borders(param_borders) + if ( + self._init_params_in_search_space(search_space) + and not self._params_tried() + ): + self.study.enqueue_trial(self._init_args) + + split_data = self._prepare_split_data( + train, test, user_features, item_features + ) + objective = self._objective( + search_space=search_space, + split_data=split_data, + recommender=self, + criterion=criterion, + k=k, + ) + + self.study.optimize(objective, budget) + best_params = self.study.best_params + self.set_params(**best_params) + return best_params + + @property + @abstractmethod + def _init_args(self): + """ + Dictionary of the model attributes passed during model initialization. + Used for model saving and loading + """ + + def _init_params_in_search_space(self, search_space): + """Check if model params are inside search space""" + params = self._init_args # pylint: disable=no-member + outside_search_space = {} + for param, value in params.items(): + if param not in search_space: + continue + borders = search_space[param]["args"] + param_type = search_space[param]["type"] + + extra_category = ( + param_type == "categorical" and value not in borders + ) + param_out_of_bounds = param_type != "categorical" and ( + value < borders[0] or value > borders[1] + ) + if extra_category or param_out_of_bounds: + outside_search_space[param] = { + "borders": borders, + "value": value, + } + + if outside_search_space: + self.logger.debug( + "Model is initialized with parameters outside the search space: %s." + "Initial parameters will not be evaluated during optimization." + "Change search spare with 'param_borders' argument if necessary", + outside_search_space, + ) + return False + else: + return True + + def _prepare_param_borders( + self, param_borders: Optional[Dict[str, List[Any]]] = None + ) -> Dict[str, Dict[str, List[Any]]]: + """ + Checks if param borders are valid and convert them to a search_space format + + :param param_borders: a dictionary with search grid, where + key is the parameter name and value is the range of possible values + ``{param: [low, high]}``. + :return: + """ + search_space = deepcopy(self._search_space) + if param_borders is None: + return search_space + + for param, borders in param_borders.items(): + self._check_borders(param, borders) + search_space[param]["args"] = borders + + # Optuna trials should contain all searchable parameters + # to be able to correctly return best params + # If used didn't specify some params to be tested optuna still needs to suggest them + # This part makes sure this suggestion will be constant + args = self._init_args + missing_borders = { + param: args[param] + for param in search_space + if param not in param_borders + } + for param, value in missing_borders.items(): + if search_space[param]["type"] == "categorical": + search_space[param]["args"] = [value] + else: + search_space[param]["args"] = [value, value] + + return search_space + + def _check_borders(self, param, borders): + """Raise value error if param borders are not valid""" + if param not in self._search_space: + raise ValueError( + f"Hyper parameter {param} is not defined for {str(self)}" + ) + if not isinstance(borders, list): + raise ValueError(f"Parameter {param} borders are not a list") + if ( + self._search_space[param]["type"] != "categorical" + and len(borders) != 2 + ): + raise ValueError( + f""" + Hyper parameter {param} is numerical + but bounds are not in ([lower, upper]) format + """ + ) + + def _prepare_split_data( + self, + train: DataFrame, + test: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> SplitData: + """ + This method converts data to spark and packs it into a named tuple to pass into optuna. + + :param train: train data + :param test: test data + :param user_features: user features + :param item_features: item features + :return: packed PySpark DataFrames + """ + user_features_train, user_features_test = self._train_test_features( + train, test, user_features, "user_idx" + ) + item_features_train, item_features_test = self._train_test_features( + train, test, item_features, "item_idx" + ) + users = test.select("user_idx").distinct() + items = test.select("item_idx").distinct() + split_data = SplitData( + train, + test, + users, + items, + user_features_train, + user_features_test, + item_features_train, + item_features_test, + ) + return split_data + + @property + def _dataframes(self): + return {} + + def _save_model(self, path: str): + pass + + def _load_model(self, path: str): + pass + + @staticmethod + def _train_test_features( + train: DataFrame, + test: DataFrame, + features: Optional[DataFrame], + column: Union[str, Column], + ) -> Tuple[Optional[DataFrame], Optional[DataFrame]]: + """ + split dataframe with features into two dataframes representing + features for train and tests subset entities, defined by `column` + + :param train: spark dataframe with the train subset + :param test: spark dataframe with the train subset + :param features: spark dataframe with users'/items' features + :param column: column name to use as a key for join (e.g., user_idx or item_idx) + :return: features for train and test subsets + """ + if features is not None: + features_train = features.join( + train.select(column).distinct(), on=column + ) + features_test = features.join( + test.select(column).distinct(), on=column + ) + else: + features_train = None + features_test = None + return features_train, features_test + + def set_params(self, **params: Dict[str, Any]) -> None: + """ + Set model parameters + + :param params: dictionary param name - param value + :return: + """ + for param, value in params.items(): + setattr(self, param, value) + self._clear_cache() + + def __str__(self): + return type(self).__name__ + + def _fit_wrap( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> None: + """ + Wrapper for fit to allow for fewer arguments in a model. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features + ``[user_idx, timestamp]`` + feature columns + :param item_features: item features + ``[item_idx, timestamp]`` + feature columns + :return: + """ + self.logger.debug("Starting fit %s", type(self).__name__) + if user_features is None: + users = log.select("user_idx").distinct() + else: + users = ( + log.select("user_idx") + .union(user_features.select("user_idx")) + .distinct() + ) + if item_features is None: + items = log.select("item_idx").distinct() + else: + items = ( + log.select("item_idx") + .union(item_features.select("item_idx")) + .distinct() + ) + self.fit_users = sf.broadcast(users) + self.fit_items = sf.broadcast(items) + self._num_users = self.fit_users.count() + self._num_items = self.fit_items.count() + self._user_dim_size = ( + self.fit_users.agg({"user_idx": "max"}).collect()[0][0] + 1 + ) + self._item_dim_size = ( + self.fit_items.agg({"item_idx": "max"}).collect()[0][0] + 1 + ) + self._fit(log, user_features, item_features) + + @abstractmethod + def _fit( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> None: + """ + Inner method where model actually fits. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features + ``[user_idx, timestamp]`` + feature columns + :param item_features: item features + ``[item_idx, timestamp]`` + feature columns + :return: + """ + + def _cache_model_temp_view(self, df: DataFrame, df_name: str) -> None: + """ + Create Spark SQL temporary view for df, cache it and add temp view name to self.cached_dfs. + Temp view name is : "id__model__" + """ + full_name = f"id_{id(self)}_model_{str(self)}_{df_name}" + cache_temp_view(df, full_name) + + if self.cached_dfs is None: + self.cached_dfs = set() + self.cached_dfs.add(full_name) + + def _clear_model_temp_view(self, df_name: str) -> None: + """ + Uncache and drop Spark SQL temporary view and remove from self.cached_dfs + Temp view to replace will be constructed as + "id__model__" + """ + full_name = f"id_{id(self)}_model_{str(self)}_{df_name}" + drop_temp_view(full_name) + if self.cached_dfs is not None: + self.cached_dfs.discard(full_name) + + def _filter_seen( + self, recs: DataFrame, log: DataFrame, k: int, users: DataFrame + ): + """ + Filter seen items (presented in log) out of the users' recommendations. + For each user return from `k` to `k + number of seen by user` recommendations. + """ + users_log = log.join(users, on="user_idx") + self._cache_model_temp_view(users_log, "filter_seen_users_log") + + # filter recommendations presented in interactions log + recs = recs.join( + users_log.withColumnRenamed("item_idx", "item") + .withColumnRenamed("user_idx", "user") + .select("user", "item"), + on=(sf.col("user_idx") == sf.col("user")) + & (sf.col("item_idx") == sf.col("item")), + how="anti", + ).drop("user", "item") + +<<<<<<< HEAD + # crop recommendations to first k + max_seen items for each user + recs = recs.withColumn( + "temp_rank", + sf.row_number().over( + Window.partitionBy("user_idx").orderBy( + sf.col("relevance").desc() + ) + ), + ).filter(sf.col("temp_rank") <= sf.lit(k)) +======= + # because relevances are already sorted, we can return the first k values + # for every user_idx + def get_top_k(iterator): + current_user_idx = None + n = 0 + for row in iterator: + if row.user_idx == current_user_idx and n <= k: + n += 1 + yield row + elif row.user_idx != current_user_idx: + current_user_idx = row.user_idx + n = 1 + yield row + + # recs = recs.rdd.mapPartitions(get_top_k).toDF(["user_idx", "item_idx", "relevance"]) + recs = recs.rdd.mapPartitions(get_top_k).toDF() +>>>>>>> 519a569 (passing tests) + + return recs + + # pylint: disable=too-many-arguments + def _predict_wrap( + self, + log: Optional[DataFrame], + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Predict wrapper to allow for fewer parameters in models + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + self.logger.debug("Starting predict %s", type(self).__name__) + user_data = users or log or user_features or self.fit_users + users = self._get_ids(user_data, "user_idx") + users, log = self._filter_cold_for_predict(users, log, "user") + + item_data = items or self.fit_items + items = self._get_ids(item_data, "item_idx") + items, log = self._filter_cold_for_predict(items, log, "item") + + num_items = items.count() + if num_items < k: + message = f"k = {k} > number of items = {num_items}" + self.logger.debug(message) + + recs = self._inner_predict_wrap( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + ) + + if filter_seen_items and log: + recs = self._filter_seen(recs=recs, log=log, users=users, k=k) + + output = None + if recs_file_path is not None: + recs.write.parquet(path=recs_file_path, mode="overwrite") + else: + output = recs.cache() + output.count() + + self._clear_model_temp_view("filter_seen_users_log") + self._clear_model_temp_view("filter_seen_num_seen") + return output + + @staticmethod + def _get_ids( + log: Union[Iterable, DataFrame], + column: str, + ) -> DataFrame: + """ + Get unique values from ``array`` and put them into dataframe with column ``column``. + """ + spark = State().session + if isinstance(log, DataFrame): + unique = log.select(column).distinct() + elif isinstance(log, collections.abc.Iterable): + unique = spark.createDataFrame( + data=pd.DataFrame(pd.unique(list(log)), columns=[column]) + ) + else: + raise ValueError(f"Wrong type {type(log)}") + return unique + + def _filter_cold( + self, df: Optional[DataFrame], entity: str, suffix: str = "idx" + ) -> Tuple[int, Optional[DataFrame]]: + """ + Filter out new ids if the model cannot predict cold users/items. + Return number of new users/items and filtered dataframe. + """ + if getattr(self, f"can_predict_cold_{entity}s") or df is None: + return 0, df + + col_name = f"{entity}_{suffix}" + num_cold = ( + df.select(col_name) + .distinct() + .join(getattr(self, f"fit_{entity}s"), on=col_name, how="anti") + .count() + ) + if num_cold == 0: + return 0, df + + return num_cold, df.join( + getattr(self, f"fit_{entity}s"), on=col_name, how="inner" + ) + + def _filter_cold_for_predict( + self, + main_df: DataFrame, + log_df: DataFrame, + entity: str, + suffix: str = "idx", + ): + """ + Filter out cold entities (users/items) from the `main_df` and `log_df`. + Warn if cold entities are present in the `main_df`. + """ + num_new, main_df = self._filter_cold(main_df, entity, suffix) + if num_new > 0: + self.logger.info( + "%s model can't predict cold %ss, they will be ignored", + self, + entity, + ) + _, log_df = self._filter_cold(log_df, entity, suffix) + return main_df, log_df + + # pylint: disable=too-many-arguments + @abstractmethod + def _predict( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + Inner method where model actually predicts. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :return: recommendation dataframe + ``[user_idx, item_idx, relevance]`` + """ + + def _inner_predict_wrap( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + Inner method that wrap _predict method. Can be overwritten. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :return: recommendation dataframe + ``[user_idx, item_idx, relevance]`` + """ + + return self._predict( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + ) + + @property + def logger(self) -> logging.Logger: + """ + :returns: get library logger + """ + if self._logger is None: + self._logger = logging.getLogger("replay") + return self._logger + + def _get_fit_counts(self, entity: str) -> int: + if not hasattr(self, f"_num_{entity}s"): + setattr( + self, + f"_num_{entity}s", + getattr(self, f"fit_{entity}s").count(), + ) + return getattr(self, f"_num_{entity}s") + + @property + def users_count(self) -> int: + """ + :returns: number of users the model was trained on + """ + return self._get_fit_counts("user") + + @property + def items_count(self) -> int: + """ + :returns: number of items the model was trained on + """ + return self._get_fit_counts("item") + + def _get_fit_dims(self, entity: str) -> int: + if not hasattr(self, f"_{entity}_dim_size"): + setattr( + self, + f"_{entity}_dim_size", + getattr(self, f"fit_{entity}s") + .agg({f"{entity}_idx": "max"}) + .collect()[0][0] + + 1, + ) + return getattr(self, f"_{entity}_dim_size") + + @property + def _user_dim(self) -> int: + """ + :returns: dimension of users matrix (maximal user idx + 1) + """ + return self._get_fit_dims("user") + + @property + def _item_dim(self) -> int: + """ + :returns: dimension of items matrix (maximal item idx + 1) + """ + return self._get_fit_dims("item") + + def _fit_predict( + self, + log: DataFrame, + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + self._fit_wrap(log, user_features, item_features) + return self._predict_wrap( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + recs_file_path=recs_file_path, + ) + + def _clear_cache(self): + """ + Clear spark cache + """ + + def _predict_pairs_wrap( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + recs_file_path: Optional[str] = None, + k: Optional[int] = None, + ) -> Optional[DataFrame]: + """ + This method + 1) converts data to spark + 2) removes cold users and items if model does not predict them + 3) calls inner _predict_pairs method of a model + + :param pairs: user-item pairs to get relevance for, + dataframe containing``[user_idx, item_idx]``. + :param log: train data + ``[user_idx, item_idx, timestamp, relevance]``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + log, user_features, item_features, pairs = [ + convert2spark(df) + for df in [log, user_features, item_features, pairs] + ] + if sorted(pairs.columns) != ["item_idx", "user_idx"]: + raise ValueError( + "pairs must be a dataframe with columns strictly [user_idx, item_idx]" + ) + pairs, log = self._filter_cold_for_predict(pairs, log, "user") + pairs, log = self._filter_cold_for_predict(pairs, log, "item") + + pred = self._predict_pairs( + pairs=pairs, + log=log, + user_features=user_features, + item_features=item_features, + ) + + if k: + pred = get_top_k( + dataframe=pred, + partition_by_col=sf.col("user_idx"), + order_by_col=[ + sf.col("relevance").desc(), + ], + k=k, + ) + + if recs_file_path is not None: + pred.write.parquet(path=recs_file_path, mode="overwrite") + return None + + pred.cache().count() + return pred + + def _predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> DataFrame: + """ + Fallback method to use in case ``_predict_pairs`` is not implemented. + Simply joins ``predict`` with given ``pairs``. + :param pairs: user-item pairs to get relevance for, + dataframe containing``[user_idx, item_idx]``. + :param log: train data + ``[user_idx, item_idx, timestamp, relevance]``. + """ + message = ( + "native predict_pairs is not implemented for this model. " + "Falling back to usual predict method and filtering the results." + ) + self.logger.warning(message) + + users = pairs.select("user_idx").distinct() + items = pairs.select("item_idx").distinct() + k = items.count() + pred = self._predict( + log=log, + k=k, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + filter_seen_items=False, + ) + + pred = pred.join( + pairs.select("user_idx", "item_idx"), + on=["user_idx", "item_idx"], + how="inner", + ) + return pred + + def _get_features_wrap( + self, ids: DataFrame, features: Optional[DataFrame] + ) -> Optional[Tuple[DataFrame, int]]: + if "user_idx" not in ids.columns and "item_idx" not in ids.columns: + raise ValueError("user_idx or item_idx missing") + vectors, rank = self._get_features(ids, features) + return vectors, rank + + # pylint: disable=unused-argument + def _get_features( + self, ids: DataFrame, features: Optional[DataFrame] + ) -> Tuple[Optional[DataFrame], Optional[int]]: + """ + Get embeddings from model + + :param ids: id ids to get embeddings for Spark DataFrame containing user_idx or item_idx + :param features: user or item features + :return: DataFrame with biases and embeddings, and vector size + """ + + self.logger.info( + "get_features method is not defined for the model %s. Features will not be returned.", + str(self), + ) + return None, None + + def get_nearest_items( + self, + items: Union[DataFrame, Iterable], + k: int, + metric: Optional[str] = "cosine_similarity", + candidates: Optional[Union[DataFrame, Iterable]] = None, + ) -> Optional[DataFrame]: + """ + Get k most similar items be the `metric` for each of the `items`. + + :param items: spark dataframe or list of item ids to find neighbors + :param k: number of neighbors + :param metric: 'euclidean_distance_sim', 'cosine_similarity', 'dot_product' + :param candidates: spark dataframe or list of items + to consider as similar, e.g. popular/new items. If None, + all items presented during model training are used. + :return: dataframe with the most similar items an distance, + where bigger value means greater similarity. + spark-dataframe with columns ``[item_idx, neighbour_item_idx, similarity]`` + """ + if metric is None: + raise ValueError( + f"Distance metric is required to get nearest items with " + f"{self} model" + ) + + if self.can_predict_item_to_item: + return self._get_nearest_items_wrap( + items=items, + k=k, + metric=metric, + candidates=candidates, + ) + + raise ValueError( + "Use models with attribute 'can_predict_item_to_item' set to True to get nearest items" + ) + + def _get_nearest_items_wrap( + self, + items: Union[DataFrame, Iterable], + k: int, + metric: Optional[str] = "cosine_similarity", + candidates: Optional[Union[DataFrame, Iterable]] = None, + ) -> Optional[DataFrame]: + """ + Convert indexes and leave top-k nearest items for each item in `items`. + """ + items = self._get_ids(items, "item_idx") + if candidates is not None: + candidates = self._get_ids(candidates, "item_idx") + + nearest_items_to_filter = self._get_nearest_items( + items=items, + metric=metric, + candidates=candidates, + ) + + rel_col_name = metric if metric is not None else "similarity" + nearest_items = get_top_k( + dataframe=nearest_items_to_filter, + partition_by_col=sf.col("item_idx_one"), + order_by_col=[ + sf.col(rel_col_name).desc(), + sf.col("item_idx_two").desc(), + ], + k=k, + ) + + nearest_items = nearest_items.withColumnRenamed( + "item_idx_two", "neighbour_item_idx" + ) + nearest_items = nearest_items.withColumnRenamed( + "item_idx_one", "item_idx" + ) + return nearest_items + + def _get_nearest_items( + self, + items: DataFrame, + metric: Optional[str] = None, + candidates: Optional[DataFrame] = None, + ) -> Optional[DataFrame]: + raise NotImplementedError( + f"item-to-item prediction is not implemented for {self}" + ) + + def _params_tried(self): + """check if current parameters were already evaluated""" + if self.study is None: + return False + + params = { + name: value + for name, value in self._init_args.items() + if name in self._search_space + } + for trial in self.study.trials: + if params == trial.params: + return True + + return False + + +class ItemVectorModel(BaseRecommender): + """Parent for models generating items' vector representations""" + + can_predict_item_to_item: bool = True + + @abstractmethod + def _get_item_vectors(self) -> DataFrame: + """ + Return dataframe with items' vectors as a + spark dataframe with columns ``[item_idx, item_vector]`` + """ + + def _get_nearest_items( + self, + items: DataFrame, + metric: str = "cosine_similarity", + candidates: Optional[DataFrame] = None, + ) -> DataFrame: + """ + Return distance metric value for all available close items filtered by `candidates`. + + :param items: ids to find neighbours, spark dataframe with column ``item_idx`` + :param metric: 'euclidean_distance_sim' calculated as 1/(1 + euclidean_distance), + 'cosine_similarity', 'dot_product' + :param candidates: items among which we are looking for similar, + e.g. popular/new items. If None, all items presented during model training are used. + :return: dataframe with neighbours, + spark-dataframe with columns ``[item_idx_one, item_idx_two, similarity]`` + """ + dist_function = cosine_similarity + if metric == "euclidean_distance_sim": + dist_function = vector_euclidean_distance_similarity + elif metric == "dot_product": + dist_function = vector_dot + elif metric != "cosine_similarity": + raise NotImplementedError( + f"{metric} metric is not implemented, valid metrics are " + "'euclidean_distance_sim', 'cosine_similarity', 'dot_product'" + ) + + items_vectors = self._get_item_vectors() + left_part = ( + items_vectors.withColumnRenamed("item_idx", "item_idx_one") + .withColumnRenamed("item_vector", "item_vector_one") + .join( + items.select(sf.col("item_idx").alias("item_idx_one")), + on="item_idx_one", + ) + ) + + right_part = items_vectors.withColumnRenamed( + "item_idx", "item_idx_two" + ).withColumnRenamed("item_vector", "item_vector_two") + + if candidates is not None: + right_part = right_part.join( + candidates.withColumnRenamed("item_idx", "item_idx_two"), + on="item_idx_two", + ) + + joined_factors = left_part.join( + right_part, on=sf.col("item_idx_one") != sf.col("item_idx_two") + ) + + joined_factors = joined_factors.withColumn( + metric, + dist_function( + sf.col("item_vector_one"), sf.col("item_vector_two") + ), + ) + + similarity_matrix = joined_factors.select( + "item_idx_one", "item_idx_two", metric + ) + + return similarity_matrix + + +class PartialFitMixin(BaseRecommender): + def fit_partial(self, + log: DataFrame, + previous_log: Optional[DataFrame] = None) -> None: + self._fit_partial(log, + user_features=None, + item_features=None, + previous_log=previous_log) + + def _fit( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None) -> None: + self._fit_partial(log, user_features, item_features) + + @abstractmethod + def _fit_partial( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + previous_log: Optional[DataFrame] = None) -> None: + ... + + def _clear_cache(self): + super(PartialFitMixin, self)._clear_cache() + for df in self._dataframes.values(): + if df is not None: + df.unpersist() + + +# pylint: disable=abstract-method +class HybridRecommender(BaseRecommender, ABC): + """Base class for models that can use extra features""" + + def fit( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> None: + """ + Fit a recommendation model + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features + ``[user_idx, timestamp]`` + feature columns + :param item_features: item features + ``[item_idx, timestamp]`` + feature columns + :return: + """ + self._fit_wrap( + log=log, + user_features=user_features, + item_features=item_features, + ) + + # pylint: disable=too-many-arguments + def predict( + self, + log: DataFrame, + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + + """ + return self._predict_wrap( + log=log, + k=k, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + filter_seen_items=filter_seen_items, + recs_file_path=recs_file_path, + ) + + def fit_predict( + self, + log: DataFrame, + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Fit model and get recommendations + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._fit_predict( + log=log, + k=k, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + filter_seen_items=filter_seen_items, + recs_file_path=recs_file_path, + ) + + def predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + recs_file_path: Optional[str] = None, + k: Optional[int] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations for specific user-item ``pairs``. + If a model can't produce recommendation + for specific pair it is removed from the resulting dataframe. + + :param pairs: dataframe with pairs to calculate relevance for, ``[user_idx, item_idx]``. + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param item_features: item features + ``[item_idx , timestamp]`` + feature columns + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :param k: top-k items for each user from pairs. + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._predict_pairs_wrap( + pairs=pairs, + log=log, + user_features=user_features, + item_features=item_features, + recs_file_path=recs_file_path, + k=k, + ) + + def get_features( + self, ids: DataFrame, features: Optional[DataFrame] + ) -> Optional[Tuple[DataFrame, int]]: + """ + Returns user or item feature vectors as a Column with type ArrayType + :param ids: Spark DataFrame with unique ids + :param features: Spark DataFrame with features for provided ids + :return: feature vectors + If a model does not have a vector for some ids they are not present in the final result. + """ + return self._get_features_wrap(ids, features) + + +# pylint: disable=abstract-method +class Recommender(BaseRecommender, ABC): + """Usual recommender class for models without features.""" + + def fit(self, log: DataFrame) -> None: + """ + Fit a recommendation model + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :return: + """ + self._fit_wrap( + log=log, + user_features=None, + item_features=None, + ) + + # pylint: disable=too-many-arguments + def predict( + self, + log: DataFrame, + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._predict_wrap( + log=log, + k=k, + users=users, + items=items, + user_features=None, + item_features=None, + filter_seen_items=filter_seen_items, + recs_file_path=recs_file_path, + ) + + def predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + recs_file_path: Optional[str] = None, + k: Optional[int] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations for specific user-item ``pairs``. + If a model can't produce recommendation + for specific pair it is removed from the resulting dataframe. + + :param pairs: dataframe with pairs to calculate relevance for, ``[user_idx, item_idx]``. + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :param k: top-k items for each user from pairs. + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._predict_pairs_wrap( + pairs=pairs, + log=log, + recs_file_path=recs_file_path, + k=k, + ) + + # pylint: disable=too-many-arguments + def fit_predict( + self, + log: DataFrame, + k: int, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Fit model and get recommendations + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._fit_predict( + log=log, + k=k, + users=users, + items=items, + user_features=None, + item_features=None, + filter_seen_items=filter_seen_items, + recs_file_path=recs_file_path, + ) + + def get_features(self, ids: DataFrame) -> Optional[Tuple[DataFrame, int]]: + """ + Returns user or item feature vectors as a Column with type ArrayType + + :param ids: Spark DataFrame with unique ids + :return: feature vectors. + If a model does not have a vector for some ids they are not present in the final result. + """ + return self._get_features_wrap(ids, None) + + +class UserRecommender(BaseRecommender, ABC): + """Base class for models that use user features + but not item features. ``log`` is not required for this class.""" + + def fit( + self, + log: DataFrame, + user_features: DataFrame, + ) -> None: + """ + Finds user clusters and calculates item similarity in that clusters. + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features + ``[user_idx, timestamp]`` + feature columns + :return: + """ + self._fit_wrap(log=log, user_features=user_features) + + # pylint: disable=too-many-arguments + def predict( + self, + user_features: DataFrame, + k: int, + log: Optional[DataFrame] = None, + users: Optional[Union[DataFrame, Iterable]] = None, + items: Optional[Union[DataFrame, Iterable]] = None, + filter_seen_items: bool = True, + recs_file_path: Optional[str] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations + + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param k: number of recommendations for each user + :param users: users to create recommendations for + dataframe containing ``[user_idx]`` or ``array-like``; + if ``None``, recommend to all users from ``log`` + :param items: candidate items for recommendations + dataframe containing ``[item_idx]`` or ``array-like``; + if ``None``, take all items from ``log``. + If it contains new items, ``relevance`` for them will be ``0``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param filter_seen_items: flag to remove seen items from recommendations based on ``log``. + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._predict_wrap( + log=log, + user_features=user_features, + k=k, + filter_seen_items=filter_seen_items, + users=users, + items=items, + recs_file_path=recs_file_path, + ) + + def predict_pairs( + self, + pairs: DataFrame, + user_features: DataFrame, + log: Optional[DataFrame] = None, + recs_file_path: Optional[str] = None, + k: Optional[int] = None, + ) -> Optional[DataFrame]: + """ + Get recommendations for specific user-item ``pairs``. + If a model can't produce recommendation + for specific pair it is removed from the resulting dataframe. + + :param pairs: dataframe with pairs to calculate relevance for, ``[user_idx, item_idx]``. + :param user_features: user features + ``[user_idx , timestamp]`` + feature columns + :param log: historical log of interactions + ``[user_idx, item_idx, timestamp, relevance]`` + :param recs_file_path: save recommendations at the given absolute path as parquet file. + If None, cached and materialized recommendations dataframe will be returned + :param k: top-k items for each user from pairs. + :return: cached recommendation dataframe with columns ``[user_idx, item_idx, relevance]`` + or None if `file_path` is provided + """ + return self._predict_pairs_wrap( + pairs=pairs, + log=log, + user_features=user_features, + recs_file_path=recs_file_path, + k=k, + ) + + +from replay.models.nmslib_hnsw import NmslibHnswMixin + + +class NeighbourRec(Recommender, NmslibHnswMixin, ABC): + """Base class that requires log at prediction time""" + + similarity: Optional[DataFrame] + can_predict_item_to_item: bool = True + can_predict_cold_users: bool = True + can_change_metric: bool = False + item_to_item_metrics = ["similarity"] + _similarity_metric = "similarity" + + @property + def _dataframes(self): + return {"similarity": self.similarity} + + def _clear_cache(self): + if hasattr(self, "similarity"): + self.similarity.unpersist() + + # pylint: disable=missing-function-docstring + @property + def similarity_metric(self): + return self._similarity_metric + + @similarity_metric.setter + def similarity_metric(self, value): + if not self.can_change_metric: + raise ValueError("This class does not support changing similarity metrics") + if value not in self.item_to_item_metrics: + raise ValueError( + f"Select one of the valid metrics for predict: " + f"{self.item_to_item_metrics}" + ) + self._similarity_metric = value + + def _predict_pairs_inner( + self, + log: DataFrame, + filter_df: DataFrame, + condition: Column, + users: DataFrame, + ) -> DataFrame: + """ + Get recommendations for all provided users + and filter results with ``filter_df`` by ``condition``. + It allows to implement both ``predict_pairs`` and usual ``predict``@k. + + :param log: historical interactions, DataFrame + ``[user_idx, item_idx, timestamp, relevance]``. + :param filter_df: DataFrame use to filter items: + ``[item_idx_filter]`` or ``[user_idx_filter, item_idx_filter]``. + :param condition: condition used for inner join with ``filter_df`` + :param users: users to calculate recommendations for + :return: DataFrame ``[user_idx, item_idx, relevance]`` + """ + if log is None: + raise ValueError( + "log is not provided, but it is required for prediction" + ) + + recs = ( + log.join(users, how="inner", on="user_idx") + .join( + self.similarity, + how="inner", + on=sf.col("item_idx") == sf.col("item_idx_one"), + ) + .join( + filter_df, + how="inner", + on=condition, + ) + .groupby("user_idx", "item_idx_two") + .agg(sf.sum(self.similarity_metric).alias("relevance")) + .withColumnRenamed("item_idx_two", "item_idx") + ) + return recs + + # pylint: disable=too-many-arguments + def _predict( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + + return self._predict_pairs_inner( + log=log, + filter_df=items.withColumnRenamed("item_idx", "item_idx_filter"), + condition=sf.col("item_idx_two") == sf.col("item_idx_filter"), + users=users, + ) + + def _predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> DataFrame: + + if log is None: + raise ValueError( + "log is not provided, but it is required for prediction" + ) + + return self._predict_pairs_inner( + log=log, + filter_df=( + pairs.withColumnRenamed( + "user_idx", "user_idx_filter" + ).withColumnRenamed("item_idx", "item_idx_filter") + ), + condition=(sf.col("user_idx") == sf.col("user_idx_filter")) + & (sf.col("item_idx_two") == sf.col("item_idx_filter")), + users=pairs.select("user_idx").distinct(), + ) + + def get_nearest_items( + self, + items: Union[DataFrame, Iterable], + k: int, + metric: Optional[str] = None, + candidates: Optional[Union[DataFrame, Iterable]] = None, + ) -> DataFrame: + """ + Get k most similar items be the `metric` for each of the `items`. + + :param items: spark dataframe or list of item ids to find neighbors + :param k: number of neighbors + :param metric: metric is not used to find neighbours in NeighbourRec, + the parameter is ignored + :param candidates: spark dataframe or list of items + to consider as similar, e.g. popular/new items. If None, + all items presented during model training are used. + :return: dataframe with the most similar items an distance, + where bigger value means greater similarity. + spark-dataframe with columns ``[item_idx, neighbour_item_idx, similarity]`` + """ + + if metric is not None: + self.logger.debug( + "Metric is not used to determine nearest items in %s model", + str(self), + ) + + return self._get_nearest_items_wrap( + items=items, + k=k, + metric=metric, + candidates=candidates, + ) + + def _get_nearest_items( + self, + items: DataFrame, + metric: Optional[str] = None, + candidates: Optional[DataFrame] = None, + ) -> DataFrame: + + similarity_filtered = self.similarity.join( + items.withColumnRenamed("item_idx", "item_idx_one"), + on="item_idx_one", + ) + + if candidates is not None: + similarity_filtered = similarity_filtered.join( + candidates.withColumnRenamed("item_idx", "item_idx_two"), + on="item_idx_two", + ) + + return similarity_filtered.select( + "item_idx_one", "item_idx_two", "similarity" if metric is None else metric + ) + + def _get_ann_build_params(self, log: DataFrame) -> Dict[str, Any]: + items_count = log.select(sf.max("item_idx")).first()[0] + 1 + return { + "features_col": None, + "params": self._nmslib_hnsw_params, + "index_type": "sparse", + "items_count": items_count, + } + + def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame: + similarity_df = self.similarity.select( + "similarity", "item_idx_one", "item_idx_two" + ) + return similarity_df + + def _get_vectors_to_infer_ann_inner( + self, log: DataFrame, users: DataFrame + ) -> DataFrame: + + user_vectors = ( + log.groupBy("user_idx").agg( + sf.collect_list("item_idx").alias("vector_items"), + sf.collect_list("relevance").alias("vector_relevances")) + ) + return user_vectors + + +class NonPersonalizedRecommender(Recommender, PartialFitMixin, ABC): + """Base class for non-personalized recommenders with popularity statistics.""" + + can_predict_cold_users = True + can_predict_cold_items = True + item_popularity: DataFrame + add_cold_items: bool + cold_weight: float + sample: bool + fill: float + seed: Optional[int] = None + + def __init__(self, add_cold_items: bool, cold_weight: float): + self.add_cold_items = add_cold_items + if 0 < cold_weight <= 1: + self.cold_weight = cold_weight + else: + raise ValueError( + "`cold_weight` value should be in interval (0, 1]" + ) + + @property + def _dataframes(self): + return {"item_popularity": self.item_popularity} + + def _save_model(self, path: str): + spark = State().session + sc = spark.sparkContext + # TODO: simplify it and move it to utils + # maybe it can just be saved in json + pickled_instance = pickle.dumps({"fill": self.fill}) + Record = collections.namedtuple("Record", ["params"]) + rdd = sc.parallelize([Record(pickled_instance)]) + instance_df = rdd.map(lambda rec: Record(bytearray(rec.params))).toDF() + instance_df.write.mode("overwrite").parquet(join(path, "params.dump")) + + def _load_model(self, path: str): + spark = State().session + df = spark.read.parquet(join(path, "params.dump")) + pickled_instance = df.rdd.map(lambda row: bytes(row.params)).first() + self.fill = pickle.loads(pickled_instance)["fill"] + + def _clear_cache(self): + if hasattr(self, "item_popularity"): + self.item_popularity.unpersist() + + @staticmethod + def _calc_fill(item_popularity: DataFrame, weight: float) -> float: + """ + Calculating a fill value a the minimal relevance + calculated during model training multiplied by weight. + """ + return item_popularity.select(sf.min("relevance")).first()[0] * weight + + @staticmethod + def _check_relevance(log: Optional[DataFrame] = None): + if log is None: + return + + vals = log.select("relevance").where( + (sf.col("relevance") != 1) & (sf.col("relevance") != 0) + ) + if vals.count() > 0: + raise ValueError("Relevance values in log must be 0 or 1") + + def _get_selected_item_popularity(self, items: DataFrame) -> DataFrame: + """ + Choose only required item from `item_popularity` dataframe + for further recommendations generation. + """ + return self.item_popularity.join( + items, + on="item_idx", + how="right" if self.add_cold_items else "inner", + ).fillna(value=self.fill, subset=["relevance"]) + + @staticmethod + def _calc_max_hist_len(log: DataFrame, users: DataFrame) -> int: + max_hist_len = ( + ( + log.join(users, on="user_idx") + .groupBy("user_idx") + .agg(sf.countDistinct("item_idx").alias("items_count")) + ) + .select(sf.max("items_count")) + .collect()[0][0] + ) + # all users have empty history + if max_hist_len is None: + max_hist_len = 0 + + return max_hist_len + + # pylint: disable=too-many-arguments + def _predict_without_sampling( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + Regular prediction for popularity-based models, + top-k most relevant items from `items` are chosen for each user + """ + selected_item_popularity = self._get_selected_item_popularity(items) + selected_item_popularity = selected_item_popularity.withColumn( + "rank", + sf.row_number().over( + Window.orderBy( + sf.col("relevance").desc(), sf.col("item_idx").desc() + ) + ), + ) + + max_hist_len = ( + self._calc_max_hist_len(log, users) + if filter_seen_items and log is not None + else 0 + ) + + return users.crossJoin( + selected_item_popularity.filter(sf.col("rank") <= k + max_hist_len) + ).drop("rank") + + def _predict_with_sampling( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + Randomized prediction for popularity-based models, + top-k items from `items` are sampled for each user based with + probability proportional to items' popularity + """ + selected_item_popularity = self._get_selected_item_popularity(items) + selected_item_popularity = selected_item_popularity.withColumn( + "relevance", + sf.when(sf.col("relevance") == sf.lit(0.0), 0.1**6).otherwise( + sf.col("relevance") + ), + ) + + items_pd = selected_item_popularity.withColumn( + "probability", + sf.col("relevance") + / selected_item_popularity.select(sf.sum("relevance")).first()[0], + ).toPandas() + + if items_pd.shape[0] == 0: + return State().session.createDataFrame([], REC_SCHEMA) + + seed = self.seed + class_name = self.__class__.__name__ + + def grouped_map(pandas_df: pd.DataFrame) -> pd.DataFrame: + user_idx = pandas_df["user_idx"][0] + cnt = pandas_df["cnt"][0] + + if seed is not None: + local_rng = default_rng(seed + user_idx) + else: + local_rng = default_rng() + + items_positions = local_rng.choice( + np.arange(items_pd.shape[0]), + size=cnt, + p=items_pd["probability"].values, + replace=False, + ) + + # workaround to unify RandomRec and UCB + if class_name == "RandomRec": + relevance = 1 / np.arange(1, cnt + 1) + else: + relevance = items_pd["probability"].values[items_positions] + + return pd.DataFrame( + { + "user_idx": cnt * [user_idx], + "item_idx": items_pd["item_idx"].values[items_positions], + "relevance": relevance, + } + ) + + if log is not None and filter_seen_items: + recs = ( + log.select("user_idx", "item_idx") + .distinct() + .join(users, how="right", on="user_idx") + .groupby("user_idx") + .agg(sf.countDistinct("item_idx").alias("cnt")) + .selectExpr( + "user_idx", + f"LEAST(cnt + {k}, {items_pd.shape[0]}) AS cnt", + ) + ) + else: + recs = users.withColumn("cnt", sf.lit(min(k, items_pd.shape[0]))) + + return recs.groupby("user_idx").applyInPandas(grouped_map, REC_SCHEMA) + + # pylint: disable=too-many-arguments + def _predict( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + + if self.sample: + return self._predict_with_sampling( + log=log, + k=k, + users=users, + items=items, + filter_seen_items=filter_seen_items, + ) + else: + return self._predict_without_sampling( + log, k, users, items, filter_seen_items + ) + + def _predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> DataFrame: + return pairs.join( + self.item_popularity, + on="item_idx", + how="left" if self.add_cold_items else "inner", + ).fillna(value=self.fill, subset=["relevance"]).select("user_idx", "item_idx", "relevance") diff --git a/replay/models/hnswlib.py b/replay/models/hnswlib.py new file mode 100644 index 000000000..1e341e99d --- /dev/null +++ b/replay/models/hnswlib.py @@ -0,0 +1,402 @@ +import logging +import os +import shutil +import tempfile +import uuid +import weakref +from typing import Any, Dict, Iterator, Optional, Union + +import hnswlib +import numpy as np +import pandas as pd +from pyarrow import fs +from pyspark import SparkFiles +from pyspark.sql import DataFrame +from pyspark.sql import SparkSession +from pyspark.sql.functions import pandas_udf + +from replay.ann.ann_mixin import ANNMixin +from replay.ann.utils import ( + save_index_to_destination_fs, + load_index_from_source_fs, +) +from replay.session_handler import State +from replay.utils import FileSystem, get_filesystem, FileInfo + +logger = logging.getLogger("replay") + +INDEX_FILENAME = "hnswlib_index" + + +class HnswlibIndexFileManager: + """Loads index from hdfs, local disk or SparkFiles dir and keep it in a memory. + Instance of `HnswlibIndexFileManager` broadcasts to executors and is used in pandas_udf. + """ + + def __init__( + self, + index_params, + index_dim: int, + index_file: Union[FileInfo, str] + ) -> None: + + self._space = index_params["space"] + self._efS = index_params.get("efS") + self._dim = index_dim + self._index_file = index_file + self._index = None + + @property + def index(self): + if self._index: + return self._index + + self._index = hnswlib.Index(space=self._space, dim=self._dim) + if isinstance(self._index_file, FileInfo): + load_index_from_source_fs( + sparse=False, + load_index=lambda path: self._index.load_index(path), + source=self._index_file + ) + else: + self._index.load_index(SparkFiles.get(self._index_file)) + + if self._efS: + self._index.set_ef(self._efS) + return self._index + + +class HnswlibMixin(ANNMixin): + """Mixin that provides methods to build hnswlib index and infer it. + Also provides methods to saving and loading index to/from disk. + """ + + def _infer_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + k: int, + filter_seen_items: bool, + index_dim: str = None, + index_type: str = None, + log: DataFrame = None, + ) -> DataFrame: + return self._infer_hnsw_index( + vectors, features_col, params, k, filter_seen_items, index_dim + ) + + def _build_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + dim: int = None, + num_elements: int = None, + id_col: Optional[str] = None, + index_type: str = None, + items_count: Optional[int] = None, + ) -> None: + self._build_hnsw_index( + vectors, features_col, params, dim, num_elements, id_col + ) + + def _build_hnsw_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + dim: int, + num_elements: int, + id_col: Optional[str] = None, + ) -> None: + """Builds hnsw index and dump it to hdfs or disk. + + Args: + vectors: DataFrame with vectors. Schema: [{id_col}: int, {features_col}: array] + features_col: the name of the column in the `vectors` dataframe that contains features (vectors). + params: index params + dim: feature (vector) length + num_elements: how many elements will be stored in the index + id_col: the name of the column in the `vectors` dataframe that contains ids (of vectors) + """ + + if params["build_index_on"] == "executor": + # to execution in one executor + vectors = vectors.repartition(1) + + target_index_file = get_filesystem(params["index_path"]) + + def build_index(iterator: Iterator[pd.DataFrame]): + """Builds index on executor and writes it to shared disk or hdfs. + + Args: + iterator: iterates on dataframes with vectors/features + + """ + index = hnswlib.Index(space=params["space"], dim=dim) + + # Initializing index - the maximum number of elements should be known beforehand + index.init_index( + max_elements=num_elements, + ef_construction=params["efC"], + M=params["M"], + ) + + # pdf is a pandas dataframe that contains ids and features (vectors) + for pdf in iterator: + vectors_np = np.squeeze(pdf[features_col].values) + if id_col: + index.add_items( + np.stack(vectors_np), pdf[id_col].values + ) + else: + # ids will be from [0, ..., len(vectors_np)] + index.add_items(np.stack(vectors_np)) + + save_index_to_destination_fs( + sparse=False, + save_index=lambda path: index.save_index(path), + target=target_index_file, + ) + + yield pd.DataFrame(data={"_success": 1}, index=[0]) + + # Here we perform materialization (`.collect()`) to build the hnsw index. + logger.info("Started building the hnsw index") + cols = [id_col, features_col] if id_col else [features_col] + vectors.select(*cols).mapInPandas( + build_index, "_success int" + ).collect() + logger.info("Finished building the hnsw index") + else: + vectors = vectors.toPandas() + vectors_np = np.squeeze(vectors[features_col].values) + + index = hnswlib.Index(space=params["space"], dim=dim) + + # Initializing index - the maximum number of elements should be known beforehand + index.init_index( + max_elements=num_elements, + ef_construction=params["efC"], + M=params["M"], + ) + + if id_col: + index.add_items(np.stack(vectors_np), vectors[id_col].values) + else: + index.add_items(np.stack(vectors_np)) + + # saving index to local temp file and sending it to executors + temp_dir = tempfile.mkdtemp() + weakref.finalize(self, shutil.rmtree, temp_dir) + tmp_file_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.save_index(tmp_file_path) + spark = SparkSession.getActiveSession() + spark.sparkContext.addFile("file://" + tmp_file_path) + + def _update_hnsw_index( + self, + item_vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + dim: int, + num_elements: int, + ): + index = hnswlib.Index(space=params["space"], dim=dim) + index_path = SparkFiles.get( + f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.load_index(index_path, max_elements=num_elements) + item_vectors = item_vectors.toPandas() + item_vectors_np = np.squeeze(item_vectors[features_col].values) + index.add_items(np.stack(item_vectors_np), item_vectors["id"].values) + + self._spark_index_file_uid = uuid.uuid4().hex[-12:] + # saving index to local temp file and sending it to executors + temp_dir = tempfile.mkdtemp() + tmp_file_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.save_index(tmp_file_path) + spark = SparkSession.getActiveSession() + spark.sparkContext.addFile("file://" + tmp_file_path) + + def _infer_hnsw_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + k: int, + filter_seen_items: bool, + index_dim: str = None, + ): + + if params["build_index_on"] == "executor": + index_file = get_filesystem(params["index_path"]) + else: + index_file = f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + + _index_file_manager = HnswlibIndexFileManager( + params, + index_dim, + index_file=index_file, + ) + + index_file_manager_broadcast = State().session.sparkContext.broadcast( + _index_file_manager + ) + + return_type = "item_idx array, distance array" + + if filter_seen_items: + + @pandas_udf(return_type) + def infer_index( + vectors: pd.Series, + num_items: pd.Series, + seen_item_idxs: pd.Series, + ) -> pd.DataFrame: + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + # max number of items to retrieve per batch + max_items_to_retrieve = num_items.max() + + labels, distances = index.knn_query( + np.stack(vectors.values), + k=k + max_items_to_retrieve, + num_threads=1, + ) + + filtered_labels = [] + filtered_distances = [] + for i, item_idxs in enumerate(labels): + non_seen_item_indexes = ~np.isin( + item_idxs, seen_item_idxs[i], assume_unique=True + ) + filtered_labels.append( + (item_idxs[non_seen_item_indexes])[:k] + ) + filtered_distances.append( + (distances[i][non_seen_item_indexes])[:k] + ) + + pd_res = pd.DataFrame( + { + "item_idx": filtered_labels, + "distance": filtered_distances, + } + ) + + return pd_res + + else: + + @pandas_udf(return_type) + def infer_index(vectors: pd.Series) -> pd.DataFrame: + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + labels, distances = index.knn_query( + np.stack(vectors.values), + k=k, + num_threads=1, + ) + + pd_res = pd.DataFrame( + {"item_idx": list(labels), "distance": list(distances)} + ) + + return pd_res + + cols = [] + if filter_seen_items: + cols = ["num_items", "seen_item_idxs"] + + res = vectors.select( + "user_idx", + infer_index(features_col, *cols).alias("neighbours"), + ) + res = self._unpack_infer_struct(res) + + return res + + def _save_hnswlib_index(self, path: str): + """Method save (copy) index from hdfs (or local) to `path` directory. + `path` can be a hdfs path or a local path. + + Args: + path (str): directory where to dump (copy) the index + """ + + params = self._hnswlib_params + + if params["build_index_on"] == "executor": + index_path = params["index_path"] + elif params["build_index_on"] == "driver": + index_path = SparkFiles.get( + f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + else: + raise ValueError("Unknown 'build_index_on' param.") + + source = get_filesystem(index_path) + target = get_filesystem(path) + self.logger.debug(f"Index file coping from '{index_path}' to '{path}'") + + if source.filesystem == FileSystem.HDFS: + source_filesystem = fs.HadoopFileSystem.from_uri(source.hdfs_uri) + else: + source_filesystem = fs.LocalFileSystem() + if target.filesystem == FileSystem.HDFS: + destination_filesystem = fs.HadoopFileSystem.from_uri( + target.hdfs_uri + ) + else: + destination_filesystem = fs.LocalFileSystem() + + fs.copy_files( + source.path, + os.path.join(target.path, INDEX_FILENAME), + source_filesystem=source_filesystem, + destination_filesystem=destination_filesystem, + ) + # param use_threads=True (?) + + def _load_hnswlib_index(self, path: str): + """Loads hnsw index from `path` directory to local dir. + Index file name is 'hnswlib_index'. + And adds index file to the `SparkFiles`. + `path` can be a hdfs path or a local path. + + + Args: + path: directory path, where index file is stored + """ + source = get_filesystem(path + f"/{INDEX_FILENAME}") + + temp_dir = tempfile.mkdtemp() + weakref.finalize(self, shutil.rmtree, temp_dir) + target_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + + if source.filesystem == FileSystem.HDFS: + source_filesystem = fs.HadoopFileSystem.from_uri(source.hdfs_uri) + else: + source_filesystem = fs.LocalFileSystem() + destination_filesystem = fs.LocalFileSystem() + fs.copy_files( + source.path, + target_path, + source_filesystem=source_filesystem, + destination_filesystem=destination_filesystem, + ) + + spark = SparkSession.getActiveSession() + spark.sparkContext.addFile("file://" + target_path) + + self._hnswlib_params["build_index_on"] = "driver" diff --git a/replay/models/knn.py b/replay/models/knn.py index da6675af8..ce2b7fdf0 100644 --- a/replay/models/knn.py +++ b/replay/models/knn.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any from pyspark.sql import DataFrame from pyspark.sql import functions as sf @@ -11,6 +11,17 @@ class ItemKNN(NeighbourRec): """Item-based ItemKNN with modified cosine similarity measure.""" + def _get_ann_infer_params(self) -> Dict[str, Any]: + return { + "features_col": "", + "params": self._nmslib_hnsw_params, + "index_type": "sparse", + } + + @property + def _use_ann(self) -> bool: + return self._nmslib_hnsw_params is not None + all_items: Optional[DataFrame] dot_products: Optional[DataFrame] item_norms: Optional[DataFrame] @@ -29,12 +40,32 @@ def __init__( use_relevance: bool = False, shrink: float = 0.0, weighting: str = None, + nmslib_hnsw_params: Optional[dict] = None, ): """ :param num_neighbours: number of neighbours :param use_relevance: flag to use relevance values as is or to treat them as 1 :param shrink: term added to the denominator when calculating similarity :param weighting: item reweighting type, one of [None, 'tf_idf', 'bm25'] + :param nmslib_hnsw_params: parameters for nmslib-hnsw methods: + {"method":"hnsw", + "space":"negdotprod_sparse_fast", + "M":16,"efS":200,"efC":200, + ...} + The reasonable range of values for M parameter is 5-100, + for efC and eFS is 100-2000. + Increasing these values improves the prediction quality but increases index_time and inference_time too. + We recommend using these settings: + - M=16, efC=200 and efS=200 for simple datasets like MovieLens + - M=50, efC=1000 and efS=1000 for average quality with an average prediction time + - M=75, efC=2000 and efS=2000 for the highest quality with a long prediction time + + note: choosing these parameters depends on the dataset and quality/time tradeoff + note: while reducing parameter values the highest range metrics like Metric@1000 suffer first + note: even in a case with a long training time, + profit from ann could be obtained while inference will be used multiple times + + for more details see https://github.com/nmslib/nmslib/blob/master/manual/methods.md """ self.shrink = shrink self.use_relevance = use_relevance @@ -44,6 +75,7 @@ def __init__( if weighting not in valid_weightings: raise ValueError(f"weighting must be one of {valid_weightings}") self.weighting = weighting + self._nmslib_hnsw_params = nmslib_hnsw_params @property def _init_args(self): @@ -52,8 +84,17 @@ def _init_args(self): "use_relevance": self.use_relevance, "num_neighbours": self.num_neighbours, "weighting": self.weighting, + "nmslib_hnsw_params": self._nmslib_hnsw_params, } + def _save_model(self, path: str): + if self._nmslib_hnsw_params: + self._save_nmslib_hnsw_index(path, sparse=True) + + def _load_model(self, path: str): + if self._nmslib_hnsw_params: + self._load_nmslib_hnsw_index(path, sparse=True) + @staticmethod def _shrink(dot_products: DataFrame, shrink: float) -> DataFrame: return dot_products.withColumn( diff --git a/replay/models/nmslib_hnsw.py b/replay/models/nmslib_hnsw.py new file mode 100644 index 000000000..2fe4aaca7 --- /dev/null +++ b/replay/models/nmslib_hnsw.py @@ -0,0 +1,639 @@ +import logging +import os +import shutil +import tempfile +import weakref +from typing import Any, Dict, Optional, Iterator, Union + +import nmslib +import numpy as np +import pandas as pd +from pyarrow import fs +from pyspark import SparkFiles +from pyspark.sql import DataFrame +from pyspark.sql import SparkSession +from pyspark.sql.functions import pandas_udf +from scipy.sparse import csr_matrix + +from replay.ann.ann_mixin import ANNMixin +from replay.ann.utils import ( + save_index_to_destination_fs, + load_index_from_source_fs, +) +from replay.session_handler import State +from replay.utils import FileSystem, get_filesystem, FileInfo + +logger = logging.getLogger("replay") + +INDEX_FILENAME = "nmslib_hnsw_index" + + +class NmslibIndexFileManager: + """Loads index from hdfs, local disk or SparkFiles dir and keep it in a memory. + Instance of `NmslibIndexFileManager` broadcasts to executors and is used in pandas_udf. + """ + + def __init__( + self, + index_params, + index_type: str, + index_file: Union[FileInfo, str] + ) -> None: + + self._method = index_params["method"] + self._space = index_params["space"] + self._efS = index_params.get("efS") + self._index_type = index_type + self._index_file = index_file + self._index = None + + @property + def index(self): + if self._index: + return self._index + + if self._index_type == "sparse": + self._index = nmslib.init( + method=self._method, + space=self._space, + data_type=nmslib.DataType.SPARSE_VECTOR, + ) + if isinstance(self._index_file, FileInfo): + load_index_from_source_fs( + sparse=True, + load_index=lambda path: self._index.loadIndex( + path, load_data=True + ), + source=self._index_file + ) + else: + self._index.loadIndex( + SparkFiles.get(self._index_file), load_data=True + ) + else: + self._index = nmslib.init( + method=self._method, + space=self._space, + data_type=nmslib.DataType.DENSE_VECTOR, + ) + if isinstance(self._index_file, FileInfo): + load_index_from_source_fs( + sparse=False, + load_index=lambda path: self._index.loadIndex(path), + source=self._index_file + ) + else: + self._index.loadIndex(SparkFiles.get(self._index_file)) + + if self._efS: + self._index.setQueryTimeParams({"efSearch": self._efS}) + return self._index + + +class NmslibHnswMixin(ANNMixin): + """Mixin that provides methods to build nmslib hnsw index and infer it. + Also provides methods to saving and loading index to/from disk. + """ + + def _infer_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + k: int, + filter_seen_items: bool, + index_dim: str = None, + index_type: str = None, + log: DataFrame = None, + ) -> DataFrame: + return self._infer_nmslib_hnsw_index( + vectors, + features_col, + params, + k, + filter_seen_items, + index_type, + log, + ) + + def _build_ann_index( + self, + vectors: DataFrame, + features_col: str, + params: Dict[str, Union[int, str]], + dim: int = None, + num_elements: int = None, + id_col: Optional[str] = None, + index_type: str = None, + items_count: Optional[int] = None, + ) -> None: + self._build_nmslib_hnsw_index( + vectors, features_col, params, index_type, items_count + ) + + def _build_nmslib_hnsw_index( + self, + item_vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + index_type: str = None, + items_count: Optional[int] = None, + ) -> None: + """Builds hnsw index and dump it to hdfs or disk. + + Args: + item_vectors (DataFrame): DataFrame with item vectors + params (Dict[str, Any]): hnsw params + """ + + index_params = { + "M": params.get("M", 200), + "efConstruction": params.get("efC", 20000), + "post": params.get("post", 2000), + } + + if params["build_index_on"] == "executor": + # to execution in one executor + item_vectors = item_vectors.repartition(1) + + target_index_file = get_filesystem(params["index_path"]) + + if index_type == "sparse": + + def build_index(iterator: Iterator[pd.DataFrame]): + """Builds index on executor and writes it to shared disk or hdfs. + + Args: + iterator: iterates on dataframes with vectors/features + + """ + index = nmslib.init( + method=params["method"], + space=params["space"], + data_type=nmslib.DataType.SPARSE_VECTOR, + ) + + pdfs = [] + for pdf in iterator: + pdfs.append(pdf) + + pdf = pd.concat(pdfs, copy=False) + + # We collect all iterator values into one dataframe, + # because we cannot guarantee that `pdf` will contain rows with the same `item_idx_two`. + # And therefore we cannot call the `addDataPointBatch` iteratively. + data = pdf["similarity"].values + row_ind = pdf["item_idx_two"].values + col_ind = pdf["item_idx_one"].values + + sim_matrix_tmp = csr_matrix( + (data, (row_ind, col_ind)), + shape=(items_count, items_count), + ) + index.addDataPointBatch(data=sim_matrix_tmp) + index.createIndex(index_params) + + save_index_to_destination_fs( + sparse=True, + save_index=lambda path: index.saveIndex( + path, save_data=True + ), + target=target_index_file, + ) + + yield pd.DataFrame(data={"_success": 1}, index=[0]) + + else: + + def build_index(iterator: Iterator[pd.DataFrame]): + """Builds index on executor and writes it to shared disk or hdfs. + + Args: + iterator: iterates on dataframes with vectors/features + + """ + index = nmslib.init( + method=params["method"], + space=params["space"], + data_type=nmslib.DataType.DENSE_VECTOR, + ) + for pdf in iterator: + item_vectors_np = np.squeeze(pdf[features_col].values) + index.addDataPointBatch( + data=np.stack(item_vectors_np), + ids=pdf["item_idx"].values, + ) + index.createIndex(index_params) + + save_index_to_destination_fs( + sparse=False, + save_index=lambda path: index.saveIndex(path), + target=target_index_file, + ) + + yield pd.DataFrame(data={"_success": 1}, index=[0]) + + # Here we perform materialization (`.collect()`) to build the hnsw index. + logger.info("Started building the hnsw index") + if index_type == "sparse": + item_vectors.select( + "similarity", "item_idx_one", "item_idx_two" + ).mapInPandas(build_index, "_success int").collect() + else: + item_vectors.select("item_idx", features_col).mapInPandas( + build_index, "_success int" + ).collect() + logger.info("Finished building the hnsw index") + else: + if index_type == "sparse": + item_vectors = item_vectors.toPandas() + + index = nmslib.init( + method=params["method"], + space=params["space"], + data_type=nmslib.DataType.SPARSE_VECTOR, + ) + + data = item_vectors["similarity"].values + row_ind = item_vectors["item_idx_two"].values + col_ind = item_vectors["item_idx_one"].values + + sim_matrix = csr_matrix( + (data, (row_ind, col_ind)), + shape=(items_count, items_count), + ) + index.addDataPointBatch(data=sim_matrix) + index.createIndex(index_params) + + # saving index to local temp file and sending it to executors + temp_dir = tempfile.mkdtemp() + weakref.finalize(self, shutil.rmtree, temp_dir) + self.__dict__.pop('_spark_index_file_uid', None) + tmp_file_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.saveIndex(tmp_file_path, save_data=True) + spark = SparkSession.getActiveSession() + # for the "sparse" type we need to store two files + spark.sparkContext.addFile("file://" + tmp_file_path) + spark.sparkContext.addFile("file://" + tmp_file_path + ".dat") + + else: + item_vectors = item_vectors.toPandas() + item_vectors_np = np.squeeze(item_vectors[features_col].values) + index = nmslib.init( + method=params["method"], + space=params["space"], + data_type=nmslib.DataType.DENSE_VECTOR, + ) + index.addDataPointBatch( + data=np.stack(item_vectors_np), + ids=item_vectors["item_idx"].values, + ) + index.createIndex(index_params) + + # saving index to local temp file and sending it to executors + temp_dir = tempfile.mkdtemp() + weakref.finalize(self, shutil.rmtree, temp_dir) + tmp_file_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.saveIndex(tmp_file_path) + spark = SparkSession.getActiveSession() + spark.sparkContext.addFile("file://" + tmp_file_path) + + def _update_hnsw_index( + self, + item_vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + ): + index = nmslib.init( + method=params["method"], + space=params["space"], + data_type=nmslib.DataType.DENSE_VECTOR, + ) + index_path = SparkFiles.get( + f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.loadIndex(index_path) + item_vectors = item_vectors.toPandas() + item_vectors_np = np.squeeze(item_vectors[features_col].values) + index.addDataPointBatch( + data=np.stack(item_vectors_np), + ids=item_vectors["id"].values, + ) + index_params = { + "M": params.get("M", 200), + "efConstruction": params.get("efC", 20000), + "post": params.get("post", 2000), + } + index.createIndex(index_params) + + # saving index to local temp file and sending it to executors + temp_dir = tempfile.mkdtemp() + tmp_file_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + index.saveIndex(tmp_file_path) + spark = SparkSession.getActiveSession() + spark.sparkContext.addFile("file://" + tmp_file_path) + + def _infer_nmslib_hnsw_index( + self, + user_vectors: DataFrame, + features_col: str, + params: Dict[str, Any], + k: int, + filter_seen_items: bool, + index_type: str = None, + log: DataFrame = None, + ) -> DataFrame: + + if params["build_index_on"] == "executor": + index_file = get_filesystem(params["index_path"]) + else: + index_file = f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + + _index_file_manager = NmslibIndexFileManager( + params, + index_type, + index_file=index_file, + ) + + index_file_manager_broadcast = State().session.sparkContext.broadcast( + _index_file_manager + ) + + return_type = "item_idx array, distance array" + + if index_type == "sparse": + + def get_csr_matrix( + user_idx: pd.Series, + vector_items: pd.Series, + vector_relevances: pd.Series, + ) -> csr_matrix: + + return csr_matrix( + ( + vector_relevances.explode().values.astype(float), + (user_idx.repeat(vector_items.apply(lambda x: len(x))).values, + vector_items.explode().values.astype(int)), + ), + shape=(user_idx.max() + 1, vector_items.apply(lambda x: max(x)).max() + 1), + ) + + if filter_seen_items: + + @pandas_udf(return_type) + def infer_index( + user_idx: pd.Series, + vector_items: pd.Series, + vector_relevances: pd.Series, + num_items: pd.Series, + seen_item_idxs: pd.Series, + ) -> pd.DataFrame: + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + # max number of items to retrieve per batch + max_items_to_retrieve = num_items.max() + + user_vectors = get_csr_matrix(user_idx, vector_items, vector_relevances) + + # take slice + m = user_vectors[user_idx.values, :] + + neighbours = index.knnQueryBatch( + m, k=k + max_items_to_retrieve, num_threads=1 + ) + + neighbours_filtered = [] + for i, (item_idxs, distances) in enumerate(neighbours): + non_seen_item_indexes = ~np.isin( + item_idxs, seen_item_idxs[i], assume_unique=True + ) + neighbours_filtered.append( + ( + (item_idxs[non_seen_item_indexes])[:k], + (distances[non_seen_item_indexes])[:k], + ) + ) + + pd_res = pd.DataFrame( + neighbours_filtered, columns=["item_idx", "distance"] + ) + + # pd_res looks like + # item_idx distances + # [1, 2, 3, ...] [-0.5, -0.3, -0.1, ...] + # [1, 3, 4, ...] [-0.1, -0.8, -0.2, ...] + + return pd_res + + else: + + @pandas_udf(return_type) + def infer_index(user_idx: pd.Series, + vector_items: pd.Series, + vector_relevances: pd.Series,) -> pd.DataFrame: + + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + user_vectors = get_csr_matrix(user_idx, vector_items, vector_relevances) + # take slice + m = user_vectors[user_idx.values, :] + neighbours = index.knnQueryBatch(m, num_threads=1) + + pd_res = pd.DataFrame( + neighbours, columns=["item_idx", "distance"] + ) + + # pd_res looks like + # item_idx distances + # [1, 2, 3, ...] [-0.5, -0.3, -0.1, ...] + # [1, 3, 4, ...] [-0.1, -0.8, -0.2, ...] + + return pd_res + + else: + if filter_seen_items: + + @pandas_udf(return_type) + def infer_index( + vectors: pd.Series, + num_items: pd.Series, + seen_item_idxs: pd.Series, + ) -> pd.DataFrame: + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + # max number of items to retrieve per batch + max_items_to_retrieve = num_items.max() + + neighbours = index.knnQueryBatch( + np.stack(vectors.values), + k=k + max_items_to_retrieve, + num_threads=1, + ) + + neighbours_filtered = [] + for i, (item_idxs, distances) in enumerate(neighbours): + non_seen_item_indexes = ~np.isin( + item_idxs, seen_item_idxs[i], assume_unique=True + ) + neighbours_filtered.append( + ( + (item_idxs[non_seen_item_indexes])[:k], + (distances[non_seen_item_indexes])[:k], + ) + ) + + pd_res = pd.DataFrame( + neighbours_filtered, columns=["item_idx", "distance"] + ) + + return pd_res + + else: + + @pandas_udf(return_type) + def infer_index(vectors: pd.Series) -> pd.DataFrame: + index_file_manager = index_file_manager_broadcast.value + index = index_file_manager.index + + neighbours = index.knnQueryBatch( + np.stack(vectors.values), + k=k, + num_threads=1, + ) + pd_res = pd.DataFrame( + neighbours, columns=["item_idx", "distance"] + ) + + return pd_res + + cols = [] + if index_type == "sparse": + cols += ["user_idx", "vector_items", "vector_relevances"] + else: + cols.append(features_col) + if filter_seen_items: + cols = cols + ["num_items", "seen_item_idxs"] + + res = user_vectors.select( + "user_idx", + infer_index(*cols).alias("neighbours"), + ) + + res = self._unpack_infer_struct(res) + + return res + + def _save_nmslib_hnsw_index(self, path, sparse=False): + """Method save (copy) index from hdfs (or local) to `path` directory. + `path` can be an hdfs path or a local path. + + Args: + path (_type_): directory where to dump (copy) the index + """ + + params = self._nmslib_hnsw_params + + if params["build_index_on"] == "executor": + index_path = params["index_path"] + elif params["build_index_on"] == "driver": + index_path = SparkFiles.get( + f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + else: + raise ValueError("Unknown 'build_index_on' param.") + + source = get_filesystem(index_path) + target = get_filesystem(path) + self.logger.debug(f"Index file coping from '{index_path}' to '{path}'") + + source_paths = [] + target_paths = [] + if sparse: + source_paths.append(source.path) + source_paths.append(source.path + ".dat") + index_file_target_path = os.path.join(target.path, INDEX_FILENAME) + target_paths.append(index_file_target_path) + target_paths.append(index_file_target_path + ".dat") + else: + source_paths.append(source.path) + index_file_target_path = os.path.join(target.path, INDEX_FILENAME) + target_paths.append(index_file_target_path) + + if source.filesystem == FileSystem.HDFS: + source_filesystem = fs.HadoopFileSystem.from_uri(source.hdfs_uri) + else: + source_filesystem = fs.LocalFileSystem() + if target.filesystem == FileSystem.HDFS: + destination_filesystem = fs.HadoopFileSystem.from_uri( + target.hdfs_uri + ) + else: + destination_filesystem = fs.LocalFileSystem() + + for source_path, target_path in zip(source_paths, target_paths): + fs.copy_files( + source_path, + target_path, + source_filesystem=source_filesystem, + destination_filesystem=destination_filesystem, + ) + # param use_threads=True (?) + + def _load_nmslib_hnsw_index(self, path: str, sparse=False): + """Loads hnsw index from `path` directory to local dir. + Index file name is 'hnswlib_index'. + And adds index file to the `SparkFiles`. + `path` can be a hdfs path or a local path. + + + Args: + path: directory path, where index file is stored + """ + source = get_filesystem(path + f"/{INDEX_FILENAME}") + + temp_dir = tempfile.mkdtemp() + weakref.finalize(self, shutil.rmtree, temp_dir) + target_path = os.path.join( + temp_dir, f"{INDEX_FILENAME}_{self._spark_index_file_uid}" + ) + + source_paths = [] + target_paths = [] + if sparse: + source_paths.append(source.path) + source_paths.append(source.path + ".dat") + target_paths.append(target_path) + target_paths.append(target_path + ".dat") + else: + source_paths.append(source.path) + target_paths.append(target_path) + + if source.filesystem == FileSystem.HDFS: + source_filesystem = fs.HadoopFileSystem.from_uri(source.hdfs_uri) + else: + source_filesystem = fs.LocalFileSystem() + + destination_filesystem = fs.LocalFileSystem() + + for source_path, target_path in zip(source_paths, target_paths): + fs.copy_files( + source_path, + target_path, + source_filesystem=source_filesystem, + destination_filesystem=destination_filesystem, + ) + + spark = SparkSession.getActiveSession() + for target_path in target_paths: + spark.sparkContext.addFile("file://" + target_path) + + self._nmslib_hnsw_params["build_index_on"] = "driver" diff --git a/replay/models/slim.py b/replay/models/slim.py index e304ea02e..b865f82bf 100644 --- a/replay/models/slim.py +++ b/replay/models/slim.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any import numpy as np import pandas as pd @@ -15,6 +15,17 @@ class SLIM(NeighbourRec): """`SLIM: Sparse Linear Methods for Top-N Recommender Systems `_""" + def _get_ann_infer_params(self) -> Dict[str, Any]: + return { + "features_col": "", + "params": self._nmslib_hnsw_params, + "index_type": "sparse", + } + + @property + def _use_ann(self) -> bool: + return self._nmslib_hnsw_params is not None + _search_space = { "beta": {"type": "loguniform", "args": [1e-6, 5]}, "lambda_": {"type": "loguniform", "args": [1e-6, 2]}, @@ -25,21 +36,55 @@ def __init__( beta: float = 0.01, lambda_: float = 0.01, seed: Optional[int] = None, + nmslib_hnsw_params: Optional[dict] = None, ): """ :param beta: l2 regularization :param lambda_: l1 regularization :param seed: random seed + :param nmslib_hnsw_params: parameters for nmslib-hnsw methods: + {"method":"hnsw", + "space":"negdotprod_sparse_fast", + "M":16,"efS":200,"efC":200, + ...} + The reasonable range of values for M parameter is 5-100, + for efC and eFS is 100-2000. + Increasing these values improves the prediction quality but increases index_time and inference_time too. + We recommend using these settings: + - M=16, efC=200 and efS=200 for simple datasets like MovieLens + - M=50, efC=1000 and efS=1000 for average quality with an average prediction time + - M=75, efC=2000 and efS=2000 for the highest quality with a long prediction time + + note: choosing these parameters depends on the dataset and quality/time tradeoff + note: while reducing parameter values the highest range metrics like Metric@1000 suffer first + note: even in a case with a long training time, + profit from ann could be obtained while inference will be used multiple times + + for more details see https://github.com/nmslib/nmslib/blob/master/manual/methods.md """ if beta < 0 or lambda_ <= 0: raise ValueError("Invalid regularization parameters") self.beta = beta self.lambda_ = lambda_ self.seed = seed + self._nmslib_hnsw_params = nmslib_hnsw_params @property def _init_args(self): - return {"beta": self.beta, "lambda_": self.lambda_, "seed": self.seed} + return { + "beta": self.beta, + "lambda_": self.lambda_, + "seed": self.seed, + "nmslib_hnsw_params": self._nmslib_hnsw_params, + } + + def _save_model(self, path: str): + if self._nmslib_hnsw_params: + self._save_nmslib_hnsw_index(path, sparse=True) + + def _load_model(self, path: str): + if self._nmslib_hnsw_params: + self._load_nmslib_hnsw_index(path, sparse=True) def _fit( self, diff --git a/replay/models/ucb.py.orig b/replay/models/ucb.py.orig new file mode 100644 index 000000000..2f63aed89 --- /dev/null +++ b/replay/models/ucb.py.orig @@ -0,0 +1,219 @@ +<<<<<<< HEAD +import os + +import joblib +======= +>>>>>>> main +import math +from os.path import join +from typing import Any, Dict, List, Optional + +import joblib +from pyspark.sql import DataFrame +from pyspark.sql import functions as sf + +from replay.metrics import Metric, NDCG +from replay.models.base_rec import NonPersonalizedRecommender +<<<<<<< HEAD +from replay.utils import create_folder +======= +from replay.utils import unpersist_after, unionify +>>>>>>> main + + +class UCB(NonPersonalizedRecommender): + """Simple bandit model, which caclulate item relevance as upper confidence bound + (`UCB `_) + for the confidence interval of true fraction of positive ratings. + Should be used in iterative (online) mode to achive proper recommendation quality. + + ``relevance`` from log must be converted to binary 0-1 form. + + .. math:: + pred_i = ctr_i + \\sqrt{\\frac{c\\ln{n}}{n_i}} + + :math:`pred_i` -- predicted relevance of item :math:`i` + :math:`c` -- exploration coeficient + :math:`n` -- number of interactions in log + :math:`n_i` -- number of interactions with item :math:`i` + + >>> import pandas as pd + >>> data_frame = pd.DataFrame({"user_idx": [1, 2, 3, 3], "item_idx": [1, 2, 1, 2], "relevance": [1, 0, 0, 0]}) + >>> from replay.utils import convert2spark + >>> data_frame = convert2spark(data_frame) + >>> model = UCB() + >>> model.fit(data_frame) + >>> model.predict(data_frame,k=2,users=[1,2,3,4], items=[1,2,3] + ... ).toPandas().sort_values(["user_idx","relevance","item_idx"], + ... ascending=[True,False,True]).reset_index(drop=True) + user_idx item_idx relevance + 0 1 3 2.665109 + 1 1 2 1.177410 + 2 2 3 2.665109 + 3 2 1 1.677410 + 4 3 3 2.665109 + 5 4 3 2.665109 + 6 4 1 1.677410 + + """ + + # attributes which are needed for refit method + full_count: int + items_counts_aggr: DataFrame + + def __init__( + self, + exploration_coef: float = 2, + sample: bool = False, + seed: Optional[int] = None, + ): + """ + :param exploration_coef: exploration coefficient + :param sample: flag to choose recommendation strategy. + If True, items are sampled with a probability proportional + to the calculated predicted relevance. + Could be changed after model training by setting the `sample` attribute. + :param seed: random seed. Provides reproducibility if fixed + """ + # pylint: disable=super-init-not-called + self.coef = exploration_coef + self.sample = sample + self.seed = seed + self.items_counts_aggr: Optional[DataFrame] = None + self.item_popularity: Optional[DataFrame] = None + self.full_count = 0 + super().__init__(add_cold_items=True, cold_weight=1) + + @property + def _init_args(self): + return { + "exploration_coef": self.coef, + "sample": self.sample, + "seed": self.seed, + } + + @property + def _dataframes(self): + return { + "items_counts_aggr": self.items_counts_aggr, + "item_popularity": self.item_popularity + } + + def _clear_cache(self): + for df in self._dataframes.values(): + if df is not None: + df.unpersist() + + def _save_model(self, path: str): +<<<<<<< HEAD + create_folder(os.path.dirname(path), exists_ok=True) + # TODO: need to fix such saving, won't work with HDFS + joblib.dump({"fill": self.fill}, join(path)) +======= + joblib.dump({"fill": self.fill}, join(path, "params.dump")) +>>>>>>> main + + def _load_model(self, path: str): + self.fill = joblib.load(join(path, "params.dump"))["fill"] + + # pylint: disable=too-many-arguments + def optimize( + self, + train: DataFrame, + test: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + param_borders: Optional[Dict[str, List[Any]]] = None, + criterion: Metric = NDCG(), + k: int = 10, + budget: int = 10, + new_study: bool = True, + ) -> None: + """ + Searches best parameters with optuna. + + :param train: train data + :param test: test data + :param user_features: user features + :param item_features: item features + :param param_borders: a dictionary with search borders, where + key is the parameter name and value is the range of possible values + ``{param: [low, high]}``. In case of categorical parameters it is + all possible values: ``{cat_param: [cat_1, cat_2, cat_3]}``. + :param criterion: metric to use for optimization + :param k: recommendation list length + :param budget: number of points to try + :param new_study: keep searching with previous study or start a new study + :return: dictionary with best parameters + """ + self.logger.warning( + "The UCB model has only exploration coefficient parameter, " + "which cannot not be directly optimized" + ) + + def _fit_partial(self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + previous_log: Optional[DataFrame] = None) -> None: + with unpersist_after(self._dataframes): + self._check_relevance(log) + self._check_relevance(previous_log) + + # we save this dataframe for the refit() method + self.items_counts_aggr = unionify( + log.select("item_idx", sf.col("relevance").alias("pos"), sf.lit(1).alias("total")), + self.items_counts_aggr + ).groupby("item_idx").agg( + sf.sum("pos").alias("pos"), + sf.sum("total").alias("total") + # sf.count("relevance").alias("total"), + ).cache() + + # we save this variable for the refit() method + self.full_count += log.count() + self.item_popularity = self.items_counts_aggr.withColumn( + "relevance", + sf.col("pos") / sf.col("total") + sf.sqrt(sf.log(sf.lit(self.coef * self.full_count)) / sf.col("total")) + ).drop("pos", "total").cache() + + self.item_popularity.cache().count() + + self.fill = 1 + math.sqrt(math.log(self.coef * self.full_count)) + + # pylint: disable=too-many-arguments + def _predict( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + + if self.sample: + return self._predict_with_sampling( + log=log, + k=k, + users=users, + items=items, + filter_seen_items=filter_seen_items + ) + else: + return self._predict_without_sampling( + log, k, users, items, filter_seen_items + ) + + def _predict_pairs( + self, + pairs: DataFrame, + log: Optional[DataFrame] = None, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> DataFrame: + + return pairs.join( + self.item_popularity, on="item_idx", how="left" + ).fillna(value=self.fill, subset=["relevance"]) diff --git a/replay/models/word2vec.py b/replay/models/word2vec.py index 5f82d92cd..267716472 100644 --- a/replay/models/word2vec.py +++ b/replay/models/word2vec.py @@ -1,21 +1,64 @@ -from typing import Optional +from typing import Optional, Dict, Any from pyspark.ml.feature import Word2Vec +from pyspark.ml.functions import vector_to_array from pyspark.sql import DataFrame from pyspark.sql import functions as sf from pyspark.sql import types as st from pyspark.ml.stat import Summarizer from replay.models.base_rec import Recommender, ItemVectorModel -from replay.utils import vector_dot, vector_mult, join_with_col_renaming +from replay.models.hnswlib import HnswlibMixin +from replay.utils import vector_dot, multiply_scala_udf, join_with_col_renaming # pylint: disable=too-many-instance-attributes -class Word2VecRec(Recommender, ItemVectorModel): +class Word2VecRec(Recommender, ItemVectorModel, HnswlibMixin): """ Trains word2vec model where items ar treated as words and users as sentences. """ + def _get_ann_infer_params(self) -> Dict[str, Any]: + return { + "features_col": "user_vector", + "params": self._hnswlib_params, + "index_dim": self.rank, + } + + def _get_vectors_to_infer_ann_inner(self, log: DataFrame, users: DataFrame) -> DataFrame: + user_vectors = self._get_user_vectors(users, log) + # converts to pandas_udf compatible format + user_vectors = user_vectors.select( + "user_idx", vector_to_array("user_vector").alias("user_vector") + ) + return user_vectors + + def _get_ann_build_params(self, log: DataFrame) -> Dict[str, Any]: + self.num_elements = log.select("item_idx").distinct().count() + self.logger.debug(f"index 'num_elements' = {self.num_elements}") + return { + "features_col": "item_vector", + "params": self._hnswlib_params, + "dim": self.rank, + "num_elements": self.num_elements, + "id_col": "item_idx" + } + + def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame: + item_vectors = self._get_item_vectors() + item_vectors = ( + item_vectors + .select( + "item_idx", + vector_to_array("item_vector").alias("item_vector") + ) + ) + return item_vectors + + @property + def _use_ann(self) -> bool: + return self._hnswlib_params is not None + idf: DataFrame vectors: DataFrame @@ -36,6 +79,8 @@ def __init__( window_size: int = 1, use_idf: bool = False, seed: Optional[int] = None, + num_partitions: Optional[int] = None, + hnswlib_params: Optional[dict] = None, ): """ :param rank: embedding size @@ -55,6 +100,8 @@ def __init__( self.step_size = step_size self.max_iter = max_iter self._seed = seed + self._num_partitions = num_partitions + self._hnswlib_params = hnswlib_params @property def _init_args(self): @@ -66,8 +113,17 @@ def _init_args(self): "step_size": self.step_size, "max_iter": self.max_iter, "seed": self._seed, + "hnswlib_params": self._hnswlib_params, } + def _save_model(self, path: str): + if self._hnswlib_params: + self._save_hnswlib_index(path) + + def _load_model(self, path: str): + if self._hnswlib_params: + self._load_hnswlib_index(path) + def _fit( self, log: DataFrame, @@ -106,9 +162,13 @@ def _fit( self.logger.debug("Model training") + if self._num_partitions is None: + self._num_partitions = log_by_users.rdd.getNumPartitions() + word_2_vec = Word2Vec( vectorSize=self.rank, minCount=self.min_count, + numPartitions=self._num_partitions, stepSize=self.step_size, maxIter=self.max_iter, inputCol="items", @@ -151,7 +211,7 @@ def _get_user_vectors( res, self.idf, on_col_name="item_idx", how="inner" ) res = res.join( - self.vectors, + self.vectors.hint("broadcast"), how="inner", on=sf.col("item_idx") == sf.col("item"), ).drop("item") @@ -159,7 +219,7 @@ def _get_user_vectors( res.groupby("user_idx") .agg( Summarizer.mean( - vector_mult(sf.col("idf"), sf.col("vector")) + multiply_scala_udf(sf.col("idf"), sf.col("vector")) ).alias("user_vector") ) .select("user_idx", "user_vector") diff --git a/replay/scenarios/basescenario.py b/replay/scenarios/basescenario.py index eb34ccdd4..e3a7c68ce 100644 --- a/replay/scenarios/basescenario.py +++ b/replay/scenarios/basescenario.py @@ -9,7 +9,7 @@ from replay.filters import filter_by_min_count from replay.metrics import Metric, NDCG from replay.models.base_rec import BaseRecommender -from replay.utils import convert2spark, get_unique_entities +from replay.utils import convert2spark class BaseScenario(BaseRecommender): @@ -78,7 +78,7 @@ def predict( """ log = convert2spark(log) users = users or log or user_features or self.fit_users - users = get_unique_entities(users, "user_idx") + users = self._get_ids(users, "user_idx") hot_data = filter_by_min_count(log, self.threshold, "user_idx") hot_users = hot_data.select("user_idx").distinct() if not self.can_predict_cold_users: diff --git a/replay/scenarios/fallback.py b/replay/scenarios/fallback.py index 7612a3960..4d847451a 100644 --- a/replay/scenarios/fallback.py +++ b/replay/scenarios/fallback.py @@ -8,7 +8,7 @@ from replay.metrics import Metric, NDCG from replay.models import PopRec from replay.models.base_rec import BaseRecommender -from replay.utils import fallback, get_unique_entities +from replay.utils import fallback class Fallback(BaseRecommender): @@ -94,7 +94,7 @@ def predict( ``[user_idx, item_idx, relevance]`` """ users = users or log or user_features or self.fit_users - users = get_unique_entities(users, "user_idx") + users = self._get_ids(users, "user_idx") hot_data = filter_by_min_count(log, self.threshold, "user_idx") hot_users = hot_data.select("user_idx").distinct() hot_users = hot_users.join(self.hot_users, on="user_idx") diff --git a/replay/scenarios/two_stages/__init__.py b/replay/scenarios/two_stages/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/replay/scenarios/two_stages/reranker.py b/replay/scenarios/two_stages/reranker.py index c06c6f5c3..9ca205e54 100644 --- a/replay/scenarios/two_stages/reranker.py +++ b/replay/scenarios/two_stages/reranker.py @@ -1,18 +1,19 @@ import logging +import pickle from abc import abstractmethod from typing import Dict, Optional from lightautoml.automl.presets.tabular_presets import TabularAutoML from lightautoml.tasks import Task -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, SparkSession from replay.utils import ( convert2spark, - get_top_k_recs, + get_top_k_recs, AbleToSaveAndLoad, ) -class ReRanker: +class ReRanker(AbleToSaveAndLoad): """ Base class for models which re-rank recommendations produced by other models. May be used as a part of two-stages recommendation pipeline. @@ -56,6 +57,24 @@ class LamaWrap(ReRanker): Read more: https://github.com/sberbank-ai-lab/LightAutoML """ + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + spark = spark or cls._get_spark_session() + row = spark.read.parquet(path).first().asDict() + model = pickle.loads(row["data"]) + wrap = LamaWrap() + wrap.model = model + return wrap + + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + spark = spark or self._get_spark_session() + data = pickle.dumps(self.model) + + spark.createDataFrame([{ + "classname": self.get_classname(), + "data": data + }]).write.parquet(path, mode='overwrite' if overwrite else 'error') + def __init__( self, params: Optional[Dict] = None, diff --git a/replay/scenarios/two_stages/slama_reranker.py b/replay/scenarios/two_stages/slama_reranker.py new file mode 100644 index 000000000..359494e4f --- /dev/null +++ b/replay/scenarios/two_stages/slama_reranker.py @@ -0,0 +1,217 @@ +import logging +from typing import Optional, Dict + +import mlflow +from pyspark.ml import PipelineModel, Transformer +from pyspark.ml.functions import vector_to_array +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.pandas.functions import pandas_udf +from pyspark.sql.functions import expr +from pyspark.sql import functions as sf + +from sparklightautoml.automl.presets.tabular_presets import SparkTabularAutoML +from sparklightautoml.tasks.base import SparkTask +from sparklightautoml.utils import WrappingSelectingPipelineModel +from sparklightautoml.tasks.base import SparkTask +from pyspark.sql.types import TimestampType, DoubleType, NumericType, DateType, ArrayType, StringType + +from replay.scenarios.two_stages.reranker import ReRanker +from replay.session_handler import State +from replay.utils import get_top_k_recs, log_exec_timer, JobGroup, JobGroupWithMetrics, \ + cache_and_materialize_if_in_debug + +import pandas as pd +import numpy as np + + +logger = logging.getLogger("replay") + + +class SlamaWrap(ReRanker): + """ + LightAutoML TabularPipeline binary classification model wrapper for recommendations re-ranking. + Read more: https://github.com/sberbank-ai-lab/LightAutoML + """ + + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + transformer = self.model.transformer() + + if overwrite: + transformer.write().overwrite().save(path) + else: + transformer.write().save(path) + + @classmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + pipeline_model = PipelineModel.load(path) + + return SlamaWrap(transformer=pipeline_model) + + def __init__( + self, + params: Optional[Dict] = None, + config_path: Optional[str] = None, + transformer: Optional[Transformer] = None + ): + """ + Initialize LightAutoML TabularPipeline with passed params/configuration file. + + :param params: dict of model parameters + :param config_path: path to configuration file + """ + assert (transformer is not None) != (params is not None or config_path is not None) + + if transformer is not None: + self.model = None + self.transformer = transformer + else: + self.model = SparkTabularAutoML( + spark=State().session, + task=SparkTask("binary"), + config_path=config_path, + **(params if params is not None else {}), + ) + self.transformer = None + + @staticmethod + def handle_columns(df: DataFrame, convert_target: bool = False) -> DataFrame: + def explode_vec(col_name: str, size: int): + return [sf.col(col_name).getItem(i).alias(f'{col_name}_{i}') for i in range(size)] + + supported_types = (NumericType, TimestampType, DateType, StringType) + + wrong_type_fields = [ + field for field in df.schema.fields + if not (isinstance(field.dataType, supported_types) + or (isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, + NumericType))) + ] + assert len( + wrong_type_fields) == 0, f"Fields with wrong types have been found: {wrong_type_fields}. "\ + "Only the following types are supported: {supported_types} "\ + "and ArrayType with Numeric type of elements" + + array_fields = [field.name for field in df.schema.fields if isinstance(field.dataType, ArrayType)] + + arrays_to_explode = { + field.name: df.where(sf.col(field.name).isNotNull()).select(sf.size(field.name).alias("size")).first()[ + "size"] + for field in df.schema.fields if isinstance(field.dataType, ArrayType) + } + + timestamp_fields = [field.name for field in df.schema.fields if + isinstance(field.dataType, TimestampType)] + + if convert_target: + additional_columns = [sf.col('target').astype('int').alias('target')] + else: + additional_columns = [] + + df = ( + df + .select( + *(c for c in df.columns if c not in timestamp_fields + array_fields + ['target']), + *(sf.col(c).astype('int').alias(c) for c in timestamp_fields), + *additional_columns, + # *(c for c in [sf.col('target').astype('int').alias('target') if convert_target else None] if c), + *(c for f, size in arrays_to_explode.items() for c in explode_vec(f, size)) + ) + # .drop(*(f.name for f in array_fields)) + # # `withColumns` method is only available since version 3.3.0 + # .withColumns({c: sf.col(c).astype('int') for c in timestamp_fields}) + # .withColumn('target', sf.col('target').astype('int')) + ) + + return df + + def fit(self, data: DataFrame, fit_params: Optional[Dict] = None) -> None: + """ + Fit the LightAutoML TabularPipeline model with binary classification task. + Data should include negative and positive user-item pairs. + + :param data: spark dataframe with obligatory ``[user_idx, item_idx, target]`` + columns and features' columns. `Target` column should consist of zeros and ones + as the model is a binary classification model. + :param fit_params: dict of parameters to pass to model.fit() + See LightAutoML TabularPipeline fit_predict parameters. + """ + + if self.transformer is not None: + raise RuntimeError("The ranker is already fitted") + + data = data.drop("user_idx", "item_idx") + + data = self.handle_columns(data, convert_target=True) + + roles = { + "target": "target", + "numeric": [field.name for field in data.schema.fields if + isinstance(field.dataType, NumericType) and field.name != 'target'], + } + + params = { + "roles": roles, + "verbose": 1, + **({} if fit_params is None else fit_params) + } + + # this part is required to cut the plan of the dataframe because it may be huge + temp_checkpoint = f"/tmp/{type(self.model).__name__}_transform.parquet" + data.write.mode("overwrite").parquet(temp_checkpoint) + data = SparkSession.getActiveSession().read.parquet(temp_checkpoint).cache() + data.write.mode('overwrite').format('noop').save() + + self.model.fit_predict(data, **params) + + data.unpersist() + + def predict(self, data: DataFrame, k: int) -> DataFrame: + """ + Re-rank data with the model and get top-k recommendations for each user. + + :param data: spark dataframe with obligatory ``[user_idx, item_idx]`` + columns and features' columns + :param k: number of recommendations for each user + :return: spark dataframe with top-k recommendations for each user + the dataframe columns are ``[user_idx, item_idx, relevance]`` + """ + self.logger.info("Starting re-ranking") + + transformer = self.transformer if self.transformer else self.model.transformer() + logger.info(f"transformer type: {str(type(transformer))}") + + data = self.handle_columns(data) + + data.write.mode("overwrite").parquet(f"/tmp/{type(self.model).__name__}_transform.parquet") + data = SparkSession.getActiveSession().read.parquet(f"/tmp/{type(self.model).__name__}_transform.parquet").cache() + data.write.mode('overwrite').format('noop').save() + + model_name = type(self.model).__name__ + + with JobGroupWithMetrics("slama_predict", f"{model_name}.infer_sec"): + sdf = transformer.transform(data) + logger.info(f"sdf.columns: {sdf.columns}") + data.unpersist() + + candidates_pred_sdf = sdf.select( + 'user_idx', + 'item_idx', + vector_to_array('prediction').getItem(1).alias('relevance') + ) + + self.logger.info("Re-ranking is finished") + + # TODO: strange, but the further process would hang without maetrialization + # probably, it may be related to optimization and lightgbm models + # need to dig deeper later + candidates_pred_sdf = candidates_pred_sdf.cache() + candidates_pred_sdf.write.mode('overwrite').format('noop').save() + + with JobGroupWithMetrics("slama_predict", "top_k_recs_sec"): + self.logger.info("top-k") + top_k_recs = get_top_k_recs( + recs=candidates_pred_sdf, k=k, id_type="idx" + ) + cache_and_materialize_if_in_debug(top_k_recs, "slama_predict_top_k_recs_sec") + + return top_k_recs diff --git a/replay/scenarios/two_stages/two_stages_scenario.py b/replay/scenarios/two_stages/two_stages_scenario.py index 26816845d..6c7ed386d 100644 --- a/replay/scenarios/two_stages/two_stages_scenario.py +++ b/replay/scenarios/two_stages/two_stages_scenario.py @@ -1,4 +1,8 @@ # pylint: disable=too-many-lines +import functools +import logging +import os +import pickle from collections.abc import Iterable from typing import Dict, Optional, Tuple, List, Union, Any @@ -12,25 +16,26 @@ from replay.models import ALSWrap, RandomRec, PopRec from replay.models.base_rec import BaseRecommender, HybridRecommender from replay.scenarios.two_stages.reranker import LamaWrap - +from replay.scenarios.two_stages.slama_reranker import SlamaWrap from replay.session_handler import State from replay.splitters import Splitter, UserSplitter from replay.utils import ( array_mult, cache_if_exists, - fallback, get_log_info, get_top_k_recs, - horizontal_explode, join_or_return, join_with_col_renaming, - unpersist_if_exists, + unpersist_if_exists, create_folder, save_transformer, do_path_exists, load_transformer, list_folder, JobGroup, + cache_and_materialize_if_in_debug, JobGroupWithMetrics, ) +logger = logging.getLogger("replay") + # pylint: disable=too-many-locals, too-many-arguments def get_first_level_model_features( - model: DataFrame, + model: BaseRecommender, pairs: DataFrame, user_features: Optional[DataFrame] = None, item_features: Optional[DataFrame] = None, @@ -69,7 +74,6 @@ def get_first_level_model_features( on="item_idx", ) - factors_to_explode = [] if user_factors is not None: pairs_with_features = pairs_with_features.withColumn( "user_factors", @@ -78,7 +82,6 @@ def get_first_level_model_features( sf.array([sf.lit(0.0)] * user_vector_len), ), ) - factors_to_explode.append(("user_factors", "uf")) if item_factors is not None: pairs_with_features = pairs_with_features.withColumn( @@ -88,7 +91,6 @@ def get_first_level_model_features( sf.array([sf.lit(0.0)] * item_vector_len), ), ) - factors_to_explode.append(("item_factors", "if")) if model.__str__() == "LightFMWrap": pairs_with_features = ( @@ -106,17 +108,6 @@ def get_first_level_model_features( "factors_mult", array_mult(sf.col("item_factors"), sf.col("user_factors")), ) - factors_to_explode.append(("factors_mult", "fm")) - - for col_name, feature_prefix in factors_to_explode: - col_set = set(pairs_with_features.columns) - col_set.remove(col_name) - pairs_with_features = horizontal_explode( - data_frame=pairs_with_features, - column_to_explode=col_name, - other_columns=[sf.col(column) for column in sorted(list(col_set))], - prefix=f"{prefix}_{feature_prefix}", - ) return pairs_with_features @@ -167,6 +158,7 @@ def __init__( ] = ALSWrap(rank=128), fallback_model: Optional[BaseRecommender] = PopRec(), use_first_level_models_feat: Union[List[bool], bool] = False, + second_model_type: str = "lama", second_model_params: Optional[Union[Dict, str]] = None, second_model_config_path: Optional[str] = None, num_negatives: int = 100, @@ -175,7 +167,7 @@ def __init__( user_cat_features_list: Optional[List] = None, item_cat_features_list: Optional[List] = None, custom_features_processor: HistoryBasedFeaturesProcessor = None, - seed: int = 123, + seed: int = 123 ) -> None: """ :param train_splitter: splitter to get ``first_level_train`` and ``second_level_train``. @@ -234,9 +226,16 @@ def __init__( self.use_first_level_models_feat = use_first_level_models_feat - self.second_stage_model = LamaWrap( - params=second_model_params, config_path=second_model_config_path - ) + if second_model_type == "lama": + self.second_stage_model = LamaWrap( + params=second_model_params, config_path=second_model_config_path + ) + elif second_model_type == "slama": + self.second_stage_model = SlamaWrap( + params=second_model_params, config_path=second_model_config_path + ) + else: + raise Exception(f"Unsupported second model type: {second_model_type}") self.num_negatives = num_negatives if negatives_type not in ["random", "first_level"]: @@ -256,11 +255,116 @@ def __init__( ) self.seed = seed + self._job_group_id = "" + # TO DO: add save/load for scenarios @property def _init_args(self): return {} + def _save_model(self, path: str): + from replay.model_handler import save + spark = State().session + create_folder(path, exists_ok=True) + + # save features + if self.first_level_user_features_transformer is not None: + save_transformer( + self.first_level_user_features_transformer, + os.path.join(path, "first_level_user_features_transformer") + ) + + if self.first_level_item_features_transformer is not None: + save_transformer( + self.first_level_item_features_transformer, + os.path.join(path, "first_level_item_features_transformer") + ) + + if self.features_processor is not None: + save_transformer(self.features_processor, os.path.join(path, "features_processor")) + + # Save first level models + first_level_models_path = os.path.join(path, "first_level_models") + create_folder(first_level_models_path) + for i, model in enumerate(self.first_level_models): + save(model, os.path.join(first_level_models_path, f"model_{i}")) + + # save auxillary models + if self.random_model is not None: + save(self.random_model, os.path.join(path, "random_model")) + + if self.fallback_model is not None: + save(self.fallback_model, os.path.join(path, "fallback_model")) + + # save second stage model + if self.second_stage_model is not None: + save_transformer(self.second_stage_model, os.path.join(path, "second_stage_model")) + + # save general data and settings + data = { + "train_splitter": pickle.dumps(self.train_splitter), + "first_level_item_len": self.first_level_item_len, + "first_level_user_len": self.first_level_user_len, + "use_first_level_models_feat": self.use_first_level_models_feat, + "num_negatives": self.num_negatives, + "negatives_type": self.negatives_type, + "use_generated_features": self.use_generated_features, + "seed": self.seed + } + + spark.createDataFrame([data]).write.parquet(os.path.join(path, "data.parquet")) + + def _load_model(self, path: str): + from replay.model_handler import load + spark = State().session + + # load general data and settings + data = spark.read.parquet(os.path.join(path, "data.parquet")).first().asDict() + + # load transformers for features + comp_path = os.path.join(path, "first_level_user_features_transformer") + first_level_user_features_transformer = load_transformer(comp_path) if do_path_exists(comp_path) else None #TODO: check why this dir exists if user_features=None + + comp_path = os.path.join(path, "first_level_item_features_transformer") + first_level_item_features_transformer = load_transformer(comp_path) if do_path_exists(comp_path) else None #TODO same + + comp_path = os.path.join(path, "features_processor") + features_processor = load_transformer(comp_path) if do_path_exists(comp_path) else None # TODO same + + # load first level models + first_level_models_path = os.path.join(path, "first_level_models") + if do_path_exists(first_level_models_path): + model_paths = [ + os.path.join(first_level_models_path, model_path) + for model_path in list_folder(first_level_models_path) + ] + first_level_models = [load(model_path) for model_path in model_paths] + else: + first_level_models = None + + # load auxillary models + comp_path = os.path.join(path, "random_model") + random_model = load(comp_path) if do_path_exists(comp_path) else None + + comp_path = os.path.join(path, "fallback_model") + fallback_model = load(comp_path) if do_path_exists(comp_path) else None + + # load second stage model + comp_path = os.path.join(path, "second_stage_model") + # second_stage_model = load_transformer(comp_path) if do_path_exists(comp_path) else None # TODO: fix it + second_stage_model = None + + self.__dict__.update({ + **data, + "first_level_user_features_transformer": first_level_user_features_transformer, + "first_level_item_features_transformer": first_level_item_features_transformer, + "features_processor": features_processor, + "first_level_models": first_level_models, + "random_model": random_model, + "fallback_model": fallback_model, + "second_stage_model": second_stage_model + }) + # pylint: disable=too-many-locals def _add_features_for_second_level( self, @@ -291,31 +395,19 @@ def _add_features_for_second_level( self.first_level_user_features_transformer.transform(user_features) ) - pairs = log_to_add_features.select("user_idx", "item_idx") for idx, model in enumerate(self.first_level_models): - current_pred = self._predict_pairs_with_first_level_model( - model=model, - log=log_for_first_level_models, - pairs=pairs, - user_features=first_level_user_features_cached, - item_features=first_level_item_features_cached, - ).withColumnRenamed("relevance", f"rel_{idx}_{model}") - full_second_level_train = full_second_level_train.join( - sf.broadcast(current_pred), - on=["user_idx", "item_idx"], - how="left", - ) - if self.use_first_level_models_feat[idx]: features = get_first_level_model_features( model=model, - pairs=full_second_level_train.select( - "user_idx", "item_idx" - ), + pairs=full_second_level_train.select("user_idx", "item_idx"), user_features=first_level_user_features_cached, item_features=first_level_item_features_cached, prefix=f"m_{idx}", ) + + with JobGroup(self._job_group_id, f"features_caching_{type(self).__name__}") as job_desc: + cache_and_materialize_if_in_debug(features, job_desc) + full_second_level_train = join_with_col_renaming( left=full_second_level_train, right=features, @@ -326,9 +418,7 @@ def _add_features_for_second_level( unpersist_if_exists(first_level_user_features_cached) unpersist_if_exists(first_level_item_features_cached) - full_second_level_train_cached = full_second_level_train.fillna( - 0 - ).cache() + full_second_level_train_cached = full_second_level_train.fillna(0) self.logger.info("Adding features from the dataset") full_second_level_train = join_or_return( @@ -345,32 +435,38 @@ def _add_features_for_second_level( ) if self.use_generated_features: - if not self.features_processor.fitted: - self.features_processor.fit( - log=log_for_first_level_models, - user_features=user_features, - item_features=item_features, + with JobGroupWithMetrics(self._job_group_id, "fitting_the_feature_processor"): + if not self.features_processor.fitted: + # PERF - preventing potential losing time on repeated expensive computations + full_second_level_train = full_second_level_train.cache() + self.cached_list.append(full_second_level_train) + + self.features_processor.fit( + log=log_for_first_level_models, + user_features=user_features, + item_features=item_features, + ) + + self.logger.info("Adding generated features") + full_second_level_train = self.features_processor.transform( + log=full_second_level_train ) - self.logger.info("Adding generated features") - full_second_level_train = self.features_processor.transform( - log=full_second_level_train - ) self.logger.info( "Columns at second level: %s", " ".join(full_second_level_train.columns), ) - full_second_level_train_cached.unpersist() + return full_second_level_train def _split_data(self, log: DataFrame) -> Tuple[DataFrame, DataFrame]: """Write statistics""" first_level_train, second_level_train = self.train_splitter.split(log) - State().logger.debug("Log info: %s", get_log_info(log)) - State().logger.debug( + logger.info("Log info: %s", get_log_info(log)) + logger.info( "first_level_train info: %s", get_log_info(first_level_train) ) - State().logger.debug( + logger.info( "second_level_train info: %s", get_log_info(second_level_train) ) return first_level_train, second_level_train @@ -391,7 +487,8 @@ def _predict_with_first_level_model( user_features: DataFrame, item_features: DataFrame, log_to_filter: DataFrame, - ): + prediction_label: str = '' + ) -> DataFrame: """ Filter users and items using can_predict_cold_items and can_predict_cold_users, and predict """ @@ -419,23 +516,29 @@ def _predict_with_first_level_model( ).cache() max_positives_to_filter = 0 - if log_to_filter_cached.count() > 0: - max_positives_to_filter = ( - log_to_filter_cached.groupBy("user_idx") - .agg(sf.count("item_idx").alias("num_positives")) - .select(sf.max("num_positives")) - .collect()[0][0] + with JobGroupWithMetrics(self._job_group_id, "calculating_max_positives_to_filter"): + if log_to_filter_cached.count() > 0: + max_positives_to_filter = ( + log_to_filter_cached.groupBy("user_idx") + .agg(sf.count("item_idx").alias("num_positives")) + .select(sf.max("num_positives")) + .collect()[0][0] + ) + + with JobGroupWithMetrics(__file__, f"{type(model).__name__}._{prediction_label}_predict") as job_desc: + pred = model._inner_predict_wrap( + log, + k=k + max_positives_to_filter, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + filter_seen_items=False, ) + cache_and_materialize_if_in_debug(pred, job_desc) - pred = model._predict( - log, - k=k + max_positives_to_filter, - users=users, - items=items, - user_features=user_features, - item_features=item_features, - filter_seen_items=False, - ) + logger.info(f"{type(model).__name__} prediction: {pred}") + logger.info(f"Length of {type(model).__name__} prediction: {pred.count()}") pred = pred.join( log_to_filter_cached.select("user_idx", "item_idx"), @@ -443,7 +546,9 @@ def _predict_with_first_level_model( how="anti", ).drop("user", "item") - log_to_filter_cached.unpersist() + # PERF - preventing potential losing time on repeated expensive computations + pred = pred.cache() + self.cached_list.extend([pred, log_to_filter_cached]) return get_top_k_recs(pred, k) @@ -493,6 +598,7 @@ def _get_first_level_candidates( user_features: DataFrame, item_features: DataFrame, log_to_filter: DataFrame, + prediction_label: str = '' ) -> DataFrame: """ Combining the base model predictions with the fallback model @@ -500,20 +606,130 @@ def _get_first_level_candidates( """ passed_arguments = locals() passed_arguments.pop("self") - candidates = self._predict_with_first_level_model(**passed_arguments) - if self.fallback_model is not None: - passed_arguments.pop("model") - fallback_candidates = self._predict_with_first_level_model( - model=self.fallback_model, **passed_arguments - ) + with JobGroupWithMetrics(self._job_group_id, f"{type(self).__name__}._predict_with_first_level_model"): + candidates = self._predict_with_first_level_model(**passed_arguments) + + candidates = candidates.cache() + candidates.write.mode("overwrite").format("noop").save() + self.cached_list.append(candidates) + + # TODO: temporary commenting + # if self.fallback_model is not None: + # passed_arguments.pop("model") + # with JobGroup("fit", "fallback candidates"): + # fallback_candidates = self._predict_with_first_level_model( + # model=self.fallback_model, **passed_arguments + # ) + # fallback_candidates = fallback_candidates.cache() + # + # with JobGroup("fit", "fallback"): + # # TODO: PERF - no cache and repeated computations for candidate and fallback_candidates? + # candidates = fallback( + # base=candidates, + # fill=fallback_candidates, + # k=self.num_negatives, + # ) + return candidates - candidates = fallback( - base=candidates, - fill=fallback_candidates, - k=self.num_negatives, + def _combine( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: DataFrame, + item_features: DataFrame, + log_to_filter: DataFrame, + mode: str, + model_names: List[str] = None, + prediction_label: str = '' + ) -> DataFrame: + + partial_dfs = [] + + for idx, model in enumerate(self.first_level_models): + with JobGroupWithMetrics(self._job_group_id, f"{type(model).__name__}._predict_with_first_level_model"): + candidates = self._predict_with_first_level_model( + model=model, + log=log, + k=k, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + log_to_filter=log_to_filter, + prediction_label=prediction_label + ).withColumnRenamed("relevance", f"rel_{idx}_{model}") + + # we need this caching if mpairs will be counted later in this method + candidates = candidates.cache() + candidates.write.mode("overwrite").format("noop").save() + self.cached_list.append(candidates) + + partial_dfs.append(candidates) + + if mode == 'union': + required_pairs = ( + functools.reduce( + lambda acc, x: acc.unionByName(x), + (df.select('user_idx', 'item_idx') for df in partial_dfs) + ).distinct() ) - return candidates + else: + # "leading_" + leading_model_name = mode.split('_')[-1] + required_pairs = ( + partial_dfs[model_names.index(leading_model_name)] + .select('user_idx', 'item_idx') + .distinct() + ) + + logger.info("Selecting missing pairs") + + missing_pairs = [ + required_pairs.join(df, on=['user_idx', 'item_idx'], how='anti').select('user_idx', 'item_idx').distinct() + for df in partial_dfs + ] + + def get_rel_col(df: DataFrame) -> str: + logger.info(f"columns: {str(df.columns)}") + rel_col = [c for c in df.columns if c.startswith('rel_')][0] + return rel_col + + def make_missing_predictions(model: BaseRecommender, mpairs: DataFrame, partial_df: DataFrame) -> DataFrame: + + mpairs = mpairs.cache() + + if mpairs.count() == 0: + mpairs.unpersist() + return partial_df + + current_pred = model._predict_pairs( + mpairs, + log=log, + user_features=user_features, + item_features=item_features + ).withColumnRenamed('relevance', get_rel_col(partial_df)) + + mpairs.unpersist() + return partial_df.unionByName(current_pred.select(*partial_df.columns)) + + logger.info("Making missing predictions") + extended_train_dfs = [ + make_missing_predictions(model, mpairs, partial_df) + for model, mpairs, partial_df in zip(self.first_level_models, missing_pairs, partial_dfs) + ] + + # we apply left here because some algorithms like itemknn cannot predict beyond their inbuilt top + combined_df = functools.reduce( + lambda acc, x: acc.join(x, on=['user_idx', 'item_idx'], how='left'), + extended_train_dfs + ) + + logger.info("Combination completed.") + + return combined_df # pylint: disable=too-many-locals,too-many-statements def _fit( @@ -522,13 +738,13 @@ def _fit( user_features: Optional[DataFrame] = None, item_features: Optional[DataFrame] = None, ) -> None: + self._job_group_id = "2stage_fit" self.cached_list = [] - self.logger.info("Data split") + # 1. Split train data between first and second levels + self.logger.info("Data splitting") first_level_train, second_level_positive = self._split_data(log) - # second_level_positive = second_level_positive - # .join(first_level_train.select("user_idx"), on="user_idx", how="left") self.first_level_item_len = ( first_level_train.select("item_idx").distinct().count() @@ -544,6 +760,7 @@ def _fit( [log, first_level_train, second_level_positive] ) + # 2. Transform user and item features if applicable if user_features is not None: user_features.cache() self.cached_list.append(user_features) @@ -552,8 +769,13 @@ def _fit( item_features.cache() self.cached_list.append(item_features) - self.first_level_item_features_transformer.fit(item_features) - self.first_level_user_features_transformer.fit(user_features) + with JobGroupWithMetrics(self._job_group_id, "item_features_transformer"): + if not self.first_level_item_features_transformer.fitted: + self.first_level_item_features_transformer.fit(item_features) + + with JobGroupWithMetrics(self._job_group_id, "user_features_transformer"): + if not self.first_level_user_features_transformer.fitted: + self.first_level_user_features_transformer.fit(user_features) first_level_item_features = cache_if_exists( self.first_level_item_features_transformer.transform(item_features) @@ -562,21 +784,25 @@ def _fit( self.first_level_user_features_transformer.transform(user_features) ) - for base_model in [ - *self.first_level_models, - self.random_model, - self.fallback_model, - ]: - base_model._fit_wrap( - log=first_level_train, - user_features=first_level_user_features.filter( - sf.col("user_idx") < self.first_level_user_len - ), - item_features=first_level_item_features.filter( - sf.col("item_idx") < self.first_level_item_len - ), - ) + first_level_user_features = first_level_user_features.filter(sf.col("user_idx") < self.first_level_user_len) \ + if first_level_user_features is not None else None + + first_level_item_features = first_level_item_features.filter(sf.col("item_idx") < self.first_level_item_len) \ + if first_level_item_features is not None else None + # 3. Fit first level models + logger.info(f"first_level_train: {str(first_level_train.columns)}") + + for base_model in [*self.first_level_models, self.random_model, self.fallback_model]: + with JobGroupWithMetrics(self._job_group_id, f"{type(base_model).__name__}._fit_wrap"): + base_model._fit_wrap( + log=first_level_train, + user_features=first_level_user_features, + item_features=first_level_item_features, + ) + + # 4. Generate negative examples + # by making predictions with first level models and combining them into final recommendation lists self.logger.info("Generate negative examples") negatives_source = ( self.first_level_models[0] @@ -584,59 +810,87 @@ def _fit( else self.random_model ) - first_level_candidates = self._get_first_level_candidates( - model=negatives_source, - log=first_level_train, - k=self.num_negatives, - users=log.select("user_idx").distinct(), - items=log.select("item_idx").distinct(), - user_features=first_level_user_features, - item_features=first_level_item_features, - log_to_filter=first_level_train, - ).select("user_idx", "item_idx") + with JobGroupWithMetrics(self._job_group_id, f"{type(self).__name__}._combine"): + first_level_candidates = self._combine( + log=first_level_train, + k=self.num_negatives, + users=log.select("user_idx").distinct(), + items=log.select("item_idx").distinct(), + user_features=first_level_user_features, + item_features=first_level_item_features, + log_to_filter=first_level_train, + mode="union", + prediction_label='1' + ) + + # may be skipped due to join caching in the end + first_level_candidates = first_level_candidates.cache() + first_level_candidates.write.mode('overwrite').format('noop').save() + self.cached_list.append(first_level_candidates) + + logger.info(f"first_level_candidates.columns: {str(first_level_candidates.columns)}") unpersist_if_exists(first_level_user_features) unpersist_if_exists(first_level_item_features) - self.logger.info("Crate train dataset for second level") + # 5. Create user/ item pairs for the train dataset of the second level (no features except relevance) + self.logger.info("Creating train dataset for second level") second_level_train = ( first_level_candidates.join( second_level_positive.select( "user_idx", "item_idx" - ).withColumn("target", sf.lit(1.0)), + ).withColumn("target", sf.lit(1)), on=["user_idx", "item_idx"], how="left", - ).fillna(0.0, subset="target") + ).fillna(0, subset="target") ).cache() self.cached_list.append(second_level_train) - self.logger.info( - "Distribution of classes in second-level train dataset:/n %s", - ( - second_level_train.groupBy("target") - .agg(sf.count(sf.col("target")).alias("count_for_class")) - .take(2) - ), - ) + # Apply negative sampling to balance postive / negative combination in the resulting train dataset + neg = second_level_train.filter(second_level_train.target == 0) + pos = second_level_train.filter(second_level_train.target == 1) + neg_new = neg.sample(fraction=10 * pos.count() / neg.count()) + second_level_train = pos.union(neg_new) + + with JobGroupWithMetrics(self._job_group_id, "inferring_class_distribution"): + self.logger.info( + "Distribution of classes in second-level train dataset:\n %s", + ( + second_level_train.groupBy("target") + .agg(sf.count(sf.col("target")).alias("count_for_class")) + .take(2) + ), + ) - self.features_processor.fit( - log=first_level_train, - user_features=user_features, - item_features=item_features, - ) + # 6. Fill the second level train dataset with user/item features and features of the first level models + with JobGroupWithMetrics(self._job_group_id, "feature_processor_fit"): + if not self.features_processor.fitted: + self.features_processor.fit( + log=first_level_train, + user_features=user_features, + item_features=item_features, + ) self.logger.info("Adding features to second-level train dataset") - second_level_train_to_convert = self._add_features_for_second_level( - log_to_add_features=second_level_train, - log_for_first_level_models=first_level_train, - user_features=user_features, - item_features=item_features, - ).cache() + + with JobGroupWithMetrics(self._job_group_id, "_add_features_for_second_level"): + second_level_train_to_convert = self._add_features_for_second_level( + log_to_add_features=second_level_train, + log_for_first_level_models=first_level_train, + user_features=user_features, + item_features=item_features, + ).cache() self.cached_list.append(second_level_train_to_convert) - self.second_stage_model.fit(second_level_train_to_convert) + + # 7. Fit the second level model + logger.info(f"Fitting {type(self.second_stage_model).__name__} on {second_level_train_to_convert}") + # second_level_train_to_convert.write.parquet("hdfs://node21.bdcl:9000/tmp/second_level_train_to_convert.parquet") + with JobGroupWithMetrics(self._job_group_id, f"{type(self.second_stage_model).__name__}_fitting"): + self.second_stage_model.fit(second_level_train_to_convert) + for dataframe in self.cached_list: unpersist_if_exists(dataframe) @@ -651,8 +905,11 @@ def _predict( item_features: Optional[DataFrame] = None, filter_seen_items: bool = True, ) -> DataFrame: + self._job_group_id = "2stage_predict" + self.cached_list = [] - State().logger.debug(msg="Generating candidates to rerank") + # 1. Transform user and item features if applicable + logger.debug(msg="Generating candidates to rerank") first_level_user_features = cache_if_exists( self.first_level_user_features_transformer.transform(user_features) @@ -661,35 +918,64 @@ def _predict( self.first_level_item_features_transformer.transform(item_features) ) - candidates = self._get_first_level_candidates( - model=self.first_level_models[0], - log=log, - k=self.num_negatives, - users=users, - items=items, - user_features=first_level_user_features, - item_features=first_level_item_features, - log_to_filter=log, - ).select("user_idx", "item_idx") + # 2. Create user/ item pairs for the train dataset of the second level (no features except relevance) + # by making predictions with first level models and combining them into final recommendation lists + with JobGroupWithMetrics(self._job_group_id, f"{type(self).__name__}._combine"): + candidates = self._combine( + log=log, + k=self.num_negatives, + users=users, + items=items, + user_features=first_level_user_features, + item_features=first_level_item_features, + log_to_filter=log, + mode="union", + prediction_label='2' + ) + + # PERF - preventing potential losing time on repeated expensive computations + # may be removed after testing + candidates_cached = candidates.cache() + candidates_cached.write.mode('overwrite').format('noop').save() + self.cached_list.append(candidates_cached) + + logger.info(f"2candidates.columns: {candidates.columns}") - candidates_cached = candidates.cache() unpersist_if_exists(first_level_user_features) unpersist_if_exists(first_level_item_features) + + # 3. Fill the second level recommendations dataset with user/item features + # and features of the first level models + self.logger.info("Adding features") - candidates_features = self._add_features_for_second_level( - log_to_add_features=candidates_cached, - log_for_first_level_models=log, - user_features=user_features, - item_features=item_features, - ) - candidates_features.cache() - candidates_cached.unpersist() - self.logger.info( - "Generated %s candidates for %s users", - candidates_features.count(), - candidates_features.select("user_idx").distinct().count(), - ) - return self.second_stage_model.predict(data=candidates_features, k=k) + with JobGroupWithMetrics(self._job_group_id, "_add_features_for_second_level"): + candidates_features = self._add_features_for_second_level( + log_to_add_features=candidates_cached, + log_for_first_level_models=log, + user_features=user_features, + item_features=item_features, + ) + + logger.info(f"rel_ columns in candidates_features: {[x for x in candidates_features.columns if 'rel_' in x]}") + + # PERF - preventing potential losing time on repeated expensive computations + candidates_features = candidates_features.cache() + self.cached_list.extend([candidates_features, candidates_cached]) + + with JobGroupWithMetrics(self._job_group_id, "candidates_features info logging"): + self.logger.info( + "Generated %s candidates for %s users", + candidates_features.count(), + candidates_features.select("user_idx").distinct().count(), + ) + + # 4. Rerank recommendations with the second level model and produce final version of recommendations + with JobGroupWithMetrics(self._job_group_id, f"{type(self.second_stage_model).__name__}_predict"): + predictions = self.second_stage_model.predict(data=candidates_features, k=k) + + logger.info(f"predictions.columns: {predictions.columns}") + + return predictions def fit_predict( self, @@ -843,3 +1129,7 @@ def optimize( unpersist_if_exists(first_level_item_features) unpersist_if_exists(first_level_user_features) return params_found, fallback_params + + def _get_nearest_items(self, items: DataFrame, metric: Optional[str] = None, + candidates: Optional[DataFrame] = None) -> Optional[DataFrame]: + raise NotImplementedError("Unsupported method") diff --git a/replay/scenarios/two_stages/two_stages_scenario.py.orig b/replay/scenarios/two_stages/two_stages_scenario.py.orig new file mode 100644 index 000000000..96d188a58 --- /dev/null +++ b/replay/scenarios/two_stages/two_stages_scenario.py.orig @@ -0,0 +1,966 @@ +# pylint: disable=too-many-lines +import os +import pickle +from collections.abc import Iterable +from typing import Dict, Optional, Tuple, List, Union, Any + +import pyspark.sql.functions as sf +from pyspark.sql import DataFrame + +from replay.constants import AnyDataFrame +from replay.data_preparator import ToNumericFeatureTransformer +from replay.history_based_fp import HistoryBasedFeaturesProcessor +from replay.metrics import Metric, Precision +from replay.models import ALSWrap, RandomRec, PopRec +from replay.models.base_rec import BaseRecommender, HybridRecommender +<<<<<<< HEAD +from replay.scenarios.two_stages.reranker import LamaWrap +======= +from replay.scenarios.two_stages.reranker import LamaWrap, ReRanker + +>>>>>>> feature/two_stage_preset +from replay.session_handler import State +from replay.splitters import Splitter, UserSplitter +from replay.utils import ( + array_mult, + cache_if_exists, + fallback, + get_log_info, + get_top_k_recs, + horizontal_explode, + join_or_return, + join_with_col_renaming, + unpersist_if_exists, create_folder, save_transformer, do_path_exists, load_transformer, list_folder, +) + + +# pylint: disable=too-many-locals, too-many-arguments +def get_first_level_model_features( + model: BaseRecommender, + pairs: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + add_factors_mult: bool = True, + prefix: str = "", +) -> DataFrame: + """ + Get user and item embeddings from replay model. + Can also compute elementwise multiplication between them with ``add_factors_mult`` parameter. + Zero vectors are returned if a model does not have embeddings for specific users/items. + + :param model: trained model + :param pairs: user-item pairs to get vectors for `[user_id/user_idx, item_id/item_id]` + :param user_features: user features `[user_id/user_idx, feature_1, ....]` + :param item_features: item features `[item_id/item_idx, feature_1, ....]` + :param add_factors_mult: flag to add elementwise multiplication + :param prefix: name to add to the columns + :return: DataFrame + """ + users = pairs.select("user_idx").distinct() + items = pairs.select("item_idx").distinct() + user_factors, user_vector_len = model._get_features_wrap( + users, user_features + ) + item_factors, item_vector_len = model._get_features_wrap( + items, item_features + ) + + pairs_with_features = join_or_return( + pairs, user_factors, how="left", on="user_idx" + ) + pairs_with_features = join_or_return( + pairs_with_features, + item_factors, + how="left", + on="item_idx", + ) + + factors_to_explode = [] + if user_factors is not None: + pairs_with_features = pairs_with_features.withColumn( + "user_factors", + sf.coalesce( + sf.col("user_factors"), + sf.array([sf.lit(0.0)] * user_vector_len), + ), + ) + factors_to_explode.append(("user_factors", "uf")) + + if item_factors is not None: + pairs_with_features = pairs_with_features.withColumn( + "item_factors", + sf.coalesce( + sf.col("item_factors"), + sf.array([sf.lit(0.0)] * item_vector_len), + ), + ) + factors_to_explode.append(("item_factors", "if")) + + if model.__str__() == "LightFMWrap": + pairs_with_features = ( + pairs_with_features.fillna({"user_bias": 0, "item_bias": 0}) + .withColumnRenamed("user_bias", f"{prefix}_user_bias") + .withColumnRenamed("item_bias", f"{prefix}_item_bias") + ) + + if ( + add_factors_mult + and user_factors is not None + and item_factors is not None + ): + pairs_with_features = pairs_with_features.withColumn( + "factors_mult", + array_mult(sf.col("item_factors"), sf.col("user_factors")), + ) + factors_to_explode.append(("factors_mult", "fm")) + + for col_name, feature_prefix in factors_to_explode: + col_set = set(pairs_with_features.columns) + col_set.remove(col_name) + pairs_with_features = horizontal_explode( + data_frame=pairs_with_features, + column_to_explode=col_name, + other_columns=[sf.col(column) for column in sorted(list(col_set))], + prefix=f"{prefix}_{feature_prefix}", + ) + + return pairs_with_features + + +# pylint: disable=too-many-instance-attributes +class TwoStagesScenario(HybridRecommender): + """ + *train*: + + 1) take input ``log`` and split it into first_level_train and second_level_train + default splitter splits each user's data 50/50 + 2) train ``first_stage_models`` on ``first_stage_train`` + 3) create negative examples to train second stage model using one of: + + - wrong recommendations from first stage + - random examples + + use ``num_negatives`` to specify number of negatives per user + 4) augments dataset with features: + + - get 1 level recommendations for positive examples + from second_level_train and for generated negative examples + - add user and item features + - generate statistical and pair features + + 5) train ``TabularAutoML`` from LightAutoML + + *inference*: + + 1) take ``log`` + 2) generate candidates, their number can be specified with ``num_candidates`` + 3) add features as in train + 4) get recommendations + + """ + + can_predict_cold_users: bool = True + can_predict_cold_items: bool = True + + # pylint: disable=too-many-arguments + def __init__( + self, + train_splitter: Splitter = UserSplitter( + item_test_size=0.5, shuffle=True, seed=42 + ), + first_level_models: Union[ + List[BaseRecommender], BaseRecommender + ] = ALSWrap(rank=128), + fallback_model: Optional[BaseRecommender] = PopRec(), + use_first_level_models_feat: Union[List[bool], bool] = False, + second_model: Optional[ReRanker] = None, + second_model_params: Optional[Union[Dict, str]] = None, + second_model_config_path: Optional[str] = None, + num_negatives: int = 100, + negatives_type: str = "first_level", + use_generated_features: bool = False, + user_cat_features_list: Optional[List] = None, + item_cat_features_list: Optional[List] = None, + custom_features_processor: HistoryBasedFeaturesProcessor = None, + seed: int = 123, + ) -> None: + """ + :param train_splitter: splitter to get ``first_level_train`` and ``second_level_train``. + Default is random 50% split. + :param first_level_models: model or a list of models + :param fallback_model: model used to fill missing recommendations at first level models + :param use_first_level_models_feat: flag or a list of flags to use + features created by first level models + : + :param second_model_params: TabularAutoML parameters + :param second_model_config_path: path to config file for TabularAutoML + :param num_negatives: number of negative examples used during train + :param negatives_type: negative examples creation strategy,``random`` + or most relevant examples from ``first-level`` + :param use_generated_features: flag to use generated features to train second level + :param user_cat_features_list: list of user categorical features + :param item_cat_features_list: list of item categorical features + :param custom_features_processor: you can pass custom feature processor + :param seed: random seed + + """ + self.train_splitter = train_splitter + self.cached_list = [] + + self.first_level_models = ( + first_level_models + if isinstance(first_level_models, Iterable) + else [first_level_models] + ) + + self.first_level_item_len = 0 + self.first_level_user_len = 0 + + self.random_model = RandomRec(seed=seed) + self.fallback_model = fallback_model + self.first_level_user_features_transformer = ( + ToNumericFeatureTransformer() + ) + self.first_level_item_features_transformer = ( + ToNumericFeatureTransformer() + ) + + if isinstance(use_first_level_models_feat, bool): + self.use_first_level_models_feat = [ + use_first_level_models_feat + ] * len(self.first_level_models) + else: + if len(self.first_level_models) != len( + use_first_level_models_feat + ): + raise ValueError( + f"For each model from first_level_models specify " + f"flag to use first level features." + f"Length of first_level_models is {len(first_level_models)}, " + f"Length of use_first_level_models_feat is {len(use_first_level_models_feat)}" + ) + + self.use_first_level_models_feat = use_first_level_models_feat + + assert (second_model is not None) != (second_model_params or second_model_config_path), \ + "Either second_model should be defined or params for second models" + if second_model: + self.second_stage_model = second_model + else: + self.second_stage_model = LamaWrap( + params=second_model_params, config_path=second_model_config_path + ) + + self.num_negatives = num_negatives + if negatives_type not in ["random", "first_level"]: + raise ValueError( + f"Invalid negatives_type value: {negatives_type}. Use 'random' or 'first_level'" + ) + self.negatives_type = negatives_type + + self.use_generated_features = use_generated_features + self.features_processor = ( + custom_features_processor + if custom_features_processor + else HistoryBasedFeaturesProcessor( + user_cat_features_list=user_cat_features_list, + item_cat_features_list=item_cat_features_list, + ) + ) + self.seed = seed + + # TO DO: add save/load for scenarios + @property + def _init_args(self): + return {} + + def _save_model(self, path: str): + from replay.model_handler import save + spark = State().session + create_folder(path) + + # save features + if self.first_level_user_features_transformer is not None: + save_transformer( + self.first_level_user_features_transformer, + os.path.join(path, "first_level_user_features_transformer") + ) + + if self.first_level_item_features_transformer is not None: + save_transformer( + self.first_level_item_features_transformer, + os.path.join(path, "first_level_item_features_transformer") + ) + + if self.features_processor is not None: + save_transformer(self.features_processor, os.path.join(path, "features_processor")) + + # Save first level models + first_level_models_path = os.path.join(path, "first_level_models") + create_folder(first_level_models_path) + for i, model in enumerate(self.first_level_models): + save(model, os.path.join(first_level_models_path, f"model_{i}")) + + # save auxillary models + if self.random_model is not None: + save(self.random_model, os.path.join(path, "random_model")) + + if self.fallback_model is not None: + save(self.fallback_model, os.path.join(path, "fallback_model")) + + # save second stage model + if self.second_stage_model is not None: + save_transformer(self.second_stage_model, os.path.join(path, "second_stage_model")) + + # save general data and settings + data = { + "train_splitter": pickle.dumps(self.train_splitter), + "first_level_item_len": self.first_level_item_len, + "first_level_user_len": self.first_level_user_len, + "use_first_level_models_feat": self.use_first_level_models_feat, + "num_negatives": self.num_negatives, + "negatives_type": self.negatives_type, + "use_generated_features": self.use_generated_features, + "seed": self.seed + } + + spark.createDataFrame([data]).write.parquet(os.path.join(path, "data.parquet")) + + def _load_model(self, path: str): + from replay.model_handler import load + spark = State().session + + # load general data and settings + data = spark.read.parquet(os.path.join(path, "data.parquet")).first().asDict() + + # load transformers for features + comp_path = os.path.join(path, "first_level_user_features_transformer") + first_level_user_features_transformer = load_transformer(comp_path) if do_path_exists(comp_path) else None + + comp_path = os.path.join(path, "first_level_item_features_transformer") + first_level_item_features_transformer = load_transformer(comp_path) if do_path_exists(comp_path) else None + + comp_path = os.path.join(path, "features_processor") + features_processor = load_transformer(comp_path) if do_path_exists(comp_path) else None + + # load first level models + first_level_models_path = os.path.join(path, "first_level_models") + if do_path_exists(first_level_models_path): + model_paths = [ + os.path.join(first_level_models_path, model_path) + for model_path in list_folder(first_level_models_path) + ] + first_level_models = [load(model_path) for model_path in model_paths] + else: + first_level_models = None + + # load auxillary models + comp_path = os.path.join(path, "random_model") + random_model = load(comp_path) if do_path_exists(comp_path) else None + + comp_path = os.path.join(path, "fallback_model") + fallback_model = load(comp_path) if do_path_exists(comp_path) else None + + # load second stage model + comp_path = os.path.join(path, "second_stage_model") + second_stage_model = load_transformer(comp_path) if do_path_exists(comp_path) else None + + self.__dict__.update({ + **data, + "first_level_user_features_transformer": first_level_user_features_transformer, + "first_level_item_features_transformer": first_level_item_features_transformer, + "features_processor": features_processor, + "first_level_models": first_level_models, + "random_model": random_model, + "fallback_model": fallback_model, + "second_stage_model": second_stage_model + }) + + # pylint: disable=too-many-locals + def _add_features_for_second_level( + self, + log_to_add_features: DataFrame, + log_for_first_level_models: DataFrame, + user_features: DataFrame, + item_features: DataFrame, + ) -> DataFrame: + """ + Added features are: + - relevance from first level models + - user and item features from first level models + - dataset features + - FeatureProcessor features + + :param log_to_add_features: input DataFrame``[user_idx, item_idx, timestamp, relevance]`` + :param log_for_first_level_models: DataFrame``[user_idx, item_idx, timestamp, relevance]`` + :param user_features: user features``[user_idx]`` + feature columns + :param item_features: item features``[item_idx]`` + feature columns + :return: DataFrame + """ + self.logger.info("Generating features") + full_second_level_train = log_to_add_features + first_level_item_features_cached = cache_if_exists( + self.first_level_item_features_transformer.transform(item_features) + ) + first_level_user_features_cached = cache_if_exists( + self.first_level_user_features_transformer.transform(user_features) + ) + + pairs = log_to_add_features.select("user_idx", "item_idx") + for idx, model in enumerate(self.first_level_models): + current_pred = self._predict_pairs_with_first_level_model( + model=model, + log=log_for_first_level_models, + pairs=pairs, + user_features=first_level_user_features_cached, + item_features=first_level_item_features_cached, + ).withColumnRenamed("relevance", f"rel_{idx}_{model}") + full_second_level_train = full_second_level_train.join( + sf.broadcast(current_pred), + on=["user_idx", "item_idx"], + how="left", + ) + + if self.use_first_level_models_feat[idx]: + features = get_first_level_model_features( + model=model, + pairs=full_second_level_train.select( + "user_idx", "item_idx" + ), + user_features=first_level_user_features_cached, + item_features=first_level_item_features_cached, + prefix=f"m_{idx}", + ) + full_second_level_train = join_with_col_renaming( + left=full_second_level_train, + right=features, + on_col_name=["user_idx", "item_idx"], + how="left", + ) + + unpersist_if_exists(first_level_user_features_cached) + unpersist_if_exists(first_level_item_features_cached) + + full_second_level_train_cached = full_second_level_train.fillna( + 0 + ).cache() + + self.logger.info("Adding features from the dataset") + full_second_level_train = join_or_return( + full_second_level_train_cached, + user_features, + on="user_idx", + how="left", + ) + full_second_level_train = join_or_return( + full_second_level_train, + item_features, + on="item_idx", + how="left", + ) + + if self.use_generated_features: + if not self.features_processor.fitted: + self.features_processor.fit( + log=log_for_first_level_models, + user_features=user_features, + item_features=item_features, + ) + self.logger.info("Adding generated features") + full_second_level_train = self.features_processor.transform( + log=full_second_level_train + ) + + self.logger.info( + "Columns at second level: %s", + " ".join(full_second_level_train.columns), + ) + full_second_level_train_cached.unpersist() + return full_second_level_train + + def _split_data(self, log: DataFrame) -> Tuple[DataFrame, DataFrame]: + """Write statistics""" + first_level_train, second_level_train = self.train_splitter.split(log) + State().logger.debug("Log info: %s", get_log_info(log)) + State().logger.debug( + "first_level_train info: %s", get_log_info(first_level_train) + ) + State().logger.debug( + "second_level_train info: %s", get_log_info(second_level_train) + ) + return first_level_train, second_level_train + + @staticmethod + def _filter_or_return(dataframe, condition): + if dataframe is None: + return dataframe + return dataframe.filter(condition) + + def _predict_with_first_level_model( + self, + model: BaseRecommender, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: DataFrame, + item_features: DataFrame, + log_to_filter: DataFrame, + ): + """ + Filter users and items using can_predict_cold_items and can_predict_cold_users, and predict + """ + if not model.can_predict_cold_items: + log, items, item_features = [ + self._filter_or_return( + dataframe=df, + condition=sf.col("item_idx") < self.first_level_item_len, + ) + for df in [log, items, item_features] + ] + if not model.can_predict_cold_users: + log, users, user_features = [ + self._filter_or_return( + dataframe=df, + condition=sf.col("user_idx") < self.first_level_user_len, + ) + for df in [log, users, user_features] + ] + + log_to_filter_cached = join_with_col_renaming( + left=log_to_filter, + right=users, + on_col_name="user_idx", + ).cache() + max_positives_to_filter = 0 + + if log_to_filter_cached.count() > 0: + max_positives_to_filter = ( + log_to_filter_cached.groupBy("user_idx") + .agg(sf.count("item_idx").alias("num_positives")) + .select(sf.max("num_positives")) + .collect()[0][0] + ) + + pred = model._predict( + log, + k=k + max_positives_to_filter, + users=users, + items=items, + user_features=user_features, + item_features=item_features, + filter_seen_items=False, + ) + + pred = pred.join( + log_to_filter_cached.select("user_idx", "item_idx"), + on=["user_idx", "item_idx"], + how="anti", + ).drop("user", "item") + + log_to_filter_cached.unpersist() + + return get_top_k_recs(pred, k) + + def _predict_pairs_with_first_level_model( + self, + model: BaseRecommender, + log: DataFrame, + pairs: DataFrame, + user_features: DataFrame, + item_features: DataFrame, + ): + """ + Get relevance for selected user-item pairs. + """ + if not model.can_predict_cold_items: + log, pairs, item_features = [ + self._filter_or_return( + dataframe=df, + condition=sf.col("item_idx") < self.first_level_item_len, + ) + for df in [log, pairs, item_features] + ] + if not model.can_predict_cold_users: + log, pairs, user_features = [ + self._filter_or_return( + dataframe=df, + condition=sf.col("user_idx") < self.first_level_user_len, + ) + for df in [log, pairs, user_features] + ] + + return model._predict_pairs( + pairs=pairs, + log=log, + user_features=user_features, + item_features=item_features, + ) + + # pylint: disable=unused-argument + def _get_first_level_candidates( + self, + model: BaseRecommender, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: DataFrame, + item_features: DataFrame, + log_to_filter: DataFrame, + ) -> DataFrame: + """ + Combining the base model predictions with the fallback model + predictions. + """ + passed_arguments = locals() + passed_arguments.pop("self") + candidates = self._predict_with_first_level_model(**passed_arguments) + + if self.fallback_model is not None: + passed_arguments.pop("model") + fallback_candidates = self._predict_with_first_level_model( + model=self.fallback_model, **passed_arguments + ) + + candidates = fallback( + base=candidates, + fill=fallback_candidates, + k=self.num_negatives, + ) + return candidates + + # pylint: disable=too-many-locals,too-many-statements + def _fit( + self, + log: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + ) -> None: + + self.cached_list = [] + + self.logger.info("Data split") + first_level_train, second_level_positive = self._split_data(log) + # second_level_positive = second_level_positive + # .join(first_level_train.select("user_idx"), on="user_idx", how="left") + + self.first_level_item_len = ( + first_level_train.select("item_idx").distinct().count() + ) + self.first_level_user_len = ( + first_level_train.select("user_idx").distinct().count() + ) + + log.cache() + first_level_train.cache() + second_level_positive.cache() + self.cached_list.extend( + [log, first_level_train, second_level_positive] + ) + + if user_features is not None: + user_features.cache() + self.cached_list.append(user_features) + + if item_features is not None: + item_features.cache() + self.cached_list.append(item_features) + + if not self.first_level_item_features_transformer.fitted: + self.first_level_item_features_transformer.fit(item_features) + + if not self.first_level_user_features_transformer.fitted: + self.first_level_user_features_transformer.fit(user_features) + + first_level_item_features = cache_if_exists( + self.first_level_item_features_transformer.transform(item_features) + ) + first_level_user_features = cache_if_exists( + self.first_level_user_features_transformer.transform(user_features) + ) + + first_level_user_features = first_level_user_features.filter(sf.col("user_idx") < self.first_level_user_len) \ + if first_level_user_features is not None else None + + first_level_item_features = first_level_item_features.filter(sf.col("item_idx") < self.first_level_item_len) \ + if first_level_item_features is not None else None + + for base_model in [ + *self.first_level_models, + self.random_model, + self.fallback_model, + ]: + base_model._fit_wrap( + log=first_level_train, + user_features=first_level_user_features, + item_features=first_level_item_features, + ) + + self.logger.info("Generate negative examples") + negatives_source = ( + self.first_level_models[0] + if self.negatives_type == "first_level" + else self.random_model + ) + + first_level_candidates = self._get_first_level_candidates( + model=negatives_source, + log=first_level_train, + k=self.num_negatives, + users=log.select("user_idx").distinct(), + items=log.select("item_idx").distinct(), + user_features=first_level_user_features, + item_features=first_level_item_features, + log_to_filter=first_level_train, + ).select("user_idx", "item_idx") + + unpersist_if_exists(first_level_user_features) + unpersist_if_exists(first_level_item_features) + + self.logger.info("Crate train dataset for second level") + + second_level_train = ( + first_level_candidates.join( + second_level_positive.select( + "user_idx", "item_idx" + ).withColumn("target", sf.lit(1.0)), + on=["user_idx", "item_idx"], + how="left", + ).fillna(0.0, subset="target") + ).cache() + + self.cached_list.append(second_level_train) + + self.logger.info( + "Distribution of classes in second-level train dataset:/n %s", + ( + second_level_train.groupBy("target") + .agg(sf.count(sf.col("target")).alias("count_for_class")) + .take(2) + ), + ) + + if not self.features_processor.fitted: + self.features_processor.fit( + log=first_level_train, + user_features=user_features, + item_features=item_features, + ) + + self.logger.info("Adding features to second-level train dataset") + second_level_train_to_convert = self._add_features_for_second_level( + log_to_add_features=second_level_train, + log_for_first_level_models=first_level_train, + user_features=user_features, + item_features=item_features, + ).cache() + + self.cached_list.append(second_level_train_to_convert) + self.second_stage_model.fit(second_level_train_to_convert) + for dataframe in self.cached_list: + unpersist_if_exists(dataframe) + + # pylint: disable=too-many-arguments + def _predict( + self, + log: DataFrame, + k: int, + users: DataFrame, + items: DataFrame, + user_features: Optional[DataFrame] = None, + item_features: Optional[DataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + + State().logger.debug(msg="Generating candidates to rerank") + + first_level_user_features = cache_if_exists( + self.first_level_user_features_transformer.transform(user_features) + ) + first_level_item_features = cache_if_exists( + self.first_level_item_features_transformer.transform(item_features) + ) + + candidates = self._get_first_level_candidates( + model=self.first_level_models[0], + log=log, + k=self.num_negatives, + users=users, + items=items, + user_features=first_level_user_features, + item_features=first_level_item_features, + log_to_filter=log, + ).select("user_idx", "item_idx") + + candidates_cached = candidates.cache() + unpersist_if_exists(first_level_user_features) + unpersist_if_exists(first_level_item_features) + self.logger.info("Adding features") + candidates_features = self._add_features_for_second_level( + log_to_add_features=candidates_cached, + log_for_first_level_models=log, + user_features=user_features, + item_features=item_features, + ) + candidates_features.cache() + candidates_cached.unpersist() + self.logger.info( + "Generated %s candidates for %s users", + candidates_features.count(), + candidates_features.select("user_idx").distinct().count(), + ) + return self.second_stage_model.predict(data=candidates_features, k=k) + + def fit_predict( + self, + log: AnyDataFrame, + k: int, + users: Optional[Union[AnyDataFrame, Iterable]] = None, + items: Optional[Union[AnyDataFrame, Iterable]] = None, + user_features: Optional[AnyDataFrame] = None, + item_features: Optional[AnyDataFrame] = None, + filter_seen_items: bool = True, + ) -> DataFrame: + """ + :param log: input DataFrame ``[user_id, item_id, timestamp, relevance]`` + :param k: length of a recommendation list, must be smaller than the number of ``items`` + :param users: users to get recommendations for + :param items: items to get recommendations for + :param user_features: user features``[user_id]`` + feature columns + :param item_features: item features``[item_id]`` + feature columns + :param filter_seen_items: flag to removed seen items from recommendations + :return: DataFrame ``[user_id, item_id, relevance]`` + """ + self.fit(log, user_features, item_features) + return self.predict( + log, + k, + users, + items, + user_features, + item_features, + filter_seen_items, + ) + + @staticmethod + def _optimize_one_model( + model: BaseRecommender, + train: AnyDataFrame, + test: AnyDataFrame, + user_features: Optional[AnyDataFrame] = None, + item_features: Optional[AnyDataFrame] = None, + param_borders: Optional[Dict[str, List[Any]]] = None, + criterion: Metric = Precision(), + k: int = 10, + budget: int = 10, + new_study: bool = True, + ): + params = model.optimize( + train, + test, + user_features, + item_features, + param_borders, + criterion, + k, + budget, + new_study, + ) + return params + + # pylint: disable=too-many-arguments, too-many-locals + def optimize( + self, + train: AnyDataFrame, + test: AnyDataFrame, + user_features: Optional[AnyDataFrame] = None, + item_features: Optional[AnyDataFrame] = None, + param_borders: Optional[List[Dict[str, List[Any]]]] = None, + criterion: Metric = Precision(), + k: int = 10, + budget: int = 10, + new_study: bool = True, + ) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]: + """ + Optimize first level models with optuna. + + :param train: train DataFrame ``[user_id, item_id, timestamp, relevance]`` + :param test: test DataFrame ``[user_id, item_id, timestamp, relevance]`` + :param user_features: user features ``[user_id , timestamp]`` + feature columns + :param item_features: item features``[item_id]`` + feature columns + :param param_borders: list with param grids for first level models and a fallback model. + Empty dict skips optimization for that model. + Param grid is a dict ``{param: [low, high]}``. + :param criterion: metric to optimize + :param k: length of a recommendation list + :param budget: number of points to train each model + :param new_study: keep searching with previous study or start a new study + :return: list of dicts of parameters + """ + number_of_models = len(self.first_level_models) + if self.fallback_model is not None: + number_of_models += 1 + if number_of_models != len(param_borders): + raise ValueError( + "Provide search grid or None for every first level model" + ) + + first_level_user_features_tr = ToNumericFeatureTransformer() + first_level_user_features = first_level_user_features_tr.fit_transform( + user_features + ) + first_level_item_features_tr = ToNumericFeatureTransformer() + first_level_item_features = first_level_item_features_tr.fit_transform( + item_features + ) + + first_level_user_features = cache_if_exists(first_level_user_features) + first_level_item_features = cache_if_exists(first_level_item_features) + + params_found = [] + for i, model in enumerate(self.first_level_models): + if param_borders[i] is None or ( + isinstance(param_borders[i], dict) and param_borders[i] + ): + self.logger.info( + "Optimizing first level model number %s, %s", + i, + model.__str__(), + ) + params_found.append( + self._optimize_one_model( + model=model, + train=train, + test=test, + user_features=first_level_user_features, + item_features=first_level_item_features, + param_borders=param_borders[i], + criterion=criterion, + k=k, + budget=budget, + new_study=new_study, + ) + ) + else: + params_found.append(None) + + if self.fallback_model is None or ( + isinstance(param_borders[-1], dict) and not param_borders[-1] + ): + return params_found, None + + self.logger.info("Optimizing fallback-model") + fallback_params = self._optimize_one_model( + model=self.fallback_model, + train=train, + test=test, + user_features=first_level_user_features, + item_features=first_level_item_features, + param_borders=param_borders[-1], + criterion=criterion, + new_study=new_study, + ) + unpersist_if_exists(first_level_item_features) + unpersist_if_exists(first_level_user_features) + return params_found, fallback_params diff --git a/replay/session_handler.py b/replay/session_handler.py index 9a2d26c5b..0cb508680 100644 --- a/replay/session_handler.py +++ b/replay/session_handler.py @@ -24,6 +24,9 @@ def get_spark_session( 70% of RAM by default. :param shuffle_partitions: number of partitions for Spark; triple CPU count by default """ + if os.environ.get("SCRIPT_ENV", None) == "cluster": + return SparkSession.builder.getOrCreate() + os.environ["PYSPARK_PYTHON"] = sys.executable os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable @@ -46,6 +49,7 @@ def get_spark_session( .config("spark.driver.host", "localhost") .config("spark.sql.execution.arrow.pyspark.enabled", "true") .config("spark.kryoserializer.buffer.max", "256m") + .config("spark.files.overwrite", "true") .master("local[*]") .enableHiveSupport() .getOrCreate() diff --git a/replay/utils.py b/replay/utils.py index f0451ac86..c72a63870 100644 --- a/replay/utils.py +++ b/replay/utils.py @@ -1,20 +1,34 @@ -from typing import Any, Iterable, List, Optional, Set, Tuple, Union - import collections +import logging +import os +import pickle +import shutil +from abc import ABC, abstractmethod +from contextlib import contextmanager +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Iterable + import numpy as np import pandas as pd import pyspark.sql.types as st from numpy.random import default_rng +from pyarrow import fs from pyspark.ml.linalg import DenseVector, Vectors, VectorUDT -from pyspark.sql import Column, DataFrame, Window, functions as sf +from pyspark.sql import SparkSession, Column, DataFrame, Window, functions as sf from scipy.sparse import csr_matrix from replay.constants import AnyDataFrame, NumType, REC_SCHEMA from replay.session_handler import State +from pyspark.sql.column import _to_java_column, _to_seq # pylint: disable=invalid-name +logger = logging.getLogger("replay") + def convert2spark(data_frame: Optional[AnyDataFrame]) -> Optional[DataFrame]: """ @@ -128,6 +142,30 @@ def get_top_k_recs(recs: DataFrame, k: int, id_type: str = "idx") -> DataFrame: ) +def delete_folder(path: str): + file_info = get_filesystem(path) + + if file_info.filesystem == FileSystem.HDFS: + fs.HadoopFileSystem.from_uri(file_info.hdfs_uri).delete_dir(path) + else: + fs.LocalFileSystem().delete_dir(file_info.path) + + +def create_folder(path: str, delete_if_exists: bool = False, exists_ok: bool = False): + file_info = get_filesystem(path) + + is_exists = do_path_exists(path) + if is_exists and delete_if_exists: + delete_folder(path) + elif is_exists and not exists_ok: + raise FileExistsError(f"The path already exists: {path}") + + if file_info.filesystem == FileSystem.HDFS: + fs.HadoopFileSystem.from_uri(file_info.hdfs_uri).create_dir(file_info.path) + else: + fs.LocalFileSystem().create_dir(file_info.path) + + @sf.udf(returnType=st.DoubleType()) def vector_dot(one: DenseVector, two: DenseVector) -> float: """ @@ -208,6 +246,20 @@ def vector_mult( return one * two +def multiply_scala_udf(scalar, vector): + """Multiplies a scalar by a vector + + Args: + scalar: column with scalars + vector: column with vectors + + Returns: column expression + """ + sc = SparkSession.getActiveSession().sparkContext + _f = sc._jvm.org.apache.spark.replay.utils.ScalaPySparkUDFs.multiplyUDF() + return Column(_f.apply(_to_seq(sc, [scalar, vector], _to_java_column))) + + @sf.udf(returnType=st.ArrayType(st.DoubleType())) def array_mult(first: st.ArrayType, second: st.ArrayType): """ @@ -724,6 +776,136 @@ def drop_temp_view(temp_view_name: str) -> None: spark.catalog.dropTempView(temp_view_name) +class log_exec_timer: + def __init__(self, name: Optional[str] = None): + self.name = name + self._start = None + self._duration = None + + def __enter__(self): + self._start = datetime.now() + return self + + def __exit__(self, type, value, traceback): + self._duration = (datetime.now() - self._start).total_seconds() + msg = ( + f"Exec time of {self.name}: {self._duration}" + if self.name + else f"Exec time: {self._duration}" + ) + logger.info(msg) + + @property + def duration(self): + return self._duration + + +@contextmanager +def JobGroup(group_id: str, description: str): + sc = SparkSession.getActiveSession().sparkContext + sc.setJobGroup(group_id, description) + yield f"{group_id} - {description}" + sc._jsc.clearJobGroup() + + +def cache_and_materialize_if_in_debug(df: DataFrame, description: str = "no-desc"): + if os.environ.get("REPLAY_DEBUG_MODE", None): + with log_exec_timer(description): + df = df.cache() + df.write.mode('overwrite').format('noop').save() + + +@contextmanager +def JobGroupWithMetrics(group_id: str, description: str): + metric_name = f"{group_id}__{description}" + with JobGroup(group_id, description), log_exec_timer(metric_name) as timer: + yield + + if os.environ.get("REPLAY_DEBUG_MODE", None): + import mlflow + mlflow.log_metric(timer.name, timer.duration) + + +def get_number_of_allocated_executors(spark: SparkSession): + sc = spark._jsc.sc() + return ( + len( + [ + executor.host() + for executor in sc.statusTracker().getExecutorInfos() + ] + ) + - 1 + ) + + +class FileSystem(Enum): + HDFS = 1 + LOCAL = 2 + + +def get_default_fs() -> str: + spark = SparkSession.getActiveSession() + hadoop_conf = spark._jsc.hadoopConfiguration() + default_fs = hadoop_conf.get("fs.defaultFS") + logger.debug(f"hadoop_conf.get('fs.defaultFS'): {default_fs}") + return default_fs + + +@dataclass(frozen=True) +class FileInfo: + path: str + filesystem: FileSystem + hdfs_uri: str = None + + +def get_filesystem(path: str) -> FileInfo: + """Analyzes path and hadoop config and return tuple of `filesystem`, + `hdfs uri` (if filesystem is hdfs) and `cleaned path` (without prefix). + + For example: + + >>> path = 'hdfs://node21.bdcl:9000/tmp/file' + >>> get_filesystem(path) + FileInfo(path='/tmp/file', filesystem=, hdfs_uri='hdfs://node21.bdcl:9000') + or + >>> path = 'file:///tmp/file' + >>> get_filesystem(path) + FileInfo(path='/tmp/file', filesystem=, hdfs_uri=None) + + Args: + path (str): path to file on hdfs or local disk + + Returns: + Tuple[int, Optional[str], str]: `filesystem id`, + `hdfs uri` (if filesystem is hdfs) and `cleaned path` (without prefix) + """ + prefix_len = 7 # 'hdfs://' and 'file://' length + if path.startswith("hdfs://"): + if path.startswith("hdfs:///"): + default_fs = get_default_fs() + if default_fs.startswith("hdfs://"): + return FileInfo(path[prefix_len:], FileSystem.HDFS, default_fs) + else: + raise Exception( + f"Can't get default hdfs uri for path = '{path}'. " + "Specify an explicit path, such as 'hdfs://host:port/dir/file', " + "or set 'fs.defaultFS' in hadoop configuration." + ) + else: + hostname = path[prefix_len:].split("/", 1)[0] + hdfs_uri = "hdfs://" + hostname + return FileInfo(path[len(hdfs_uri):], FileSystem.HDFS, hdfs_uri) + elif path.startswith("file://"): + return FileInfo(path[prefix_len:], FileSystem.LOCAL) + else: + default_fs = get_default_fs() + if default_fs.startswith("hdfs://"): + return FileInfo(path, FileSystem.HDFS, default_fs) + else: + return FileInfo(path, FileSystem.LOCAL) + + def sample_top_k_recs(pairs: DataFrame, k: int, seed: int = None): """ Sample k items for each user with probability proportional to the relevance score. @@ -847,3 +1029,167 @@ def return_recs( recs.write.parquet(path=recs_file_path, mode="overwrite") return None + + +def unionify(df: DataFrame, df_2: Optional[DataFrame] = None) -> DataFrame: + if df_2 is not None: + df = df.unionByName(df_2) + return df + + +@contextmanager +def unpersist_after(dfs: Dict[str, Optional[DataFrame]]): + yield + + for df in dfs.values(): + if df is not None: + df.unpersist() + + +class AbleToSaveAndLoad(ABC): + @classmethod + @abstractmethod + def load(cls, path: str, spark: Optional[SparkSession] = None): + """ + load an instance of this class from saved state + + :return: an instance of the current class + """ + + @abstractmethod + def save(self, path: str, overwrite: bool = False, spark: Optional[SparkSession] = None): + """ + Saves the current instance + """ + + @staticmethod + def _get_spark_session() -> SparkSession: + return State().session + + @classmethod + def _validate_classname(cls, classname: str): + assert classname == cls.get_classname() + + @classmethod + def get_classname(cls): + return ".".join([cls.__module__, cls.__name__]) + + +def prepare_dir(path): + """ + Create empty `path` dir + """ + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path) + + +def get_class_by_name(classname: str) -> type: + parts = classname.split(".") + module = ".".join(parts[:-1]) + m = __import__(module) + for comp in parts[1:]: + m = getattr(m, comp) + return m + + +def do_path_exists(path: str) -> bool: + spark = State().session + + # due to the error: pyspark.sql.utils.IllegalArgumentException: Wrong FS: file:/... + if path.startswith("file:/"): + return os.path.exists(path) + + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + is_exists = fs.exists(spark._jvm.org.apache.hadoop.fs.Path(path)) + return is_exists + + +def list_folder(path: str) -> List[str]: + """ + List files in a given directory + :path: a directory to list files in + :return: names of files from the given directory (not absolute names) + """ + spark = State().session + # if True: + # # if path.startswith("file:/"): + # + # files = [x for x in os.listdir(path)] + # logging.info("Files", files) + # return files + fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) + base_path = spark._jvm.org.apache.hadoop.fs.Path(path) + + if not fs.isDirectory(base_path): + raise RuntimeError(f"The path is not directory. Cannot list it. The path: {path}") + + entries = fs.listStatus(base_path) + files = [entry.getPath().getName() for entry in entries] + return files + + +def save_transformer( + transformer: AbleToSaveAndLoad, + path: str, + overwrite: bool = False): + + logger.info(f"Saving transformer on path: {path}") + spark = State().session + + is_exists = do_path_exists(path) + + if is_exists and not overwrite: + raise FileExistsError(f"Path '{path}' already exists. Mode is 'overwrite = False'.") + elif is_exists: + delete_folder(path) + + create_folder(path) + + spark.createDataFrame([{ + "classname": transformer.get_classname() + }]).write.parquet(os.path.join(path, "metadata.parquet")) + + transformer.save(os.path.join(path, "transformer"), overwrite, spark=spark) + logger.info(f"The transformer is saved on path {path}") + + +def load_transformer(path: str): + spark = State().session + metadata_row = spark.read.parquet(os.path.join(path, "metadata.parquet")).first().asDict() + clazz = get_class_by_name(metadata_row["classname"]) + instance = clazz.load(os.path.join(path, "transformer"), spark) + return instance + + +def save_picklable_to_parquet(obj: Any, path: str) -> None: + """ + Function dumps object to disk or hdfs in parquet format. + + Args: + obj: object to be saved + path: path to dump + """ + sc = SparkSession.getActiveSession().sparkContext + # We can use `RDD.saveAsPickleFile`, but it has no "overwrite" parameter + pickled_instance = pickle.dumps(obj) + Record = collections.namedtuple("Record", ["data"]) + rdd = sc.parallelize([Record(pickled_instance)]) + instance_df = rdd.map(lambda rec: Record(bytearray(rec.data))).toDF() + instance_df.write.mode("overwrite").parquet(path) + + +def load_pickled_from_parquet(path: str) -> Any: + """ + Function loads object from disk or hdfs, what was dumped via `save_picklable_to_parquet` function. + + Args: + path: source path + + Returns: unpickled object + + """ + spark = SparkSession.getActiveSession() + df = spark.read.parquet(path) + pickled_instance = df.rdd.map(lambda row: bytes(row.data)).first() + return pickle.loads(pickled_instance) diff --git a/scala/.gitignore b/scala/.gitignore new file mode 100644 index 000000000..a1e25674c --- /dev/null +++ b/scala/.gitignore @@ -0,0 +1,9 @@ +.idea +.bloop +.bsp +.metals +.vscode +target +project/target +work +assembly diff --git a/scala/build.sbt b/scala/build.sbt new file mode 100644 index 000000000..4049fa97e --- /dev/null +++ b/scala/build.sbt @@ -0,0 +1,18 @@ +import sbt.Keys.resolvers + +name := "replay" + +version := "0.1" + +scalaVersion := "2.12.15" + +resolvers ++= Seq( + ("Confluent" at "http://packages.confluent.io/maven") + .withAllowInsecureProtocol(true) +) + +libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % "3.1.3", + "org.apache.spark" %% "spark-sql" % "3.1.3", + "org.apache.spark" %% "spark-mllib" % "3.1.3", +) \ No newline at end of file diff --git a/scala/project/build.properties b/scala/project/build.properties new file mode 100644 index 000000000..22af2628c --- /dev/null +++ b/scala/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.7.1 diff --git a/scala/src/main/scala/org/apache/spark/replay/utils/ScalaPySparkUDFs.scala b/scala/src/main/scala/org/apache/spark/replay/utils/ScalaPySparkUDFs.scala new file mode 100644 index 000000000..27fb43457 --- /dev/null +++ b/scala/src/main/scala/org/apache/spark/replay/utils/ScalaPySparkUDFs.scala @@ -0,0 +1,18 @@ +package org.apache.spark.replay.utils + +import org.apache.spark.sql.functions.udf +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.sql.expressions.UserDefinedFunction + +object ScalaPySparkUDFs { + + def multiplyUDF: UserDefinedFunction = udf { (scalar: Double, vector: DenseVector) => + val resultVector = new Array[Double](vector.size) + vector.foreachActive {(index: Int, value: Double) => + resultVector(index) = scalar*value + } + Vectors.dense(resultVector) + } + +}