From 16770c4f446da80be19bcd75525542afa2375935 Mon Sep 17 00:00:00 2001 From: Yifei Ma Date: Wed, 1 Jul 2020 22:55:24 -0700 Subject: [PATCH 1/3] configure diagnose.yaml --- data_science/diagnose/diagnose.py | 314 +++++++++++++++------------- data_science/diagnose/diagnose.yaml | 37 ++++ 2 files changed, 205 insertions(+), 146 deletions(-) create mode 100644 data_science/diagnose/diagnose.yaml diff --git a/data_science/diagnose/diagnose.py b/data_science/diagnose/diagnose.py index 728ec58..e9ad075 100644 --- a/data_science/diagnose/diagnose.py +++ b/data_science/diagnose/diagnose.py @@ -5,35 +5,23 @@ import traceback import scipy.sparse as ss import time +import argparse +import os +import re +import yaml +parser = argparse.ArgumentParser(description=""" + A tool to provide dataset statistics. + If datasets are provided, conduct the analysis. + Otherwise, it will load the functions with the defaults provided in yaml. + This is useful if this function is called via jupyter magic `%run script`. + """) +parser.add_argument('--yaml', default='diagnose.yaml') +parser.add_argument('--datasets', nargs='*', help='interactions, users, items') +args, _ = parser.parse_known_args() -INTERACTIONS_REQUIRED_FIELDS = ["USER_ID", "ITEM_ID", "TIMESTAMP"] -# general -NA_RATE_THRESHOLD = 0.1 -DUP_RATE_THRESHOLD = 0.1 -REPEAT_RATE_THRESHOLD = 0.5 -COLDSTART_RATE_THRESHOLD = 0.1 - -# loglog warning thresholds -LOGLOG_RMSE_THRESHOLD = 5 -LOGLOG_MIN_CATS = 30 -LOGLOG_HEAD_HEAVY = -2 -LOGLOG_HEAVY_TAIL = -0.75 - -# metadata -CATS_FREQ_HEAD = 10 - -# temporal analysis -EPS_GREEDY = 0.01 -TIMEDELTA_REFERENCES = [ - ('min', 60), ('hour',3600), ('day',3600*24), - ('week',3600*24*7), ('month',3600*24*31), - ('year',3600*24*365)] -ROLLING_HISTORY_LEN = [1, 10, 100, 1000] -RETRAIN_FREQUENCY = ['1y','1q','1m','5d','1d','6h'] -TEMPORAL_FREQUENCY = ['5d', '1d', '6h'] -TEMPORAL_LOSS_METHODS = ['total variation', 'out-sample items'] -TEMPORAL_PLOT_LIMIT = 50 +with open(args.yaml) as fp: + config = yaml.safe_load(fp) def plot_loglog(val, name='', show=True): @@ -56,25 +44,27 @@ def plot_loglog(val, name='', show=True): def describe_categorical(sr, name=''): - print("\n=== {} top {} categories ===".format(name, CATS_FREQ_HEAD)) + print("\n=== {} top {} categories ===".format(name, config['num_top_categories_to_show'])) parts = sr.astype(str).apply(lambda x: x.split('|')) cats = pd.Series(np.hstack(parts.values)) cats_freq = cats.groupby(cats).size().sort_values(ascending=False) - print(cats_freq.head(CATS_FREQ_HEAD)) + print(cats_freq.head(config['num_top_categories_to_show'])) - if len(cats_freq) <= LOGLOG_MIN_CATS: + if len(cats_freq) <= config['loglog_min_categories_to_show']: return None (slope, intercept, rmse) = plot_loglog(cats_freq, name) - if len(cats_freq) > LOGLOG_MIN_CATS and rmse < LOGLOG_RMSE_THRESHOLD: - if slope > LOGLOG_HEAVY_TAIL: + if len(cats_freq) > config['loglog_min_categories_to_show'] and \ + rmse < config['loglog_rmse_threshold']: + + if slope > config['loglog_tail_heavy_threshold']: warnings.warn(""" Heavy-tail {0} distributions are usually hard to learn (slope={1})! Consider rolling up {0} or dropping its rare values. """.format(name, slope)) - elif slope < LOGLOG_HEAD_HEAVY: + elif slope < config['loglog_head_heavy_threshold']: warnings.warn(""" Head-heavy {0} distributions are usually uninteresting or spammy (slope={1})! Consider using finer-grade {0} or thresholding its dominate values. @@ -85,9 +75,8 @@ def describe_categorical(sr, name=''): def describe_dataframe(df, name=''): print("\n=== Describe {} ===\n".format(name)) - print(df.describe()) - if object in df.dtypes: - print(df.describe(include=['O'])) + with pd.option_context("display.max_rows", 100): + print(df.describe(include='all')) summary = {} for cn, dtype in df.dtypes.iteritems(): @@ -193,16 +182,19 @@ def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, q = _normalize_distribution(X) if method.lower() in ['kl', 'kl-divergence']: - eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N) + eps_ratio = (1-config['cross_entropy_with_epsilon_greedy']) / \ + (config['cross_entropy_with_epsilon_greedy'] / N) log_p = (p * eps_ratio).log1p() log_q = (q * eps_ratio).log1p() temporal_loss = (p .multiply (log_p - log_q)).sum(axis=1) loss_fmt = '{:.2f}' elif method.lower() in ['ce', 'cross-entropy']: - eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N) + eps_ratio = (1-config['cross_entropy_with_epsilon_greedy']) / \ + (config['cross_entropy_with_epsilon_greedy'] / N) log_q = (q * eps_ratio).log1p() - temporal_loss = -((p .multiply (log_q)).sum(axis=1) + np.log(EPS_GREEDY/N)) + temporal_loss = -((p .multiply (log_q)).sum(axis=1) + \ + np.log(config['cross_entropy_with_epsilon_greedy'] / N)) loss_fmt = '{:.2f}' elif method.lower() in ['oov', 'out-sample items']: @@ -213,6 +205,15 @@ def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, temporal_loss = (p-q).multiply(p>q).sum(axis=1) loss_fmt = '{:.1%}' + elif method.lower().startswith('prec'): + topk = int(re.findall(r'\d+', method)[0]) + q = q.toarray() + indices = np.argpartition(-q, topk) + q[:,:] = 0 + q[np.arange(q.shape[0])[:,None], indices[:,:topk]] = 1 + temporal_loss = -p.multiply(q).sum(axis=1) + loss_fmt = '{:.1%}' + else: raise NotImplementedError @@ -234,76 +235,85 @@ def diagnose_interactions(df): df['USER_ID'] = df['USER_ID'].astype(str) df.index = df["TIMESTAMP"].values.astype("datetime64[s]") + if config['na_rate_threshold'] < 1: + na_rate = df[config['interactions_required_fields']].isnull().any(axis=1).mean() + print("missing rate in fields", config['interactions_required_fields'], na_rate) + if na_rate > config['na_rate_threshold']: + warnings.warn("High data missing rate for required fields ({:.1%})!".format(na_rate)) + df = df.dropna(subset=config['interactions_required_fields']) + print("dropna shape", df.shape) - na_rate = df[INTERACTIONS_REQUIRED_FIELDS].isnull().any(axis=1).mean() - print("missing rate in fields", INTERACTIONS_REQUIRED_FIELDS, na_rate) - if na_rate > NA_RATE_THRESHOLD: - warnings.warn("High data missing rate for required fields ({:.1%})!".format(na_rate)) - df = df.dropna(subset=INTERACTIONS_REQUIRED_FIELDS) - print("dropna shape", df.shape) + if config['dup_rate_threshold'] < 1: + dup_rate = (df.groupby(config['interactions_required_fields']).size() - 1.0).sum() / df.shape[0] + print("duplication rate", dup_rate) + if dup_rate > config['dup_rate_threshold']: + warnings.warn(""" + High duplication rate ({:.1%})! + Only one event can be taken at the same (user,item,timestamp) index. + """.format(dup_rate)) - dup_rate = (df.groupby(INTERACTIONS_REQUIRED_FIELDS).size() - 1.0).sum() / df.shape[0] - print("duplication rate", dup_rate) - if dup_rate > DUP_RATE_THRESHOLD: - warnings.warn(""" - High duplication rate ({:.1%})! - Only one event can be taken at the same (user,item,timestamp) index. - """.format(dup_rate)) - df = df.drop_duplicates(subset=INTERACTIONS_REQUIRED_FIELDS) - print("drop_duplicates shape", df.shape) + df = df.drop_duplicates(subset=config['interactions_required_fields']) + print("drop_duplicates shape", df.shape) - repeat_rate = (df.groupby(["USER_ID", "ITEM_ID"]).size() - 1.0).sum() / df.shape[0] - print("user item repeat rate", repeat_rate) - if repeat_rate > REPEAT_RATE_THRESHOLD: - warnings.warn(""" - High rate of repeated consumptions ({:.1%})! - We would not do anything, but it may beneficial to - (1) consider keeping only the last interaction between the same user-item pair, - (2) consider if the ITEM_IDs have collisions, and/or - (3) use high-order hierarchical models. - """.format(repeat_rate)) + if config['repeat_rate_threshold'] < 1: + repeat_rate = (df.groupby(["USER_ID", "ITEM_ID"]).size() - 1.0).sum() / df.shape[0] + print("user item repeat rate", repeat_rate) + if repeat_rate > config['repeat_rate_threshold']: + warnings.warn(""" + High rate of repeated consumptions ({:.1%})! + We would not do anything, but it may beneficial to + (1) consider keeping only the last interaction between the same user-item pair, + (2) consider if the ITEM_IDs have collisions, and/or + (3) use high-order hierarchical models. + """.format(repeat_rate)) - summary = describe_dataframe(df, 'interactions table') + if config['describe_dataframe']: + summary = describe_dataframe(df, 'interactions table') - print("\n=== Hourly activity pattern ===") - print(df.groupby(df.index.hour).size()) + if config['show_activity_patterns']: + print("\n=== Hourly activity pattern ===") + print(df.groupby(df.index.hour).size()) - print("\n=== Day of week activity pattern ===") - print(df.groupby(df.index.dayofweek).size()) + print("\n=== Day of week activity pattern ===") + print(df.groupby(df.index.dayofweek).size()) - plot_patterns = { - "date": df.index.date, - "hour": df.index.hour, - "dayofweek": df.index.dayofweek} + plot_patterns = { + "date": df.index.date, + "hour": df.index.hour, + "dayofweek": df.index.dayofweek} - for k,v in plot_patterns.items(): - pl.plot(df.groupby(v).size(), '.-') - pl.gcf().autofmt_xdate() - pl.title("Activity pattern by %s" %k) - pl.grid() - pl.show() + for k,v in plot_patterns.items(): + pl.plot(df.groupby(v).size(), '.-') + pl.gcf().autofmt_xdate() + pl.title("Activity pattern by %s" %k) + pl.grid() + pl.show() print("\n=== Temporal shift analysis ===\n") - print("Sorting and removing repeated user-items for temporal shift analysis...") + print("Sorting...") df.sort_index(inplace=True, kind='mergesort') - df_dedup = df.drop_duplicates(['USER_ID','ITEM_ID'], keep='last') + if config['temporal_shift_dedup']: + print("Removing repeated user-items...") + df_dedup = df.drop_duplicates(['USER_ID','ITEM_ID'], keep='last') + else: + df_dedup = df.copy() print("\n=== Temporal shift - retrain frequency ===\n") - for method in TEMPORAL_LOSS_METHODS: + for method in config['temporal_loss_methods']: bootstrap_avg = [] past_fut_avg = [] - for freq in RETRAIN_FREQUENCY: + for freq in config['retrain_frequencies']: _, _, _bs_avg, loss_fmt = compute_bootstrap_loss(df_dedup, freq, method) _, _, _ts_avg, loss_fmt = compute_temporal_loss(df_dedup, freq, method, 1) bootstrap_avg.append(_bs_avg) past_fut_avg.append(_ts_avg) - pl.plot(RETRAIN_FREQUENCY, bootstrap_avg, '.--', label='same-period bootstrap') - pl.plot(RETRAIN_FREQUENCY, past_fut_avg, '.-', label='lagged popularity') + pl.plot(config['retrain_frequencies'], bootstrap_avg, '.--', label='same-period bootstrap') + pl.plot(config['retrain_frequencies'], past_fut_avg, '.-', label='lagged popularity') pl.legend() pl.xlabel('retrain frequency') pl.title(method + ' loss at different frequencies') @@ -314,52 +324,57 @@ def diagnose_interactions(df): print("\n=== Temporal shift - history cutoffs ===\n") - for method in TEMPORAL_LOSS_METHODS: - for freq in TEMPORAL_FREQUENCY: - bootstrap_loss, _, avg_loss, loss_fmt = compute_bootstrap_loss(df_dedup, freq, method) - pl.plot(bootstrap_loss.iloc[-TEMPORAL_PLOT_LIMIT:], '.--', - label = 'boostrap baseline={}'.format(loss_fmt.format(avg_loss))) + freq = config['retrain_frequencies'][-1] + for i,(p,b) in list(enumerate(zip(past_fut_avg, bootstrap_avg)))[::-1]: + if p NA_RATE_THRESHOLD: - warnings.warn("High missing rate of all user meta-data ({:%})!" - .format(missing_rate)) + if config['na_rate_threshold'] < 1: + missing_rate = 1 - df.USER_ID.astype(str).isin(set(users.index.values)).mean() + print("Missing rate of all user meta-data", missing_rate) + if missing_rate > config['na_rate_threshold']: + warnings.warn("High missing rate of all user meta-data ({:%})!" + .format(missing_rate)) - coldstart_rate = 1 - users.index.isin(set(df.USER_ID.astype(str).values)).mean() - print("User coldstart rate", coldstart_rate) - if coldstart_rate > COLDSTART_RATE_THRESHOLD: - warnings.warn("High user coldstart rate ({:%})!" - .format(coldstart_rate)) + if config['coldstart_rate_threshold'] < 1: + coldstart_rate = 1 - users.index.isin(set(df.USER_ID.astype(str).values)).mean() + print("User coldstart rate", coldstart_rate) + if coldstart_rate > config['coldstart_rate_threshold']: + warnings.warn("High user coldstart rate ({:%})!" + .format(coldstart_rate)) describe_dataframe(users) @@ -406,17 +423,19 @@ def diagnose_items(df, items): items['ITEM_ID'] = items['ITEM_ID'].astype(str) items = items.set_index('ITEM_ID') - missing_rate = 1 - df.ITEM_ID.astype(str).isin(set(items.index.values)).mean() - print("Missing rate of all item meta-data", missing_rate) - if missing_rate > NA_RATE_THRESHOLD: - warnings.warn("High missing rate of all item meta-data ({:%})!" - .format(missing_rate)) + if config['na_rate_threshold'] < 1: + missing_rate = 1 - df.ITEM_ID.astype(str).isin(set(items.index.values)).mean() + print("Missing rate of all item meta-data", missing_rate) + if missing_rate > config['na_rate_threshold']: + warnings.warn("High missing rate of all item meta-data ({:%})!" + .format(missing_rate)) - coldstart_rate = 1 - items.index.isin(set(df.ITEM_ID.astype(str).values)).mean() - print("Item coldstart rate", coldstart_rate) - if coldstart_rate > NA_RATE_THRESHOLD: - warnings.warn("High item coldstart rate ({:%})!" - .format(coldstart_rate)) + if config['coldstart_rate_threshold'] < 1: + coldstart_rate = 1 - items.index.isin(set(df.ITEM_ID.astype(str).values)).mean() + print("Item coldstart rate", coldstart_rate) + if coldstart_rate > config['coldstart_rate_threshold']: + warnings.warn("High item coldstart rate ({:%})!" + .format(coldstart_rate)) describe_dataframe(items) @@ -438,14 +457,14 @@ def diagnose(df, users=None, items=None): print("########################################") print("# DIAGNOSING INTERACTIONS TABLE, SAMPLE:") print("########################################") - print(df.sample(min(len(df), 10))) + print(df.sample(min(len(df), 10), random_state=42)) diagnose_interactions(df) if users is not None: print("########################################") print("# DIAGNOSING USERS TABLE, SAMPLE:") print("########################################") - print(users.sample(min(len(users), 10))) + print(users.sample(min(len(users), 10), random_state=42)) diagnose_users(df, users) else: print("########################################") @@ -456,9 +475,12 @@ def diagnose(df, users=None, items=None): print("########################################") print("# DIAGNOSING ITEMS TABLE, SAMPLE:") print("########################################") - print(items.sample(min(len(items), 10))) + print(items.sample(min(len(items), 10), random_state=42)) diagnose_items(df, items) else: print("########################################") print("# ITEMS TABLE NOT FOUND") print("########################################") + +if args.datasets is not None and len(args.datasets): + diagnose(*[pd.read_csv(d) for d in args.datasets]) \ No newline at end of file diff --git a/data_science/diagnose/diagnose.yaml b/data_science/diagnose/diagnose.yaml new file mode 100644 index 0000000..8369b44 --- /dev/null +++ b/data_science/diagnose/diagnose.yaml @@ -0,0 +1,37 @@ +--- +interactions_required_fields: [USER_ID, ITEM_ID, TIMESTAMP] + +# general +na_rate_threshold: 0.1 +dup_rate_threshold: 0.1 +repeat_rate_threshold: 0.5 +coldstart_rate_threshold: 0.1 + +# describe options +num_top_categories_to_show: 10 +loglog_min_categories_to_show: 30 +loglog_rmse_threshold: 5 +loglog_min_threshold: 5 +loglog_head_heavy_threshold: -2.0 +loglog_tail_heavy_threshold: -0.75 + +# temporal analysis +cross_entropy_with_epsilon_greedy: 0.01 +rolling_history_lags: [1, 10, 100, 1000] +retrain_frequencies: ['1y','1q','1m','1w','1d','6h'] +temporal_loss_methods: ['total variation', 'out-sample items'] +temporal_shift_dedup: true +temporal_plot_limit: 50 + +# misc +describe_sessions: true +describe_dataframe: true +show_activity_patterns: true + +timedelta_references: + min: 60 + hour: 3600 + day: 86400 + week: 604800 + month: 2678400 + year: 31536000 From e04e458969b933bce789df573eefa1e63ed0b129 Mon Sep 17 00:00:00 2001 From: Yifei Ma Date: Sun, 5 Jul 2020 01:11:42 -0700 Subject: [PATCH 2/3] add hit_at_k and mrr_at_k --- data_science/diagnose/diagnose.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/data_science/diagnose/diagnose.py b/data_science/diagnose/diagnose.py index e9ad075..ef29c42 100644 --- a/data_science/diagnose/diagnose.py +++ b/data_science/diagnose/diagnose.py @@ -174,6 +174,15 @@ def compute_temporal_loss(df, freq, method, hist_len): return compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq, tic) +def _fit_transform(method_at_k, q, p): + topk = int(method_at_k.split('@')[1]) + indices = np.argsort( + -q.toarray() + np.random.rand(*q.shape)*1e-10, + axis=1)[:, :topk] + p_topk = np.take_along_axis(p.toarray(), indices, axis=1) + return p_topk + + def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, tic=0): """ Y:target (unobserved), X:data (observed) """ @@ -205,13 +214,15 @@ def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, temporal_loss = (p-q).multiply(p>q).sum(axis=1) loss_fmt = '{:.1%}' - elif method.lower().startswith('prec'): - topk = int(re.findall(r'\d+', method)[0]) - q = q.toarray() - indices = np.argpartition(-q, topk) - q[:,:] = 0 - q[np.arange(q.shape[0])[:,None], indices[:,:topk]] = 1 - temporal_loss = -p.multiply(q).sum(axis=1) + elif method.lower().startswith('hit@'): + p_topk = _fit_transform(method, q, p) + temporal_loss = -p_topk.sum(axis=1) + loss_fmt = '{:.1%}' + + elif method.lower().startswith('mrr@'): + topk = int(method.split('@')[1]) + p_topk = _fit_transform(method, q, p) + temporal_loss = -(p_topk / (1+np.arange(topk))[None,:]).sum(axis=1) loss_fmt = '{:.1%}' else: From f2e47c3950af50816a3e611349a4c133d5619175 Mon Sep 17 00:00:00 2001 From: Yifei Ma Date: Sun, 5 Jul 2020 01:20:24 -0700 Subject: [PATCH 3/3] use short keys --- data_science/diagnose/diagnose.py | 16 +++++++--------- data_science/diagnose/diagnose.yaml | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/data_science/diagnose/diagnose.py b/data_science/diagnose/diagnose.py index ef29c42..a781d95 100644 --- a/data_science/diagnose/diagnose.py +++ b/data_science/diagnose/diagnose.py @@ -191,19 +191,17 @@ def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, q = _normalize_distribution(X) if method.lower() in ['kl', 'kl-divergence']: - eps_ratio = (1-config['cross_entropy_with_epsilon_greedy']) / \ - (config['cross_entropy_with_epsilon_greedy'] / N) + eps_ratio = (1-config['eps_greedy']) / (config['eps_greedy'] / N) log_p = (p * eps_ratio).log1p() log_q = (q * eps_ratio).log1p() temporal_loss = (p .multiply (log_p - log_q)).sum(axis=1) loss_fmt = '{:.2f}' elif method.lower() in ['ce', 'cross-entropy']: - eps_ratio = (1-config['cross_entropy_with_epsilon_greedy']) / \ - (config['cross_entropy_with_epsilon_greedy'] / N) + eps_ratio = (1-config['eps_greedy']) / (config['eps_greedy'] / N) log_q = (q * eps_ratio).log1p() temporal_loss = -((p .multiply (log_q)).sum(axis=1) + \ - np.log(config['cross_entropy_with_epsilon_greedy'] / N)) + np.log(config['eps_greedy'] / N)) loss_fmt = '{:.2f}' elif method.lower() in ['oov', 'out-sample items']: @@ -468,14 +466,14 @@ def diagnose(df, users=None, items=None): print("########################################") print("# DIAGNOSING INTERACTIONS TABLE, SAMPLE:") print("########################################") - print(df.sample(min(len(df), 10), random_state=42)) + print(df.sample(min(len(df), 10))) diagnose_interactions(df) if users is not None: print("########################################") print("# DIAGNOSING USERS TABLE, SAMPLE:") print("########################################") - print(users.sample(min(len(users), 10), random_state=42)) + print(users.sample(min(len(users), 10))) diagnose_users(df, users) else: print("########################################") @@ -486,7 +484,7 @@ def diagnose(df, users=None, items=None): print("########################################") print("# DIAGNOSING ITEMS TABLE, SAMPLE:") print("########################################") - print(items.sample(min(len(items), 10), random_state=42)) + print(items.sample(min(len(items), 10))) diagnose_items(df, items) else: print("########################################") @@ -494,4 +492,4 @@ def diagnose(df, users=None, items=None): print("########################################") if args.datasets is not None and len(args.datasets): - diagnose(*[pd.read_csv(d) for d in args.datasets]) \ No newline at end of file + diagnose(*[pd.read_csv(d) for d in args.datasets]) diff --git a/data_science/diagnose/diagnose.yaml b/data_science/diagnose/diagnose.yaml index 8369b44..adad0e9 100644 --- a/data_science/diagnose/diagnose.yaml +++ b/data_science/diagnose/diagnose.yaml @@ -16,7 +16,7 @@ loglog_head_heavy_threshold: -2.0 loglog_tail_heavy_threshold: -0.75 # temporal analysis -cross_entropy_with_epsilon_greedy: 0.01 +eps_greedy: 0.01 rolling_history_lags: [1, 10, 100, 1000] retrain_frequencies: ['1y','1q','1m','1w','1d','6h'] temporal_loss_methods: ['total variation', 'out-sample items']