|
| 1 | +# |
| 2 | +# Copyright (c) 2023 salesforce.com, inc. |
| 3 | +# All rights reserved. |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | +# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause |
| 6 | +# |
| 7 | +""" |
| 8 | +Window Statistics anomaly detection model for data with monthly seasonality. |
| 9 | +""" |
| 10 | +import datetime |
| 11 | +import logging |
| 12 | + |
| 13 | +import numpy |
| 14 | +import pandas as pd |
| 15 | + |
| 16 | +from merlion.evaluate.anomaly import TSADMetric |
| 17 | +from merlion.models.anomaly.base import DetectorConfig, DetectorBase |
| 18 | +from merlion.post_process.threshold import AggregateAlarms |
| 19 | +from merlion.transform.moving_average import DifferenceTransform |
| 20 | +from merlion.utils import UnivariateTimeSeries, TimeSeries |
| 21 | + |
| 22 | +logger = logging.getLogger(__name__) |
| 23 | + |
| 24 | + |
| 25 | +class WindStatsConfig(DetectorConfig): |
| 26 | + """ |
| 27 | + Config class for `WindStats`. |
| 28 | + """ |
| 29 | + |
| 30 | + _default_transform = DifferenceTransform() |
| 31 | + |
| 32 | + @property |
| 33 | + def _default_threshold(self): |
| 34 | + t = 3.0 if self.enable_calibrator else 8.8 |
| 35 | + return AggregateAlarms( |
| 36 | + alm_threshold=t, alm_window_minutes=self.wind_sz, alm_suppress_minutes=120, min_alm_in_window=1 |
| 37 | + ) |
| 38 | + |
| 39 | + def __init__(self, wind_sz=30, max_day=4, **kwargs): |
| 40 | + """ |
| 41 | + :param wind_sz: the window size in minutes, default is 30 minute window |
| 42 | + :param max_day: maximum number of month days stored in memory (only mean |
| 43 | + and std of each window are stored). Here, the days are first |
| 44 | + bucketed by month day and then by window id. |
| 45 | + """ |
| 46 | + self.wind_sz = wind_sz |
| 47 | + self.max_day = max_day |
| 48 | + super().__init__(**kwargs) |
| 49 | + |
| 50 | + |
| 51 | +class MonthlyWindStats(DetectorBase): |
| 52 | + """ |
| 53 | + Sliding Window Statistics based Anomaly Detector. |
| 54 | + This detector assumes the time series comes with a monthly seasonality. |
| 55 | + It divides the month into buckets of the specified size (in minutes). For |
| 56 | + a given (t, v) it computes an anomaly score by comparing the current |
| 57 | + value v against the historical values (mean and standard deviation) for |
| 58 | + that window of time. |
| 59 | + Note that if multiple matches (specified by the parameter max_day) can be |
| 60 | + found in history with the same day and same time window, then the |
| 61 | + minimum of the scores is returned. |
| 62 | + """ |
| 63 | + |
| 64 | + config_class = WindStatsConfig |
| 65 | + |
| 66 | + def __init__(self, config: WindStatsConfig = None): |
| 67 | + """ |
| 68 | + config.wind_sz: the window size in minutes, default is 30 minute window |
| 69 | + config.max_days: maximum number of days stored in memory (only mean and std of each window are stored), default is 4 days |
| 70 | + here the days are first bucketized and then bucketized by window id. |
| 71 | + """ |
| 72 | + super().__init__(WindStatsConfig() if config is None else config) |
| 73 | + self.table = {} |
| 74 | + |
| 75 | + @property |
| 76 | + def require_even_sampling(self) -> bool: |
| 77 | + return False |
| 78 | + |
| 79 | + @property |
| 80 | + def require_univariate(self) -> bool: |
| 81 | + return True |
| 82 | + |
| 83 | + @property |
| 84 | + def _default_post_rule_train_config(self): |
| 85 | + return dict(metric=TSADMetric.F1, unsup_quantile=None) |
| 86 | + |
| 87 | + def _get_anomaly_score(self, time_series: pd.DataFrame, time_series_prev: pd.DataFrame = None) -> pd.DataFrame: |
| 88 | + times, scores = [], [] |
| 89 | + for t, (x,) in zip(time_series.index, time_series.values): |
| 90 | + t = t.timetuple() |
| 91 | + key = (t.tm_mday, (t.tm_hour * 60 + t.tm_min) // self.config.wind_sz) |
| 92 | + if key in self.table: |
| 93 | + stats = self.table[key] |
| 94 | + score = [] |
| 95 | + for d, mu, sigma in stats: |
| 96 | + if sigma == 0: # handle missing value |
| 97 | + score.append(0) |
| 98 | + else: |
| 99 | + score.append((x - mu) / sigma) |
| 100 | + else: |
| 101 | + score = [0] |
| 102 | + scores.append(min(score, key=abs)) |
| 103 | + |
| 104 | + return pd.DataFrame(scores, index=time_series.index) |
| 105 | + |
| 106 | + def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame: |
| 107 | + # first build a hashtable with (day in the month, yearofday, and window id of the day) as key. |
| 108 | + # the value is a list of metrics |
| 109 | + table = {} |
| 110 | + for time, x in zip(train_data.index, train_data.values): |
| 111 | + t = time.timetuple() |
| 112 | + code = (t.tm_mday, t.tm_yday, (t.tm_hour * 60 + t.tm_min) // self.config.wind_sz) |
| 113 | + if code in table: |
| 114 | + table[code].append(x) |
| 115 | + else: |
| 116 | + table[code] = [x] |
| 117 | + |
| 118 | + # for each bucket, compute the mean and standard deviation |
| 119 | + for t, x in table.items(): |
| 120 | + md, d, h = t |
| 121 | + key = (md, h) |
| 122 | + v1 = numpy.array(x) |
| 123 | + mu = numpy.mean(v1) |
| 124 | + sigma = numpy.std(v1) |
| 125 | + if key in self.table: |
| 126 | + self.table[key].append((d, mu, sigma)) |
| 127 | + else: |
| 128 | + self.table[key] = [(d, mu, sigma)] |
| 129 | + |
| 130 | + # cut out maximum number of days saved in the table. only store the latest max_day |
| 131 | + for t, x in self.table.items(): |
| 132 | + self.table[t] = sorted(x, key=lambda x: x[0]) |
| 133 | + if len(self.table[t]) > self.config.max_day: |
| 134 | + self.table[t] = self.table[t][-self.config.max_day :] |
| 135 | + |
| 136 | + return self._get_anomaly_score(train_data) |
| 137 | + |
| 138 | + |
0 commit comments