Skip to content

Commit d3e29ad

Browse files
committed
feedback: take the latest datetime from the group
1 parent 23a7e4b commit d3e29ad

File tree

3 files changed

+327
-8
lines changed

3 files changed

+327
-8
lines changed

packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3018,4 +3018,314 @@ describe('checkAlerts', () => {
30183018
expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(2);
30193019
});
30203020
});
3021+
3022+
describe('group-by alert with disappearing group', () => {
3023+
const server = getServer();
3024+
3025+
beforeAll(async () => {
3026+
await server.start();
3027+
});
3028+
3029+
afterEach(async () => {
3030+
await server.clearDBs();
3031+
jest.clearAllMocks();
3032+
});
3033+
3034+
afterAll(async () => {
3035+
await server.stop();
3036+
});
3037+
3038+
it('should use latest createdAt from any group and not rescan old timeframe when one group disappears', async () => {
3039+
jest.spyOn(slack, 'postMessageToWebhook').mockResolvedValue(null as any);
3040+
3041+
const team = await createTeam({ name: 'My Team' });
3042+
3043+
const now = new Date('2023-11-16T22:18:00.000Z');
3044+
3045+
// Insert logs in first time bucket (22:00-22:05):
3046+
// - service-a: 3 errors (ALERT - exceeds threshold of 2)
3047+
// - service-b: 3 errors (ALERT - exceeds threshold of 2)
3048+
await bulkInsertLogs([
3049+
{
3050+
ServiceName: 'service-a',
3051+
Timestamp: new Date('2023-11-16T22:00:00.000Z'),
3052+
SeverityText: 'error',
3053+
Body: 'Error from service-a',
3054+
},
3055+
{
3056+
ServiceName: 'service-a',
3057+
Timestamp: new Date('2023-11-16T22:01:00.000Z'),
3058+
SeverityText: 'error',
3059+
Body: 'Error from service-a',
3060+
},
3061+
{
3062+
ServiceName: 'service-a',
3063+
Timestamp: new Date('2023-11-16T22:02:00.000Z'),
3064+
SeverityText: 'error',
3065+
Body: 'Error from service-a',
3066+
},
3067+
{
3068+
ServiceName: 'service-b',
3069+
Timestamp: new Date('2023-11-16T22:00:00.000Z'),
3070+
SeverityText: 'error',
3071+
Body: 'Error from service-b',
3072+
},
3073+
{
3074+
ServiceName: 'service-b',
3075+
Timestamp: new Date('2023-11-16T22:01:00.000Z'),
3076+
SeverityText: 'error',
3077+
Body: 'Error from service-b',
3078+
},
3079+
{
3080+
ServiceName: 'service-b',
3081+
Timestamp: new Date('2023-11-16T22:02:00.000Z'),
3082+
SeverityText: 'error',
3083+
Body: 'Error from service-b',
3084+
},
3085+
// Second time bucket (22:05-22:10):
3086+
// - service-a: 1 error (OK - below threshold)
3087+
// - service-b: 3 errors (ALERT - exceeds threshold)
3088+
{
3089+
ServiceName: 'service-a',
3090+
Timestamp: new Date('2023-11-16T22:05:00.000Z'),
3091+
SeverityText: 'error',
3092+
Body: 'Error from service-a',
3093+
},
3094+
{
3095+
ServiceName: 'service-b',
3096+
Timestamp: new Date('2023-11-16T22:05:00.000Z'),
3097+
SeverityText: 'error',
3098+
Body: 'Error from service-b',
3099+
},
3100+
{
3101+
ServiceName: 'service-b',
3102+
Timestamp: new Date('2023-11-16T22:06:00.000Z'),
3103+
SeverityText: 'error',
3104+
Body: 'Error from service-b',
3105+
},
3106+
{
3107+
ServiceName: 'service-b',
3108+
Timestamp: new Date('2023-11-16T22:07:00.000Z'),
3109+
SeverityText: 'error',
3110+
Body: 'Error from service-b',
3111+
},
3112+
// Third time bucket (22:10-22:15):
3113+
// - service-a: 1 error (OK - below threshold)
3114+
// - service-b: DISAPPEARS (no logs)
3115+
{
3116+
ServiceName: 'service-a',
3117+
Timestamp: new Date('2023-11-16T22:10:00.000Z'),
3118+
SeverityText: 'error',
3119+
Body: 'Error from service-a',
3120+
},
3121+
]);
3122+
3123+
const webhook = await new Webhook({
3124+
team: team._id,
3125+
service: 'slack',
3126+
url: 'https://hooks.slack.com/services/123',
3127+
name: 'My Webhook',
3128+
}).save();
3129+
const teamWebhooksById = new Map<string, typeof webhook>([
3130+
[webhook._id.toString(), webhook],
3131+
]);
3132+
const connection = await Connection.create({
3133+
team: team._id,
3134+
name: 'Default',
3135+
host: config.CLICKHOUSE_HOST,
3136+
username: config.CLICKHOUSE_USER,
3137+
password: config.CLICKHOUSE_PASSWORD,
3138+
});
3139+
const source = await Source.create({
3140+
kind: 'log',
3141+
team: team._id,
3142+
from: {
3143+
databaseName: 'default',
3144+
tableName: 'otel_logs',
3145+
},
3146+
timestampValueExpression: 'Timestamp',
3147+
connection: connection.id,
3148+
name: 'Logs',
3149+
});
3150+
const savedSearch = await new SavedSearch({
3151+
team: team._id,
3152+
name: 'My Error Search',
3153+
select: 'Body',
3154+
where: 'SeverityText: "error"',
3155+
whereLanguage: 'lucene',
3156+
orderBy: 'Timestamp',
3157+
source: source.id,
3158+
tags: ['test'],
3159+
}).save();
3160+
const mockUserId = new mongoose.Types.ObjectId();
3161+
const alert = await createAlert(
3162+
team._id,
3163+
{
3164+
source: AlertSource.SAVED_SEARCH,
3165+
channel: {
3166+
type: 'webhook',
3167+
webhookId: webhook._id.toString(),
3168+
},
3169+
interval: '5m',
3170+
thresholdType: AlertThresholdType.ABOVE,
3171+
threshold: 2,
3172+
savedSearchId: savedSearch.id,
3173+
groupBy: 'ServiceName', // Group by ServiceName
3174+
},
3175+
mockUserId,
3176+
);
3177+
3178+
const enhancedAlert: any = await Alert.findById(alert.id).populate([
3179+
'team',
3180+
'savedSearch',
3181+
]);
3182+
3183+
// Create previous alert histories at 22:00 for initial baseline (one per service)
3184+
await AlertHistory.create([
3185+
{
3186+
alert: alert.id,
3187+
createdAt: new Date('2023-11-16T22:00:00.000Z'),
3188+
state: 'OK',
3189+
counts: 0,
3190+
lastValues: [],
3191+
group: 'ServiceName:service-a',
3192+
},
3193+
{
3194+
alert: alert.id,
3195+
createdAt: new Date('2023-11-16T22:00:00.000Z'),
3196+
state: 'OK',
3197+
counts: 0,
3198+
lastValues: [],
3199+
group: 'ServiceName:service-b',
3200+
},
3201+
]);
3202+
3203+
const previousMap = await getPreviousAlertHistories(
3204+
[enhancedAlert.id],
3205+
now,
3206+
);
3207+
3208+
const details = {
3209+
alert: enhancedAlert,
3210+
source,
3211+
taskType: AlertTaskType.SAVED_SEARCH,
3212+
savedSearch,
3213+
previousMap,
3214+
} satisfies AlertDetails;
3215+
3216+
const clickhouseClient = new ClickhouseClient({
3217+
host: connection.host,
3218+
username: connection.username,
3219+
password: connection.password,
3220+
});
3221+
3222+
const mockMetadata = {
3223+
getColumn: jest.fn().mockImplementation(({ column }) => {
3224+
const columnMap = {
3225+
Body: { name: 'Body', type: 'String' },
3226+
Timestamp: { name: 'Timestamp', type: 'DateTime' },
3227+
SeverityText: { name: 'SeverityText', type: 'String' },
3228+
ServiceName: { name: 'ServiceName', type: 'String' },
3229+
};
3230+
return Promise.resolve(columnMap[column]);
3231+
}),
3232+
};
3233+
3234+
jest.mock('@hyperdx/common-utils/dist/metadata', () => ({
3235+
...jest.requireActual('@hyperdx/common-utils/dist/metadata'),
3236+
getMetadata: jest.fn().mockReturnValue(mockMetadata),
3237+
}));
3238+
3239+
// First run: process alert at 22:18
3240+
// Should check buckets: 22:00-22:05, 22:05-22:10, 22:10-22:15
3241+
await processAlert(
3242+
now,
3243+
details,
3244+
clickhouseClient,
3245+
connection.id,
3246+
alertProvider,
3247+
teamWebhooksById,
3248+
);
3249+
3250+
// Alert should be in ALERT state (service-b still alerting)
3251+
const updatedAlert = await Alert.findById(enhancedAlert.id);
3252+
expect(updatedAlert!.state).toBe('ALERT');
3253+
3254+
// Check alert histories after first run
3255+
const firstRunHistories = await AlertHistory.find({
3256+
alert: alert.id,
3257+
}).sort({ createdAt: 1 });
3258+
3259+
// Should have histories: 1 previous + 2 groups (service-a, service-b)
3260+
// service-a should resolve, service-b should alert
3261+
expect(firstRunHistories.length).toBeGreaterThan(1);
3262+
3263+
// Find the latest createdAt from all group histories (should be from 22:15)
3264+
const latestCreatedAt = firstRunHistories
3265+
.slice(1) // Skip the initial previous history
3266+
.reduce(
3267+
(latest, history) => {
3268+
return !latest || history.createdAt > latest
3269+
? history.createdAt
3270+
: latest;
3271+
},
3272+
null as Date | null,
3273+
);
3274+
3275+
expect(latestCreatedAt).toEqual(new Date('2023-11-16T22:15:00.000Z'));
3276+
3277+
// Second run: process alert at 22:23:00
3278+
// The check was already done at 22:15 (latest createdAt from any group)
3279+
// Even though service-b doesn't exist in that bucket, we shouldn't recheck old buckets
3280+
// Should use the LATEST createdAt (22:15) from any group, not the earliest
3281+
// This means it should only check NEW bucket: 22:15-22:20
3282+
// Should NOT rescan 22:00-22:05 where service-b had data but was already checked
3283+
const nextRun = new Date('2023-11-16T22:23:00.000Z');
3284+
const previousMapNextRun = await getPreviousAlertHistories(
3285+
[enhancedAlert.id],
3286+
nextRun,
3287+
);
3288+
3289+
// Verify that previousMapNextRun has the latest createdAt from any group
3290+
const previousDates = Array.from(previousMapNextRun.values()).map(
3291+
h => h.createdAt,
3292+
);
3293+
const maxPreviousDate = previousDates.reduce(
3294+
(max, date) => (date > max ? date : max),
3295+
new Date(0),
3296+
);
3297+
expect(maxPreviousDate).toEqual(new Date('2023-11-16T22:15:00.000Z'));
3298+
3299+
await processAlert(
3300+
nextRun,
3301+
{
3302+
...details,
3303+
previousMap: previousMapNextRun,
3304+
},
3305+
clickhouseClient,
3306+
connection.id,
3307+
alertProvider,
3308+
teamWebhooksById,
3309+
);
3310+
3311+
// Check all alert histories after second run
3312+
const allHistories = await AlertHistory.find({
3313+
alert: alert.id,
3314+
}).sort({ createdAt: 1 });
3315+
3316+
// Verify no histories were created for old timeframes (22:00-22:05)
3317+
// All new histories should have createdAt >= 22:15
3318+
const firstRunEndTime = new Date('2023-11-16T22:15:00.000Z').getTime();
3319+
const newHistories = allHistories.filter(
3320+
h => h.createdAt.getTime() > firstRunEndTime, // Exclude first run histories (which have createdAt <= 22:15)
3321+
);
3322+
3323+
// New histories should be for the new bucket (22:20) only
3324+
newHistories.forEach(history => {
3325+
expect(history.createdAt.getTime()).toBeGreaterThanOrEqual(
3326+
new Date('2023-11-16T22:20:00.000Z').getTime(),
3327+
);
3328+
});
3329+
});
3330+
});
30213331
});

packages/api/src/tasks/checkAlerts/index.ts

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,14 @@ const fireChannelEvent = async ({
157157
// MongoDB ObjectIds are hex strings (0-9, a-f), so pipes are safe
158158
const ALERT_GROUP_DELIMITER = '||';
159159

160+
/**
161+
* Get the alert key prefix for filtering grouped alert histories.
162+
* Returns "alertId||" which is used to match all group keys for this alert.
163+
*/
164+
const getAlertKeyPrefix = (alertId: string): string => {
165+
return `${alertId}${ALERT_GROUP_DELIMITER}`;
166+
};
167+
160168
/**
161169
* Compute a composite map key for tracking alert history per group.
162170
* For non-grouped alerts, returns just the alertId.
@@ -165,7 +173,7 @@ const ALERT_GROUP_DELIMITER = '||';
165173
* or in typical group key values.
166174
*/
167175
const computeHistoryMapKey = (alertId: string, groupKey: string): string => {
168-
return groupKey ? `${alertId}${ALERT_GROUP_DELIMITER}${groupKey}` : alertId;
176+
return groupKey ? `${getAlertKeyPrefix(alertId)}${groupKey}` : alertId;
169177
};
170178

171179
/**
@@ -174,7 +182,7 @@ const computeHistoryMapKey = (alertId: string, groupKey: string): string => {
174182
* by using the alert ID prefix with the delimiter to identify the split point.
175183
*/
176184
const extractGroupKeyFromMapKey = (mapKey: string, alertId: string): string => {
177-
const alertIdPrefix = `${alertId}${ALERT_GROUP_DELIMITER}`;
185+
const alertIdPrefix = getAlertKeyPrefix(alertId);
178186
return mapKey.startsWith(alertIdPrefix)
179187
? mapKey.substring(alertIdPrefix.length)
180188
: '';
@@ -196,7 +204,7 @@ export const processAlert = async (
196204
// Check if we should skip this alert run
197205
// Skip if ANY previous history for this alert was created in the current window
198206
const hasGroupBy = alert.groupBy && alert.groupBy.length > 0;
199-
const alertKeyPrefix = `${alert.id}${ALERT_GROUP_DELIMITER}`;
207+
const alertKeyPrefix = getAlertKeyPrefix(alert.id);
200208

201209
const shouldSkip = Array.from(previousMap.entries()).some(
202210
([key, history]) => {
@@ -228,14 +236,15 @@ export const processAlert = async (
228236
}
229237

230238
// Calculate date range for the query
231-
// Find the earliest createdAt among all histories for this alert
239+
// Find the latest createdAt among all histories for this alert
232240
let previousCreatedAt: Date | undefined;
233241
if (hasGroupBy) {
234-
// For grouped alerts, find the earliest createdAt among all groups
235-
const alertKeyPrefix = `${alert.id}${ALERT_GROUP_DELIMITER}`;
242+
// For grouped alerts, find the latest createdAt among all groups
243+
// Use the latest to avoid checking from old groups that might no longer exist
244+
const alertKeyPrefix = getAlertKeyPrefix(alert.id);
236245
for (const [key, history] of previousMap.entries()) {
237246
if (key.startsWith(alertKeyPrefix)) {
238-
if (!previousCreatedAt || history.createdAt < previousCreatedAt) {
247+
if (!previousCreatedAt || history.createdAt > previousCreatedAt) {
239248
previousCreatedAt = history.createdAt;
240249
}
241250
}

packages/app/src/TeamPage.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1227,7 +1227,7 @@ function ClickhouseSettingForm({
12271227
? 'Disabled'
12281228
: 'Enabled'
12291229
: 'Enabled'
1230-
: currentValue ?? defaultValue ?? '',
1230+
: (currentValue ?? defaultValue ?? ''),
12311231
},
12321232
});
12331233

0 commit comments

Comments
 (0)