@@ -287,38 +287,51 @@ async function generateSqlQuery(apiKey: string, schemaInfo: string, question: st
287
287
* Second CTE: Segment parents based on characteristics
288
288
* Final CTE: Calculate overall totals if needed
289
289
- For entity-level averages:
290
- * WRONG: AVG(detail.value)
291
- * RIGHT: AVG(parent_totals.total_value)
290
+ * WRONG: AVG(detail.value) directly from details
291
+ * RIGHT: AVG(parent_totals.total_value) from parent-level CTE
292
292
- For entity grouping:
293
293
* WRONG: GROUP BY detail.attribute > 0
294
294
* RIGHT: GROUP BY parent_level.has_attribute
295
295
- For MECE (Mutually Exclusive, Collectively Exhaustive) results:
296
- * WRONG: COUNT(DISTINCT parent_id) directly from details
297
- * RIGHT: COUNT(*) from parent- level CTE
298
- Example pattern:
299
- WITH detail_totals AS (
300
- -- First aggregate all details to parent level
296
+ * WRONG: Segmenting at detail level then counting parents
297
+ * RIGHT: First aggregate to parent level, then segment
298
+ Example pattern for segmentation :
299
+ WITH parent_totals AS (
300
+ -- First aggregate ALL metrics to parent level
301
301
SELECT
302
302
parent_id,
303
303
SUM(quantity) as total_quantity,
304
304
SUM(amount) as total_amount,
305
- MAX(CASE WHEN attribute > 0 THEN 1 ELSE 0 END ) as has_attribute ,
306
- SUM(amount * attribute) as attribute_amount
305
+ SUM(amount * attribute) as attribute_amount ,
306
+ MAX(CASE WHEN attribute > 0 THEN 1 ELSE 0 END ) as has_attribute
307
307
FROM detail_table
308
308
GROUP BY parent_id
309
309
),
310
- parent_segments AS (
310
+ segments AS (
311
311
-- Then segment based on parent-level characteristics
312
312
SELECT
313
313
CASE WHEN has_attribute = 1 THEN 'With Attribute'
314
314
ELSE 'Without Attribute' END as segment,
315
315
COUNT(*) as total_parents,
316
- ROUND(CAST(AVG(total_quantity) AS NUMERIC), 2) as avg_quantity,
317
- ROUND(CAST(AVG(total_amount) AS NUMERIC), 2) as avg_amount,
318
- ROUND(CAST(AVG(attribute_amount) AS NUMERIC), 2) as avg_attr_amount,
319
- ROUND(CAST(SUM(attribute_amount) * 100.0 / NULLIF(SUM(total_amount), 0) AS NUMERIC), 2) as attr_percentage
320
- FROM detail_totals
316
+ ROUND(CAST(AVG(total_quantity) AS NUMERIC), 2) as avg_items,
317
+ ROUND(CAST(SUM(total_amount) AS NUMERIC), 2) as total_value,
318
+ ROUND(CAST(SUM(attribute_amount) AS NUMERIC), 2) as attr_value,
319
+ ROUND(CAST(SUM(attribute_amount) * 100.0 /
320
+ NULLIF(SUM(total_amount), 0) AS NUMERIC), 2) as attr_percentage
321
+ FROM parent_totals
321
322
GROUP BY has_attribute
323
+
324
+ UNION ALL
325
+
326
+ SELECT
327
+ 'Total' as segment,
328
+ COUNT(*) as total_parents,
329
+ ROUND(CAST(AVG(total_quantity) AS NUMERIC), 2) as avg_items,
330
+ ROUND(CAST(SUM(total_amount) AS NUMERIC), 2) as total_value,
331
+ ROUND(CAST(SUM(attribute_amount) AS NUMERIC), 2) as attr_value,
332
+ ROUND(CAST(SUM(attribute_amount) * 100.0 /
333
+ NULLIF(SUM(total_amount), 0) AS NUMERIC), 2) as attr_percentage
334
+ FROM parent_totals
322
335
)
323
336
324
337
8. Query Optimization:
@@ -498,41 +511,42 @@ function formatQueryResponse(sqlQuery: string): string {
498
511
* - Never use window function results directly in GROUP BY
499
512
*
500
513
* 7. "Non-MECE Results in Multi-Level Aggregations"
501
- * Problem: Incorrect aggregation levels leading to wrong averages and counts
514
+ * Problem: Incorrect aggregation levels leading to wrong segment totals
502
515
* Solution:
503
- * - Always use three-step aggregation for hierarchical data:
504
- * 1. Aggregate details to parent level (all metrics per parent)
505
- * 2. Segment parents based on characteristics
506
- * 3. Calculate overall totals if needed
507
- * - Common mistakes and fixes:
508
- * Instead of:
516
+ * - Never segment at detail level when counting or averaging parent entities
517
+ * - Always follow this pattern:
518
+ * 1. Aggregate ALL metrics to parent level first
519
+ * 2. Then segment based on parent characteristics
520
+ * 3. Finally add totals if needed
521
+ * Common mistakes and fixes:
522
+ * Instead of:
523
+ * SELECT
524
+ * CASE WHEN d.attribute > 0 THEN 'With' ELSE 'Without' END as segment,
525
+ * COUNT(DISTINCT d.parent_id) as total,
526
+ * AVG(d.amount) as avg_amount
527
+ * FROM details d
528
+ * GROUP BY CASE WHEN d.attribute > 0 THEN 'With' ELSE 'Without' END
529
+ * Use:
530
+ * WITH parent_totals AS (
509
531
* SELECT
510
- * has_attribute ,
511
- * COUNT(DISTINCT parent_id ) as total ,
512
- * AVG(amount ) as avg_amount
532
+ * parent_id ,
533
+ * SUM(amount ) as total_amount ,
534
+ * MAX(CASE WHEN attribute > 0 THEN 1 ELSE 0 END ) as has_attribute
513
535
* FROM details
514
- * GROUP BY has_attribute
515
- * Use:
516
- * WITH parent_totals AS (
517
- * SELECT
518
- * parent_id,
519
- * MAX(CASE WHEN attribute > 0 THEN 1 ELSE 0 END) as has_attribute,
520
- * SUM(amount) as total_amount
521
- * FROM details
522
- * GROUP BY parent_id
523
- * )
524
- * SELECT
525
- * has_attribute,
526
- * COUNT(*) as total,
527
- * AVG(total_amount) as avg_amount
528
- * FROM parent_totals
529
- * GROUP BY has_attribute
536
+ * GROUP BY parent_id
537
+ * )
538
+ * SELECT
539
+ * CASE WHEN has_attribute = 1 THEN 'With' ELSE 'Without' END as segment,
540
+ * COUNT(*) as total,
541
+ * AVG(total_amount) as avg_amount
542
+ * FROM parent_totals
543
+ * GROUP BY has_attribute
530
544
* Testing:
531
- * - Compare results with manual calculations for a small dataset
532
- * - Verify parent counts match between segments and totals
533
- * - Check that averages are calculated at the correct level
534
- * - Test with parents having varying numbers of detail records
535
- * - Test with mixed attribute values within the same parent
545
+ * - Compare segment counts: sum should equal total parents
546
+ * - Check parent appears in only one segment
547
+ * - Verify averages match manual calculations
548
+ * - Test with parent having mixed attribute values
549
+ * - Test with uneven distribution of details per parent
536
550
*
537
551
* 8. "Overloaded Error"
538
552
* Problem: Query too complex or taking too long
0 commit comments