@@ -336,6 +336,44 @@ async function generateSqlQuery(apiKey: string, schemaInfo: string, question: st
336
336
- For ratios/divisions: ROUND(CAST(CAST(numerator AS NUMERIC) / NULLIF(denominator, 0) AS NUMERIC), 2)
337
337
- Handle NULLs: COALESCE(value, 0)
338
338
339
+ 10. Statistical Calculations:
340
+ - For complex statistics requiring multiple aggregations:
341
+ * WRONG: Mixing window and aggregate functions directly
342
+ * RIGHT: Use staged CTEs to build up calculations
343
+ - For correlations and statistical measures:
344
+ * Step 1: Calculate base metrics per entity
345
+ * Step 2: Calculate statistical components
346
+ * Step 3: Combine into final formula
347
+ Example pattern for correlation:
348
+ WITH base_metrics AS (
349
+ -- First get metrics per entity
350
+ SELECT
351
+ parent_id,
352
+ SUM(amount) / COUNT(*) as avg_amount,
353
+ SUM(attribute) / COUNT(*) as avg_attribute
354
+ FROM details
355
+ GROUP BY parent_id
356
+ ),
357
+ stats AS (
358
+ -- Then calculate statistical components
359
+ SELECT
360
+ COUNT(*) as n,
361
+ AVG(avg_amount) as avg_x,
362
+ AVG(avg_attribute) as avg_y,
363
+ STDDEV_POP(avg_amount) as stddev_x,
364
+ STDDEV_POP(avg_attribute) as stddev_y,
365
+ SUM((avg_amount * avg_attribute)) as sum_xy,
366
+ SUM(avg_amount) as sum_x,
367
+ SUM(avg_attribute) as sum_y
368
+ FROM base_metrics
369
+ )
370
+ -- Finally combine into correlation formula
371
+ SELECT
372
+ (n * sum_xy - sum_x * sum_y) /
373
+ (SQRT(n * SUM(POWER(avg_amount, 2)) - POWER(sum_x, 2)) *
374
+ SQRT(n * SUM(POWER(avg_attribute, 2)) - POWER(sum_y, 2))) as correlation
375
+ FROM stats, base_metrics;
376
+
339
377
IMPLEMENTATION REQUIREMENTS:
340
378
- Generate only SELECT queries (no modifications)
341
379
- Include LIMIT ${ maxRows } in final results
@@ -486,11 +524,32 @@ function formatQueryResponse(sqlQuery: string): string {
486
524
* - No aggregates in GROUP BY clause
487
525
*
488
526
* 5. "aggregate function calls cannot contain window function calls"
489
- * Problem: Mixing window functions with aggregates
527
+ * Problem: Attempting to combine window functions with aggregates
490
528
* Solution:
491
- * - Calculate window functions in separate CTE
529
+ * - Break calculation into separate CTEs
530
+ * - Calculate window functions first
492
531
* - Use results in subsequent aggregations
493
- * - Keep window functions and aggregates separate
532
+ * Example fix:
533
+ * Instead of:
534
+ * SELECT CORR(AVG(amount) OVER (PARTITION BY parent_id),
535
+ * AVG(attribute) OVER (PARTITION BY parent_id))
536
+ * FROM details
537
+ * Use:
538
+ * WITH per_parent AS (
539
+ * SELECT parent_id,
540
+ * AVG(amount) as avg_amount,
541
+ * AVG(attribute) as avg_attribute
542
+ * FROM details
543
+ * GROUP BY parent_id
544
+ * )
545
+ * SELECT CORR(avg_amount, avg_attribute)
546
+ * FROM per_parent
547
+ * Testing:
548
+ * - Verify results match manual calculations
549
+ * - Test with different window sizes
550
+ * - Check handling of NULL values
551
+ * - Test with single-row groups
552
+ * - Validate statistical significance
494
553
*
495
554
* 6. "window functions are not allowed in GROUP BY"
496
555
* Problem: Window functions in GROUP BY clause
@@ -633,6 +692,44 @@ function formatQueryResponse(sqlQuery: string): string {
633
692
* - Test with parents having mixed attribute values
634
693
* - Check that parent-level metrics match when calculated different ways
635
694
*
695
+ * 14. "Statistical Calculations:
696
+ * - For complex statistics requiring multiple aggregations:
697
+ * * WRONG: Mixing window and aggregate functions directly
698
+ * * RIGHT: Use staged CTEs to build up calculations
699
+ * - For correlations and statistical measures:
700
+ * * Step 1: Calculate base metrics per entity
701
+ * * Step 2: Calculate statistical components
702
+ * * Step 3: Combine into final formula
703
+ * Example pattern for correlation:
704
+ * WITH base_metrics AS (
705
+ * -- First get metrics per parent
706
+ * SELECT
707
+ * parent_id,
708
+ * SUM(amount) / COUNT(*) as avg_amount,
709
+ * SUM(attribute) / COUNT(*) as avg_attribute
710
+ * FROM details
711
+ * GROUP BY parent_id
712
+ * ),
713
+ * stats AS (
714
+ * -- Then calculate statistical components
715
+ * SELECT
716
+ * COUNT(*) as n,
717
+ * AVG(avg_amount) as avg_x,
718
+ * AVG(avg_attribute) as avg_y,
719
+ * STDDEV_POP(avg_amount) as stddev_x,
720
+ * STDDEV_POP(avg_attribute) as stddev_y,
721
+ * SUM((avg_amount * avg_attribute)) as sum_xy,
722
+ * SUM(avg_amount) as sum_x,
723
+ * SUM(avg_attribute) as sum_y
724
+ * FROM base_metrics
725
+ * )
726
+ * -- Finally combine into correlation formula
727
+ * SELECT
728
+ * (n * sum_xy - sum_x * sum_y) /
729
+ * (SQRT(n * SUM(POWER(avg_amount, 2)) - POWER(sum_x, 2)) *
730
+ * SQRT(n * SUM(POWER(avg_attribute, 2)) - POWER(sum_y, 2))) as correlation
731
+ * FROM stats, base_metrics;
732
+ *
636
733
* IMPLEMENTATION REQUIREMENTS:
637
734
* 1. Schema Awareness
638
735
* - All queries must be built using actual schema information
0 commit comments