@@ -1501,8 +1501,33 @@ def to_input_dataframe(
1501
1501
1502
1502
return df
1503
1503
1504
+ def to_input_dict (self ) -> dict :
1505
+ """Exports a dictionary which can be loaded back to a new Simulation to reproduce the same results.
1506
+
1507
+ Returns:
1508
+ dict: The dictionary containing the input values.
1509
+ """
1510
+ data = {}
1511
+
1512
+ for variable in self .tax_benefit_system .variables :
1513
+ data [variable ] = {}
1514
+ for period in self .get_holder (variable ).get_known_periods ():
1515
+ values = self .calculate (variable , period , map_to = "person" )
1516
+ if values is not None :
1517
+ data [variable ][str (period )] = values .tolist ()
1518
+
1519
+ if len (data [variable ]) == 0 :
1520
+ del data [variable ]
1521
+
1522
+ return data
1523
+
1504
1524
def subsample (
1505
- self , n = None , frac = None , seed = None , time_period = None
1525
+ self ,
1526
+ n = None ,
1527
+ frac = None ,
1528
+ seed = None ,
1529
+ time_period = None ,
1530
+ quantize_weights : bool = True ,
1506
1531
) -> "Simulation" :
1507
1532
"""Quantize the simulation to a smaller size by sampling households.
1508
1533
@@ -1515,6 +1540,7 @@ def subsample(
1515
1540
Returns:
1516
1541
Simulation: The quantized simulation.
1517
1542
"""
1543
+ default_calculation_period = self .default_calculation_period
1518
1544
# Set default key if not provided
1519
1545
if seed is None :
1520
1546
seed = self .dataset .name
@@ -1529,6 +1555,7 @@ def subsample(
1529
1555
# Extract time period from DataFrame columns
1530
1556
df_time_period = df .columns .values [0 ].split ("__" )[1 ]
1531
1557
df_household_id_column = f"household_id__{ df_time_period } "
1558
+ df_person_id_column = f"person_id__{ df_time_period } "
1532
1559
1533
1560
# Determine the appropriate household weight column
1534
1561
if f"household_weight__{ time_period } " in df .columns :
@@ -1545,34 +1572,59 @@ def subsample(
1545
1572
n = int (len (h_ids ) * frac )
1546
1573
h_weights = pd .Series (h_df [household_weight_column ].values )
1547
1574
1548
- if n > len (h_weights ):
1549
- # Don't need to subsample!
1550
- return self
1575
+ frac = n / len (h_ids )
1551
1576
1552
1577
# Seed the random number generators for reproducibility
1553
1578
random .seed (str (seed ))
1554
1579
state = random .randint (0 , 2 ** 32 - 1 )
1555
1580
np .random .seed (state )
1556
1581
1582
+ h_ids = h_ids [h_weights > 0 ]
1583
+ h_weights = h_weights [h_weights > 0 ]
1584
+
1557
1585
# Sample household IDs based on their weights
1558
- chosen_household_ids = np .random .choice (
1559
- h_ids ,
1560
- n ,
1561
- p = h_weights .values / h_weights .values .sum (),
1562
- replace = False ,
1586
+ chosen_household_ids = pd .Series (
1587
+ np .random .choice (
1588
+ h_ids ,
1589
+ n ,
1590
+ p = (
1591
+ h_weights .values / h_weights .values .sum ()
1592
+ if quantize_weights
1593
+ else None
1594
+ ),
1595
+ replace = True ,
1596
+ )
1563
1597
)
1564
1598
1565
- # Filter DataFrame to include only the chosen households
1566
- df = df [df [df_household_id_column ].isin (chosen_household_ids )]
1599
+ household_id_to_count = {}
1600
+ for household_id in chosen_household_ids :
1601
+ if household_id not in household_id_to_count :
1602
+ household_id_to_count [household_id ] = 0
1603
+ household_id_to_count [household_id ] += 1
1567
1604
1568
- # Adjust household weights to maintain the total weight
1569
- df [household_weight_column ] *= (
1570
- h_weights . sum ()
1571
- / df . groupby ( df_household_id_column )
1572
- . first ()[ household_weight_column ]
1573
- . sum ( )
1605
+ subset_df = df [
1606
+ df [df_household_id_column ]. isin ( chosen_household_ids )
1607
+ ]. copy ()
1608
+
1609
+ household_counts = subset_df [ df_household_id_column ]. map (
1610
+ lambda x : household_id_to_count . get ( x , 0 )
1574
1611
)
1575
1612
1613
+ # Adjust household weights to maintain the total weight
1614
+
1615
+ for col in subset_df .columns :
1616
+ if "weight__" in col :
1617
+ target_total_weight = df [col ].values .sum ()
1618
+ if not quantize_weights :
1619
+ subset_df [col ] *= household_counts .values
1620
+ else :
1621
+ subset_df [col ] = household_counts .values
1622
+ subset_df [col ] *= (
1623
+ target_total_weight / subset_df [col ].values .sum ()
1624
+ )
1625
+
1626
+ df = subset_df
1627
+
1576
1628
# Update the dataset and rebuild the simulation
1577
1629
self .dataset = Dataset .from_dataframe (df , self .dataset .time_period )
1578
1630
self .build_from_dataset ()
@@ -1584,6 +1636,8 @@ def subsample(
1584
1636
].tax_benefit_system
1585
1637
self .branches ["baseline" ] = self .clone ()
1586
1638
self .branches ["tax_benefit_system" ] = baseline_tax_benefit_system
1639
+
1640
+ self .default_calculation_period = default_calculation_period
1587
1641
return self
1588
1642
1589
1643
0 commit comments