From e50eff828a90c5f0e91bb48c39189d678ef7e21d Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 4 Jul 2024 13:44:36 +0200
Subject: [PATCH 01/33] data gerneration graphODE without dampings

---
 .../surrogatemodel/GNN/data_generation_nodamp | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp
new file mode 100644
index 0000000000..3beecebdff
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp
@@ -0,0 +1,314 @@
+import copy
+import os
+import pickle
+import random
+import json
+from datetime import date
+ 
+import numpy as np
+from progress.bar import Bar
+from sklearn.preprocessing import FunctionTransformer
+
+from memilio.simulation import (AgeGroup, LogLevel, set_log_level)
+
+from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
+                                      InfectionState, Model, ModelGraph, 
+                                      interpolate_simulation_result, set_edges)
+
+from memilio.epidata import geoModificationGermany as geoger
+from memilio.epidata import transformMobilityData as tmd
+from memilio.epidata import getDataIntoPandasDataFrame as gd
+
+
+def run_secir_groups_simulation(days, populations):
+    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 different age groups. The model is not stratified by region. 
+    Virus-specific parameters are fixed and initial number of persons in the particular infection states are chosen randomly from defined ranges.
+    @param Days Describes how many days we simulate within a single run.
+    @param damping_day The day when damping is applied.
+    @param populations List containing the population in each age group.
+    @return List containing the populations in each compartment used to initialize the run.
+   """
+    set_log_level(LogLevel.Off)
+
+    start_day = 1
+    start_month = 1
+    start_year = 2019
+    dt = 0.1
+
+    # get county ids
+    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
+
+    # Define age Groups
+    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
+    num_groups = len(groups)
+    num_regions = len(populations)
+    models = []
+
+    # Initialize Parameters
+    for region in range(num_regions):
+        model = Model(num_groups)
+
+        # Set parameters
+        for i in range(num_groups):
+            # Compartment transition duration
+            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
+            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
+            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
+            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
+            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+
+            # Initial number of people in each compartment with random numbers
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Exposed)] = random.uniform(
+                0.00025, 0.005) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptoms)] = random.uniform(
+                0.0001, 0.0035) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptoms)] = random.uniform(
+                0.00007, 0.001) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSevere)] = random.uniform(
+                0.00003, 0.0006) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedCritical)] = random.uniform(
+                0.00001, 0.0002) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Recovered)] = random.uniform(
+                0.002, 0.08) * populations[region][i]
+            model.populations[AgeGroup(i),
+                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
+                0, 0.0003) * populations[region][i]
+            model.populations.set_difference_from_group_total_AgeGroup(
+                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  populations[region][i])
+
+            # Compartment transition propabilities
+            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
+            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
+                i)] = 0.1
+            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
+            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
+            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
+            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
+            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
+            # twice the value of RiskOfInfectionFromSymptomatic
+            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
+                i)] = 0.5
+
+        # StartDay is the n-th day of the year
+        model.parameters.StartDay = (
+            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
+
+        # Load baseline and minimum contact matrix and assign them to the model
+        baseline = getBaselineMatrix()
+        #minimum = getMinimumMatrix()
+
+        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
+        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
+        (num_groups, num_groups)) * 0
+
+        # Apply mathematical constraints to parameters
+        model.apply_constraints()
+        models.append(model)
+
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), graph, num_locations)
+
+    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
+    study.run()
+
+    graph_run = study.run()[0]
+    results = interpolate_simulation_result(graph_run)
+
+    for result_indx in range(len(results)):
+        results[result_indx] = remove_confirmed_compartments(
+            np.transpose(results[result_indx].as_ndarray()[1:, :]), num_groups)
+
+    # Omit first column, as the time points are not of interest here.
+    dataset_entry = copy.deepcopy(results)
+
+    return dataset_entry
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are therefore omitted by summarizing the confirmed 
+    compartment with the original compartment. 
+    @param dataset_entries Array that contains the compartmental data with confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+    new_dataset_entries = []
+    for i in dataset_entries : 
+      dataset_entries_reshaped  = i.reshape([num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups) ])
+      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
+      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
+      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+      dataset_entries_reshaped[:, 4] = sum_inf_symp
+      new_dataset_entries.append(np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten())
+    return new_dataset_entries
+
+
+def get_population(path="data/pydata/Germany/county_population.json"):
+    """! loads population data 
+    @param path Path to population file. 
+    @return List with all 400 populations and 6 age groups. 
+   """
+    with open(path) as f:
+        data = json.load(f)
+    population = []
+    for data_entry in data:
+        population_county = []
+        population_county.append(
+            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
+        population_county.append(data_entry['6-14 years'])
+        population_county.append(
+            data_entry['15-17 years'] + data_entry['18-24 years'] +
+            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
+        population_county.append(
+            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
+            data_entry['50-64 years'] * 2 / 3)
+        population_county.append(
+            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
+            data_entry['50-64 years'] * 1 / 3)
+        population_county.append(
+            data_entry['>74 years'] * 0.8)
+
+        population.append(population_county)
+    return population
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+def generate_data(
+        num_runs, path, input_width, days, save_data=True):
+    """! Generate dataset by calling run_secir_simulation (num_runs)-often
+   @param num_runs Number of times, the function run_secir_simulation is called.
+   @param path Path, where the datasets are stored.
+   @param input_width number of time steps used for model input.
+   @param label_width number of time steps (days) used as model output/label.  
+   @param save_data Option to deactivate the save of the dataset. Per default true.
+   """
+    population = get_population()
+    days_sum = days+input_width-1
+
+
+    # days = damping_days[-1]+20
+    data = {"inputs": [],
+                "labels": [],
+                }
+
+    # show progess in terminal for longer runs
+    # Due to the random structure, theres currently no need to shuffle the data
+    bar = Bar('Number of Runs done', max=num_runs)
+
+    for _ in range(num_runs):
+
+            data_run = run_secir_groups_simulation(
+                days_sum, population)
+
+            inputs = np.asarray(data_run).transpose(1,2,0)[:input_width]
+            data["inputs"].append(inputs)
+
+            data["labels"].append(np.asarray(data_run).transpose(1,2,0)[input_width:])
+
+            bar.next()
+
+    bar.finish()
+
+    if save_data:
+            num_groups = int(np.asarray(data['inputs']).shape[2]/8)
+            transformer = FunctionTransformer(np.log1p, validate=True)
+
+            # Scale inputs
+            inputs = np.asarray(
+                data['inputs']).transpose(2,0,1,3).reshape(num_groups*8,-1)
+            scaled_inputs = transformer.transform(inputs)
+            original_shape_input = np.asarray(data['inputs']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_inputs.reshape(original_shape_input[2], original_shape_input[0], 
+                                                  original_shape_input[1], original_shape_input[3])
+
+            # Step 2: Reverse the transpose
+            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+            
+            # Scale labels
+            labels = np.asarray(
+                data['labels']).transpose(2,0,1,3).reshape(num_groups*8, -1)
+            scaled_labels = transformer.transform(labels)
+            original_shape_labels = np.asarray(data['labels']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_labels.reshape(original_shape_labels[2], original_shape_labels[0], 
+                                                  original_shape_labels[1], original_shape_labels[3])
+
+            # Step 2: Reverse the transpose
+            original_labels = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_labels = original_labels.transpose(0, 3, 1, 2)
+
+            all_data = {"inputs": scaled_inputs,
+                "labels": scaled_labels,
+                }
+
+            # check if data directory exists. If necessary create it.
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+            # save dict to json file
+            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
+                pickle.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    path = os.path.dirname(os.path.realpath(__file__))
+    path_data = os.path.join(
+        os.path.dirname(
+            os.path.realpath(os.path.dirname(os.path.realpath(path)))),
+        'data_GNN_nodamp')
+
+ 
+    input_width = 5
+    days = 30
+    num_runs = 1000
+    number_of_populations = 400
+    generate_data(num_runs, path_data, input_width,
+                  days, number_of_populations)
+    

From 21e6df5fc7587977e7fd5bdd6c45ff01f154541e Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Tue, 9 Jul 2024 09:41:47 +0200
Subject: [PATCH 02/33] GNN data generator with dampings - graphODE

---
 .../GNN/data_generation_withdamp.py           | 417 ++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
new file mode 100644
index 0000000000..4056108962
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -0,0 +1,417 @@
+import copy
+import os
+import pickle
+import random
+import json
+from datetime import date
+ 
+import numpy as np
+from progress.bar import Bar
+from sklearn.preprocessing import FunctionTransformer
+
+from memilio.simulation import (AgeGroup, LogLevel, set_log_level, Damping)
+
+from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
+                                      InfectionState, Model, ModelGraph, 
+                                      interpolate_simulation_result, set_edges)
+
+from memilio.epidata import geoModificationGermany as geoger
+from memilio.epidata import transformMobilityData as tmd
+from memilio.epidata import getDataIntoPandasDataFrame as gd
+
+
+def run_secir_groups_simulation(days, populations, num_dampings):
+    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 different age groups. The model is not stratified by region. 
+    Virus-specific parameters are fixed and initial number of persons in the particular infection states are chosen randomly from defined ranges.
+    @param Days Describes how many days we simulate within a single run.
+    @param damping_day The day when damping is applied.
+    @param populations List containing the population in each age group.
+    @return List containing the populations in each compartment used to initialize the run.
+   """
+    set_log_level(LogLevel.Off)
+
+    start_day = 1
+    start_month = 1
+    start_year = 2019
+    dt = 0.1
+
+    # get county ids
+    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
+
+    # Define age Groups
+    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
+    num_groups = len(groups)
+    num_regions = len(populations)
+    models = []
+
+    # Initialize Parameters
+    for region in range(num_regions):
+        model = Model(num_groups)
+
+        # Set parameters
+        for i in range(num_groups):
+            # Compartment transition duration
+            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
+            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
+            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
+            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
+            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+
+            # Initial number of people in each compartment with random numbers
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Exposed)] = random.uniform(
+                0.00025, 0.005) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptoms)] = random.uniform(
+                0.0001, 0.0035) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptoms)] = random.uniform(
+                0.00007, 0.001) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSevere)] = random.uniform(
+                0.00003, 0.0006) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedCritical)] = random.uniform(
+                0.00001, 0.0002) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Recovered)] = random.uniform(
+                0.002, 0.08) * populations[region][i]
+            model.populations[AgeGroup(i),
+                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
+                0, 0.0003) * populations[region][i]
+            model.populations.set_difference_from_group_total_AgeGroup(
+                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  populations[region][i])
+
+            # Compartment transition propabilities
+            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
+            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
+                i)] = 0.1
+            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
+            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
+            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
+            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
+            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
+            # twice the value of RiskOfInfectionFromSymptomatic
+            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
+                i)] = 0.5
+
+        # StartDay is the n-th day of the year
+        model.parameters.StartDay = (
+            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
+
+        # Load baseline and minimum contact matrix and assign them to the model
+        baseline = getBaselineMatrix()
+        minimum = getMinimumMatrix()
+
+        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
+        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = minimum
+
+        # Generate a damping matrix and assign it to the model
+        damped_matrices = []
+        damping_coeff = []
+        for day in num_dampings:
+
+            # generat a random damping factor
+            damping = np.ones((num_groups, num_groups)
+                        ) * np.float16(random.uniform(0, 0.5))
+            
+            # add damping to model
+            model.parameters.ContactPatterns.cont_freq_mat.add_damping(Damping(
+                coeffs=(damping), t=day, level=0, type=0))
+
+            damped_matrices.append(model.parameters.ContactPatterns.cont_freq_mat.get_matrix_at(
+                day+1))
+            damping_coeff.append(damping[0][0])
+
+        # Apply mathematical constraints to parameters
+        model.apply_constraints()
+        models.append(model)
+
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), graph, num_locations)
+
+    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
+    study.run()
+
+    graph_run = study.run()[0]
+    results = interpolate_simulation_result(graph_run)
+
+    for result_indx in range(len(results)):
+        results[result_indx] = remove_confirmed_compartments(
+            np.transpose(results[result_indx].as_ndarray()[1:, :]), num_groups)
+
+    # Omit first column, as the time points are not of interest here.
+    dataset_entry = copy.deepcopy(results)
+
+    return dataset_entry, damped_matrices, num_dampings, damping_coeff
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are therefore omitted by summarizing the confirmed 
+    compartment with the original compartment. 
+    @param dataset_entries Array that contains the compartmental data with confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+    new_dataset_entries = []
+    for i in dataset_entries : 
+      dataset_entries_reshaped  = i.reshape([num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups) ])
+      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
+      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
+      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+      dataset_entries_reshaped[:, 4] = sum_inf_symp
+      new_dataset_entries.append(np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten())
+    return new_dataset_entries
+
+
+def get_population(path="data/pydata/Germany/county_population.json"):
+    """! loads population data 
+    @param path Path to population file. 
+    @return List with all 400 populations and 6 age groups. 
+   """
+    with open(path) as f:
+        data = json.load(f)
+    population = []
+    for data_entry in data:
+        population_county = []
+        population_county.append(
+            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
+        population_county.append(data_entry['6-14 years'])
+        population_county.append(
+            data_entry['15-17 years'] + data_entry['18-24 years'] +
+            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
+        population_county.append(
+            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
+            data_entry['50-64 years'] * 2 / 3)
+        population_county.append(
+            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
+            data_entry['50-64 years'] * 1 / 3)
+        population_county.append(
+            data_entry['>74 years'] * 0.8)
+
+        population.append(population_county)
+    return population
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+def getMinimumMatrix():
+    """! loads the minimum matrix
+    """
+
+    minimum_contact_matrix0 = os.path.join(
+        "./data/contacts/minimum_home.txt")
+    minimum_contact_matrix1 = os.path.join(
+        "./data/contacts/minimum_school_pf_eig.txt")
+    minimum_contact_matrix2 = os.path.join(
+        "./data/contacts/minimum_work.txt")
+    minimum_contact_matrix3 = os.path.join(
+        "./data/contacts/minimum_other.txt")
+
+    minimum = np.loadtxt(minimum_contact_matrix0) \
+        + np.loadtxt(minimum_contact_matrix1) + \
+        np.loadtxt(minimum_contact_matrix2) + \
+        np.loadtxt(minimum_contact_matrix3)
+
+    return minimum
+
+
+def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min_damping_day, n_runs):
+    """! Draw damping days while keeping a minimum distance between the damping days. This method aims to create a 
+        uniform ditribution of drawn damping days. 
+   @param num_of_dampings Number of dampings that have to be drawn.
+   @param days Number of days which are simulated (label_width).
+   @param min_distance The minimum number of days between two dampings. 
+   @param min_damping_day The earliest day of the simualtion where a damping can take place.   
+   @param n_runs Number of simulation runs. 
+   """
+
+    all_dampings = []
+    count_runs = 0 
+    count_shadow = 0
+    while len(all_dampings)<n_runs:
+
+        days_list = list(range((min_damping_day),days))
+        dampings = []
+        if count_shadow <2:
+            for i in range(number_of_dampings):
+                damp = random.choice(days_list)
+                days_before = list(range(damp-(min_distance), damp))
+                days_after = list(range(damp, damp+(min_distance+1)))
+                dampings.append(damp)
+                days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
+        else: 
+            # chose a forbidden damping 
+            damp = random.choice(list(range((0-min_distance),0))+ list(range(days+1, (days+min_distance+1))))
+                
+            days_before = list(range(damp-(min_distance), damp))
+            days_after = list(range(damp, damp+(min_distance+1)))
+            days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
+            dampings.append(damp)
+            for i in range(number_of_dampings):
+                
+                
+                damp = random.choice(days_list)
+                days_before = list(range(damp-(min_distance), damp))
+                days_after = list(range(damp, damp+(min_distance+1)))
+                dampings.append(damp)
+                days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
+                count_shadow = 0
+        
+            
+        forbidden_damping_values = list(range((0-min_distance),0))+ list(range(days+1, (days+min_distance+1)))
+        dampings = [ele for ele in dampings if ele not in forbidden_damping_values]
+        count_runs+=1
+        count_shadow +=1
+        # select first or last five dampings
+        if len(dampings) >= number_of_dampings:
+            #dampings = random.sample(dampings, 5)
+            all_dampings.append(sorted(dampings))
+        #     if count_runs % 2 == 0:
+        
+    return np.asarray(all_dampings)
+
+def generate_data(
+        num_runs, path, input_width, label_width, save_data=True):
+    """! Generate dataset by calling run_secir_simulation (num_runs)-often
+   @param num_runs Number of times, the function run_secir_simulation is called.
+   @param path Path, where the datasets are stored.
+   @param input_width Number of time steps used for model input.
+   @param label_width Number of time steps (days) used as model output/label.  
+   @param save_data Option to deactivate the save of the dataset. Per default true.
+   """
+    population = get_population()
+    days_sum = label_width+input_width-1
+
+    #generate dampings
+    damping_days = generate_dampings_withshadowdamp(number_of_dampings = number_of_dampings, days= label_width, min_distance=7, min_damping_day=input_width, n_runs = num_runs)
+    
+    # all data including damping information
+    all_data = {"inputs": [],
+            "labels": [],
+            "damping_coeff": [],
+            "damping_day": [], 
+            "damped_matrix": []}
+
+    # data that needs to be scaled 
+    data = {"inputs": [],
+            "labels": []}
+    
+    # show progess in terminal for longer runs
+    # Due to the random structure, theres currently no need to shuffle the data
+    bar = Bar('Number of Runs done', max=num_runs)
+
+    for i in range(num_runs):
+            
+            data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
+                 days_sum,  population,  damping_days[i])
+
+            inputs = np.asarray(data_run).transpose(1,2,0)[:input_width]
+            data["inputs"].append(inputs)
+            data["labels"].append(np.asarray(data_run).transpose(1,2,0)[input_width:])
+            all_data["damping_coeff"].append(damping_factor)
+            all_data["damping_day"].append(damping_days_s)
+            all_data["damped_matrix"].append(damped_contact_matrix)
+
+            bar.next()
+
+    bar.finish()
+
+    if save_data:
+            num_groups = int(np.asarray(data['inputs']).shape[2]/8)
+            transformer = FunctionTransformer(np.log1p, validate=True)
+
+            # Scale inputs
+            inputs = np.asarray(
+                data['inputs']).transpose(2,0,1,3).reshape(num_groups*8,-1)
+            scaled_inputs = transformer.transform(inputs)
+            original_shape_input = np.asarray(data['inputs']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_inputs.reshape(original_shape_input[2], original_shape_input[0], 
+                                                  original_shape_input[1], original_shape_input[3])
+
+            # Step 2: Reverse the transpose
+            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+            
+            # Scale labels
+            labels = np.asarray(
+                data['labels']).transpose(2,0,1,3).reshape(num_groups*8, -1)
+            scaled_labels = transformer.transform(labels)
+            original_shape_labels = np.asarray(data['labels']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_labels.reshape(original_shape_labels[2], original_shape_labels[0], 
+                                                  original_shape_labels[1], original_shape_labels[3])
+
+            # Step 2: Reverse the transpose
+            original_labels = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_labels = original_labels.transpose(0, 3, 1, 2)
+
+            all_data = {"inputs": scaled_inputs,
+                "labels": scaled_labels,
+                }
+
+            # check if data directory exists. If necessary create it.
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+            # save dict to json file
+            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
+                pickle.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    input_width = 5
+    label_width = 100
+    number_of_dampings = 3
+    num_runs = 4
+    number_of_populations = 400
+
+    path = os.path.dirname(os.path.realpath(__file__))
+    path_data = os.path.join(
+        os.path.dirname(
+            os.path.realpath(os.path.dirname(os.path.realpath(path)))),
+        'data_GNN_with '+str(number_of_dampings)+' dampings')
+
+    generate_data(num_runs, path_data, input_width,
+                  label_width, number_of_populations)
+    

From 95a9d154604907dbed0c59b0cf05f111f34ce43c Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Tue, 9 Jul 2024 10:25:00 +0200
Subject: [PATCH 03/33] add damping information to output

---
 .../GNN/data_generation_withdamp.py           | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 4056108962..0bae453fee 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -330,7 +330,10 @@ def generate_data(
 
     # data that needs to be scaled 
     data = {"inputs": [],
-            "labels": []}
+            "labels": [],
+            "damping_coeff": [],
+            "damping_day": [], 
+            "damped_matrix": []}
     
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
@@ -344,9 +347,9 @@ def generate_data(
             inputs = np.asarray(data_run).transpose(1,2,0)[:input_width]
             data["inputs"].append(inputs)
             data["labels"].append(np.asarray(data_run).transpose(1,2,0)[input_width:])
-            all_data["damping_coeff"].append(damping_factor)
-            all_data["damping_day"].append(damping_days_s)
-            all_data["damped_matrix"].append(damped_contact_matrix)
+            data["damping_coeff"].append(damping_factor)
+            data["damping_day"].append(damping_days_s)
+            data["damped_matrix"].append(damped_contact_matrix)
 
             bar.next()
 
@@ -387,7 +390,9 @@ def generate_data(
 
             all_data = {"inputs": scaled_inputs,
                 "labels": scaled_labels,
-                }
+                "damping_coeff": data['damping_coeff'],
+                "damping_day": data['damping_day'],
+                "damped_matrix": data['damped_matrix']}
 
             # check if data directory exists. If necessary create it.
             if not os.path.isdir(path):
@@ -401,16 +406,16 @@ def generate_data(
 if __name__ == "__main__":
 
     input_width = 5
-    label_width = 100
+    label_width = 30
     number_of_dampings = 3
-    num_runs = 4
+    num_runs = 2
     number_of_populations = 400
 
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_with '+str(number_of_dampings)+' dampings')
+        'data_GNN_with_'+str(number_of_dampings)+'_dampings')
 
     generate_data(num_runs, path_data, input_width,
                   label_width, number_of_populations)

From 300b9562bf97b6da34f1288391bd240ec18f3226 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 7 Aug 2024 15:17:57 +0200
Subject: [PATCH 04/33] adjust to code guidelines

---
 .../GNN/data_generation_nodamp.py             | 328 ++++++++++++++++++
 1 file changed, 328 insertions(+)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
new file mode 100644
index 0000000000..e15fccf453
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -0,0 +1,328 @@
+import copy
+import os
+import pickle
+import random
+import json
+from datetime import date
+import numpy as np
+ 
+from progress.bar import Bar
+from sklearn.preprocessing import FunctionTransformer
+
+from memilio.simulation import (AgeGroup, LogLevel, set_log_level)
+from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
+                                      InfectionState, Model, ModelGraph, 
+                                      interpolate_simulation_result, set_edges)
+from memilio.epidata import geoModificationGermany as geoger
+from memilio.epidata import transformMobilityData as tmd
+from memilio.epidata import getDataIntoPandasDataFrame as gd
+
+
+def run_secir_groups_simulation(days, populations):
+    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 
+        different age groups. The model is not stratified by region. 
+        Virus-specific parameters are fixed and initial number of persons 
+        in the particular infection states are chosen randomly from defined ranges.
+    @param Days Describes how many days we simulate within a single run.
+    @param damping_day The day when damping is applied.
+    @param populations List containing the population in each age group.
+    @return List containing the populations in each compartment used to initialize 
+            the run.
+   """
+    
+    set_log_level(LogLevel.Off)
+
+    start_day = 1
+    start_month = 1
+    start_year = 2019
+    dt = 0.1
+
+    # get county ids
+    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
+
+    # Define age Groups
+    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
+    num_groups = len(groups)
+    num_regions = len(populations)
+    models = []
+
+    # Initialize Parameters
+    for region in range(num_regions):
+        model = Model(num_groups)
+
+        # Set parameters
+        for i in range(num_groups):
+            # Compartment transition duration
+            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
+            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
+            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
+            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
+            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+
+            # Initial number of people in each compartment with random numbers
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Exposed)] = random.uniform(
+                0.00025, 0.005) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptoms)] = random.uniform(
+                0.0001, 0.0035) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedNoSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptoms)] = random.uniform(
+                0.00007, 0.001) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSymptomsConfirmed)] = 0
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedSevere)] = random.uniform(
+                0.00003, 0.0006) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.InfectedCritical)] = random.uniform(
+                0.00001, 0.0002) * populations[region][i]
+            model.populations[AgeGroup(i), Index_InfectionState(
+                InfectionState.Recovered)] = random.uniform(
+                0.002, 0.08) * populations[region][i]
+            model.populations[AgeGroup(i),
+                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
+                0, 0.0003) * populations[region][i]
+            model.populations.set_difference_from_group_total_AgeGroup(
+                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  
+                    populations[region][i])
+
+            # Compartment transition propabilities
+            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
+            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
+                i)] = 0.1
+            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
+            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
+            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
+            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
+            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
+            # twice the value of RiskOfInfectionFromSymptomatic
+            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
+                i)] = 0.5
+
+        # StartDay is the n-th day of the year
+        model.parameters.StartDay = (
+            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
+
+        # Load baseline and minimum contact matrix and assign them to the model
+        baseline = getBaselineMatrix()
+        #minimum = getMinimumMatrix()
+
+        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
+        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
+        (num_groups, num_groups)) * 0
+
+        # Apply mathematical constraints to parameters
+        model.apply_constraints()
+        models.append(model)
+
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
+                            graph, num_locations)
+
+    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
+    study.run()
+
+    graph_run = study.run()[0]
+    results = interpolate_simulation_result(graph_run)
+
+    for result_indx in range(len(results)):
+        results[result_indx] = remove_confirmed_compartments(
+            np.transpose(results[result_indx].as_ndarray()[1:, :]), num_groups)
+
+    # Omit first column, as the time points are not of interest here.
+    dataset_entry = copy.deepcopy(results)
+
+    return dataset_entry
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are 
+        therefore omitted by summarizing the confirmed compartment with the 
+        original compartment. 
+    @param dataset_entries Array that contains the compartmental data with 
+            confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+    
+    new_dataset_entries = []
+    for i in dataset_entries : 
+      dataset_entries_reshaped  = i.reshape(
+          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+          )
+      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
+      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
+      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+      dataset_entries_reshaped[:, 4] = sum_inf_symp
+      new_dataset_entries.append(
+          np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten()
+          )
+    return new_dataset_entries
+
+
+def get_population(path="data/pydata/Germany/county_population.json"):
+    """! loads population data 
+    @param path Path to population file. 
+    @return List with all 400 populations and 6 age groups. 
+   """
+    
+    with open(path) as f:
+        data = json.load(f)
+    population = []
+    for data_entry in data:
+        population_county = []
+        population_county.append(
+            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
+        population_county.append(data_entry['6-14 years'])
+        population_county.append(
+            data_entry['15-17 years'] + data_entry['18-24 years'] +
+            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
+        population_county.append(
+            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
+            data_entry['50-64 years'] * 2 / 3)
+        population_county.append(
+            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
+            data_entry['50-64 years'] * 1 / 3)
+        population_county.append(
+            data_entry['>74 years'] * 0.8)
+
+        population.append(population_county)
+    return population
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+def generate_data(
+        num_runs, path, input_width, days, save_data=True):
+    """! Generate dataset by calling run_secir_simulation (num_runs)-often
+   @param num_runs Number of times, the function run_secir_simulation is called.
+   @param path Path, where the datasets are stored.
+   @param input_width number of time steps used for model input.
+   @param label_width number of time steps (days) used as model output/label.  
+   @param save_data Option to deactivate the save of the dataset. Per default true.
+   """
+    
+    population = get_population()
+    days_sum = days + input_width - 1
+
+    data = {"inputs": [],
+                "labels": [],
+                }
+
+    # show progess in terminal for longer runs
+    # Due to the random structure, theres currently no need to shuffle the data
+    bar = Bar('Number of Runs done', max=num_runs)
+
+    for _ in range(num_runs):
+
+            data_run = run_secir_groups_simulation(
+                days_sum, population)
+
+            inputs = np.asarray(data_run).transpose(1, 2, 0)[: input_width]
+            data["inputs"].append(inputs)
+
+            data["labels"].append(np.asarray(data_run).transpose(1, 2, 0)[input_width:])
+
+            bar.next()
+
+    bar.finish()
+
+    if save_data:
+            num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
+            transformer = FunctionTransformer(np.log1p, validate=True)
+
+            # Scale inputs
+            inputs = np.asarray(
+                data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+            scaled_inputs = transformer.transform(inputs)
+            original_shape_input = np.asarray(data['inputs']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_inputs.reshape(original_shape_input[2], 
+                                                  original_shape_input[0], 
+                                                  original_shape_input[1], 
+                                                  original_shape_input[3])
+
+            # Step 2: Reverse the transpose
+            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+            
+            # Scale labels
+            labels = np.asarray(
+                data['labels']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+            scaled_labels = transformer.transform(labels)
+            original_shape_labels = np.asarray(data['labels']).shape
+            
+            # Step 1: Reverse the reshape
+            reshaped_back = scaled_labels.reshape(original_shape_labels[2], 
+                                                  original_shape_labels[0], 
+                                                  original_shape_labels[1], 
+                                                  original_shape_labels[3])
+
+            # Step 2: Reverse the transpose
+            original_labels = reshaped_back.transpose(1, 2, 0, 3)
+            scaled_labels = original_labels.transpose(0, 3, 1, 2)
+
+            all_data = {"inputs": scaled_inputs,
+                "labels": scaled_labels,
+                }
+
+            # check if data directory exists. If necessary create it.
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+            # save dict to json file
+            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
+                pickle.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    path = os.path.dirname(os.path.realpath(__file__))
+    path_data = os.path.join(
+        os.path.dirname(
+            os.path.realpath(os.path.dirname(os.path.realpath(path)))),
+        'data_GNN_nodamp')
+ 
+    input_width = 5
+    days = 30
+    num_runs = 1000
+    number_of_populations = 400
+    generate_data(num_runs, path_data, input_width,
+                  days, number_of_populations)
+    

From 75ce80d9d054b4bd997c9bc14ee9c0db9f346f01 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 7 Aug 2024 15:19:55 +0200
Subject: [PATCH 05/33] delete file

---
 .../surrogatemodel/GNN/data_generation_nodamp | 314 ------------------
 1 file changed, 314 deletions(-)
 delete mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp
deleted file mode 100644
index 3beecebdff..0000000000
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp
+++ /dev/null
@@ -1,314 +0,0 @@
-import copy
-import os
-import pickle
-import random
-import json
-from datetime import date
- 
-import numpy as np
-from progress.bar import Bar
-from sklearn.preprocessing import FunctionTransformer
-
-from memilio.simulation import (AgeGroup, LogLevel, set_log_level)
-
-from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
-                                      InfectionState, Model, ModelGraph, 
-                                      interpolate_simulation_result, set_edges)
-
-from memilio.epidata import geoModificationGermany as geoger
-from memilio.epidata import transformMobilityData as tmd
-from memilio.epidata import getDataIntoPandasDataFrame as gd
-
-
-def run_secir_groups_simulation(days, populations):
-    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 different age groups. The model is not stratified by region. 
-    Virus-specific parameters are fixed and initial number of persons in the particular infection states are chosen randomly from defined ranges.
-    @param Days Describes how many days we simulate within a single run.
-    @param damping_day The day when damping is applied.
-    @param populations List containing the population in each age group.
-    @return List containing the populations in each compartment used to initialize the run.
-   """
-    set_log_level(LogLevel.Off)
-
-    start_day = 1
-    start_month = 1
-    start_year = 2019
-    dt = 0.1
-
-    # get county ids
-    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
-
-    # Define age Groups
-    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
-    num_groups = len(groups)
-    num_regions = len(populations)
-    models = []
-
-    # Initialize Parameters
-    for region in range(num_regions):
-        model = Model(num_groups)
-
-        # Set parameters
-        for i in range(num_groups):
-            # Compartment transition duration
-            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
-            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
-            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
-            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
-            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
-
-            # Initial number of people in each compartment with random numbers
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Exposed)] = random.uniform(
-                0.00025, 0.005) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedNoSymptoms)] = random.uniform(
-                0.0001, 0.0035) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedNoSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSymptoms)] = random.uniform(
-                0.00007, 0.001) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSevere)] = random.uniform(
-                0.00003, 0.0006) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedCritical)] = random.uniform(
-                0.00001, 0.0002) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Recovered)] = random.uniform(
-                0.002, 0.08) * populations[region][i]
-            model.populations[AgeGroup(i),
-                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
-                0, 0.0003) * populations[region][i]
-            model.populations.set_difference_from_group_total_AgeGroup(
-                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  populations[region][i])
-
-            # Compartment transition propabilities
-            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
-            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
-                i)] = 0.1
-            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
-            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
-            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
-            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
-            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
-            # twice the value of RiskOfInfectionFromSymptomatic
-            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
-                i)] = 0.5
-
-        # StartDay is the n-th day of the year
-        model.parameters.StartDay = (
-            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
-
-        # Load baseline and minimum contact matrix and assign them to the model
-        baseline = getBaselineMatrix()
-        #minimum = getMinimumMatrix()
-
-        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
-        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
-        (num_groups, num_groups)) * 0
-
-        # Apply mathematical constraints to parameters
-        model.apply_constraints()
-        models.append(model)
-
-    graph = ModelGraph()
-    for i in range(num_regions):
-        graph.add_node(int(countykey_list[i]), models[i])
-
-    # get mobility data directory
-    arg_dict = gd.cli("commuter_official")
-
-    directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
-
-    # Merge Eisenach and Wartbugkreis in Input Data 
-    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-
-    num_locations = 4
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), graph, num_locations)
-
-    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
-    study.run()
-
-    graph_run = study.run()[0]
-    results = interpolate_simulation_result(graph_run)
-
-    for result_indx in range(len(results)):
-        results[result_indx] = remove_confirmed_compartments(
-            np.transpose(results[result_indx].as_ndarray()[1:, :]), num_groups)
-
-    # Omit first column, as the time points are not of interest here.
-    dataset_entry = copy.deepcopy(results)
-
-    return dataset_entry
-
-def remove_confirmed_compartments(dataset_entries, num_groups):
-    """! The compartments which contain confirmed cases are not needed and are therefore omitted by summarizing the confirmed 
-    compartment with the original compartment. 
-    @param dataset_entries Array that contains the compartmental data with confirmed compartments. 
-    @param num_groups Number of age groups.
-    @return Array that contains the compartmental data without confirmed compartments. 
-   """
-    new_dataset_entries = []
-    for i in dataset_entries : 
-      dataset_entries_reshaped  = i.reshape([num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups) ])
-      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
-      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
-      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
-      dataset_entries_reshaped[:, 4] = sum_inf_symp
-      new_dataset_entries.append(np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten())
-    return new_dataset_entries
-
-
-def get_population(path="data/pydata/Germany/county_population.json"):
-    """! loads population data 
-    @param path Path to population file. 
-    @return List with all 400 populations and 6 age groups. 
-   """
-    with open(path) as f:
-        data = json.load(f)
-    population = []
-    for data_entry in data:
-        population_county = []
-        population_county.append(
-            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
-        population_county.append(data_entry['6-14 years'])
-        population_county.append(
-            data_entry['15-17 years'] + data_entry['18-24 years'] +
-            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
-        population_county.append(
-            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
-            data_entry['50-64 years'] * 2 / 3)
-        population_county.append(
-            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
-            data_entry['50-64 years'] * 1 / 3)
-        population_county.append(
-            data_entry['>74 years'] * 0.8)
-
-        population.append(population_county)
-    return population
-
-def getBaselineMatrix():
-    """! loads the baselinematrix
-    """
-
-    baseline_contact_matrix0 = os.path.join(
-        "./data/contacts/baseline_home.txt")
-    baseline_contact_matrix1 = os.path.join(
-        "./data/contacts/baseline_school_pf_eig.txt")
-    baseline_contact_matrix2 = os.path.join(
-        "./data/contacts/baseline_work.txt")
-    baseline_contact_matrix3 = os.path.join(
-        "./data/contacts/baseline_other.txt")
-
-    baseline = np.loadtxt(baseline_contact_matrix0) \
-        + np.loadtxt(baseline_contact_matrix1) + \
-        np.loadtxt(baseline_contact_matrix2) + \
-        np.loadtxt(baseline_contact_matrix3)
-
-    return baseline
-
-def generate_data(
-        num_runs, path, input_width, days, save_data=True):
-    """! Generate dataset by calling run_secir_simulation (num_runs)-often
-   @param num_runs Number of times, the function run_secir_simulation is called.
-   @param path Path, where the datasets are stored.
-   @param input_width number of time steps used for model input.
-   @param label_width number of time steps (days) used as model output/label.  
-   @param save_data Option to deactivate the save of the dataset. Per default true.
-   """
-    population = get_population()
-    days_sum = days+input_width-1
-
-
-    # days = damping_days[-1]+20
-    data = {"inputs": [],
-                "labels": [],
-                }
-
-    # show progess in terminal for longer runs
-    # Due to the random structure, theres currently no need to shuffle the data
-    bar = Bar('Number of Runs done', max=num_runs)
-
-    for _ in range(num_runs):
-
-            data_run = run_secir_groups_simulation(
-                days_sum, population)
-
-            inputs = np.asarray(data_run).transpose(1,2,0)[:input_width]
-            data["inputs"].append(inputs)
-
-            data["labels"].append(np.asarray(data_run).transpose(1,2,0)[input_width:])
-
-            bar.next()
-
-    bar.finish()
-
-    if save_data:
-            num_groups = int(np.asarray(data['inputs']).shape[2]/8)
-            transformer = FunctionTransformer(np.log1p, validate=True)
-
-            # Scale inputs
-            inputs = np.asarray(
-                data['inputs']).transpose(2,0,1,3).reshape(num_groups*8,-1)
-            scaled_inputs = transformer.transform(inputs)
-            original_shape_input = np.asarray(data['inputs']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_inputs.reshape(original_shape_input[2], original_shape_input[0], 
-                                                  original_shape_input[1], original_shape_input[3])
-
-            # Step 2: Reverse the transpose
-            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
-
-            
-            # Scale labels
-            labels = np.asarray(
-                data['labels']).transpose(2,0,1,3).reshape(num_groups*8, -1)
-            scaled_labels = transformer.transform(labels)
-            original_shape_labels = np.asarray(data['labels']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_labels.reshape(original_shape_labels[2], original_shape_labels[0], 
-                                                  original_shape_labels[1], original_shape_labels[3])
-
-            # Step 2: Reverse the transpose
-            original_labels = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_labels = original_labels.transpose(0, 3, 1, 2)
-
-            all_data = {"inputs": scaled_inputs,
-                "labels": scaled_labels,
-                }
-
-            # check if data directory exists. If necessary create it.
-            if not os.path.isdir(path):
-                os.mkdir(path)
-
-            # save dict to json file
-            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
-                pickle.dump(all_data, f)
-
-
-if __name__ == "__main__":
-
-    path = os.path.dirname(os.path.realpath(__file__))
-    path_data = os.path.join(
-        os.path.dirname(
-            os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_nodamp')
-
- 
-    input_width = 5
-    days = 30
-    num_runs = 1000
-    number_of_populations = 400
-    generate_data(num_runs, path_data, input_width,
-                  days, number_of_populations)
-    

From 10b6e923a1c71b29a2137f367bbd17aaf845f4a4 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 7 Aug 2024 15:28:49 +0200
Subject: [PATCH 06/33] adjust to code guidelines

---
 .../GNN/data_generation_withdamp.py           | 82 +++++++++++--------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 0bae453fee..18c73565bd 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -3,30 +3,30 @@
 import pickle
 import random
 import json
+import numpy as np
 from datetime import date
  
-import numpy as np
 from progress.bar import Bar
 from sklearn.preprocessing import FunctionTransformer
 
 from memilio.simulation import (AgeGroup, LogLevel, set_log_level, Damping)
-
 from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
                                       InfectionState, Model, ModelGraph, 
                                       interpolate_simulation_result, set_edges)
-
 from memilio.epidata import geoModificationGermany as geoger
 from memilio.epidata import transformMobilityData as tmd
 from memilio.epidata import getDataIntoPandasDataFrame as gd
 
-
 def run_secir_groups_simulation(days, populations, num_dampings):
-    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 different age groups. The model is not stratified by region. 
-    Virus-specific parameters are fixed and initial number of persons in the particular infection states are chosen randomly from defined ranges.
+    """! Uses an ODE SECIR model allowing for asymptomatic infection 
+        with 6 different age groups. The model is not stratified by region. 
+        Virus-specific parameters are fixed and initial number of persons 
+        in the particular infection states are chosen randomly from defined ranges.
     @param Days Describes how many days we simulate within a single run.
     @param damping_day The day when damping is applied.
     @param populations List containing the population in each age group.
-    @return List containing the populations in each compartment used to initialize the run.
+    @return List containing the populations in each compartment 
+            used to initialize the run.
    """
     set_log_level(LogLevel.Off)
 
@@ -121,7 +121,7 @@ def run_secir_groups_simulation(days, populations, num_dampings):
             
             # add damping to model
             model.parameters.ContactPatterns.cont_freq_mat.add_damping(Damping(
-                coeffs=(damping), t=day, level=0, type=0))
+                coeffs = (damping), t=day, level=0, type=0))
 
             damped_matrices.append(model.parameters.ContactPatterns.cont_freq_mat.get_matrix_at(
                 day+1))
@@ -165,20 +165,28 @@ def run_secir_groups_simulation(days, populations, num_dampings):
     return dataset_entry, damped_matrices, num_dampings, damping_coeff
 
 def remove_confirmed_compartments(dataset_entries, num_groups):
-    """! The compartments which contain confirmed cases are not needed and are therefore omitted by summarizing the confirmed 
+    """! The compartments which contain confirmed cases are not 
+        needed and are therefore omitted by summarizing the confirmed 
     compartment with the original compartment. 
-    @param dataset_entries Array that contains the compartmental data with confirmed compartments. 
+    @param dataset_entries Array that contains the compartmental data with 
+            confirmed compartments. 
     @param num_groups Number of age groups.
-    @return Array that contains the compartmental data without confirmed compartments. 
+    @return Array that contains the compartmental data without 
+            confirmed compartments. 
    """
+    
     new_dataset_entries = []
     for i in dataset_entries : 
-      dataset_entries_reshaped  = i.reshape([num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups) ])
+      dataset_entries_reshaped  = i.reshape(
+          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+          )
       sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
       sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
       dataset_entries_reshaped[:, 2] = sum_inf_no_symp
       dataset_entries_reshaped[:, 4] = sum_inf_symp
-      new_dataset_entries.append(np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten())
+      new_dataset_entries.append(np.delete(
+          dataset_entries_reshaped , [3, 5], axis=1).flatten()
+          )
     return new_dataset_entries
 
 
@@ -187,6 +195,7 @@ def get_population(path="data/pydata/Germany/county_population.json"):
     @param path Path to population file. 
     @return List with all 400 populations and 6 age groups. 
    """
+    
     with open(path) as f:
         data = json.load(f)
     population = []
@@ -252,12 +261,14 @@ def getMinimumMatrix():
 
 
 def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min_damping_day, n_runs):
-    """! Draw damping days while keeping a minimum distance between the damping days. This method aims to create a 
-        uniform ditribution of drawn damping days. 
+    """! Draw damping days while keeping a minimum distance between the 
+        damping days. This method aims to create a uniform ditribution of 
+        drawn damping days. 
    @param num_of_dampings Number of dampings that have to be drawn.
    @param days Number of days which are simulated (label_width).
    @param min_distance The minimum number of days between two dampings. 
-   @param min_damping_day The earliest day of the simualtion where a damping can take place.   
+   @param min_damping_day The earliest day of the simualtion where a damping 
+        can take place.   
    @param n_runs Number of simulation runs. 
    """
 
@@ -266,10 +277,11 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
     count_shadow = 0
     while len(all_dampings)<n_runs:
 
-        days_list = list(range((min_damping_day),days))
+        days_list = list(range((min_damping_day), days))
         dampings = []
-        if count_shadow <2:
+        if count_shadow < 2:
             for i in range(number_of_dampings):
+
                 damp = random.choice(days_list)
                 days_before = list(range(damp-(min_distance), damp))
                 days_after = list(range(damp, damp+(min_distance+1)))
@@ -277,14 +289,13 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
                 days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
         else: 
             # chose a forbidden damping 
-            damp = random.choice(list(range((0-min_distance),0))+ list(range(days+1, (days+min_distance+1))))
+            damp = random.choice(list(range((0-min_distance), 0))+ list(range(days+1, (days+min_distance+1))))
                 
             days_before = list(range(damp-(min_distance), damp))
             days_after = list(range(damp, damp+(min_distance+1)))
             days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
             dampings.append(damp)
-            for i in range(number_of_dampings):
-                
+            for i in range(number_of_dampings):                
                 
                 damp = random.choice(days_list)
                 days_before = list(range(damp-(min_distance), damp))
@@ -294,7 +305,7 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
                 count_shadow = 0
         
             
-        forbidden_damping_values = list(range((0-min_distance),0))+ list(range(days+1, (days+min_distance+1)))
+        forbidden_damping_values = list(range((0-min_distance), 0))+ list(range(days+1, (days+min_distance+1)))
         dampings = [ele for ele in dampings if ele not in forbidden_damping_values]
         count_runs+=1
         count_shadow +=1
@@ -319,7 +330,10 @@ def generate_data(
     days_sum = label_width+input_width-1
 
     #generate dampings
-    damping_days = generate_dampings_withshadowdamp(number_of_dampings = number_of_dampings, days= label_width, min_distance=7, min_damping_day=input_width, n_runs = num_runs)
+    damping_days = generate_dampings_withshadowdamp(
+        number_of_dampings = number_of_dampings, days = label_width, 
+        min_distance = 7, min_damping_day = input_width, n_runs = num_runs
+        )
     
     # all data including damping information
     all_data = {"inputs": [],
@@ -337,16 +351,16 @@ def generate_data(
     
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
-    bar = Bar('Number of Runs done', max=num_runs)
+    bar = Bar('Number of Runs done', max = num_runs)
 
     for i in range(num_runs):
             
             data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
                  days_sum,  population,  damping_days[i])
 
-            inputs = np.asarray(data_run).transpose(1,2,0)[:input_width]
+            inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
             data["inputs"].append(inputs)
-            data["labels"].append(np.asarray(data_run).transpose(1,2,0)[input_width:])
+            data["labels"].append(np.asarray(data_run).transpose(1, 2, 0)[input_width:])
             data["damping_coeff"].append(damping_factor)
             data["damping_day"].append(damping_days_s)
             data["damped_matrix"].append(damped_contact_matrix)
@@ -361,13 +375,15 @@ def generate_data(
 
             # Scale inputs
             inputs = np.asarray(
-                data['inputs']).transpose(2,0,1,3).reshape(num_groups*8,-1)
+                data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
             scaled_inputs = transformer.transform(inputs)
             original_shape_input = np.asarray(data['inputs']).shape
             
             # Step 1: Reverse the reshape
-            reshaped_back = scaled_inputs.reshape(original_shape_input[2], original_shape_input[0], 
-                                                  original_shape_input[1], original_shape_input[3])
+            reshaped_back = scaled_inputs.reshape(
+                        original_shape_input[2], original_shape_input[0], 
+                        original_shape_input[1], original_shape_input[3]
+                        )
 
             # Step 2: Reverse the transpose
             original_inputs = reshaped_back.transpose(1, 2, 0, 3)
@@ -376,13 +392,15 @@ def generate_data(
             
             # Scale labels
             labels = np.asarray(
-                data['labels']).transpose(2,0,1,3).reshape(num_groups*8, -1)
+                data['labels']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
             scaled_labels = transformer.transform(labels)
             original_shape_labels = np.asarray(data['labels']).shape
             
             # Step 1: Reverse the reshape
-            reshaped_back = scaled_labels.reshape(original_shape_labels[2], original_shape_labels[0], 
-                                                  original_shape_labels[1], original_shape_labels[3])
+            reshaped_back = scaled_labels.reshape(
+                        original_shape_labels[2], original_shape_labels[0], 
+                        original_shape_labels[1], original_shape_labels[3]
+                        )
 
             # Step 2: Reverse the transpose
             original_labels = reshaped_back.transpose(1, 2, 0, 3)

From 9115678b294aa3ca25beabdc24dd192aef2ae352 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 12 Aug 2024 13:44:51 +0200
Subject: [PATCH 07/33] create a make_graph function

---
 .../GNN/data_generation_nodamp.py             | 51 ++++++++++++-------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index e15fccf453..c3d1bf4dcc 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -118,24 +118,7 @@ def run_secir_groups_simulation(days, populations):
         model.apply_constraints()
         models.append(model)
 
-    graph = ModelGraph()
-    for i in range(num_regions):
-        graph.add_node(int(countykey_list[i]), models[i])
-
-    # get mobility data directory
-    arg_dict = gd.cli("commuter_official")
-
-    directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
-
-    # Merge Eisenach and Wartbugkreis in Input Data 
-    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-
-    num_locations = 4
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
-                            graph, num_locations)
+    graph = make_graph(num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
     study.run()
@@ -152,6 +135,7 @@ def run_secir_groups_simulation(days, populations):
 
     return dataset_entry
 
+
 def remove_confirmed_compartments(dataset_entries, num_groups):
     """! The compartments which contain confirmed cases are not needed and are 
         therefore omitted by summarizing the confirmed compartment with the 
@@ -226,6 +210,35 @@ def getBaselineMatrix():
 
     return baseline
 
+def make_graph(num_regions, countykey_list, models):
+    """! 
+    @param num_regions Number (int) of counties that should be added to the 
+            grap-ODE model. Equals 400 for whole Germany. 
+    @param countykey_list List of keys/IDs for each county. 
+    @models models List of osecir Model with one model per population. 
+    @return graph Graph-ODE model. 
+   """
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
+                            graph, num_locations)
+    return graph
+    
+
 def generate_data(
         num_runs, path, input_width, days, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
@@ -317,7 +330,7 @@ def generate_data(
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_nodamp')
+        'data_GNN_nodamp_test')
  
     input_width = 5
     days = 30

From c9b26cf01a5759eabd64d5bfbeb62592f55813a9 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 12 Aug 2024 14:22:37 +0200
Subject: [PATCH 08/33] create make graph function

---
 .../GNN/data_generation_withdamp.py           | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 18c73565bd..852cb0dfbf 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -131,23 +131,7 @@ def run_secir_groups_simulation(days, populations, num_dampings):
         model.apply_constraints()
         models.append(model)
 
-    graph = ModelGraph()
-    for i in range(num_regions):
-        graph.add_node(int(countykey_list[i]), models[i])
-
-    # get mobility data directory
-    arg_dict = gd.cli("commuter_official")
-
-    directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
-
-    # Merge Eisenach and Wartbugkreis in Input Data 
-    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-
-    num_locations = 4
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), graph, num_locations)
+    graph = make_graph(num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
     study.run()
@@ -260,6 +244,35 @@ def getMinimumMatrix():
     return minimum
 
 
+def make_graph(num_regions, countykey_list, models):
+    """! 
+    @param num_regions Number (int) of counties that should be added to the 
+            grap-ODE model. Equals 400 for whole Germany. 
+    @param countykey_list List of keys/IDs for each county. 
+    @models models List of osecir Model with one model per population. 
+    @return graph Graph-ODE model. 
+   """
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
+                            graph, num_locations)
+    return graph
+
+
 def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min_damping_day, n_runs):
     """! Draw damping days while keeping a minimum distance between the 
         damping days. This method aims to create a uniform ditribution of 
@@ -425,15 +438,15 @@ def generate_data(
 
     input_width = 5
     label_width = 30
-    number_of_dampings = 3
+    number_of_dampings = 1
     num_runs = 2
-    number_of_populations = 400
+    number_of_populations = 20
 
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_with_'+str(number_of_dampings)+'_dampings')
+        'data_GNN_with_'+str(number_of_dampings)+'_dampings_test')
 
     generate_data(num_runs, path_data, input_width,
                   label_width, number_of_populations)

From d97a1f7df16636947b8608d09cbec6bec7b88527 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 14 Aug 2024 12:44:27 +0200
Subject: [PATCH 09/33] fix population number at 400

---
 .../memilio/surrogatemodel/GNN/data_generation_nodamp.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index c3d1bf4dcc..06a4a4c67b 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -323,6 +323,7 @@ def generate_data(
             with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
                 pickle.dump(all_data, f)
 
+    return data
 
 if __name__ == "__main__":
 
@@ -330,12 +331,12 @@ def generate_data(
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_nodamp_test')
+        'data_GNN_nodamp')
  
     input_width = 5
     days = 30
     num_runs = 1000
-    number_of_populations = 400
+    
     generate_data(num_runs, path_data, input_width,
-                  days, number_of_populations)
+                  days)
     

From e20fe2d6386995a493e4a8391585f3a613a045c1 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 14 Aug 2024 12:50:28 +0200
Subject: [PATCH 10/33] fix number of population to 400 and rename damping
 information

---
 .../GNN/data_generation_withdamp.py            | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 852cb0dfbf..5bc060c84b 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -17,7 +17,7 @@
 from memilio.epidata import transformMobilityData as tmd
 from memilio.epidata import getDataIntoPandasDataFrame as gd
 
-def run_secir_groups_simulation(days, populations, num_dampings):
+def run_secir_groups_simulation(days, populations, dampings):
     """! Uses an ODE SECIR model allowing for asymptomatic infection 
         with 6 different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
@@ -113,7 +113,7 @@ def run_secir_groups_simulation(days, populations, num_dampings):
         # Generate a damping matrix and assign it to the model
         damped_matrices = []
         damping_coeff = []
-        for day in num_dampings:
+        for day in dampings:
 
             # generat a random damping factor
             damping = np.ones((num_groups, num_groups)
@@ -146,7 +146,7 @@ def run_secir_groups_simulation(days, populations, num_dampings):
     # Omit first column, as the time points are not of interest here.
     dataset_entry = copy.deepcopy(results)
 
-    return dataset_entry, damped_matrices, num_dampings, damping_coeff
+    return dataset_entry, damped_matrices, dampings, damping_coeff
 
 def remove_confirmed_compartments(dataset_entries, num_groups):
     """! The compartments which contain confirmed cases are not 
@@ -331,7 +331,7 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
     return np.asarray(all_dampings)
 
 def generate_data(
-        num_runs, path, input_width, label_width, save_data=True):
+        num_runs, path, input_width, label_width, number_of_dampings, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
    @param num_runs Number of times, the function run_secir_simulation is called.
    @param path Path, where the datasets are stored.
@@ -437,11 +437,9 @@ def generate_data(
 if __name__ == "__main__":
 
     input_width = 5
-    label_width = 30
-    number_of_dampings = 1
-    num_runs = 2
-    number_of_populations = 20
-
+    label_width = 100
+    number_of_dampings = 2
+    num_runs = 1000
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(
@@ -449,5 +447,5 @@ def generate_data(
         'data_GNN_with_'+str(number_of_dampings)+'_dampings_test')
 
     generate_data(num_runs, path_data, input_width,
-                  label_width, number_of_populations)
+                  label_width, number_of_dampings)
     

From d9110e5c6289cbdf7965a910df5bfff2502b1ff3 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 15 Aug 2024 11:07:57 +0200
Subject: [PATCH 11/33] add "return data" for case without saving

---
 .../memilio/surrogatemodel/GNN/data_generation_withdamp.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 5bc060c84b..ac7c7d4599 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -432,6 +432,7 @@ def generate_data(
             # save dict to json file
             with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
                 pickle.dump(all_data, f)
+    return data
 
 
 if __name__ == "__main__":

From 8823aa1fccfd6e107f46b63759a3242a38e5a50e Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 15 Aug 2024 13:50:37 +0200
Subject: [PATCH 12/33] tests for simulation run and datageneration for models
 with and without dampings

---
 .../test_surrogatemodel_GNN.py                | 284 ++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
new file mode 100644
index 0000000000..7f86a6d469
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
@@ -0,0 +1,284 @@
+#############################################################################
+# Copyright (C) 2020-2023 German Aerospace Center (DLR-SC)
+#
+# Authors:
+#
+# Contact: Martin J. Kuehn <Martin.Kuehn@DLR.de>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#############################################################################
+from pyfakefs import fake_filesystem_unittest
+
+from memilio.surrogatemodel.GNN import (data_generation_nodamp, data_generation_withdamp)
+import memilio.simulation as mio
+import memilio.simulation.osecir as osecir
+                                                
+from unittest.mock import patch
+import os
+import unittest
+
+import numpy as np
+import logging
+import json
+# suppress all autograph warnings from tensorflow
+
+#logging.getLogger("tensorflow").setLevel(logging.ERROR)
+import tensorflow as tf
+tf.get_logger().setLevel('ERROR')
+
+class TestSurrogatemodelGNN(fake_filesystem_unittest.TestCase):
+
+    path = '/home/'
+    
+    num_groups = 6
+    model = osecir.Model(num_groups)
+    graph = osecir.ModelGraph()
+    graph.add_node(0, model)
+    graph.add_node(1, model)
+    mobility_coefficients = 0.01 * np.ones(model.populations.numel())
+    for i in range(num_groups):
+        flat_index = model.populations.get_flat_index(
+            osecir.MultiIndex_PopulationsArray(mio.AgeGroup(i), osecir.InfectionState.Dead))
+        mobility_coefficients[flat_index] = 0
+    graph.add_edge(0, 1, mobility_coefficients)
+    graph.add_edge(1, 0, mobility_coefficients)
+
+    def setUp(self):
+        self.setUpPyfakefs()
+        
+#### test simulation no damp ####
+
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    
+    # create mock graph 
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
+           return_value= graph)
+    
+    def test_simulation_run_nodamp(self, mock_baseline, mockgraph):
+        days_1 = 10
+        days_2 = 30
+        days_3 = 50
+
+        population = [[5256.0, 10551, 32368.5,
+                      43637.833333333336, 22874.066666666666, 8473.6],
+                      [4000, 8000, 40000,
+                      28000, 15000, 6000]]
+
+        simulation_1 = data_generation_nodamp.run_secir_groups_simulation(
+            days_1, population)
+        simulation_2 = data_generation_nodamp.run_secir_groups_simulation(
+            days_2, population)
+        simulation_3 = data_generation_nodamp.run_secir_groups_simulation(
+            days_3, population)
+
+        # result length
+        self.assertEqual(len(simulation_1[0]), days_1+1)
+        self.assertEqual(len(simulation_2[0]), days_2+1)
+        self.assertEqual(len(simulation_3[0]), days_3+1)
+
+#### test data genertion no damp ####
+
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    
+    # create mock graph 
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
+           return_value= graph)
+        
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
+           return_value= np.random.randint(0, 700001, size=(400, 6)))
+    
+    def test_data_generation_runs_nodamp(
+            self, mock_population,  mock_baseline, mock_graph):
+
+        input_width_1 = 1
+        input_width_2 = 5
+
+        label_width_1 = 1
+        label_width_2 = 10
+
+        num_runs_1 = 1
+        num_runs_2 = 2
+
+        # test data generation without damping
+        data_1 = data_generation_nodamp.generate_data(
+            num_runs_1, self.path, input_width_1, label_width_1,
+            save_data=False)
+        self.assertEqual(len(data_1['inputs']), num_runs_1)
+        self.assertEqual(len(data_1['inputs'][0]), input_width_1)
+        self.assertEqual(len(data_1['inputs'][0][0]), 48)
+        self.assertEqual(len(data_1['labels']), num_runs_1)
+        self.assertEqual(len(data_1['labels'][0]), label_width_1)
+        self.assertEqual(len(data_1['labels'][0][0]), 48)
+
+        data_2 = data_generation_nodamp.generate_data(
+            num_runs_2, self.path, input_width_2, label_width_2,
+            save_data=False)
+        self.assertEqual(len(data_2['inputs']), num_runs_2)
+        self.assertEqual(len(data_2['inputs'][0]), input_width_2)
+        self.assertEqual(len(data_2['inputs'][0][0]), 48)
+        self.assertEqual(len(data_2['labels']), num_runs_2)
+        self.assertEqual(len(data_2['labels'][0]), label_width_2)
+        self.assertEqual(len(data_2['labels'][0][0]), 48)
+
+
+
+    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
+    #        return_value=0.6 * np.ones((6, 6)))
+     
+    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
+    #        return_value= np.random.randint(0, 700001, size=(400, 6)))
+    # # create mock graph 
+    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
+    #        return_value= graph)
+    
+    # def test_data_generation_save_nodamp(
+    #         self, mock_population, mock_baseline, mock_graph):
+
+    #     input_width = 2
+    #     label_width = 3
+    #     num_runs = 1
+
+    #     data_generation_nodamp.generate_data(num_runs, self.path, input_width,
+    #                                  label_width)
+    #     self.assertEqual(len(os.listdir(self.path)), 1)
+
+    #     self.assertEqual(os.listdir(self.path),
+    #                      ['data_secir_groups.pickle']) 
+
+
+        
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
+           return_value=0 * np.ones((6, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    # create mock graph 
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph', 
+           return_value= graph)
+    
+    def test_simulation_run_withdamp(self, mock_baseline, mock_minimum, mock_graph):
+   
+        days_1 = 10
+        days_2 = 30
+        days_3 = 50
+
+        dampings1 = [5]
+        dampings2 = [6, 15] 
+        dampings3 = [8, 18, 35]
+
+
+        population = [[5256.0, 10551, 32368.5,
+                      43637.833333333336, 22874.066666666666, 8473.6],
+                      [4000, 8000, 40000,
+                      28000, 15000, 6000]]
+
+        dataset_entry1, damped_matrices1, num_damp1, damping_coeff1 = data_generation_withdamp.run_secir_groups_simulation(
+            days_1, population, dampings1)
+        dataset_entry2, damped_matrices2, num_damp2, damping_coeff2 = data_generation_withdamp.run_secir_groups_simulation(
+            days_2, population, dampings2)
+        dataset_entry3, damped_matrices3, num_damp3, damping_coeff3 = data_generation_withdamp.run_secir_groups_simulation(
+            days_3, population, dampings3)
+
+        # result length
+        self.assertEqual(len(dataset_entry1[0]), days_1+1)
+        self.assertEqual(len(dataset_entry2[0]), days_2+1)
+        self.assertEqual(len(dataset_entry3[0]), days_3+1)
+
+
+        baseline = data_generation_withdamp.getBaselineMatrix()
+        #damping factor
+        self.assertEqual(damped_matrices1[0].all(),
+           (baseline * damping_coeff1[0]).all())
+        self.assertEqual(
+            damped_matrices2[1].all(),
+            (baseline * damping_coeff2[1]).all())
+        self.assertEqual(
+            damped_matrices3[2].all(),
+            (baseline * damping_coeff3[2]).all())
+        
+        # number of dampings length
+        self.assertEqual(len(damping_coeff1), len(dampings1))
+        self.assertEqual(len(damping_coeff2), len(dampings2))
+        self.assertEqual(len(damping_coeff3), len(dampings3))
+    
+
+            
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
+           return_value=0 * np.ones((6, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    # create mock graph 
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph', 
+           return_value= graph)    
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.get_population',
+           return_value= np.random.randint(0, 700001, size=(400, 6)))
+    
+    def test_data_generation_runs_withdamp(
+                self, mock_population, mock_baseline, mock_minimum, mock_graph):
+
+            input_width_1 = 1
+            input_width_2 = 5
+
+            label_width_1 = 10
+            label_width_2 = 30
+
+            num_runs_1 = 1
+            num_runs_2 = 2
+
+            damping1 = 1
+            damping2 = 2
+       
+            data_1 = data_generation_withdamp.generate_data(
+                num_runs_1, self.path, input_width_1, label_width_1, 
+                damping1, save_data=False)
+            self.assertEqual(len(data_1['inputs']), num_runs_1)
+            self.assertEqual(len(data_1['inputs'][0]), input_width_1)
+            self.assertEqual(len(data_1['inputs'][0][0]), 48)
+            self.assertEqual(len(data_1['labels']), num_runs_1)
+            self.assertEqual(len(data_1['labels'][0]), label_width_1)
+            self.assertEqual(len(data_1['labels'][0][0]), 48)
+
+            data_2 = data_generation_withdamp.generate_data(
+                num_runs_2, self.path, input_width_2, label_width_2,
+                damping2, save_data=False)
+            self.assertEqual(len(data_2['inputs']), num_runs_2)
+            self.assertEqual(len(data_2['inputs'][0]), input_width_2)
+            self.assertEqual(len(data_2['inputs'][0][0]), 48)
+            self.assertEqual(len(data_2['labels']), num_runs_2)
+            self.assertEqual(len(data_2['labels'][0]), label_width_2)
+            self.assertEqual(len(data_2['labels'][0][0]), 48)
+
+
+
+
+
+        
+    # def test_data_generation_save_withdamp(
+    #         self, mock_population, mock_baseline, mock_minimum):
+
+    #     input_width = 2
+    #     label_width = 3
+    #     num_runs = 1
+    #     dampings = 3
+
+    #     data_generation_withdamp.generate_data(num_runs, self.path, "", input_width,
+    #                                   label_width)
+    #     self.assertEqual(len(os.listdir(self.path)), 1)
+
+    #     self.assertEqual(os.listdir(self.path),
+    #                      ['data_secir_groups.pickle'])
+
+    
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 74da6d8dcd132f1bd4ded0037d571c38ed993b6b Mon Sep 17 00:00:00 2001
From: AgathaSchmidt <62878663+AgathaSchmidt@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:50:59 +0200
Subject: [PATCH 13/33] Apply suggestions from code review

Co-authored-by: Henrik Zunker <69154294+HenrZu@users.noreply.github.com>
---
 .../memilio/surrogatemodel/GNN/data_generation_nodamp.py       | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index 06a4a4c67b..a55fe77a0e 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -24,7 +24,6 @@ def run_secir_groups_simulation(days, populations):
         Virus-specific parameters are fixed and initial number of persons 
         in the particular infection states are chosen randomly from defined ranges.
     @param Days Describes how many days we simulate within a single run.
-    @param damping_day The day when damping is applied.
     @param populations List containing the population in each age group.
     @return List containing the populations in each compartment used to initialize 
             the run.
@@ -98,7 +97,6 @@ def run_secir_groups_simulation(days, populations):
             model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
             model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
             model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
-            # twice the value of RiskOfInfectionFromSymptomatic
             model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
                 i)] = 0.5
 
@@ -108,7 +106,6 @@ def run_secir_groups_simulation(days, populations):
 
         # Load baseline and minimum contact matrix and assign them to the model
         baseline = getBaselineMatrix()
-        #minimum = getMinimumMatrix()
 
         model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
         model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(

From 47f99eb2e0d0f1a090b25d3f37b20d2cda8b110a Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 29 Aug 2024 13:29:31 +0200
Subject: [PATCH 14/33] apply suggestions from code review

---
 .../GNN/data_generation_nodamp.py             | 266 ++++++------------
 1 file changed, 90 insertions(+), 176 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index a55fe77a0e..0bbb2a6bc3 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -2,20 +2,20 @@
 import os
 import pickle
 import random
-import json
-from datetime import date
 import numpy as np
- 
+import pandas as pd
+
 from progress.bar import Bar
 from sklearn.preprocessing import FunctionTransformer
+from datetime import date
 
 from memilio.simulation import (AgeGroup, LogLevel, set_log_level)
 from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
-                                      InfectionState, Model, ModelGraph, 
-                                      interpolate_simulation_result, set_edges)
+                                       InfectionState, Model, interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
-from memilio.epidata import transformMobilityData as tmd
-from memilio.epidata import getDataIntoPandasDataFrame as gd
+import memilio.epidata.getPopulationData as gpd
+from memilio.epidata import modifyDataframeSeries as mdfs
+from .GNN_utils import getBaselineMatrix, transform_mobility_directory, make_graph, remove_confirmed_compartments
 
 
 def run_secir_groups_simulation(days, populations):
@@ -28,7 +28,7 @@ def run_secir_groups_simulation(days, populations):
     @return List containing the populations in each compartment used to initialize 
             the run.
    """
-    
+
     set_log_level(LogLevel.Off)
 
     start_day = 1
@@ -59,6 +59,7 @@ def run_secir_groups_simulation(days, populations):
             model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
 
             # Initial number of people in each compartment with random numbers
+            # Numbers are chosen heuristically based on experience
             model.populations[AgeGroup(i), Index_InfectionState(
                 InfectionState.Exposed)] = random.uniform(
                 0.00025, 0.005) * populations[region][i]
@@ -82,11 +83,11 @@ def run_secir_groups_simulation(days, populations):
                 InfectionState.Recovered)] = random.uniform(
                 0.002, 0.08) * populations[region][i]
             model.populations[AgeGroup(i),
-                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
+                              Index_InfectionState(InfectionState.Dead)] = random.uniform(
                 0, 0.0003) * populations[region][i]
             model.populations.set_difference_from_group_total_AgeGroup(
-                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  
-                    populations[region][i])
+                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),
+                populations[region][i])
 
             # Compartment transition propabilities
             model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
@@ -109,13 +110,14 @@ def run_secir_groups_simulation(days, populations):
 
         model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
         model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
-        (num_groups, num_groups)) * 0
+            (num_groups, num_groups)) * 0
 
         # Apply mathematical constraints to parameters
         model.apply_constraints()
         models.append(model)
 
-    graph = make_graph(num_regions, countykey_list, models)
+    directory = transform_mobility_directory()
+    graph = make_graph(directory, num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
     study.run()
@@ -133,109 +135,6 @@ def run_secir_groups_simulation(days, populations):
     return dataset_entry
 
 
-def remove_confirmed_compartments(dataset_entries, num_groups):
-    """! The compartments which contain confirmed cases are not needed and are 
-        therefore omitted by summarizing the confirmed compartment with the 
-        original compartment. 
-    @param dataset_entries Array that contains the compartmental data with 
-            confirmed compartments. 
-    @param num_groups Number of age groups.
-    @return Array that contains the compartmental data without confirmed compartments. 
-   """
-    
-    new_dataset_entries = []
-    for i in dataset_entries : 
-      dataset_entries_reshaped  = i.reshape(
-          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
-          )
-      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
-      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
-      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
-      dataset_entries_reshaped[:, 4] = sum_inf_symp
-      new_dataset_entries.append(
-          np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten()
-          )
-    return new_dataset_entries
-
-
-def get_population(path="data/pydata/Germany/county_population.json"):
-    """! loads population data 
-    @param path Path to population file. 
-    @return List with all 400 populations and 6 age groups. 
-   """
-    
-    with open(path) as f:
-        data = json.load(f)
-    population = []
-    for data_entry in data:
-        population_county = []
-        population_county.append(
-            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
-        population_county.append(data_entry['6-14 years'])
-        population_county.append(
-            data_entry['15-17 years'] + data_entry['18-24 years'] +
-            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
-        population_county.append(
-            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
-            data_entry['50-64 years'] * 2 / 3)
-        population_county.append(
-            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
-            data_entry['50-64 years'] * 1 / 3)
-        population_county.append(
-            data_entry['>74 years'] * 0.8)
-
-        population.append(population_county)
-    return population
-
-def getBaselineMatrix():
-    """! loads the baselinematrix
-    """
-
-    baseline_contact_matrix0 = os.path.join(
-        "./data/contacts/baseline_home.txt")
-    baseline_contact_matrix1 = os.path.join(
-        "./data/contacts/baseline_school_pf_eig.txt")
-    baseline_contact_matrix2 = os.path.join(
-        "./data/contacts/baseline_work.txt")
-    baseline_contact_matrix3 = os.path.join(
-        "./data/contacts/baseline_other.txt")
-
-    baseline = np.loadtxt(baseline_contact_matrix0) \
-        + np.loadtxt(baseline_contact_matrix1) + \
-        np.loadtxt(baseline_contact_matrix2) + \
-        np.loadtxt(baseline_contact_matrix3)
-
-    return baseline
-
-def make_graph(num_regions, countykey_list, models):
-    """! 
-    @param num_regions Number (int) of counties that should be added to the 
-            grap-ODE model. Equals 400 for whole Germany. 
-    @param countykey_list List of keys/IDs for each county. 
-    @models models List of osecir Model with one model per population. 
-    @return graph Graph-ODE model. 
-   """
-    graph = ModelGraph()
-    for i in range(num_regions):
-        graph.add_node(int(countykey_list[i]), models[i])
-
-    # get mobility data directory
-    arg_dict = gd.cli("commuter_official")
-
-    directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
-
-    # Merge Eisenach and Wartbugkreis in Input Data 
-    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-
-    num_locations = 4
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
-                            graph, num_locations)
-    return graph
-    
-
 def generate_data(
         num_runs, path, input_width, days, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
@@ -245,13 +144,23 @@ def generate_data(
    @param label_width number of time steps (days) used as model output/label.  
    @param save_data Option to deactivate the save of the dataset. Per default true.
    """
-    
-    population = get_population()
+
+    df_population = pd.read_json(
+        'data/pydata/Germany/county_population.json')
+    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
+
+    df_population_agegroups = pd.DataFrame(
+        columns=[df_population.columns[0]] + age_groups)
+    for region_id in df_population.iloc[:, 0]:
+        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
+            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
+
+    population = df_population_agegroups.values.tolist
     days_sum = days + input_width - 1
 
     data = {"inputs": [],
-                "labels": [],
-                }
+            "labels": [],
+            }
 
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
@@ -259,69 +168,75 @@ def generate_data(
 
     for _ in range(num_runs):
 
-            data_run = run_secir_groups_simulation(
-                days_sum, population)
+        data_run = run_secir_groups_simulation(
+            days_sum, population)
 
-            inputs = np.asarray(data_run).transpose(1, 2, 0)[: input_width]
-            data["inputs"].append(inputs)
+        inputs = np.asarray(data_run).transpose(1, 2, 0)[: input_width]
+        data["inputs"].append(inputs)
 
-            data["labels"].append(np.asarray(data_run).transpose(1, 2, 0)[input_width:])
+        data["labels"].append(np.asarray(
+            data_run).transpose(1, 2, 0)[input_width:])
 
-            bar.next()
+        bar.next()
 
     bar.finish()
 
     if save_data:
-            num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
-            transformer = FunctionTransformer(np.log1p, validate=True)
-
-            # Scale inputs
-            inputs = np.asarray(
-                data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
-            scaled_inputs = transformer.transform(inputs)
-            original_shape_input = np.asarray(data['inputs']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_inputs.reshape(original_shape_input[2], 
-                                                  original_shape_input[0], 
-                                                  original_shape_input[1], 
-                                                  original_shape_input[3])
-
-            # Step 2: Reverse the transpose
-            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
-
-            
-            # Scale labels
-            labels = np.asarray(
-                data['labels']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
-            scaled_labels = transformer.transform(labels)
-            original_shape_labels = np.asarray(data['labels']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_labels.reshape(original_shape_labels[2], 
-                                                  original_shape_labels[0], 
-                                                  original_shape_labels[1], 
-                                                  original_shape_labels[3])
-
-            # Step 2: Reverse the transpose
-            original_labels = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_labels = original_labels.transpose(0, 3, 1, 2)
-
-            all_data = {"inputs": scaled_inputs,
-                "labels": scaled_labels,
-                }
-
-            # check if data directory exists. If necessary create it.
-            if not os.path.isdir(path):
-                os.mkdir(path)
-
-            # save dict to json file
-            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
-                pickle.dump(all_data, f)
+
+        # we use a logistic transformer to reduce the
+        # influence of ouliers and to correct the skewness of out dataset
+        # as a result we obtaina dataset which is handled better by the NN
+
+        num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
+        transformer = FunctionTransformer(np.log1p, validate=True)
+
+        # Scale inputs
+        inputs = np.asarray(
+            data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+        scaled_inputs = transformer.transform(inputs)
+        original_shape_input = np.asarray(data['inputs']).shape
+
+        # Reverse the reshape
+        reshaped_back = scaled_inputs.reshape(original_shape_input[2],
+                                              original_shape_input[0],
+                                              original_shape_input[1],
+                                              original_shape_input[3])
+
+        # Reverse the transpose
+        original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+        scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+        # Scale labels
+        labels = np.asarray(
+            data['labels']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+        scaled_labels = transformer.transform(labels)
+        original_shape_labels = np.asarray(data['labels']).shape
+
+        # Reverse the reshape
+        reshaped_back = scaled_labels.reshape(original_shape_labels[2],
+                                              original_shape_labels[0],
+                                              original_shape_labels[1],
+                                              original_shape_labels[3])
+
+        # Reverse the transpose
+        original_labels = reshaped_back.transpose(1, 2, 0, 3)
+        scaled_labels = original_labels.transpose(0, 3, 1, 2)
+
+        all_data = {"inputs": scaled_inputs,
+                    "labels": scaled_labels,
+                    }
+
+        # check if data directory exists. If necessary create it.
+        if not os.path.isdir(path):
+            os.mkdir(path)
+
+        # save dict to json file
+        with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
+            pickle.dump(all_data, f)
 
     return data
 
+
 if __name__ == "__main__":
 
     path = os.path.dirname(os.path.realpath(__file__))
@@ -329,11 +244,10 @@ def generate_data(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
         'data_GNN_nodamp')
- 
+
     input_width = 5
     days = 30
     num_runs = 1000
-    
+
     generate_data(num_runs, path_data, input_width,
                   days)
-    

From 58fc8e87f9a5c2e5fd8427bd68e1f735391b81f3 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 29 Aug 2024 13:30:12 +0200
Subject: [PATCH 15/33] a python file with functions frequently used for our
 GNNs

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py   | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
new file mode 100644
index 0000000000..d2b333bb30
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -0,0 +1,85 @@
+import numpy as np 
+import os 
+
+from memilio.epidata import transformMobilityData as tmd
+from memilio.epidata import getDataIntoPandasDataFrame as gd
+from memilio.simulation.osecir import (ModelGraph, set_edges)
+
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are 
+        therefore omitted by summarizing the confirmed compartment with the 
+        original compartment. 
+    @param dataset_entries Array that contains the compartmental data with 
+            confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+    
+    new_dataset_entries = []
+    for i in dataset_entries : 
+      dataset_entries_reshaped  = i.reshape(
+          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+          )
+      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
+      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
+      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+      dataset_entries_reshaped[:, 4] = sum_inf_symp
+      new_dataset_entries.append(
+          np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten()
+          )
+    return new_dataset_entries
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+def make_graph(directory, num_regions, countykey_list, models):
+    """! 
+    @param directory Directory with mobility data. 
+    @param num_regions Number (int) of counties that should be added to the 
+            grap-ODE model. Equals 400 for whole Germany. 
+    @param countykey_list List of keys/IDs for each county. 
+    @models models List of osecir Model with one model per population. 
+    @return graph Graph-ODE model. 
+   """
+    graph = ModelGraph()
+    for i in range(num_regions):
+        graph.add_node(int(countykey_list[i]), models[i])
+
+    num_locations = 4
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
+                            graph, num_locations)
+    return graph
+    
+
+def transform_mobility_directory():
+    """! Transforms the mobility data by merging Eisenach and Wartburgkreis
+    """
+    # get mobility data directory
+    arg_dict = gd.cli("commuter_official")
+
+    directory = arg_dict['out_folder'].split('/pydata')[0]
+    directory = os.path.join(directory, 'mobility/')  
+
+    # Merge Eisenach and Wartbugkreis in Input Data 
+    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
+    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
+    
\ No newline at end of file

From ffa076e37012e0d1d3accb1746a183794df4fb49 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 2 Sep 2024 09:48:30 +0200
Subject: [PATCH 16/33] add get population and get minimum matrix to utils

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py   | 82 ++++++++++++++-----
 1 file changed, 61 insertions(+), 21 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index d2b333bb30..97e19d71ea 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -1,9 +1,11 @@
-import numpy as np 
-import os 
+import numpy as np
+import pandas as pd
+import os
 
 from memilio.epidata import transformMobilityData as tmd
 from memilio.epidata import getDataIntoPandasDataFrame as gd
 from memilio.simulation.osecir import (ModelGraph, set_edges)
+from memilio.epidata import modifyDataframeSeries as mdfs
 
 
 def remove_confirmed_compartments(dataset_entries, num_groups):
@@ -15,21 +17,22 @@ def remove_confirmed_compartments(dataset_entries, num_groups):
     @param num_groups Number of age groups.
     @return Array that contains the compartmental data without confirmed compartments. 
    """
-    
+
     new_dataset_entries = []
-    for i in dataset_entries : 
-      dataset_entries_reshaped  = i.reshape(
-          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
-          )
-      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
-      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
-      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
-      dataset_entries_reshaped[:, 4] = sum_inf_symp
-      new_dataset_entries.append(
-          np.delete(dataset_entries_reshaped , [3, 5], axis=1).flatten()
-          )
+    for i in dataset_entries:
+        dataset_entries_reshaped = i.reshape(
+            [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+        )
+        sum_inf_no_symp = np.sum(dataset_entries_reshaped[:, [2, 3]], axis=1)
+        sum_inf_symp = np.sum(dataset_entries_reshaped[:, [4, 5]], axis=1)
+        dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+        dataset_entries_reshaped[:, 4] = sum_inf_symp
+        new_dataset_entries.append(
+            np.delete(dataset_entries_reshaped, [3, 5], axis=1).flatten()
+        )
     return new_dataset_entries
 
+
 def getBaselineMatrix():
     """! loads the baselinematrix
     """
@@ -50,6 +53,28 @@ def getBaselineMatrix():
 
     return baseline
 
+
+def getMinimumMatrix():
+    """! loads the minimum matrix
+    """
+
+    minimum_contact_matrix0 = os.path.join(
+        "./data/contacts/minimum_home.txt")
+    minimum_contact_matrix1 = os.path.join(
+        "./data/contacts/minimum_school_pf_eig.txt")
+    minimum_contact_matrix2 = os.path.join(
+        "./data/contacts/minimum_work.txt")
+    minimum_contact_matrix3 = os.path.join(
+        "./data/contacts/minimum_other.txt")
+
+    minimum = np.loadtxt(minimum_contact_matrix0) \
+        + np.loadtxt(minimum_contact_matrix1) + \
+        np.loadtxt(minimum_contact_matrix2) + \
+        np.loadtxt(minimum_contact_matrix3)
+
+    return minimum
+
+
 def make_graph(directory, num_regions, countykey_list, models):
     """! 
     @param directory Directory with mobility data. 
@@ -65,10 +90,10 @@ def make_graph(directory, num_regions, countykey_list, models):
 
     num_locations = 4
 
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
-                            graph, num_locations)
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
+              graph, num_locations)
     return graph
-    
+
 
 def transform_mobility_directory():
     """! Transforms the mobility data by merging Eisenach and Wartburgkreis
@@ -77,9 +102,24 @@ def transform_mobility_directory():
     arg_dict = gd.cli("commuter_official")
 
     directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
+    directory = os.path.join(directory, 'mobility/')
 
-    # Merge Eisenach and Wartbugkreis in Input Data 
+    # Merge Eisenach and Wartbugkreis in Input Data
     tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-    
\ No newline at end of file
+    tmd.updateMobility2022(
+        directory, mobility_file='commuter_migration_scaled')
+
+
+def get_population():
+    df_population = pd.read_json(
+        'data/pydata/Germany/county_population.json')
+    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
+
+    df_population_agegroups = pd.DataFrame(
+        columns=[df_population.columns[0]] + age_groups)
+    for region_id in df_population.iloc[:, 0]:
+        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
+            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
+
+    population = df_population_agegroups.values.tolist
+    return population

From 0cbda2b0d2f7b492205a497bcd72db520e480e75 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 2 Sep 2024 09:48:59 +0200
Subject: [PATCH 17/33] remove get population function from this file and
 import it from utils

---
 .../surrogatemodel/GNN/data_generation_nodamp.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index 0bbb2a6bc3..dc78c1ccc6 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -14,8 +14,8 @@
                                        InfectionState, Model, interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
 import memilio.epidata.getPopulationData as gpd
-from memilio.epidata import modifyDataframeSeries as mdfs
-from .GNN_utils import getBaselineMatrix, transform_mobility_directory, make_graph, remove_confirmed_compartments
+from .GNN_utils import (getBaselineMatrix, transform_mobility_directory,
+                        make_graph, remove_confirmed_compartments, get_population)
 
 
 def run_secir_groups_simulation(days, populations):
@@ -145,17 +145,7 @@ def generate_data(
    @param save_data Option to deactivate the save of the dataset. Per default true.
    """
 
-    df_population = pd.read_json(
-        'data/pydata/Germany/county_population.json')
-    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
-
-    df_population_agegroups = pd.DataFrame(
-        columns=[df_population.columns[0]] + age_groups)
-    for region_id in df_population.iloc[:, 0]:
-        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
-            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
-
-    population = df_population_agegroups.values.tolist
+    population = get_population()
     days_sum = days + input_width - 1
 
     data = {"inputs": [],

From e8fb58036b1f133fb95a45bb94e7b15920c51ddc Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 2 Sep 2024 09:49:27 +0200
Subject: [PATCH 18/33] remove functons and import them from utlis

---
 .../GNN/data_generation_withdamp.py           | 335 ++++++------------
 1 file changed, 109 insertions(+), 226 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index ac7c7d4599..d882194713 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -5,17 +5,19 @@
 import json
 import numpy as np
 from datetime import date
- 
+
 from progress.bar import Bar
 from sklearn.preprocessing import FunctionTransformer
 
 from memilio.simulation import (AgeGroup, LogLevel, set_log_level, Damping)
 from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
-                                      InfectionState, Model, ModelGraph, 
-                                      interpolate_simulation_result, set_edges)
+                                       InfectionState, Model,
+                                       interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
-from memilio.epidata import transformMobilityData as tmd
-from memilio.epidata import getDataIntoPandasDataFrame as gd
+
+from .GNN_utils import (getBaselineMatrix, make_graph,
+                        remove_confirmed_compartments, getMinimumMatrix, get_population)
+
 
 def run_secir_groups_simulation(days, populations, dampings):
     """! Uses an ODE SECIR model allowing for asymptomatic infection 
@@ -81,7 +83,7 @@ def run_secir_groups_simulation(days, populations, dampings):
                 InfectionState.Recovered)] = random.uniform(
                 0.002, 0.08) * populations[region][i]
             model.populations[AgeGroup(i),
-                              Index_InfectionState(InfectionState.Dead)] =  random.uniform(
+                              Index_InfectionState(InfectionState.Dead)] = random.uniform(
                 0, 0.0003) * populations[region][i]
             model.populations.set_difference_from_group_total_AgeGroup(
                 (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  populations[region][i])
@@ -117,11 +119,11 @@ def run_secir_groups_simulation(days, populations, dampings):
 
             # generat a random damping factor
             damping = np.ones((num_groups, num_groups)
-                        ) * np.float16(random.uniform(0, 0.5))
-            
+                              ) * np.float16(random.uniform(0, 0.5))
+
             # add damping to model
             model.parameters.ContactPatterns.cont_freq_mat.add_damping(Damping(
-                coeffs = (damping), t=day, level=0, type=0))
+                coeffs=(damping), t=day, level=0, type=0))
 
             damped_matrices.append(model.parameters.ContactPatterns.cont_freq_mat.get_matrix_at(
                 day+1))
@@ -148,130 +150,6 @@ def run_secir_groups_simulation(days, populations, dampings):
 
     return dataset_entry, damped_matrices, dampings, damping_coeff
 
-def remove_confirmed_compartments(dataset_entries, num_groups):
-    """! The compartments which contain confirmed cases are not 
-        needed and are therefore omitted by summarizing the confirmed 
-    compartment with the original compartment. 
-    @param dataset_entries Array that contains the compartmental data with 
-            confirmed compartments. 
-    @param num_groups Number of age groups.
-    @return Array that contains the compartmental data without 
-            confirmed compartments. 
-   """
-    
-    new_dataset_entries = []
-    for i in dataset_entries : 
-      dataset_entries_reshaped  = i.reshape(
-          [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
-          )
-      sum_inf_no_symp = np.sum(dataset_entries_reshaped [:, [2, 3]], axis=1)
-      sum_inf_symp = np.sum(dataset_entries_reshaped [:, [4, 5]], axis=1)
-      dataset_entries_reshaped[:, 2] = sum_inf_no_symp
-      dataset_entries_reshaped[:, 4] = sum_inf_symp
-      new_dataset_entries.append(np.delete(
-          dataset_entries_reshaped , [3, 5], axis=1).flatten()
-          )
-    return new_dataset_entries
-
-
-def get_population(path="data/pydata/Germany/county_population.json"):
-    """! loads population data 
-    @param path Path to population file. 
-    @return List with all 400 populations and 6 age groups. 
-   """
-    
-    with open(path) as f:
-        data = json.load(f)
-    population = []
-    for data_entry in data:
-        population_county = []
-        population_county.append(
-            data_entry['<3 years'] + data_entry['3-5 years'] / 2)
-        population_county.append(data_entry['6-14 years'])
-        population_county.append(
-            data_entry['15-17 years'] + data_entry['18-24 years'] +
-            data_entry['25-29 years'] + data_entry['30-39 years'] / 2)
-        population_county.append(
-            data_entry['30-39 years'] / 2 + data_entry['40-49 years'] +
-            data_entry['50-64 years'] * 2 / 3)
-        population_county.append(
-            data_entry['65-74 years'] + data_entry['>74 years'] * 0.2 +
-            data_entry['50-64 years'] * 1 / 3)
-        population_county.append(
-            data_entry['>74 years'] * 0.8)
-
-        population.append(population_county)
-    return population
-
-def getBaselineMatrix():
-    """! loads the baselinematrix
-    """
-
-    baseline_contact_matrix0 = os.path.join(
-        "./data/contacts/baseline_home.txt")
-    baseline_contact_matrix1 = os.path.join(
-        "./data/contacts/baseline_school_pf_eig.txt")
-    baseline_contact_matrix2 = os.path.join(
-        "./data/contacts/baseline_work.txt")
-    baseline_contact_matrix3 = os.path.join(
-        "./data/contacts/baseline_other.txt")
-
-    baseline = np.loadtxt(baseline_contact_matrix0) \
-        + np.loadtxt(baseline_contact_matrix1) + \
-        np.loadtxt(baseline_contact_matrix2) + \
-        np.loadtxt(baseline_contact_matrix3)
-
-    return baseline
-
-def getMinimumMatrix():
-    """! loads the minimum matrix
-    """
-
-    minimum_contact_matrix0 = os.path.join(
-        "./data/contacts/minimum_home.txt")
-    minimum_contact_matrix1 = os.path.join(
-        "./data/contacts/minimum_school_pf_eig.txt")
-    minimum_contact_matrix2 = os.path.join(
-        "./data/contacts/minimum_work.txt")
-    minimum_contact_matrix3 = os.path.join(
-        "./data/contacts/minimum_other.txt")
-
-    minimum = np.loadtxt(minimum_contact_matrix0) \
-        + np.loadtxt(minimum_contact_matrix1) + \
-        np.loadtxt(minimum_contact_matrix2) + \
-        np.loadtxt(minimum_contact_matrix3)
-
-    return minimum
-
-
-def make_graph(num_regions, countykey_list, models):
-    """! 
-    @param num_regions Number (int) of counties that should be added to the 
-            grap-ODE model. Equals 400 for whole Germany. 
-    @param countykey_list List of keys/IDs for each county. 
-    @models models List of osecir Model with one model per population. 
-    @return graph Graph-ODE model. 
-   """
-    graph = ModelGraph()
-    for i in range(num_regions):
-        graph.add_node(int(countykey_list[i]), models[i])
-
-    # get mobility data directory
-    arg_dict = gd.cli("commuter_official")
-
-    directory = arg_dict['out_folder'].split('/pydata')[0]
-    directory = os.path.join(directory, 'mobility/')  
-
-    # Merge Eisenach and Wartbugkreis in Input Data 
-    tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
-    tmd.updateMobility2022(directory, mobility_file='commuter_migration_scaled')
-
-    num_locations = 4
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)), 
-                            graph, num_locations)
-    return graph
-
 
 def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min_damping_day, n_runs):
     """! Draw damping days while keeping a minimum distance between the 
@@ -286,9 +164,9 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
    """
 
     all_dampings = []
-    count_runs = 0 
+    count_runs = 0
     count_shadow = 0
-    while len(all_dampings)<n_runs:
+    while len(all_dampings) < n_runs:
 
         days_list = list(range((min_damping_day), days))
         dampings = []
@@ -299,37 +177,43 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
                 days_before = list(range(damp-(min_distance), damp))
                 days_after = list(range(damp, damp+(min_distance+1)))
                 dampings.append(damp)
-                days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
-        else: 
-            # chose a forbidden damping 
-            damp = random.choice(list(range((0-min_distance), 0))+ list(range(days+1, (days+min_distance+1))))
-                
+                days_list = [ele for ele in days_list if ele not in (
+                    days_before+days_after)]
+        else:
+            # chose a forbidden damping
+            damp = random.choice(
+                list(range((0-min_distance), 0)) + list(range(days+1, (days+min_distance+1))))
+
             days_before = list(range(damp-(min_distance), damp))
             days_after = list(range(damp, damp+(min_distance+1)))
-            days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
+            days_list = [ele for ele in days_list if ele not in (
+                days_before+days_after)]
             dampings.append(damp)
-            for i in range(number_of_dampings):                
-                
+            for i in range(number_of_dampings):
+
                 damp = random.choice(days_list)
                 days_before = list(range(damp-(min_distance), damp))
                 days_after = list(range(damp, damp+(min_distance+1)))
                 dampings.append(damp)
-                days_list = [ele for ele in days_list if ele not in (days_before+days_after)]
+                days_list = [ele for ele in days_list if ele not in (
+                    days_before+days_after)]
                 count_shadow = 0
-        
-            
-        forbidden_damping_values = list(range((0-min_distance), 0))+ list(range(days+1, (days+min_distance+1)))
-        dampings = [ele for ele in dampings if ele not in forbidden_damping_values]
-        count_runs+=1
-        count_shadow +=1
+
+        forbidden_damping_values = list(
+            range((0-min_distance), 0)) + list(range(days+1, (days+min_distance+1)))
+        dampings = [
+            ele for ele in dampings if ele not in forbidden_damping_values]
+        count_runs += 1
+        count_shadow += 1
         # select first or last five dampings
         if len(dampings) >= number_of_dampings:
-            #dampings = random.sample(dampings, 5)
+            # dampings = random.sample(dampings, 5)
             all_dampings.append(sorted(dampings))
         #     if count_runs % 2 == 0:
-        
+
     return np.asarray(all_dampings)
 
+
 def generate_data(
         num_runs, path, input_width, label_width, number_of_dampings, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
@@ -342,96 +226,96 @@ def generate_data(
     population = get_population()
     days_sum = label_width+input_width-1
 
-    #generate dampings
+    # generate dampings
     damping_days = generate_dampings_withshadowdamp(
-        number_of_dampings = number_of_dampings, days = label_width, 
-        min_distance = 7, min_damping_day = input_width, n_runs = num_runs
-        )
-    
+        number_of_dampings=number_of_dampings, days=label_width,
+        min_distance=7, min_damping_day=input_width, n_runs=num_runs
+    )
+
     # all data including damping information
     all_data = {"inputs": [],
-            "labels": [],
-            "damping_coeff": [],
-            "damping_day": [], 
-            "damped_matrix": []}
+                "labels": [],
+                "damping_coeff": [],
+                "damping_day": [],
+                "damped_matrix": []}
 
-    # data that needs to be scaled 
+    # data that needs to be scaled
     data = {"inputs": [],
             "labels": [],
             "damping_coeff": [],
-            "damping_day": [], 
+            "damping_day": [],
             "damped_matrix": []}
-    
+
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
-    bar = Bar('Number of Runs done', max = num_runs)
+    bar = Bar('Number of Runs done', max=num_runs)
 
     for i in range(num_runs):
-            
-            data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
-                 days_sum,  population,  damping_days[i])
 
-            inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
-            data["inputs"].append(inputs)
-            data["labels"].append(np.asarray(data_run).transpose(1, 2, 0)[input_width:])
-            data["damping_coeff"].append(damping_factor)
-            data["damping_day"].append(damping_days_s)
-            data["damped_matrix"].append(damped_contact_matrix)
+        data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
+            days_sum,  population,  damping_days[i])
 
-            bar.next()
+        inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
+        data["inputs"].append(inputs)
+        data["labels"].append(np.asarray(
+            data_run).transpose(1, 2, 0)[input_width:])
+        data["damping_coeff"].append(damping_factor)
+        data["damping_day"].append(damping_days_s)
+        data["damped_matrix"].append(damped_contact_matrix)
+
+        bar.next()
 
     bar.finish()
 
     if save_data:
-            num_groups = int(np.asarray(data['inputs']).shape[2]/8)
-            transformer = FunctionTransformer(np.log1p, validate=True)
-
-            # Scale inputs
-            inputs = np.asarray(
-                data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
-            scaled_inputs = transformer.transform(inputs)
-            original_shape_input = np.asarray(data['inputs']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_inputs.reshape(
-                        original_shape_input[2], original_shape_input[0], 
-                        original_shape_input[1], original_shape_input[3]
-                        )
-
-            # Step 2: Reverse the transpose
-            original_inputs = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
-
-            
-            # Scale labels
-            labels = np.asarray(
-                data['labels']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
-            scaled_labels = transformer.transform(labels)
-            original_shape_labels = np.asarray(data['labels']).shape
-            
-            # Step 1: Reverse the reshape
-            reshaped_back = scaled_labels.reshape(
-                        original_shape_labels[2], original_shape_labels[0], 
-                        original_shape_labels[1], original_shape_labels[3]
-                        )
-
-            # Step 2: Reverse the transpose
-            original_labels = reshaped_back.transpose(1, 2, 0, 3)
-            scaled_labels = original_labels.transpose(0, 3, 1, 2)
-
-            all_data = {"inputs": scaled_inputs,
-                "labels": scaled_labels,
-                "damping_coeff": data['damping_coeff'],
-                "damping_day": data['damping_day'],
-                "damped_matrix": data['damped_matrix']}
-
-            # check if data directory exists. If necessary create it.
-            if not os.path.isdir(path):
-                os.mkdir(path)
-
-            # save dict to json file
-            with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
-                pickle.dump(all_data, f)
+        num_groups = int(np.asarray(data['inputs']).shape[2]/8)
+        transformer = FunctionTransformer(np.log1p, validate=True)
+
+        # Scale inputs
+        inputs = np.asarray(
+            data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
+        scaled_inputs = transformer.transform(inputs)
+        original_shape_input = np.asarray(data['inputs']).shape
+
+        # Reverse the reshape of inputs
+        reshaped_back = scaled_inputs.reshape(
+            original_shape_input[2], original_shape_input[0],
+            original_shape_input[1], original_shape_input[3]
+        )
+
+        #  Reverse the transpose of inputs
+        original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+        scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+        # Scale labels
+        labels = np.asarray(
+            data['labels']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
+        scaled_labels = transformer.transform(labels)
+        original_shape_labels = np.asarray(data['labels']).shape
+
+        # Reverse the reshape of labels
+        reshaped_back = scaled_labels.reshape(
+            original_shape_labels[2], original_shape_labels[0],
+            original_shape_labels[1], original_shape_labels[3]
+        )
+
+        # Reverse the transpose of labels
+        original_labels = reshaped_back.transpose(1, 2, 0, 3)
+        scaled_labels = original_labels.transpose(0, 3, 1, 2)
+
+        all_data = {"inputs": scaled_inputs,
+                    "labels": scaled_labels,
+                    "damping_coeff": data['damping_coeff'],
+                    "damping_day": data['damping_day'],
+                    "damped_matrix": data['damped_matrix']}
+
+        # check if data directory exists. If necessary create it.
+        if not os.path.isdir(path):
+            os.mkdir(path)
+
+        # save dict to json file
+        with open(os.path.join(path, 'data_secir_age_groups.pickle'), 'wb') as f:
+            pickle.dump(all_data, f)
     return data
 
 
@@ -439,7 +323,7 @@ def generate_data(
 
     input_width = 5
     label_width = 100
-    number_of_dampings = 2
+    number_of_dampings = 3
     num_runs = 1000
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
@@ -449,4 +333,3 @@ def generate_data(
 
     generate_data(num_runs, path_data, input_width,
                   label_width, number_of_dampings)
-    

From de35ca64efa4ab73b520ddf3d45c7f6995e0c9ab Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 2 Sep 2024 10:06:16 +0200
Subject: [PATCH 19/33] add comments as proposed by review

---
 .../GNN/data_generation_withdamp.py           | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index d882194713..9d36331859 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -2,7 +2,6 @@
 import os
 import pickle
 import random
-import json
 import numpy as np
 from datetime import date
 
@@ -24,8 +23,8 @@ def run_secir_groups_simulation(days, populations, dampings):
         with 6 different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
         in the particular infection states are chosen randomly from defined ranges.
-    @param Days Describes how many days we simulate within a single run.
-    @param damping_day The day when damping is applied.
+    @param days (int) Describes how many days we simulate within a single run.
+    @param damping_day (int) The day when damping is applied.
     @param populations List containing the population in each age group.
     @return List containing the populations in each compartment 
             used to initialize the run.
@@ -60,6 +59,7 @@ def run_secir_groups_simulation(days, populations, dampings):
             model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
 
             # Initial number of people in each compartment with random numbers
+            # Numbers are chosen heuristically based on experience
             model.populations[AgeGroup(i), Index_InfectionState(
                 InfectionState.Exposed)] = random.uniform(
                 0.00025, 0.005) * populations[region][i]
@@ -155,12 +155,12 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
     """! Draw damping days while keeping a minimum distance between the 
         damping days. This method aims to create a uniform ditribution of 
         drawn damping days. 
-   @param num_of_dampings Number of dampings that have to be drawn.
-   @param days Number of days which are simulated (label_width).
-   @param min_distance The minimum number of days between two dampings. 
-   @param min_damping_day The earliest day of the simualtion where a damping 
+   @param num_of_dampings (int) Number of dampings that have to be drawn.
+   @param days (int) Number of days which are simulated (label_width).
+   @param min_distance (int) The minimum number of days between two dampings. 
+   @param min_damping_day (int) The earliest day of the simualtion where a damping 
         can take place.   
-   @param n_runs Number of simulation runs. 
+   @param n_runs 8int) Number of simulation runs. 
    """
 
     all_dampings = []
@@ -207,9 +207,7 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
         count_shadow += 1
         # select first or last five dampings
         if len(dampings) >= number_of_dampings:
-            # dampings = random.sample(dampings, 5)
             all_dampings.append(sorted(dampings))
-        #     if count_runs % 2 == 0:
 
     return np.asarray(all_dampings)
 
@@ -221,6 +219,7 @@ def generate_data(
    @param path Path, where the datasets are stored.
    @param input_width Number of time steps used for model input.
    @param label_width Number of time steps (days) used as model output/label.  
+   @param number_of_dampings (int) The number of contact change points applied to the simulation. 
    @param save_data Option to deactivate the save of the dataset. Per default true.
    """
     population = get_population()
@@ -268,6 +267,11 @@ def generate_data(
     bar.finish()
 
     if save_data:
+
+        # we use a logistic transformer to reduce the
+        # influence of ouliers and to correct the skewness of out dataset
+        # as a result we obtaina dataset which is handled better by the NN
+
         num_groups = int(np.asarray(data['inputs']).shape[2]/8)
         transformer = FunctionTransformer(np.log1p, validate=True)
 

From 3d5a644587f27a8b10184e49367abda1848aeef6 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Mon, 16 Sep 2024 14:29:24 +0200
Subject: [PATCH 20/33] pre commit

---
 .../test_surrogatemodel_GNN.py                | 140 ++++++++----------
 1 file changed, 59 insertions(+), 81 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
index 7f86a6d469..0f67fdb022 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
@@ -19,10 +19,11 @@
 #############################################################################
 from pyfakefs import fake_filesystem_unittest
 
-from memilio.surrogatemodel.GNN import (data_generation_nodamp, data_generation_withdamp)
+from memilio.surrogatemodel.GNN import (
+    data_generation_nodamp, data_generation_withdamp)
 import memilio.simulation as mio
 import memilio.simulation.osecir as osecir
-                                                
+
 from unittest.mock import patch
 import os
 import unittest
@@ -32,14 +33,15 @@
 import json
 # suppress all autograph warnings from tensorflow
 
-#logging.getLogger("tensorflow").setLevel(logging.ERROR)
+# logging.getLogger("tensorflow").setLevel(logging.ERROR)
 import tensorflow as tf
 tf.get_logger().setLevel('ERROR')
 
+
 class TestSurrogatemodelGNN(fake_filesystem_unittest.TestCase):
 
     path = '/home/'
-    
+
     num_groups = 6
     model = osecir.Model(num_groups)
     graph = osecir.ModelGraph()
@@ -55,16 +57,14 @@ class TestSurrogatemodelGNN(fake_filesystem_unittest.TestCase):
 
     def setUp(self):
         self.setUpPyfakefs()
-        
+
 #### test simulation no damp ####
 
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    
-    # create mock graph 
-    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
-           return_value= graph)
-    
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
+           return_value=graph)
     def test_simulation_run_nodamp(self, mock_baseline, mockgraph):
         days_1 = 10
         days_2 = 30
@@ -91,14 +91,11 @@ def test_simulation_run_nodamp(self, mock_baseline, mockgraph):
 
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    
-    # create mock graph 
-    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
-           return_value= graph)
-        
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
+           return_value=graph)
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
-           return_value= np.random.randint(0, 700001, size=(400, 6)))
-    
+           return_value=np.random.randint(0, 700001, size=(400, 6)))
     def test_data_generation_runs_nodamp(
             self, mock_population,  mock_baseline, mock_graph):
 
@@ -132,17 +129,15 @@ def test_data_generation_runs_nodamp(
         self.assertEqual(len(data_2['labels'][0]), label_width_2)
         self.assertEqual(len(data_2['labels'][0][0]), 48)
 
-
-
     # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
     #        return_value=0.6 * np.ones((6, 6)))
-     
+
     # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
     #        return_value= np.random.randint(0, 700001, size=(400, 6)))
-    # # create mock graph 
-    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph', 
+    # # create mock graph
+    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
     #        return_value= graph)
-    
+
     # def test_data_generation_save_nodamp(
     #         self, mock_population, mock_baseline, mock_graph):
 
@@ -155,29 +150,25 @@ def test_data_generation_runs_nodamp(
     #     self.assertEqual(len(os.listdir(self.path)), 1)
 
     #     self.assertEqual(os.listdir(self.path),
-    #                      ['data_secir_groups.pickle']) 
-
+    #                      ['data_secir_groups.pickle'])
 
-        
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
            return_value=0 * np.ones((6, 6)))
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    # create mock graph 
-    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph', 
-           return_value= graph)
-    
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph',
+           return_value=graph)
     def test_simulation_run_withdamp(self, mock_baseline, mock_minimum, mock_graph):
-   
+
         days_1 = 10
         days_2 = 30
         days_3 = 50
 
         dampings1 = [5]
-        dampings2 = [6, 15] 
+        dampings2 = [6, 15]
         dampings3 = [8, 18, 35]
 
-
         population = [[5256.0, 10551, 32368.5,
                       43637.833333333336, 22874.066666666666, 8473.6],
                       [4000, 8000, 40000,
@@ -195,75 +186,66 @@ def test_simulation_run_withdamp(self, mock_baseline, mock_minimum, mock_graph):
         self.assertEqual(len(dataset_entry2[0]), days_2+1)
         self.assertEqual(len(dataset_entry3[0]), days_3+1)
 
-
         baseline = data_generation_withdamp.getBaselineMatrix()
-        #damping factor
+        # damping factor
         self.assertEqual(damped_matrices1[0].all(),
-           (baseline * damping_coeff1[0]).all())
+                         (baseline * damping_coeff1[0]).all())
         self.assertEqual(
             damped_matrices2[1].all(),
             (baseline * damping_coeff2[1]).all())
         self.assertEqual(
             damped_matrices3[2].all(),
             (baseline * damping_coeff3[2]).all())
-        
+
         # number of dampings length
         self.assertEqual(len(damping_coeff1), len(dampings1))
         self.assertEqual(len(damping_coeff2), len(dampings2))
         self.assertEqual(len(damping_coeff3), len(dampings3))
-    
 
-            
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
            return_value=0 * np.ones((6, 6)))
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    # create mock graph 
-    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph', 
-           return_value= graph)    
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph',
+           return_value=graph)
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.get_population',
-           return_value= np.random.randint(0, 700001, size=(400, 6)))
-    
+           return_value=np.random.randint(0, 700001, size=(400, 6)))
     def test_data_generation_runs_withdamp(
-                self, mock_population, mock_baseline, mock_minimum, mock_graph):
-
-            input_width_1 = 1
-            input_width_2 = 5
+            self, mock_population, mock_baseline, mock_minimum, mock_graph):
 
-            label_width_1 = 10
-            label_width_2 = 30
-
-            num_runs_1 = 1
-            num_runs_2 = 2
-
-            damping1 = 1
-            damping2 = 2
-       
-            data_1 = data_generation_withdamp.generate_data(
-                num_runs_1, self.path, input_width_1, label_width_1, 
-                damping1, save_data=False)
-            self.assertEqual(len(data_1['inputs']), num_runs_1)
-            self.assertEqual(len(data_1['inputs'][0]), input_width_1)
-            self.assertEqual(len(data_1['inputs'][0][0]), 48)
-            self.assertEqual(len(data_1['labels']), num_runs_1)
-            self.assertEqual(len(data_1['labels'][0]), label_width_1)
-            self.assertEqual(len(data_1['labels'][0][0]), 48)
+        input_width_1 = 1
+        input_width_2 = 5
 
-            data_2 = data_generation_withdamp.generate_data(
-                num_runs_2, self.path, input_width_2, label_width_2,
-                damping2, save_data=False)
-            self.assertEqual(len(data_2['inputs']), num_runs_2)
-            self.assertEqual(len(data_2['inputs'][0]), input_width_2)
-            self.assertEqual(len(data_2['inputs'][0][0]), 48)
-            self.assertEqual(len(data_2['labels']), num_runs_2)
-            self.assertEqual(len(data_2['labels'][0]), label_width_2)
-            self.assertEqual(len(data_2['labels'][0][0]), 48)
+        label_width_1 = 10
+        label_width_2 = 30
 
+        num_runs_1 = 1
+        num_runs_2 = 2
 
+        damping1 = 1
+        damping2 = 2
 
+        data_1 = data_generation_withdamp.generate_data(
+            num_runs_1, self.path, input_width_1, label_width_1,
+            damping1, save_data=False)
+        self.assertEqual(len(data_1['inputs']), num_runs_1)
+        self.assertEqual(len(data_1['inputs'][0]), input_width_1)
+        self.assertEqual(len(data_1['inputs'][0][0]), 48)
+        self.assertEqual(len(data_1['labels']), num_runs_1)
+        self.assertEqual(len(data_1['labels'][0]), label_width_1)
+        self.assertEqual(len(data_1['labels'][0][0]), 48)
 
+        data_2 = data_generation_withdamp.generate_data(
+            num_runs_2, self.path, input_width_2, label_width_2,
+            damping2, save_data=False)
+        self.assertEqual(len(data_2['inputs']), num_runs_2)
+        self.assertEqual(len(data_2['inputs'][0]), input_width_2)
+        self.assertEqual(len(data_2['inputs'][0][0]), 48)
+        self.assertEqual(len(data_2['labels']), num_runs_2)
+        self.assertEqual(len(data_2['labels'][0]), label_width_2)
+        self.assertEqual(len(data_2['labels'][0][0]), 48)
 
-        
     # def test_data_generation_save_withdamp(
     #         self, mock_population, mock_baseline, mock_minimum):
 
@@ -271,14 +253,10 @@ def test_data_generation_runs_withdamp(
     #     label_width = 3
     #     num_runs = 1
     #     dampings = 3
-
     #     data_generation_withdamp.generate_data(num_runs, self.path, "", input_width,
     #                                   label_width)
     #     self.assertEqual(len(os.listdir(self.path)), 1)
-
     #     self.assertEqual(os.listdir(self.path),
     #                      ['data_secir_groups.pickle'])
-
-    
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 2dfbe4f54a35a5142f39315c1b202a3e767d4ef3 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 18 Sep 2024 09:42:12 +0200
Subject: [PATCH 21/33] mock transform mobility function

---
 .../test_surrogatemodel_GNN.py                | 65 ++++++++++++++-----
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
index 0f67fdb022..f3fc0794a0 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
@@ -62,19 +62,27 @@ def setUp(self):
 
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    # create mock graph
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
            return_value=graph)
-    def test_simulation_run_nodamp(self, mock_baseline, mockgraph):
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory',
+           autospec=True)
+    def test_simulation_run_nodamp(self, mock_transform_mobility,
+                                   mock_update_mobility, mock_baseline, mock_graph):
+        # Mock the behavior of the mobility transformation function,
+        # but it doesn't return anything.
+        # So, we just need it to be called without side effects.
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
+
         days_1 = 10
         days_2 = 30
         days_3 = 50
 
-        population = [[5256.0, 10551, 32368.5,
-                      43637.833333333336, 22874.066666666666, 8473.6],
-                      [4000, 8000, 40000,
-                      28000, 15000, 6000]]
+        population = [[5256.0, 10551, 32368.5, 43637.833333333336, 22874.066666666666, 8473.6],
+                      [4000, 8000, 40000, 28000, 15000, 6000]]
 
+        # Call the actual function being tested
         simulation_1 = data_generation_nodamp.run_secir_groups_simulation(
             days_1, population)
         simulation_2 = data_generation_nodamp.run_secir_groups_simulation(
@@ -82,22 +90,32 @@ def test_simulation_run_nodamp(self, mock_baseline, mockgraph):
         simulation_3 = data_generation_nodamp.run_secir_groups_simulation(
             days_3, population)
 
-        # result length
-        self.assertEqual(len(simulation_1[0]), days_1+1)
-        self.assertEqual(len(simulation_2[0]), days_2+1)
-        self.assertEqual(len(simulation_3[0]), days_3+1)
+        # Ensure that the results are of the correct length.
+        self.assertEqual(len(simulation_1[0]), days_1 + 1)
+        self.assertEqual(len(simulation_2[0]), days_2 + 1)
+        self.assertEqual(len(simulation_3[0]), days_3 + 1)
+
 
 #### test data genertion no damp ####
 
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
-    # create mock graph
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
            return_value=graph)
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
            return_value=np.random.randint(0, 700001, size=(400, 6)))
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory',
+           autospec=True)
     def test_data_generation_runs_nodamp(
-            self, mock_population,  mock_baseline, mock_graph):
+        self, mock_transform_mobility, mock_update_mobility,
+            mock_baseline, mock_graph, mock_population):
+
+       # Mock the behavior of the mobility transformation function,
+        # but it doesn't return anything.
+        # So, we just need it to be called without side effects.
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
         input_width_1 = 1
         input_width_2 = 5
@@ -108,7 +126,7 @@ def test_data_generation_runs_nodamp(
         num_runs_1 = 1
         num_runs_2 = 2
 
-        # test data generation without damping
+        # test data generation without dampings
         data_1 = data_generation_nodamp.generate_data(
             num_runs_1, self.path, input_width_1, label_width_1,
             save_data=False)
@@ -159,7 +177,15 @@ def test_data_generation_runs_nodamp(
     # create mock graph
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph',
            return_value=graph)
-    def test_simulation_run_withdamp(self, mock_baseline, mock_minimum, mock_graph):
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory',
+           autospec=True)
+    def test_simulation_run_withdamp(self, mock_transform_mobility, mock_update_mobility,
+                                     mock_minimum, mock_baseline, mock_graph):
+        # Mock the behavior of the mobility transformation function, but it doesn't return anything.
+        # So, we just need it to be called without side effects.
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
         days_1 = 10
         days_2 = 30
@@ -211,8 +237,15 @@ def test_simulation_run_withdamp(self, mock_baseline, mock_minimum, mock_graph):
            return_value=graph)
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.get_population',
            return_value=np.random.randint(0, 700001, size=(400, 6)))
+    # mock transform directory function
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory', autospec=True)
     def test_data_generation_runs_withdamp(
-            self, mock_population, mock_baseline, mock_minimum, mock_graph):
+            self, mock_transform_mobility, mock_update_mobility,
+            mock_minimum, mock_baseline, mock_graph, mock_population):
+
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
         input_width_1 = 1
         input_width_2 = 5
@@ -246,9 +279,9 @@ def test_data_generation_runs_withdamp(
         self.assertEqual(len(data_2['labels'][0]), label_width_2)
         self.assertEqual(len(data_2['labels'][0][0]), 48)
 
+
     # def test_data_generation_save_withdamp(
     #         self, mock_population, mock_baseline, mock_minimum):
-
     #     input_width = 2
     #     label_width = 3
     #     num_runs = 1

From 86a2e8923f50cfb2b3151eb57f63a675dd6ce8cd Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 19 Sep 2024 11:20:48 +0200
Subject: [PATCH 22/33] adjust utils: add surrogate utils and add scling to GNN
 utis

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py   | 116 ++++++------------
 .../GNN/data_generation_nodamp.py             |  47 +------
 .../GNN/data_generation_withdamp.py           |  48 ++------
 .../surrogatemodel/utils_surrogatemodel.py    |  86 +++++++++++++
 4 files changed, 136 insertions(+), 161 deletions(-)
 create mode 100644 pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index 97e19d71ea..4e88f33b0c 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -2,77 +2,11 @@
 import pandas as pd
 import os
 
+from sklearn.preprocessing import FunctionTransformer
+
 from memilio.epidata import transformMobilityData as tmd
 from memilio.epidata import getDataIntoPandasDataFrame as gd
 from memilio.simulation.osecir import (ModelGraph, set_edges)
-from memilio.epidata import modifyDataframeSeries as mdfs
-
-
-def remove_confirmed_compartments(dataset_entries, num_groups):
-    """! The compartments which contain confirmed cases are not needed and are 
-        therefore omitted by summarizing the confirmed compartment with the 
-        original compartment. 
-    @param dataset_entries Array that contains the compartmental data with 
-            confirmed compartments. 
-    @param num_groups Number of age groups.
-    @return Array that contains the compartmental data without confirmed compartments. 
-   """
-
-    new_dataset_entries = []
-    for i in dataset_entries:
-        dataset_entries_reshaped = i.reshape(
-            [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
-        )
-        sum_inf_no_symp = np.sum(dataset_entries_reshaped[:, [2, 3]], axis=1)
-        sum_inf_symp = np.sum(dataset_entries_reshaped[:, [4, 5]], axis=1)
-        dataset_entries_reshaped[:, 2] = sum_inf_no_symp
-        dataset_entries_reshaped[:, 4] = sum_inf_symp
-        new_dataset_entries.append(
-            np.delete(dataset_entries_reshaped, [3, 5], axis=1).flatten()
-        )
-    return new_dataset_entries
-
-
-def getBaselineMatrix():
-    """! loads the baselinematrix
-    """
-
-    baseline_contact_matrix0 = os.path.join(
-        "./data/contacts/baseline_home.txt")
-    baseline_contact_matrix1 = os.path.join(
-        "./data/contacts/baseline_school_pf_eig.txt")
-    baseline_contact_matrix2 = os.path.join(
-        "./data/contacts/baseline_work.txt")
-    baseline_contact_matrix3 = os.path.join(
-        "./data/contacts/baseline_other.txt")
-
-    baseline = np.loadtxt(baseline_contact_matrix0) \
-        + np.loadtxt(baseline_contact_matrix1) + \
-        np.loadtxt(baseline_contact_matrix2) + \
-        np.loadtxt(baseline_contact_matrix3)
-
-    return baseline
-
-
-def getMinimumMatrix():
-    """! loads the minimum matrix
-    """
-
-    minimum_contact_matrix0 = os.path.join(
-        "./data/contacts/minimum_home.txt")
-    minimum_contact_matrix1 = os.path.join(
-        "./data/contacts/minimum_school_pf_eig.txt")
-    minimum_contact_matrix2 = os.path.join(
-        "./data/contacts/minimum_work.txt")
-    minimum_contact_matrix3 = os.path.join(
-        "./data/contacts/minimum_other.txt")
-
-    minimum = np.loadtxt(minimum_contact_matrix0) \
-        + np.loadtxt(minimum_contact_matrix1) + \
-        np.loadtxt(minimum_contact_matrix2) + \
-        np.loadtxt(minimum_contact_matrix3)
-
-    return minimum
 
 
 def make_graph(directory, num_regions, countykey_list, models):
@@ -110,16 +44,40 @@ def transform_mobility_directory():
         directory, mobility_file='commuter_migration_scaled')
 
 
-def get_population():
-    df_population = pd.read_json(
-        'data/pydata/Germany/county_population.json')
-    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
+def scale_data(data):
+    num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
+    transformer = FunctionTransformer(np.log1p, validate=True)
+
+    # Scale inputs
+    inputs = np.asarray(
+        data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+    scaled_inputs = transformer.transform(inputs)
+    original_shape_input = np.asarray(data['inputs']).shape
+
+    # Reverse the reshape
+    reshaped_back = scaled_inputs.reshape(original_shape_input[2],
+                                          original_shape_input[0],
+                                          original_shape_input[1],
+                                          original_shape_input[3])
+
+    # Reverse the transpose
+    original_inputs = reshaped_back.transpose(1, 2, 0, 3)
+    scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
+
+    # Scale labels
+    labels = np.asarray(
+        data['labels']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
+    scaled_labels = transformer.transform(labels)
+    original_shape_labels = np.asarray(data['labels']).shape
+
+    # Reverse the reshape
+    reshaped_back = scaled_labels.reshape(original_shape_labels[2],
+                                          original_shape_labels[0],
+                                          original_shape_labels[1],
+                                          original_shape_labels[3])
 
-    df_population_agegroups = pd.DataFrame(
-        columns=[df_population.columns[0]] + age_groups)
-    for region_id in df_population.iloc[:, 0]:
-        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
-            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
+    # Reverse the transpose
+    original_labels = reshaped_back.transpose(1, 2, 0, 3)
+    scaled_labels = original_labels.transpose(0, 3, 1, 2)
 
-    population = df_population_agegroups.values.tolist
-    return population
+    return scaled_inputs, scaled_labels
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index dc78c1ccc6..6e440af4c1 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -6,7 +6,7 @@
 import pandas as pd
 
 from progress.bar import Bar
-from sklearn.preprocessing import FunctionTransformer
+
 from datetime import date
 
 from memilio.simulation import (AgeGroup, LogLevel, set_log_level)
@@ -14,8 +14,10 @@
                                        InfectionState, Model, interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
 import memilio.epidata.getPopulationData as gpd
-from .GNN_utils import (getBaselineMatrix, transform_mobility_directory,
-                        make_graph, remove_confirmed_compartments, get_population)
+from .GNN_utils import (transform_mobility_directory,
+                        make_graph, scale_data)
+from memilio.surrogatemodel.utils_surrogatemodel import (
+    getBaselineMatrix, remove_confirmed_compartments, get_population)
 
 
 def run_secir_groups_simulation(days, populations):
@@ -173,44 +175,7 @@ def generate_data(
 
     if save_data:
 
-        # we use a logistic transformer to reduce the
-        # influence of ouliers and to correct the skewness of out dataset
-        # as a result we obtaina dataset which is handled better by the NN
-
-        num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
-        transformer = FunctionTransformer(np.log1p, validate=True)
-
-        # Scale inputs
-        inputs = np.asarray(
-            data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
-        scaled_inputs = transformer.transform(inputs)
-        original_shape_input = np.asarray(data['inputs']).shape
-
-        # Reverse the reshape
-        reshaped_back = scaled_inputs.reshape(original_shape_input[2],
-                                              original_shape_input[0],
-                                              original_shape_input[1],
-                                              original_shape_input[3])
-
-        # Reverse the transpose
-        original_inputs = reshaped_back.transpose(1, 2, 0, 3)
-        scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
-
-        # Scale labels
-        labels = np.asarray(
-            data['labels']).transpose(2, 0, 1, 3).reshape(num_groups * 8, -1)
-        scaled_labels = transformer.transform(labels)
-        original_shape_labels = np.asarray(data['labels']).shape
-
-        # Reverse the reshape
-        reshaped_back = scaled_labels.reshape(original_shape_labels[2],
-                                              original_shape_labels[0],
-                                              original_shape_labels[1],
-                                              original_shape_labels[3])
-
-        # Reverse the transpose
-        original_labels = reshaped_back.transpose(1, 2, 0, 3)
-        scaled_labels = original_labels.transpose(0, 3, 1, 2)
+        scaled_inputs, scaled_labels = scale_data(data)
 
         all_data = {"inputs": scaled_inputs,
                     "labels": scaled_labels,
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 9d36331859..b11616f843 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -14,8 +14,10 @@
                                        interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
 
-from .GNN_utils import (getBaselineMatrix, make_graph,
-                        remove_confirmed_compartments, getMinimumMatrix, get_population)
+from .GNN_utils import (transform_mobility_directory,
+                        make_graph, scale_data)
+from memilio.surrogatemodel.utils_surrogatemodel import (
+    getBaselineMatrix, remove_confirmed_compartments, get_population, getMinimumMatrix)
 
 
 def run_secir_groups_simulation(days, populations, dampings):
@@ -133,7 +135,8 @@ def run_secir_groups_simulation(days, populations, dampings):
         model.apply_constraints()
         models.append(model)
 
-    graph = make_graph(num_regions, countykey_list, models)
+    directory = transform_mobility_directory()
+    graph = make_graph(directory, num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
     study.run()
@@ -268,44 +271,7 @@ def generate_data(
 
     if save_data:
 
-        # we use a logistic transformer to reduce the
-        # influence of ouliers and to correct the skewness of out dataset
-        # as a result we obtaina dataset which is handled better by the NN
-
-        num_groups = int(np.asarray(data['inputs']).shape[2]/8)
-        transformer = FunctionTransformer(np.log1p, validate=True)
-
-        # Scale inputs
-        inputs = np.asarray(
-            data['inputs']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
-        scaled_inputs = transformer.transform(inputs)
-        original_shape_input = np.asarray(data['inputs']).shape
-
-        # Reverse the reshape of inputs
-        reshaped_back = scaled_inputs.reshape(
-            original_shape_input[2], original_shape_input[0],
-            original_shape_input[1], original_shape_input[3]
-        )
-
-        #  Reverse the transpose of inputs
-        original_inputs = reshaped_back.transpose(1, 2, 0, 3)
-        scaled_inputs = original_inputs.transpose(0, 3, 1, 2)
-
-        # Scale labels
-        labels = np.asarray(
-            data['labels']).transpose(2, 0, 1, 3).reshape(num_groups*8, -1)
-        scaled_labels = transformer.transform(labels)
-        original_shape_labels = np.asarray(data['labels']).shape
-
-        # Reverse the reshape of labels
-        reshaped_back = scaled_labels.reshape(
-            original_shape_labels[2], original_shape_labels[0],
-            original_shape_labels[1], original_shape_labels[3]
-        )
-
-        # Reverse the transpose of labels
-        original_labels = reshaped_back.transpose(1, 2, 0, 3)
-        scaled_labels = original_labels.transpose(0, 3, 1, 2)
+        scaled_inputs, scaled_labels = scale_data(data)
 
         all_data = {"inputs": scaled_inputs,
                     "labels": scaled_labels,
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py
new file mode 100644
index 0000000000..72f3335df6
--- /dev/null
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py
@@ -0,0 +1,86 @@
+import numpy as np
+import pandas as pd
+import os
+from memilio.epidata import modifyDataframeSeries as mdfs
+
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are 
+        therefore omitted by summarizing the confirmed compartment with the 
+        original compartment. 
+    @param dataset_entries Array that contains the compartmental data with 
+            confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+
+    new_dataset_entries = []
+    for i in dataset_entries:
+        dataset_entries_reshaped = i.reshape(
+            [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+        )
+        sum_inf_no_symp = np.sum(dataset_entries_reshaped[:, [2, 3]], axis=1)
+        sum_inf_symp = np.sum(dataset_entries_reshaped[:, [4, 5]], axis=1)
+        dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+        dataset_entries_reshaped[:, 4] = sum_inf_symp
+        new_dataset_entries.append(
+            np.delete(dataset_entries_reshaped, [3, 5], axis=1).flatten()
+        )
+    return new_dataset_entries
+
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+
+def getMinimumMatrix():
+    """! loads the minimum matrix
+    """
+
+    minimum_contact_matrix0 = os.path.join(
+        "./data/contacts/minimum_home.txt")
+    minimum_contact_matrix1 = os.path.join(
+        "./data/contacts/minimum_school_pf_eig.txt")
+    minimum_contact_matrix2 = os.path.join(
+        "./data/contacts/minimum_work.txt")
+    minimum_contact_matrix3 = os.path.join(
+        "./data/contacts/minimum_other.txt")
+
+    minimum = np.loadtxt(minimum_contact_matrix0) \
+        + np.loadtxt(minimum_contact_matrix1) + \
+        np.loadtxt(minimum_contact_matrix2) + \
+        np.loadtxt(minimum_contact_matrix3)
+
+    return minimum
+
+
+def get_population():
+    df_population = pd.read_json(
+        'data/pydata/Germany/county_population.json')
+    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
+
+    df_population_agegroups = pd.DataFrame(
+        columns=[df_population.columns[0]] + age_groups)
+    for region_id in df_population.iloc[:, 0]:
+        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
+            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
+
+    population = df_population_agegroups.values.tolist
+    return population

From 77af0ab5a2ec51841da045fc6f418e83cc2de36c Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 19 Sep 2024 13:29:43 +0200
Subject: [PATCH 23/33] add test for saving mechanism

---
 .../test_surrogatemodel_GNN.py                | 106 +++++++++++++-----
 1 file changed, 76 insertions(+), 30 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
index f3fc0794a0..a28e076c1b 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
@@ -98,6 +98,7 @@ def test_simulation_run_nodamp(self, mock_transform_mobility,
 
 #### test data genertion no damp ####
 
+
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
            return_value=0.6 * np.ones((6, 6)))
     @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
@@ -147,28 +148,44 @@ def test_data_generation_runs_nodamp(
         self.assertEqual(len(data_2['labels'][0]), label_width_2)
         self.assertEqual(len(data_2['labels'][0][0]), 48)
 
-    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
-    #        return_value=0.6 * np.ones((6, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
+           return_value=np.random.randint(0, 700001, size=(400, 6)))
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
+           return_value=graph)
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.scale_data',
+           autospec=True)
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory',
+           autospec=True)
+    def test_data_generation_save_nodamp(
+            self, mock_transform_mobility, mock_update_mobility, mock_scale_data,
+            mock_population, mock_baseline, mock_graph):
+
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
-    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.get_population',
-    #        return_value= np.random.randint(0, 700001, size=(400, 6)))
-    # # create mock graph
-    # @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.make_graph',
-    #        return_value= graph)
+        # Mock the return value of scale_data with dummy inputs and labels
+        # Assuming scaled inputs and labels are 4D arrays based on the original function's output
+        mock_scale_data.return_value = (np.random.rand(
+            10, 8, 10, 6), np.random.rand(10, 8, 10, 6))
 
-    # def test_data_generation_save_nodamp(
-    #         self, mock_population, mock_baseline, mock_graph):
+        input_width = 2
+        label_width = 3
+        num_runs = 1
 
-    #     input_width = 2
-    #     label_width = 3
-    #     num_runs = 1
+        # Call the function being tested
+        data_generation_nodamp.generate_data(num_runs, self.path, input_width,
+                                             label_width)
 
-    #     data_generation_nodamp.generate_data(num_runs, self.path, input_width,
-    #                                  label_width)
-    #     self.assertEqual(len(os.listdir(self.path)), 1)
+        # Check the number of generated files
+        self.assertEqual(len(os.listdir(self.path)), 1)
 
-    #     self.assertEqual(os.listdir(self.path),
-    #                      ['data_secir_groups.pickle'])
+        # Check the contents of the directory
+        self.assertEqual(os.listdir(self.path),
+                         ['data_secir_age_groups.pickle'])
 
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
            return_value=0 * np.ones((6, 6)))
@@ -178,7 +195,7 @@ def test_data_generation_runs_nodamp(
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph',
            return_value=graph)
     @patch('memilio.epidata.transformMobilityData.updateMobility2022')
-    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory',
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.transform_mobility_directory',
            autospec=True)
     def test_simulation_run_withdamp(self, mock_transform_mobility, mock_update_mobility,
                                      mock_minimum, mock_baseline, mock_graph):
@@ -228,6 +245,8 @@ def test_simulation_run_withdamp(self, mock_transform_mobility, mock_update_mobi
         self.assertEqual(len(damping_coeff2), len(dampings2))
         self.assertEqual(len(damping_coeff3), len(dampings3))
 
+# test data generation with dampings
+
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
            return_value=0 * np.ones((6, 6)))
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
@@ -239,7 +258,7 @@ def test_simulation_run_withdamp(self, mock_transform_mobility, mock_update_mobi
            return_value=np.random.randint(0, 700001, size=(400, 6)))
     # mock transform directory function
     @patch('memilio.epidata.transformMobilityData.updateMobility2022')
-    @patch('memilio.surrogatemodel.GNN.data_generation_nodamp.transform_mobility_directory', autospec=True)
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.transform_mobility_directory', autospec=True)
     def test_data_generation_runs_withdamp(
             self, mock_transform_mobility, mock_update_mobility,
             mock_minimum, mock_baseline, mock_graph, mock_population):
@@ -279,17 +298,44 @@ def test_data_generation_runs_withdamp(
         self.assertEqual(len(data_2['labels'][0]), label_width_2)
         self.assertEqual(len(data_2['labels'][0][0]), 48)
 
+ # test saving for model with dampings
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getMinimumMatrix',
+           return_value=0 * np.ones((6, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.getBaselineMatrix',
+           return_value=0.6 * np.ones((6, 6)))
+    # create mock graph
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.make_graph',
+           return_value=graph)
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.get_population',
+           return_value=np.random.randint(0, 700001, size=(400, 6)))
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.scale_data',
+           autospec=True)
+    # @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.generate_dampings_withshadowdamp',
+    #       return_value=[4, 6])
+    @patch('memilio.epidata.transformMobilityData.updateMobility2022')
+    @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.transform_mobility_directory',
+           autospec=True)
+    def test_data_generation_save_withdamp(
+            self, mock_transform_mobility, mock_update_mobility, mock_scale_data,
+            mock_population, mock_baseline, mcok_minimum, mock_graph):
+
+        mock_transform_mobility.side_effect = lambda: None
+        mock_update_mobility.side_effect = lambda directory, mobility_file: None
+
+        mock_scale_data.return_value = (np.random.rand(
+            10, 8, 10, 6), np.random.rand(10, 8, 10, 6))
+
+        input_width = 5
+        label_width = 20
+        num_runs = 2
+        num_dampings = 2
+        data = data_generation_withdamp.generate_data(num_runs, self.path, input_width,
+                                                      label_width, num_dampings, save_data=True)
+
+        self.assertEqual(len(os.listdir(self.path)), 1)
+        self.assertEqual(os.listdir(self.path),
+                         ['data_secir_age_groups.pickle'])
+
 
-    # def test_data_generation_save_withdamp(
-    #         self, mock_population, mock_baseline, mock_minimum):
-    #     input_width = 2
-    #     label_width = 3
-    #     num_runs = 1
-    #     dampings = 3
-    #     data_generation_withdamp.generate_data(num_runs, self.path, "", input_width,
-    #                                   label_width)
-    #     self.assertEqual(len(os.listdir(self.path)), 1)
-    #     self.assertEqual(os.listdir(self.path),
-    #                      ['data_secir_groups.pickle'])
 if __name__ == '__main__':
     unittest.main()

From ce4a494f49594484d649647a0fc4297e1cf1d791 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 19 Sep 2024 13:49:55 +0200
Subject: [PATCH 24/33] import functions from new utils file

---
 .../ode_secir_groups/data_generation.py       | 65 ++-----------------
 .../ode_secir_simple/data_generation.py       | 18 ++---
 .../surrogatemodel/utils_surrogatemodel.py    |  3 +-
 3 files changed, 14 insertions(+), 72 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py
index 1cb1dac82d..5e94ebf689 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py
@@ -34,6 +34,9 @@
                                        InfectionState, Model,
                                        interpolate_simulation_result, simulate)
 
+from memilio.surrogatemodel.utils_surrogatemodel import (
+    getBaselineMatrix, getMinimumMatrix, get_population)
+
 
 def interpolate_age_groups(data_entry):
     """! Interpolates the age groups from the population data into the age groups used in the simulation. 
@@ -214,7 +217,8 @@ def generate_data(
     days = input_width + label_width - 1
 
     # Load population data
-    population = get_population(path_population)
+    # population = get_population(path_population)
+    population = get_population()
 
     # show progess in terminal for longer runs
     # Due to the random structure, there's currently no need to shuffle the data
@@ -256,61 +260,6 @@ def generate_data(
     return data
 
 
-def getBaselineMatrix():
-    """! loads the baselinematrix
-    """
-
-    baseline_contact_matrix0 = os.path.join(
-        "./data/contacts/baseline_home.txt")
-    baseline_contact_matrix1 = os.path.join(
-        "./data/contacts/baseline_school_pf_eig.txt")
-    baseline_contact_matrix2 = os.path.join(
-        "./data/contacts/baseline_work.txt")
-    baseline_contact_matrix3 = os.path.join(
-        "./data/contacts/baseline_other.txt")
-
-    baseline = np.loadtxt(baseline_contact_matrix0) \
-        + np.loadtxt(baseline_contact_matrix1) + \
-        np.loadtxt(baseline_contact_matrix2) + \
-        np.loadtxt(baseline_contact_matrix3)
-
-    return baseline
-
-
-def getMinimumMatrix():
-    """! loads the minimum matrix
-    """
-
-    minimum_contact_matrix0 = os.path.join(
-        "./data/contacts/minimum_home.txt")
-    minimum_contact_matrix1 = os.path.join(
-        "./data/contacts/minimum_school_pf_eig.txt")
-    minimum_contact_matrix2 = os.path.join(
-        "./data/contacts/minimum_work.txt")
-    minimum_contact_matrix3 = os.path.join(
-        "./data/contacts/minimum_other.txt")
-
-    minimum = np.loadtxt(minimum_contact_matrix0) \
-        + np.loadtxt(minimum_contact_matrix1) + \
-        np.loadtxt(minimum_contact_matrix2) + \
-        np.loadtxt(minimum_contact_matrix3)
-
-    return minimum
-
-
-def get_population(path):
-    """! read population data in list from dataset
-    @param path Path to the dataset containing the population data
-    """
-
-    with open(path) as f:
-        data = json.load(f)
-    population = []
-    for data_entry in data:
-        population.append(interpolate_age_groups(data_entry))
-    return population
-
-
 if __name__ == "__main__":
     # Store data relative to current file two levels higher.
     path = os.path.dirname(os.path.realpath(__file__))
@@ -322,6 +271,6 @@ def get_population(path):
 
     input_width = 5
     label_width = 30
-    num_runs = 10000
+    num_runs = 10
     data = generate_data(num_runs, path_output, path_population, input_width,
-                         label_width)
+                         label_width, save_data=True)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py
index 96eff3ac94..9dee453c70 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py
@@ -35,13 +35,7 @@
                                        InfectionState, Model, Simulation,
                                        interpolate_simulation_result, simulate)
 
-
-def remove_confirmed_compartments(result_array):
-    sum_inf_no_symp = np.sum(result_array[:, [2, 3]], axis=1)
-    sum_inf_symp = np.sum(result_array[:, [2, 3]], axis=1)
-    result_array[:, 2] = sum_inf_no_symp
-    result_array[:, 4] = sum_inf_symp
-    return np.delete(result_array, [3, 5], axis=1)
+from memilio.surrogatemodel.utils_surrogatemodel import remove_confirmed_compartments
 
 
 def run_secir_simple_simulation(days):
@@ -121,13 +115,11 @@ def run_secir_simple_simulation(days):
     result_array = result.as_ndarray()
 
     result_array = remove_confirmed_compartments(
-        result_array[1:, :].transpose())
-
-    dataset = []
+        result_array[1:, :].transpose(), 1)
 
     dataset_entries = copy.deepcopy(result_array)
 
-    return dataset_entries.tolist()
+    return dataset_entries
 
 
 def generate_data(
@@ -205,6 +197,6 @@ def generate_data(
 
     input_width = 5
     label_width = 30
-    num_runs = 1000
+    num_runs = 10000
     data = generate_data(num_runs, path_data, input_width,
-                         label_width)
+                         label_width, save_data=True)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py
index 72f3335df6..0fc619092d 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils_surrogatemodel.py
@@ -82,5 +82,6 @@ def get_population():
         df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
             mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
 
-    population = df_population_agegroups.values.tolist
+    population = df_population_agegroups.values.tolist()
+
     return population

From 63980d97843a786ba5565cbdcbc71dddb5783c90 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 19 Sep 2024 14:00:44 +0200
Subject: [PATCH 25/33] adjust import

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py          |  2 ++
 .../surrogatemodel/GNN/data_generation_nodamp.py     | 10 ++++------
 .../surrogatemodel/GNN/data_generation_withdamp.py   | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index 4e88f33b0c..6108d2d5a0 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -43,6 +43,8 @@ def transform_mobility_directory():
     tmd.updateMobility2022(
         directory, mobility_file='commuter_migration_scaled')
 
+    return directory
+
 
 def scale_data(data):
     num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index 6e440af4c1..3901fc9ebd 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -3,7 +3,6 @@
 import pickle
 import random
 import numpy as np
-import pandas as pd
 
 from progress.bar import Bar
 
@@ -13,9 +12,8 @@
 from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
                                        InfectionState, Model, interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
-import memilio.epidata.getPopulationData as gpd
-from .GNN_utils import (transform_mobility_directory,
-                        make_graph, scale_data)
+from memilio.surrogatemodel.GNN.GNN_utils import (transform_mobility_directory,
+                                                  make_graph, scale_data)
 from memilio.surrogatemodel.utils_surrogatemodel import (
     getBaselineMatrix, remove_confirmed_compartments, get_population)
 
@@ -202,7 +200,7 @@ def generate_data(
 
     input_width = 5
     days = 30
-    num_runs = 1000
+    num_runs = 10000
 
     generate_data(num_runs, path_data, input_width,
-                  days)
+                  days, save_data=True)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index b11616f843..e255f4e3ce 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -14,8 +14,8 @@
                                        interpolate_simulation_result)
 from memilio.epidata import geoModificationGermany as geoger
 
-from .GNN_utils import (transform_mobility_directory,
-                        make_graph, scale_data)
+from memilio.surrogatemodel.GNN.GNN_utils import (transform_mobility_directory,
+                                                  make_graph, scale_data)
 from memilio.surrogatemodel.utils_surrogatemodel import (
     getBaselineMatrix, remove_confirmed_compartments, get_population, getMinimumMatrix)
 
@@ -292,9 +292,9 @@ def generate_data(
 if __name__ == "__main__":
 
     input_width = 5
-    label_width = 100
-    number_of_dampings = 3
-    num_runs = 1000
+    label_width = 30
+    number_of_dampings = 1
+    num_runs = 2
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(
@@ -302,4 +302,4 @@ def generate_data(
         'data_GNN_with_'+str(number_of_dampings)+'_dampings_test')
 
     generate_data(num_runs, path_data, input_width,
-                  label_width, number_of_dampings)
+                  label_width, number_of_dampings, save_data=False)

From cabf5a16a292687613a5249a700f2722672242b2 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Thu, 19 Sep 2024 14:02:38 +0200
Subject: [PATCH 26/33] adjust imports

---
 .../test_surrogatemodel_GNN.py                   | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
index a28e076c1b..8371aed65a 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_GNN.py
@@ -29,11 +29,7 @@
 import unittest
 
 import numpy as np
-import logging
-import json
-# suppress all autograph warnings from tensorflow
 
-# logging.getLogger("tensorflow").setLevel(logging.ERROR)
 import tensorflow as tf
 tf.get_logger().setLevel('ERROR')
 
@@ -69,9 +65,7 @@ def setUp(self):
            autospec=True)
     def test_simulation_run_nodamp(self, mock_transform_mobility,
                                    mock_update_mobility, mock_baseline, mock_graph):
-        # Mock the behavior of the mobility transformation function,
-        # but it doesn't return anything.
-        # So, we just need it to be called without side effects.
+
         mock_transform_mobility.side_effect = lambda: None
         mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
@@ -112,9 +106,6 @@ def test_data_generation_runs_nodamp(
         self, mock_transform_mobility, mock_update_mobility,
             mock_baseline, mock_graph, mock_population):
 
-       # Mock the behavior of the mobility transformation function,
-        # but it doesn't return anything.
-        # So, we just need it to be called without side effects.
         mock_transform_mobility.side_effect = lambda: None
         mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
@@ -199,8 +190,7 @@ def test_data_generation_save_nodamp(
            autospec=True)
     def test_simulation_run_withdamp(self, mock_transform_mobility, mock_update_mobility,
                                      mock_minimum, mock_baseline, mock_graph):
-        # Mock the behavior of the mobility transformation function, but it doesn't return anything.
-        # So, we just need it to be called without side effects.
+
         mock_transform_mobility.side_effect = lambda: None
         mock_update_mobility.side_effect = lambda directory, mobility_file: None
 
@@ -310,8 +300,6 @@ def test_data_generation_runs_withdamp(
            return_value=np.random.randint(0, 700001, size=(400, 6)))
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.scale_data',
            autospec=True)
-    # @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.generate_dampings_withshadowdamp',
-    #       return_value=[4, 6])
     @patch('memilio.epidata.transformMobilityData.updateMobility2022')
     @patch('memilio.surrogatemodel.GNN.data_generation_withdamp.transform_mobility_directory',
            autospec=True)

From 06fb8dda94b65b0665a2a2b9c18a78be14a3f4a3 Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Tue, 24 Sep 2024 15:30:09 +0200
Subject: [PATCH 27/33] put function which read files outside if the
 run_simulation function

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py   | 85 ++++++++++++++++++-
 .../GNN/data_generation_nodamp.py             | 24 +++---
 .../GNN/data_generation_withdamp.py           | 22 ++---
 3 files changed, 106 insertions(+), 25 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index 6108d2d5a0..0f66e938b6 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -1,12 +1,79 @@
 import numpy as np
 import pandas as pd
 import os
-
 from sklearn.preprocessing import FunctionTransformer
 
 from memilio.epidata import transformMobilityData as tmd
 from memilio.epidata import getDataIntoPandasDataFrame as gd
 from memilio.simulation.osecir import (ModelGraph, set_edges)
+from memilio.epidata import modifyDataframeSeries as mdfs
+
+
+def remove_confirmed_compartments(dataset_entries, num_groups):
+    """! The compartments which contain confirmed cases are not needed and are 
+        therefore omitted by summarizing the confirmed compartment with the 
+        original compartment. 
+    @param dataset_entries Array that contains the compartmental data with 
+            confirmed compartments. 
+    @param num_groups Number of age groups.
+    @return Array that contains the compartmental data without confirmed compartments. 
+   """
+
+    new_dataset_entries = []
+    for i in dataset_entries:
+        dataset_entries_reshaped = i.reshape(
+            [num_groups, int(np.asarray(dataset_entries).shape[1]/num_groups)]
+        )
+        sum_inf_no_symp = np.sum(dataset_entries_reshaped[:, [2, 3]], axis=1)
+        sum_inf_symp = np.sum(dataset_entries_reshaped[:, [4, 5]], axis=1)
+        dataset_entries_reshaped[:, 2] = sum_inf_no_symp
+        dataset_entries_reshaped[:, 4] = sum_inf_symp
+        new_dataset_entries.append(
+            np.delete(dataset_entries_reshaped, [3, 5], axis=1).flatten()
+        )
+    return new_dataset_entries
+
+
+def getBaselineMatrix():
+    """! loads the baselinematrix
+    """
+
+    baseline_contact_matrix0 = os.path.join(
+        "./data/contacts/baseline_home.txt")
+    baseline_contact_matrix1 = os.path.join(
+        "./data/contacts/baseline_school_pf_eig.txt")
+    baseline_contact_matrix2 = os.path.join(
+        "./data/contacts/baseline_work.txt")
+    baseline_contact_matrix3 = os.path.join(
+        "./data/contacts/baseline_other.txt")
+
+    baseline = np.loadtxt(baseline_contact_matrix0) \
+        + np.loadtxt(baseline_contact_matrix1) + \
+        np.loadtxt(baseline_contact_matrix2) + \
+        np.loadtxt(baseline_contact_matrix3)
+
+    return baseline
+
+
+def getMinimumMatrix():
+    """! loads the minimum matrix
+    """
+
+    minimum_contact_matrix0 = os.path.join(
+        "./data/contacts/minimum_home.txt")
+    minimum_contact_matrix1 = os.path.join(
+        "./data/contacts/minimum_school_pf_eig.txt")
+    minimum_contact_matrix2 = os.path.join(
+        "./data/contacts/minimum_work.txt")
+    minimum_contact_matrix3 = os.path.join(
+        "./data/contacts/minimum_other.txt")
+
+    minimum = np.loadtxt(minimum_contact_matrix0) \
+        + np.loadtxt(minimum_contact_matrix1) + \
+        np.loadtxt(minimum_contact_matrix2) + \
+        np.loadtxt(minimum_contact_matrix3)
+
+    return minimum
 
 
 def make_graph(directory, num_regions, countykey_list, models):
@@ -42,10 +109,24 @@ def transform_mobility_directory():
     tmd.updateMobility2022(directory, mobility_file='twitter_scaled_1252')
     tmd.updateMobility2022(
         directory, mobility_file='commuter_migration_scaled')
-
     return directory
 
 
+def get_population():
+    df_population = pd.read_json(
+        'data/pydata/Germany/county_population.json')
+    age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80-130']
+
+    df_population_agegroups = pd.DataFrame(
+        columns=[df_population.columns[0]] + age_groups)
+    for region_id in df_population.iloc[:, 0]:
+        df_population_agegroups.loc[len(df_population_agegroups.index), :] = [int(region_id)] + list(
+            mdfs.fit_age_group_intervals(df_population[df_population.iloc[:, 0] == int(region_id)].iloc[:, 2:], age_groups))
+
+    population = df_population_agegroups.values.tolist()
+    return population
+
+
 def scale_data(data):
     num_groups = int(np.asarray(data['inputs']).shape[2] / 8)
     transformer = FunctionTransformer(np.log1p, validate=True)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index 3901fc9ebd..0e4f7caa30 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -18,7 +18,7 @@
     getBaselineMatrix, remove_confirmed_compartments, get_population)
 
 
-def run_secir_groups_simulation(days, populations):
+def run_secir_groups_simulation(days, populations, countykey_list, directory, baseline):
     """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 
         different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
@@ -36,9 +36,6 @@ def run_secir_groups_simulation(days, populations):
     start_year = 2019
     dt = 0.1
 
-    # get county ids
-    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
-
     # Define age Groups
     groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
     num_groups = len(groups)
@@ -105,9 +102,6 @@ def run_secir_groups_simulation(days, populations):
         model.parameters.StartDay = (
             date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
 
-        # Load baseline and minimum contact matrix and assign them to the model
-        baseline = getBaselineMatrix()
-
         model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
         model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
             (num_groups, num_groups)) * 0
@@ -116,7 +110,6 @@ def run_secir_groups_simulation(days, populations):
         model.apply_constraints()
         models.append(model)
 
-    directory = transform_mobility_directory()
     graph = make_graph(directory, num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
@@ -152,6 +145,13 @@ def generate_data(
             "labels": [],
             }
 
+    # get county ids
+    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
+    # Load baseline and minimum contact matrix and assign them to the model
+    baseline = getBaselineMatrix()
+    # transform directiry by merging Einsenach and Wartburgkreis
+    directory = transform_mobility_directory()
+
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
     bar = Bar('Number of Runs done', max=num_runs)
@@ -159,7 +159,7 @@ def generate_data(
     for _ in range(num_runs):
 
         data_run = run_secir_groups_simulation(
-            days_sum, population)
+            days_sum, population, countykey_list, directory, baseline)
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[: input_width]
         data["inputs"].append(inputs)
@@ -196,11 +196,11 @@ def generate_data(
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
-        'data_GNN_nodamp')
+        'data_GNN_nodamp_test')
 
     input_width = 5
-    days = 30
-    num_runs = 10000
+    days = 3
+    num_runs = 2
 
     generate_data(num_runs, path_data, input_width,
                   days, save_data=True)
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index e255f4e3ce..a8485b34aa 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -20,7 +20,7 @@
     getBaselineMatrix, remove_confirmed_compartments, get_population, getMinimumMatrix)
 
 
-def run_secir_groups_simulation(days, populations, dampings):
+def run_secir_groups_simulation(days, populations, dampings, countykey_list, directory, baseline):
     """! Uses an ODE SECIR model allowing for asymptomatic infection 
         with 6 different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
@@ -38,9 +38,6 @@ def run_secir_groups_simulation(days, populations, dampings):
     start_year = 2019
     dt = 0.1
 
-    # get county ids
-    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
-
     # Define age Groups
     groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
     num_groups = len(groups)
@@ -107,12 +104,9 @@ def run_secir_groups_simulation(days, populations, dampings):
         model.parameters.StartDay = (
             date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
 
-        # Load baseline and minimum contact matrix and assign them to the model
-        baseline = getBaselineMatrix()
-        minimum = getMinimumMatrix()
-
         model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
-        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = minimum
+        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
+            (num_groups, num_groups)) * 0
 
         # Generate a damping matrix and assign it to the model
         damped_matrices = []
@@ -135,7 +129,6 @@ def run_secir_groups_simulation(days, populations, dampings):
         model.apply_constraints()
         models.append(model)
 
-    directory = transform_mobility_directory()
     graph = make_graph(directory, num_regions, countykey_list, models)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
@@ -248,6 +241,13 @@ def generate_data(
             "damping_day": [],
             "damped_matrix": []}
 
+    # get county ids
+    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
+    directory = transform_mobility_directory()
+
+    # Load baseline matrix
+    baseline = getBaselineMatrix()
+
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
     bar = Bar('Number of Runs done', max=num_runs)
@@ -255,7 +255,7 @@ def generate_data(
     for i in range(num_runs):
 
         data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
-            days_sum,  population,  damping_days[i])
+            days_sum,  population,  damping_days[i], countykey_list, countykey_list, baseline)
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
         data["inputs"].append(inputs)

From f2345c710857a0785c2eeaaf5aa02b0e0d93095f Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Tue, 24 Sep 2024 15:40:54 +0200
Subject: [PATCH 28/33] add directory as parameter

---
 .../memilio/surrogatemodel/GNN/data_generation_withdamp.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index a8485b34aa..d800bc7520 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -255,7 +255,7 @@ def generate_data(
     for i in range(num_runs):
 
         data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
-            days_sum,  population,  damping_days[i], countykey_list, countykey_list, baseline)
+            days_sum,  population,  damping_days[i], countykey_list, directory, baseline)
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
         data["inputs"].append(inputs)

From d26528e97633581c6e3751fe17044da603e695da Mon Sep 17 00:00:00 2001
From: Agatha Schmidt <agatha.schmidt@hotmail.de>
Date: Wed, 25 Sep 2024 16:16:25 +0200
Subject: [PATCH 29/33] set edges only one time

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py     |  6 +++---
 .../GNN/data_generation_withdamp.py             | 17 +++++++++++++----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index 0f66e938b6..274b310809 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -76,7 +76,7 @@ def getMinimumMatrix():
     return minimum
 
 
-def make_graph(directory, num_regions, countykey_list, models):
+def make_graph(directory, num_regions, countykey_list, models, edges):
     """! 
     @param directory Directory with mobility data. 
     @param num_regions Number (int) of counties that should be added to the 
@@ -91,8 +91,8 @@ def make_graph(directory, num_regions, countykey_list, models):
 
     num_locations = 4
 
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
-              graph, num_locations)
+    # set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
+    #          graph, num_locations)
     return graph
 
 
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index d800bc7520..868effff97 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -11,7 +11,7 @@
 from memilio.simulation import (AgeGroup, LogLevel, set_log_level, Damping)
 from memilio.simulation.osecir import (Index_InfectionState, interpolate_simulation_result, ParameterStudy,
                                        InfectionState, Model,
-                                       interpolate_simulation_result)
+                                       interpolate_simulation_result, ModelGraph, set_edges)
 from memilio.epidata import geoModificationGermany as geoger
 
 from memilio.surrogatemodel.GNN.GNN_utils import (transform_mobility_directory,
@@ -20,7 +20,7 @@
     getBaselineMatrix, remove_confirmed_compartments, get_population, getMinimumMatrix)
 
 
-def run_secir_groups_simulation(days, populations, dampings, countykey_list, directory, baseline):
+def run_secir_groups_simulation(days, populations, dampings, countykey_list, directory, baseline, edges):
     """! Uses an ODE SECIR model allowing for asymptomatic infection 
         with 6 different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
@@ -129,7 +129,7 @@ def run_secir_groups_simulation(days, populations, dampings, countykey_list, dir
         model.apply_constraints()
         models.append(model)
 
-    graph = make_graph(directory, num_regions, countykey_list, models)
+    graph = make_graph(directory, num_regions, countykey_list, models, edges)
 
     study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
     study.run()
@@ -248,6 +248,15 @@ def generate_data(
     # Load baseline matrix
     baseline = getBaselineMatrix()
 
+    num_locations = 4
+    graph = ModelGraph()
+    for i in range(len(countykey_list)):
+        graph.add_node(int(countykey_list[i]), Model(6))
+
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
+              graph, num_locations)
+    edges = graph.edges()
+
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
     bar = Bar('Number of Runs done', max=num_runs)
@@ -255,7 +264,7 @@ def generate_data(
     for i in range(num_runs):
 
         data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
-            days_sum,  population,  damping_days[i], countykey_list, directory, baseline)
+            days_sum,  population,  damping_days[i], countykey_list, directory, baseline, edges)
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
         data["inputs"].append(inputs)

From e39fd71a98d8d7857c4603eb7c583cc45206025a Mon Sep 17 00:00:00 2001
From: HenrZu <69154294+HenrZu@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:07:10 +0200
Subject: [PATCH 30/33] new structure for no damp

---
 .../memilio/surrogatemodel/GNN/GNN_utils.py   |   6 +-
 .../GNN/data_generation_nodamp.py             | 228 +++++++++++-------
 2 files changed, 139 insertions(+), 95 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
index 274b310809..0f66e938b6 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/GNN_utils.py
@@ -76,7 +76,7 @@ def getMinimumMatrix():
     return minimum
 
 
-def make_graph(directory, num_regions, countykey_list, models, edges):
+def make_graph(directory, num_regions, countykey_list, models):
     """! 
     @param directory Directory with mobility data. 
     @param num_regions Number (int) of counties that should be added to the 
@@ -91,8 +91,8 @@ def make_graph(directory, num_regions, countykey_list, models, edges):
 
     num_locations = 4
 
-    # set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
-    #          graph, num_locations)
+    set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
+              graph, num_locations)
     return graph
 
 
diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
index 0e4f7caa30..c0f84261b5 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_nodamp.py
@@ -2,6 +2,9 @@
 import os
 import pickle
 import random
+import time
+import memilio.simulation as mio
+import memilio.simulation.osecir as osecir
 import numpy as np
 
 from progress.bar import Bar
@@ -16,104 +19,147 @@
                                                   make_graph, scale_data)
 from memilio.surrogatemodel.utils_surrogatemodel import (
     getBaselineMatrix, remove_confirmed_compartments, get_population)
-
-
-def run_secir_groups_simulation(days, populations, countykey_list, directory, baseline):
-    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6 
-        different age groups. The model is not stratified by region. 
-        Virus-specific parameters are fixed and initial number of persons 
+from enum import Enum
+
+
+class Location(Enum):
+    Home = 0
+    School = 1
+    Work = 2
+    Other = 3
+
+
+start_date = mio.Date(2019, 1, 1)
+end_date = mio.Date(2020, 12, 31)
+
+
+def set_covid_parameters(model, num_groups=6):
+    for i in range(num_groups):
+        # Compartment transition duration
+        model.parameters.TimeExposed[AgeGroup(i)] = 3.2
+        model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
+        model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
+        model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
+        model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+
+        # Compartment transition propabilities
+        model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
+        model.parameters.TransmissionProbabilityOnContact[AgeGroup(
+            i)] = 0.1
+        model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
+        model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
+        model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
+        model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
+        model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
+        model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
+            i)] = 0.5
+
+    # StartDay is the n-th day of the year
+    model.parameters.StartDay = start_date.day_in_year
+
+
+def set_contact_matrices(model, data_dir, num_groups=6):
+    contact_matrices = mio.ContactMatrixGroup(
+        len(list(Location)), num_groups)
+    locations = ["home", "school_pf_eig", "work", "other"]
+
+    for i, location in enumerate(locations):
+        baseline_file = os.path.join(
+            data_dir, "contacts", "baseline_" + location + ".txt")
+        minimum_file = os.path.join(
+            data_dir, "contacts", "minimum_" + location + ".txt")
+        contact_matrices[i] = mio.ContactMatrix(
+            mio.read_mobility_plain(baseline_file),
+            mio.read_mobility_plain(minimum_file)
+        )
+    model.parameters.ContactPatterns.cont_freq_mat = contact_matrices
+
+
+def get_graph(num_groups, data_dir):
+    model = Model(num_groups)
+    set_covid_parameters(model)
+    set_contact_matrices(model, data_dir)
+
+    graph = osecir.ModelGraph()
+
+    scaling_factor_infected = [2.5, 2.5, 2.5, 2.5, 2.5, 2.5]
+    scaling_factor_icu = 1.0
+    tnt_capacity_factor = 7.5 / 100000.
+
+    path_population_data = os.path.join(
+        data_dir, "pydata", "Germany",
+        "county_current_population.json")
+
+    mio.osecir.set_nodes(
+        model.parameters,
+        mio.Date(start_date.year,
+                 start_date.month, start_date.day),
+        mio.Date(end_date.year,
+                 end_date.month, end_date.day), data_dir,
+        path_population_data, True, graph, scaling_factor_infected,
+        scaling_factor_icu, tnt_capacity_factor, 0, False)
+
+    mio.osecir.set_edges(
+        data_dir, graph, len(Location))
+
+    return graph
+
+
+def run_secir_groups_simulation(days, graph, num_groups=6):
+    """! Uses an ODE SECIR model allowing for asymptomatic infection with 6
+        different age groups. The model is not stratified by region.
+        Virus-specific parameters are fixed and initial number of persons
         in the particular infection states are chosen randomly from defined ranges.
     @param Days Describes how many days we simulate within a single run.
-    @param populations List containing the population in each age group.
-    @return List containing the populations in each compartment used to initialize 
+    @param Graph Graph initilized for the start_date with the population data which
+            is sampled during the run.
+    @return List containing the populations in each compartment used to initialize
             the run.
    """
-
-    set_log_level(LogLevel.Off)
-
-    start_day = 1
-    start_month = 1
-    start_year = 2019
-    dt = 0.1
-
-    # Define age Groups
-    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
-    num_groups = len(groups)
-    num_regions = len(populations)
-    models = []
-
-    # Initialize Parameters
-    for region in range(num_regions):
-        model = Model(num_groups)
+    for node_indx in range(graph.num_nodes):
+        model = graph.get_node(node_indx).property
 
         # Set parameters
+        # TODO: Put This in the draw_sample function in the ParameterStudy
         for i in range(num_groups):
-            # Compartment transition duration
-            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
-            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
-            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
-            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
-            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+            age_group = AgeGroup(i)
+            pop_age_group = model.populations.get_group_total_AgeGroup(
+                age_group)
 
             # Initial number of people in each compartment with random numbers
             # Numbers are chosen heuristically based on experience
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Exposed)] = random.uniform(
-                0.00025, 0.005) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedNoSymptoms)] = random.uniform(
-                0.0001, 0.0035) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
+            model.populations[age_group, Index_InfectionState(InfectionState.Exposed)] = random.uniform(
+                0.00025, 0.005) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedNoSymptoms)] = random.uniform(
+                0.0001, 0.0035) * pop_age_group
+            model.populations[age_group, Index_InfectionState(
                 InfectionState.InfectedNoSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSymptoms)] = random.uniform(
-                0.00007, 0.001) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedSymptoms)] = random.uniform(
+                0.00007, 0.001) * pop_age_group
+            model.populations[age_group, Index_InfectionState(
                 InfectionState.InfectedSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSevere)] = random.uniform(
-                0.00003, 0.0006) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedCritical)] = random.uniform(
-                0.00001, 0.0002) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Recovered)] = random.uniform(
-                0.002, 0.08) * populations[region][i]
-            model.populations[AgeGroup(i),
-                              Index_InfectionState(InfectionState.Dead)] = random.uniform(
-                0, 0.0003) * populations[region][i]
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedSevere)] = random.uniform(
+                0.00003, 0.0006) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedCritical)] = random.uniform(
+                0.00001, 0.0002) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.Recovered)] = random.uniform(
+                0.002, 0.08) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.Dead)] = random.uniform(
+                0, 0.0003) * pop_age_group
             model.populations.set_difference_from_group_total_AgeGroup(
-                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),
-                populations[region][i])
-
-            # Compartment transition propabilities
-            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
-            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
-                i)] = 0.1
-            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
-            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
-            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
-            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
-            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
-            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
-                i)] = 0.5
-
-        # StartDay is the n-th day of the year
-        model.parameters.StartDay = (
-            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
-
-        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
-        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
-            (num_groups, num_groups)) * 0
+                (age_group, Index_InfectionState(InfectionState.Susceptible)),
+                pop_age_group)
 
         # Apply mathematical constraints to parameters
         model.apply_constraints()
-        models.append(model)
 
-    graph = make_graph(directory, num_regions, countykey_list, models)
+        # set model to graph
+        graph.get_node(node_indx).property.populations = model.populations
 
-    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
+    study = ParameterStudy(graph, 0, days, dt=0.5, num_runs=1)
+    start_time = time.time()
     study.run()
+    print("Simulation took: ", time.time() - start_time)
 
     graph_run = study.run()[0]
     results = interpolate_simulation_result(graph_run)
@@ -129,28 +175,24 @@ def run_secir_groups_simulation(days, populations, countykey_list, directory, ba
 
 
 def generate_data(
-        num_runs, path, input_width, days, save_data=True):
+        num_runs, data_dir, path, input_width, days, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
    @param num_runs Number of times, the function run_secir_simulation is called.
+   @param data_dir Directory with all data needed to initialize the models.
    @param path Path, where the datasets are stored.
    @param input_width number of time steps used for model input.
    @param label_width number of time steps (days) used as model output/label.  
    @param save_data Option to deactivate the save of the dataset. Per default true.
    """
-
-    population = get_population()
+    set_log_level(mio.LogLevel.Error)
     days_sum = days + input_width - 1
 
     data = {"inputs": [],
             "labels": [],
             }
 
-    # get county ids
-    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
-    # Load baseline and minimum contact matrix and assign them to the model
-    baseline = getBaselineMatrix()
-    # transform directiry by merging Einsenach and Wartburgkreis
-    directory = transform_mobility_directory()
+    num_groups = 6
+    graph = get_graph(num_groups, data_dir)
 
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
@@ -159,7 +201,7 @@ def generate_data(
     for _ in range(num_runs):
 
         data_run = run_secir_groups_simulation(
-            days_sum, population, countykey_list, directory, baseline)
+            days_sum, graph)
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[: input_width]
         data["inputs"].append(inputs)
@@ -198,9 +240,11 @@ def generate_data(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
         'data_GNN_nodamp_test')
 
+    data_dir = os.path.join(os.getcwd(), 'data')
+
     input_width = 5
-    days = 3
-    num_runs = 2
+    days = 30
+    num_runs = 1
 
-    generate_data(num_runs, path_data, input_width,
+    generate_data(num_runs, data_dir,  path_data, input_width,
                   days, save_data=True)

From ff6c57149d4b579d0d59029525b5a1357e9a0073 Mon Sep 17 00:00:00 2001
From: HenrZu <69154294+HenrZu@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:10:47 +0200
Subject: [PATCH 31/33] timing graph sim (Delete before Merge!)

---
 cpp/memilio/mobility/graph_simulation.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/memilio/mobility/graph_simulation.h b/cpp/memilio/mobility/graph_simulation.h
index 27f3942689..94000937f3 100644
--- a/cpp/memilio/mobility/graph_simulation.h
+++ b/cpp/memilio/mobility/graph_simulation.h
@@ -22,6 +22,7 @@
 
 #include "memilio/mobility/graph.h"
 #include "memilio/utils/random_number_generator.h"
+#include <chrono>
 
 namespace mio
 {
@@ -60,7 +61,9 @@ class GraphSimulationBase
 
     void advance(double t_max = 1.0)
     {
-        auto dt = m_dt;
+        auto dt         = m_dt;
+        auto start_time = std::chrono::high_resolution_clock::now(); // Startzeit erfassen
+
         while (m_t < t_max) {
             if (m_t + dt > t_max) {
                 dt = t_max - m_t;
@@ -77,6 +80,12 @@ class GraphSimulationBase
                             m_graph.nodes()[e.end_node_idx].property);
             }
         }
+
+        auto end_time                                = std::chrono::high_resolution_clock::now(); // Endzeit erfassen
+        std::chrono::duration<double> execution_time = end_time - start_time; // Ausführungszeit berechnen
+
+        std::cout << "t = " << m_t << " execution time (Graph Simulation): " << execution_time.count() << "sec"
+                  << std::endl;
     }
 
     double get_t() const

From ac705a46fc23671b94d2d59ab2a0d82d72a80bb4 Mon Sep 17 00:00:00 2001
From: HenrZu <69154294+HenrZu@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:20:12 +0200
Subject: [PATCH 32/33] with_dampings

---
 .../GNN/data_generation_withdamp.py           | 234 +++++++++++-------
 1 file changed, 139 insertions(+), 95 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 868effff97..857c71f7da 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -3,7 +3,11 @@
 import pickle
 import random
 import numpy as np
+import time
+import memilio.simulation as mio
+import memilio.simulation.osecir as osecir
 from datetime import date
+from enum import Enum
 
 from progress.bar import Bar
 from sklearn.preprocessing import FunctionTransformer
@@ -20,93 +24,137 @@
     getBaselineMatrix, remove_confirmed_compartments, get_population, getMinimumMatrix)
 
 
-def run_secir_groups_simulation(days, populations, dampings, countykey_list, directory, baseline, edges):
+class Location(Enum):
+    Home = 0
+    School = 1
+    Work = 2
+    Other = 3
+
+
+start_date = mio.Date(2019, 1, 1)
+end_date = mio.Date(2020, 12, 31)
+
+
+def set_covid_parameters(model, num_groups=6):
+    for i in range(num_groups):
+        # Compartment transition duration
+        model.parameters.TimeExposed[AgeGroup(i)] = 3.2
+        model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
+        model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
+        model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
+        model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+
+        # Compartment transition propabilities
+        model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
+        model.parameters.TransmissionProbabilityOnContact[AgeGroup(
+            i)] = 0.1
+        model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
+        model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
+        model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
+        model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
+        model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
+        model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
+            i)] = 0.5
+
+    # StartDay is the n-th day of the year
+    model.parameters.StartDay = start_date.day_in_year
+
+
+def set_contact_matrices(model, data_dir, num_groups=6):
+    contact_matrices = mio.ContactMatrixGroup(
+        len(list(Location)), num_groups)
+    locations = ["home", "school_pf_eig", "work", "other"]
+
+    for i, location in enumerate(locations):
+        baseline_file = os.path.join(
+            data_dir, "contacts", "baseline_" + location + ".txt")
+        minimum_file = os.path.join(
+            data_dir, "contacts", "minimum_" + location + ".txt")
+        contact_matrices[i] = mio.ContactMatrix(
+            mio.read_mobility_plain(baseline_file),
+            mio.read_mobility_plain(minimum_file)
+        )
+    model.parameters.ContactPatterns.cont_freq_mat = contact_matrices
+
+
+def get_graph(num_groups, data_dir):
+    model = Model(num_groups)
+    set_covid_parameters(model)
+    set_contact_matrices(model, data_dir)
+
+    graph = osecir.ModelGraph()
+
+    scaling_factor_infected = [2.5, 2.5, 2.5, 2.5, 2.5, 2.5]
+    scaling_factor_icu = 1.0
+    tnt_capacity_factor = 7.5 / 100000.
+
+    path_population_data = os.path.join(
+        data_dir, "pydata", "Germany",
+        "county_current_population.json")
+
+    mio.osecir.set_nodes(
+        model.parameters,
+        mio.Date(start_date.year,
+                 start_date.month, start_date.day),
+        mio.Date(end_date.year,
+                 end_date.month, end_date.day), data_dir,
+        path_population_data, True, graph, scaling_factor_infected,
+        scaling_factor_icu, tnt_capacity_factor, 0, False)
+
+    mio.osecir.set_edges(
+        data_dir, graph, len(Location))
+
+    return graph
+
+
+def run_secir_groups_simulation(days, graph, dampings, num_groups=6):
     """! Uses an ODE SECIR model allowing for asymptomatic infection 
         with 6 different age groups. The model is not stratified by region. 
         Virus-specific parameters are fixed and initial number of persons 
         in the particular infection states are chosen randomly from defined ranges.
     @param days (int) Describes how many days we simulate within a single run.
+    @param Graph Graph initilized for the start_date with the population data which
+            is sampled during the run.
     @param damping_day (int) The day when damping is applied.
-    @param populations List containing the population in each age group.
     @return List containing the populations in each compartment 
             used to initialize the run.
    """
-    set_log_level(LogLevel.Off)
-
-    start_day = 1
-    start_month = 1
-    start_year = 2019
-    dt = 0.1
-
-    # Define age Groups
-    groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+']
-    num_groups = len(groups)
-    num_regions = len(populations)
-    models = []
-
-    # Initialize Parameters
-    for region in range(num_regions):
-        model = Model(num_groups)
+    for node_indx in range(graph.num_nodes):
+        model = graph.get_node(node_indx).property
 
         # Set parameters
+        # TODO: Put This in the draw_sample function in the ParameterStudy
         for i in range(num_groups):
-            # Compartment transition duration
-            model.parameters.TimeExposed[AgeGroup(i)] = 3.2
-            model.parameters.TimeInfectedNoSymptoms[AgeGroup(i)] = 2.
-            model.parameters.TimeInfectedSymptoms[AgeGroup(i)] = 6.
-            model.parameters.TimeInfectedSevere[AgeGroup(i)] = 12.
-            model.parameters.TimeInfectedCritical[AgeGroup(i)] = 8.
+            age_group = AgeGroup(i)
+            pop_age_group = model.populations.get_group_total_AgeGroup(
+                age_group)
 
             # Initial number of people in each compartment with random numbers
             # Numbers are chosen heuristically based on experience
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Exposed)] = random.uniform(
-                0.00025, 0.005) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedNoSymptoms)] = random.uniform(
-                0.0001, 0.0035) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
+            model.populations[age_group, Index_InfectionState(InfectionState.Exposed)] = random.uniform(
+                0.00025, 0.005) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedNoSymptoms)] = random.uniform(
+                0.0001, 0.0035) * pop_age_group
+            model.populations[age_group, Index_InfectionState(
                 InfectionState.InfectedNoSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSymptoms)] = random.uniform(
-                0.00007, 0.001) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedSymptoms)] = random.uniform(
+                0.00007, 0.001) * pop_age_group
+            model.populations[age_group, Index_InfectionState(
                 InfectionState.InfectedSymptomsConfirmed)] = 0
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedSevere)] = random.uniform(
-                0.00003, 0.0006) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.InfectedCritical)] = random.uniform(
-                0.00001, 0.0002) * populations[region][i]
-            model.populations[AgeGroup(i), Index_InfectionState(
-                InfectionState.Recovered)] = random.uniform(
-                0.002, 0.08) * populations[region][i]
-            model.populations[AgeGroup(i),
-                              Index_InfectionState(InfectionState.Dead)] = random.uniform(
-                0, 0.0003) * populations[region][i]
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedSevere)] = random.uniform(
+                0.00003, 0.0006) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.InfectedCritical)] = random.uniform(
+                0.00001, 0.0002) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.Recovered)] = random.uniform(
+                0.002, 0.08) * pop_age_group
+            model.populations[age_group, Index_InfectionState(InfectionState.Dead)] = random.uniform(
+                0, 0.0003) * pop_age_group
             model.populations.set_difference_from_group_total_AgeGroup(
-                (AgeGroup(i), Index_InfectionState(InfectionState.Susceptible)),  populations[region][i])
-
-            # Compartment transition propabilities
-            model.parameters.RelativeTransmissionNoSymptoms[AgeGroup(i)] = 0.5
-            model.parameters.TransmissionProbabilityOnContact[AgeGroup(
-                i)] = 0.1
-            model.parameters.RecoveredPerInfectedNoSymptoms[AgeGroup(i)] = 0.09
-            model.parameters.RiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.25
-            model.parameters.SeverePerInfectedSymptoms[AgeGroup(i)] = 0.2
-            model.parameters.CriticalPerSevere[AgeGroup(i)] = 0.25
-            model.parameters.DeathsPerCritical[AgeGroup(i)] = 0.3
-            # twice the value of RiskOfInfectionFromSymptomatic
-            model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(
-                i)] = 0.5
-
-        # StartDay is the n-th day of the year
-        model.parameters.StartDay = (
-            date(start_year, start_month, start_day) - date(start_year, 1, 1)).days
-
-        model.parameters.ContactPatterns.cont_freq_mat[0].baseline = baseline
-        model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones(
-            (num_groups, num_groups)) * 0
+                (age_group, Index_InfectionState(InfectionState.Susceptible)),
+                pop_age_group)
+
+        # Apply mathematical constraints to parameters
+        model.apply_constraints()
 
         # Generate a damping matrix and assign it to the model
         damped_matrices = []
@@ -127,12 +175,13 @@ def run_secir_groups_simulation(days, populations, dampings, countykey_list, dir
 
         # Apply mathematical constraints to parameters
         model.apply_constraints()
-        models.append(model)
+        # set model to graph
+        graph.get_node(node_indx).property.populations = model.populations
 
-    graph = make_graph(directory, num_regions, countykey_list, models, edges)
-
-    study = ParameterStudy(graph, 0, days, dt=dt, num_runs=1)
+    study = ParameterStudy(graph, 0, days, dt=0.5, num_runs=1)
+    start_time = time.time()
     study.run()
+    print("Simulation took: ", time.time() - start_time)
 
     graph_run = study.run()[0]
     results = interpolate_simulation_result(graph_run)
@@ -209,18 +258,22 @@ def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min
 
 
 def generate_data(
-        num_runs, path, input_width, label_width, number_of_dampings, save_data=True):
+        num_runs, data_dir, path, input_width, label_width, number_of_dampings, save_data=True):
     """! Generate dataset by calling run_secir_simulation (num_runs)-often
    @param num_runs Number of times, the function run_secir_simulation is called.
+   @param data_dir Directory with all data needed to initialize the models.
    @param path Path, where the datasets are stored.
    @param input_width Number of time steps used for model input.
    @param label_width Number of time steps (days) used as model output/label.  
    @param number_of_dampings (int) The number of contact change points applied to the simulation. 
    @param save_data Option to deactivate the save of the dataset. Per default true.
    """
-    population = get_population()
+    set_log_level(mio.LogLevel.Error)
     days_sum = label_width+input_width-1
 
+    num_groups = 6
+    graph = get_graph(num_groups, data_dir)
+
     # generate dampings
     damping_days = generate_dampings_withshadowdamp(
         number_of_dampings=number_of_dampings, days=label_width,
@@ -241,30 +294,19 @@ def generate_data(
             "damping_day": [],
             "damped_matrix": []}
 
-    # get county ids
-    countykey_list = geoger.get_county_ids(merge_eisenach=True, zfill=True)
-    directory = transform_mobility_directory()
-
-    # Load baseline matrix
-    baseline = getBaselineMatrix()
-
-    num_locations = 4
-    graph = ModelGraph()
-    for i in range(len(countykey_list)):
-        graph.add_node(int(countykey_list[i]), Model(6))
-
-    set_edges(os.path.abspath(os.path.join(directory, os.pardir)),
-              graph, num_locations)
-    edges = graph.edges()
-
     # show progess in terminal for longer runs
     # Due to the random structure, theres currently no need to shuffle the data
     bar = Bar('Number of Runs done', max=num_runs)
 
+    model_params = graph.get_node(0).property.parameters
+
     for i in range(num_runs):
 
+        # reset contact matrix in each node
+        for node_indx in range(graph.num_nodes):
+            graph.get_node(node_indx).property.parameters = model_params
         data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
-            days_sum,  population,  damping_days[i], countykey_list, directory, baseline, edges)
+            days_sum,  graph,  damping_days[i])
 
         inputs = np.asarray(data_run).transpose(1, 2, 0)[:input_width]
         data["inputs"].append(inputs)
@@ -301,14 +343,16 @@ def generate_data(
 if __name__ == "__main__":
 
     input_width = 5
-    label_width = 30
+    label_width = 95
     number_of_dampings = 1
-    num_runs = 2
+    num_runs = 1
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(
             os.path.realpath(os.path.dirname(os.path.realpath(path)))),
         'data_GNN_with_'+str(number_of_dampings)+'_dampings_test')
 
-    generate_data(num_runs, path_data, input_width,
+    data_dir = os.path.join(os.getcwd(), 'data')
+
+    generate_data(num_runs, data_dir, path_data, input_width,
                   label_width, number_of_dampings, save_data=False)

From 10d5a9822e15bb279af2484c2484ad4f9421a814 Mon Sep 17 00:00:00 2001
From: HenrZu <69154294+HenrZu@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:15:26 +0200
Subject: [PATCH 33/33] [ci skip] damping correctly setted and reseted after
 each run

---
 .../surrogatemodel/GNN/data_generation_withdamp.py   | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
index 857c71f7da..1bf7857461 100644
--- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
+++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/GNN/data_generation_withdamp.py
@@ -157,6 +157,7 @@ def run_secir_groups_simulation(days, graph, dampings, num_groups=6):
         model.apply_constraints()
 
         # Generate a damping matrix and assign it to the model
+        # TODO: This can be done outside and is (currently) static for all models
         damped_matrices = []
         damping_coeff = []
         for day in dampings:
@@ -177,6 +178,7 @@ def run_secir_groups_simulation(days, graph, dampings, num_groups=6):
         model.apply_constraints()
         # set model to graph
         graph.get_node(node_indx).property.populations = model.populations
+        graph.get_node(node_indx).property.parameters = model.parameters
 
     study = ParameterStudy(graph, 0, days, dt=0.5, num_runs=1)
     start_time = time.time()
@@ -298,13 +300,13 @@ def generate_data(
     # Due to the random structure, theres currently no need to shuffle the data
     bar = Bar('Number of Runs done', max=num_runs)
 
-    model_params = graph.get_node(0).property.parameters
+    model_params = copy.deepcopy(graph.get_node(0).property.parameters)
 
     for i in range(num_runs):
-
+        params_run = copy.deepcopy(model_params)
         # reset contact matrix in each node
         for node_indx in range(graph.num_nodes):
-            graph.get_node(node_indx).property.parameters = model_params
+            graph.get_node(node_indx).property.parameters = params_run
         data_run, damped_contact_matrix, damping_days_s, damping_factor = run_secir_groups_simulation(
             days_sum,  graph,  damping_days[i])
 
@@ -344,8 +346,8 @@ def generate_data(
 
     input_width = 5
     label_width = 95
-    number_of_dampings = 1
-    num_runs = 1
+    number_of_dampings = 3
+    num_runs = 5
     path = os.path.dirname(os.path.realpath(__file__))
     path_data = os.path.join(
         os.path.dirname(