change duration to deterministic and adapt create_xes

bptlab · Jan 17, 2024 · 1c223bd · 1c223bd
1 parent 9f06697
commit 1c223bd
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 62 deletions.
diff --git a/test.py b/test.py
@@ -10,20 +10,21 @@
 from tracex.extraction.prototype import utils as u
 from tracex.extraction.prototype import function_calls as fc
 from tracex.extraction.prototype import prompts as p
+from tracex.extraction.prototype import create_xes as x
 
-text = open(u.input_path / "journey_synth_covid_1.txt").read()
-df = ih.convert_text_to_bulletpoints(text)
-print(df)
-df = ih.add_start_dates(text, df)
-df = ih.add_end_dates(text, df)
-# df = ih.add_durations(text, df)
-# df = ih.add_event_types(df)
-# df = ih.add_locations(df)
+# text = open(u.input_path / "journey_synth_covid_0.txt").read()
+# df = ih.convert_text_to_bulletpoints(text)
+# print(df)
+# df = ih.add_start_dates(text, df)
+# df = ih.add_end_dates(text, df)
+# df = ih.add_durations(df)
+# # df = ih.add_event_types(df)
+# # df = ih.add_locations(df)
 
-print(df)
+# print(df)
 
-df.to_csv(u.output_path / "intermediates/dataframe.csv")
-ih.convert_dataframe_to_csv(df)
+# ih.convert_dataframe_to_csv(df)
+x.create_xes(u.output_path / "single_trace.csv", "test", "event_information")
 
 # output = 'The end date for the bulletpoint "Experiencing mild cough and fatigue" and the start date 20220108T0000 is 20230911T0000.'
 

diff --git a/tracex/extraction/prototype/create_xes.py b/tracex/extraction/prototype/create_xes.py
@@ -19,16 +19,16 @@ def get_activity_key():
 def create_xes(csv_file, name, key):
     """Creates a xes with all traces from the regarding csv."""
     dataframe = pd.read_csv(csv_file, sep=",")
-    dataframe["caseID"] = dataframe["caseID"].astype(str)
-    dataframe["start"] = pd.to_datetime(dataframe["start"])
-    dataframe["end"] = pd.to_datetime(dataframe["end"])
+    dataframe["case_id"] = dataframe["case_id"].astype(str)
+    dataframe["start_date"] = pd.to_datetime(dataframe["start_date"])
+    dataframe["end_date"] = pd.to_datetime(dataframe["end_date"])
     dataframe["duration"] = pd.to_timedelta(dataframe["duration"])
     dataframe = dataframe.rename(
         columns={
             key: "concept:name",
-            "caseID": "case:concept:name",
-            "start": "time:timestamp",
-            "end": "time:endDate",
+            "case_id": "case:concept:name",
+            "start_date": "time:timestamp",
+            "end_date": "time:end_date",
             "duration": "time:duration",
         }
     )

diff --git a/tracex/extraction/prototype/input_handling.py b/tracex/extraction/prototype/input_handling.py
@@ -1,5 +1,6 @@
 """Module providing functions for converting text to CSV."""
 import pandas as pd
+from datetime import datetime
 
 from . import utils as u
 from . import prompts as p
@@ -86,7 +87,7 @@ def add_start_dates(text, df):
             },
             {"role": "assistant", "content": p.START_DATE_ANSWER},
         ]
-        
+
         output = u.query_gpt(messages)
 
         fc_message = [
@@ -103,8 +104,10 @@ def add_start_dates(text, df):
 
         if start_date == "N/A" and row_count > 1:
             last_index = new_df.index[-1]
-            previous_index = last_index -1
-            new_df.at[last_index, 'start_date'] = new_df.at[previous_index, 'start_date']
+            previous_index = last_index - 1
+            new_df.at[last_index, "start_date"] = new_df.at[
+                previous_index, "start_date"
+            ]
 
         print(name + ": " + str(i) + "      ", end="\r")
         i = i + 1
@@ -161,49 +164,68 @@ def add_end_dates(text, df):
     return df
 
 
-def add_durations(text, df):
-    """Adds durations to the bulletpoints."""
-    name = "duration"
-    new_df = pd.DataFrame([], columns=[name])
-    values_list = df.values.tolist()
-    i = 0
-    for item in values_list:
-        messages = [
-            {"role": "system", "content": p.DURATION_CONTEXT},
-            {
-                "role": "user",
-                "content": p.DURATION_PROMPT
-                + "\nThe text: "
-                + text
-                + "\nThe bulletpoint: "
-                + item[0]
-                + "\nThe start date: "
-                + item[1]
-                + "\nThe end date: "
-                + item[2],
-            },
-            {"role": "assistant", "content": p.DURATION_ANSWER},
-        ]
-        output = u.query_gpt(messages)
+# def add_durations(text, df):
+#     """Adds durations to the bulletpoints."""
+#     name = "duration"
+#     new_df = pd.DataFrame([], columns=[name])
+#     values_list = df.values.tolist()
+#     i = 0
+#     for item in values_list:
+#         messages = [
+#             {"role": "system", "content": p.DURATION_CONTEXT},
+#             {
+#                 "role": "user",
+#                 "content": p.DURATION_PROMPT
+#                 + "\nThe text: "
+#                 + text
+#                 + "\nThe bulletpoint: "
+#                 + item[0]
+#                 + "\nThe start date: "
+#                 + item[1]
+#                 + "\nThe end date: "
+#                 + item[2],
+#             },
+#             {"role": "assistant", "content": p.DURATION_ANSWER},
+#         ]
+#         output = u.query_gpt(messages)
+
+#         fc_message = [
+#             {"role": "system", "content": p.FC_DURATION_CONTEXT},
+#             {"role": "user", "content": p.FC_DURATION_PROMPT + "The text: " + output},
+#         ]
+#         duration = u.query_gpt(
+#             fc_message,
+#             tool_choice={"type": "function", "function": {"name": "add_duration"}},
+#         )
+#         new_row = pd.DataFrame([duration], columns=[name])
+#         new_df = pd.concat([new_df, new_row], ignore_index=True)
+#         print(name + ": " + str(i) + "      ", end="\r")
+#         i = i + 1
+#         with open(
+#             (u.output_path / "intermediates/bulletpoints.txt"),
+#             "a",
+#         ) as f:
+#             f.write("\n" + output)
+#     df = pd.concat([df, new_df], axis=1)
+#     return df
+
+
+def add_durations(df):
+    # Funktion zur Berechnung der Dauer im gewünschten Format
+    def calculate_row_duration(row):
+        if row["start_date"] == "N/A" or row["end_date"] == "N/A":
+            return "N/A"
+
+        start_date = datetime.strptime(row["start_date"], "%Y%m%dT%H%M")
+        end_date = datetime.strptime(row["end_date"], "%Y%m%dT%H%M")
+        duration = end_date - start_date
+        hours, remainder = divmod(duration.total_seconds(), 3600)
+        minutes, seconds = divmod(remainder, 60)
+        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+
+    # Neue Spalte 'duration' erstellen und für jede Zeile die Dauer berechnen
+    df["duration"] = df.apply(calculate_row_duration, axis=1)
 
-        fc_message = [
-            {"role": "system", "content": p.FC_DURATION_CONTEXT},
-            {"role": "user", "content": p.FC_DURATION_PROMPT + "The text: " + output},
-        ]
-        duration = u.query_gpt(
-            fc_message,
-            tool_choice={"type": "function", "function": {"name": "add_duration"}},
-        )
-        new_row = pd.DataFrame([duration], columns=[name])
-        new_df = pd.concat([new_df, new_row], ignore_index=True)
-        print(name + ": " + str(i) + "      ", end="\r")
-        i = i + 1
-        with open(
-            (u.output_path / "intermediates/bulletpoints.txt"),
-            "a",
-        ) as f:
-            f.write("\n" + output)
-    df = pd.concat([df, new_df], axis=1)
     return df