🔥implement backup time extractor

bptlab · Jan 29, 2024 · db1c7d8 · db1c7d8
1 parent 14371bd
commit db1c7d8
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 7 deletions.
diff --git a/tracex/extraction/logic/modules/__init__.py b/tracex/extraction/logic/modules/__init__.py
@@ -1,5 +1,5 @@
 from .module_patient_journey_generator import PatientJourneyGenerator
 from .module_activity_labeler import ActivityLabeler
-from .module_time_extractor import TimeExtractor
+from .module_time_extractor_backup import TimeExtractorBackup
 from .module_location_extractor import LocationExtractor
 from .module_event_type_classifier import EventTypeClassifier
diff --git a/tracex/extraction/logic/modules/module_time_extractor.py b/tracex/extraction/logic/modules/module_time_extractor.py
@@ -36,6 +36,7 @@ def __extract_start_date(self, activity_label):
             {"role": "assistant", "content": p.START_DATE_ANSWER},
         ]
         output = u.query_gpt(messages)
+        print(output + "\n")
         fc_message = [
             {"role": "system", "content": p.FC_START_DATE_CONTEXT},
             {"role": "user", "content": p.FC_START_DATE_PROMPT + "The text: " + output},

diff --git a/tracex/extraction/logic/modules/module_time_extractor_backup.py b/tracex/extraction/logic/modules/module_time_extractor_backup.py
@@ -0,0 +1,133 @@
+from datetime import datetime
+from pathlib import Path
+
+from ..logging import log_execution_time
+from ..module import Module
+from .. import prompts as p
+from .. import utils as u
+
+
+class TimeExtractorBackup(Module):
+    """
+    This is the module that extracts the time information from the patient journey. This includes start dates,
+    end dates and durations.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.name = "Time Extractor Backup"
+        self.description = "Extracts the timestamps for the corresponding activity labels from a patient journey."
+
+    @log_execution_time(Path("extraction/logs/execution_time.log"))
+    def execute(self, df, patient_journey=None):
+        super().execute(df, patient_journey)
+        df["start"] = df["event_information"].apply(self.__extract_start_date)
+        df["end"] = df.apply(self.__extract_end_date, axis=1)
+        df["duration"] = df.apply(self.__calculate_row_duration, axis=1)
+        self.result = df
+
+    def __extract_start_date(self, activity_label):
+        messages = [
+            {"role": "system", "content": START_DATE_CONTEXT},
+            {
+                "role": "user",
+                "content": f"The text: {self.patient_journey} \nThe activity label: {activity_label}",
+            },
+        ]
+        start_date = u.query_gpt(messages)
+        print(start_date + "\n")
+        assert self.is_valid_date_format(start_date, "%Y%m%dT%H%M") is True, f"Date {start_date} has no valid format."
+
+        return start_date
+
+    def __extract_end_date(self, row):
+        messages = [
+            {"role": "system", "content": END_DATE_CONTEXT},
+            {
+                "role": "user",
+                "content": f"\nThe text: {self.patient_journey} \nThe bulletpoint: "
+                           f"{row['event_information']} \nThe start date: {row['start']}",
+            },
+        ]
+        end_date = u.query_gpt(messages)
+        print(end_date + "\n")
+        assert self.is_valid_date_format(end_date, "%Y%m%dT%H%M") is True, f"Date {end_date} has no valid format."
+
+        return end_date
+
+    @staticmethod
+    def __calculate_row_duration(row):
+        if row["start"] == "N/A" or row["end"] == "N/A":
+            return "N/A"
+        start_date = datetime.strptime(row["start"], "%Y%m%dT%H%M")
+        end_date = datetime.strptime(row["end"], "%Y%m%dT%H%M")
+        duration = end_date - start_date
+        hours, remainder = divmod(duration.total_seconds(), 3600)
+        minutes, seconds = divmod(remainder, 60)
+
+        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+
+    @staticmethod
+    def is_valid_date_format(date_string, date_format):
+        try:
+            datetime.strptime(date_string, date_format)
+            return True
+        except ValueError:
+            return False
+
+# START_DATE_CONTEXT = """
+#     You are an expert in text understanding and your job is to take a given text and given summarizing bulletpoints and to add a start date to every bulletpoint.
+#     Edit the bulletpoints in a way, that you just take the existing bulletpoints and add a start date at the end of it.
+#     The information about the start date should be extracted from the text or from the context and should be as precise as possible.
+#     Do not modify the content of the bulletpoint and keep ending commas.
+#     Please use the format YYYYMMDD for the dates and extend every date by "T0000".
+#     Keep in mind, that the start date of a bullet point is not necessarily later than the start of the previous one.
+#     Also, the start date doesn't have to be the next date information in the text, but can be related to the previous.
+#     If the text talks about getting medication and then improving and the bullet point says 'improving', you should return the date of getting the medication as start date.
+#     If there is a conclusion at the end of the text and an outlook set the start date of the last bullet point to the start date of the corresponding bulletpoint.
+#     If there is really no information about the start date to be extracted from the text but there is information about events happening at the same time,
+#     use that information to draw conclusions about the start dates.
+#     If there is no information about the start date at all and there is no way of finding some, delete that bulletpoint.
+#     The only output should be the updated bullet points, nothing else!
+# """
+
+START_DATE_CONTEXT = """
+    You are provided with a natural language text containing various events. Your task is to identify the start date of 
+    a specific activity mentioned in the text. The activity label will be provided, and it is your job to extract only 
+    the start date associated with this activity from the text. 
+    Under no circumstances put anything else in the ouptut apart from the extracted start date.
+    Please follow the following rules:
+    1. The format of the date should always be YYYYMMDDT0000. For example, 20200101T0000.
+    2. If only a month in mentioned then the date should always be the first day of the month. For example for March it should be 20200301T0000.
+    3. If the date is mentioned in a different format, please convert it to the format mentioned above.
+    4. Also consider context information from previous activities and their start dates.
+"""
+
+# END_DATE_CONTEXT = """
+#     You are an expert in text understanding and your job is to take a given text and given summarizing bulletpoints with a start date and to add a end date to every bulletpoint.
+#     It is important, that every bullet point gets an end date, even if it is the same as the start date.
+#     Edit the bulletpoints in a way, that you just take the existing bulletpoints and add a end date to it.
+#     The information about the end date should be extracted from the text or from the context and should be as precise as possible.
+#     Please use the format YYYYMMDD for the dates and extend every date by "T0000".
+#     If the duration of an event is given, use that information to draw conclusions about the end date.
+#     If the duration of an event is not given, use the context to draw conclusions about the end date.
+#     If two bulletpoints are related, it is possible, that the end dates should match.
+#     Think about how long humans tend to stay in hospitals, how long it takes to recover from a disease, how long they practice new habits and so on.
+#     If there is no information about the end date at all, please state the start date also as the end date.
+#     The only output should be the updated bullet points, nothing else!
+# """
+
+END_DATE_CONTEXT = """
+    You are provided with a natural language text containing various events. Your task is to identify the end date of 
+    a specific activity mentioned in the text. The activity label and the corresponding start date will be provided, 
+    and it is your job to extract only the end date associated with this activity from the text. 
+    Under no circumstances put anything else in the ouptut apart from the extracted end date.
+    Please follow the following rules:
+    1. The format of the date should always be YYYYMMDDT0000. For example, 20200101T0000.
+    2. If only a month in mentioned then the date should always be the first day of the month. For example for March it should be 20200301T0000.
+    3. If the date is mentioned in a different format, please convert it to the format mentioned above.
+    4. Also consider context information from previous activities and their start dates and end dates. The end dates should
+    follow logically from the start dates.
+    5. End dates can not be earlier than the start dates.
+"""
+
diff --git a/tracex/extraction/logic/orchestrator.py b/tracex/extraction/logic/orchestrator.py
@@ -6,7 +6,7 @@
 from . import Module, logging
 from .modules.module_patient_journey_generator import PatientJourneyGenerator
 from .modules.module_activity_labeler import ActivityLabeler
-from .modules.module_time_extractor import TimeExtractor
+from .modules.module_time_extractor_backup import TimeExtractorBackup
 from .modules.module_location_extractor import LocationExtractor
 from .modules.module_event_type_classifier import EventTypeClassifier
 
@@ -28,7 +28,7 @@ class ExtractionConfiguration:
         "patient_journey_generation": PatientJourneyGenerator,
         "activity_labeling": ActivityLabeler,
         "event_type_classification": EventTypeClassifier,
-        "time_extraction": TimeExtractor,
+        "time_extraction": TimeExtractorBackup,
         "location_extraction": LocationExtractor,
     }
     activity_key: Optional[str] = "event_type"

diff --git a/tracex/extraction/logic/utils.py b/tracex/extraction/logic/utils.py
@@ -43,9 +43,10 @@ def get_decision(question):
 
 def query_gpt(
     messages,
+    max_tokens=MAX_TOKENS,
+    temperature=TEMPERATURE_SUMMARIZING,
     tools=function_calls.TOOLS,
     tool_choice="none",
-    temperature=TEMPERATURE_SUMMARIZING,
 ):
     @log_tokens_used(Path("extraction/logs/tokens_used.log"))
     def make_api_call():
@@ -54,7 +55,7 @@ def make_api_call():
         _response = client.chat.completions.create(
             model=MODEL,
             messages=messages,
-            max_tokens=MAX_TOKENS,
+            max_tokens=max_tokens,
             temperature=temperature,
             tools=tools,
             tool_choice=tool_choice,

diff --git a/tracex/extraction/views.py b/tracex/extraction/views.py
@@ -115,7 +115,7 @@ def get_context_data(self, **kwargs):
             "attribute_location": orchestrator.configuration.locations,
         }
         is_extracted = (
-            True
+            False
             if self.request.session.get("is_extracted") is None
             else self.request.session.get("is_extracted")
         )
@@ -124,6 +124,7 @@ def get_context_data(self, **kwargs):
         if not (IS_TEST or is_extracted):
             orchestrator.run()
             single_trace_df = orchestrator.data
+            print(single_trace_df)
             single_trace_df["caseID"] = single_trace_df["caseID"].astype(str)
             single_trace_df["start"] = pd.to_datetime(single_trace_df["start"])
             single_trace_df["end"] = pd.to_datetime(single_trace_df["end"])
@@ -141,6 +142,7 @@ def get_context_data(self, **kwargs):
                 name="single_trace",
                 key=orchestrator.configuration.activity_key,
             )
+            self.request.session["is_extracted"] = True
         else:
             output_path_xes = (
                 f"{str(utils.output_path / 'single_trace')}_event_type.xes"
@@ -178,7 +180,6 @@ def get_context_data(self, **kwargs):
         context["all_xes_html"] = utils.Conversion.create_html_from_xes(
             all_traces_df_filtered
         ).getvalue()
-        self.request.session["is_extracted"] = False
 
         return context