Merge pull request #138 from bptlab/refactor/96-improve-prompts

Refactor/96 improve prompts
bptlab · May 24, 2024 · 085a70a · 085a70a
2 parents 89334d7 + 33f69bb
commit 085a70a
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 70 deletions.
diff --git a/tracex_project/db.sqlite3 b/tracex_project/db.sqlite3
diff --git a/tracex_project/extraction/logic/modules/module_activity_labeler.py b/tracex_project/extraction/logic/modules/module_activity_labeler.py
@@ -25,7 +25,7 @@ def execute(
         _input=None,
         patient_journey=None,
         patient_journey_sentences=None,
-        cohort=None
+        cohort=None,
     ):
         """
         Extracts the activity labels from the patient journey with the following steps:
@@ -76,14 +76,12 @@ def __extract_activities(patient_journey_numbered, condition):
             messages.append(
                 {
                     "role": "user",
-                    "content": patient_journey_numbered
-                    + "\n\nConsider all important points regarding the course of the disease of "
-                    + condition,
+                    "content": f"Focus on those events that are related to the course of the disease of {condition}."
+                    f"\n\n{patient_journey_numbered}",
                 }
             )
         else:
             messages.append({"role": "user", "content": patient_journey_numbered})
-        messages.append({"role": "user", "content": patient_journey_numbered})
         activity_labels = u.query_gpt(messages).split("\n")
         df = pd.DataFrame(activity_labels, columns=[column_name])
         df[["activity", "sentence_id"]] = df["activity"].str.split(" #", expand=True)

diff --git a/tracex_project/extraction/logic/modules/module_patient_journey_preprocessor.py b/tracex_project/extraction/logic/modules/module_patient_journey_preprocessor.py
@@ -24,81 +24,54 @@ def execute(
         self, _input=None, patient_journey=None, patient_journey_sentences=None
     ):
         """Preprocesses the patient input for better data quality."""
-        super().execute(_input, patient_journey=patient_journey, patient_journey_sentences=patient_journey_sentences)
-        preprocessed_text = self.__spellcheck(patient_journey)
-        preprocessed_text = self.__punctuationcheck(preprocessed_text)
-        preprocessed_text = self.__identify_timestamps(preprocessed_text)
-        preprocessed_text = self.__transform_timestamps(preprocessed_text)
-        preprocessed_text = self.__interpret_timestamps(preprocessed_text)
-        preprocessed_text = self.__calculate_timestamps(preprocessed_text)
+        super().execute(
+            _input,
+            patient_journey=patient_journey,
+            patient_journey_sentences=patient_journey_sentences,
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            patient_journey, "SPELLCHECK"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "PUNCTUATION"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_IDENTIFICATION"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_HOLIDAYS"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_GENERAL"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_IDENTIFICATION"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_RELATIVE"
+        )
+        preprocessed_text = self.__apply_preprocessing_step(
+            preprocessed_text, "TIME_PROPAGATE"
+        )
+
         patient_journey_sentences = self.__make_sentences(preprocessed_text)
 
         return patient_journey_sentences
 
     @staticmethod
-    def __make_sentences(text):
-        """Splits the input into a list of its sentences."""
-        text = text.replace("\n", " ")
-        text = text.split(". ")
-
-        return text
-
-    @staticmethod
-    def __spellcheck(text):
-        """Checks and corrects spelling and grammar in the input."""
-        messages = Prompt.objects.get(name="PREPROCESSING_SPELLCHECK").text
-        new_user_message = {"role": "user", "content": text}
-        messages.append(new_user_message)
-        preprocessed_text = u.query_gpt(messages)
-
-        return preprocessed_text
-
-    @staticmethod
-    def __punctuationcheck(text):
-        """Checks and corrects punctuations and commas in the input."""
-        messages = Prompt.objects.get(name="PREPROCESSING_PUNCTUATION").text
-        new_user_message = {"role": "user", "content": text}
-        messages.append(new_user_message)
-        preprocessed_text = u.query_gpt(messages)
-
-        return preprocessed_text
-
-    @staticmethod
-    def __identify_timestamps(text):
-        """Identifies and formats time specifications in the input."""
-        messages = Prompt.objects.get(name="PREPROCESSING_IDENTIFY_TIMESTAMPS").text
+    def __apply_preprocessing_step(text, prompt_name):
+        """Applies a preprocessing step based on the step name."""
+        messages = Prompt.objects.get(name=f"PREPROCESSING_{prompt_name}").text
         new_user_message = {"role": "user", "content": text}
         messages.append(new_user_message)
         preprocessed_text = u.query_gpt(messages)
 
         return preprocessed_text
 
     @staticmethod
-    def __transform_timestamps(text):
-        """Adds a timeline to the input for better understanding of events."""
-        messages = Prompt.objects.get(name="PREPROCESSING_TRANSFORM_TIMESTAMPS").text
-        new_user_message = {"role": "user", "content": text}
-        messages.append(new_user_message)
-        preprocessed_text = u.query_gpt(messages)
-
-        return preprocessed_text
-
-    @staticmethod
-    def __calculate_timestamps(text):
-        """Calculate a Timestamp to the input for better understanding of events."""
-        messages = Prompt.objects.get(name="PREPROCESSING_TIME_CALCULATION").text
-        new_user_message = {"role": "user", "content": text}
-        messages.append(new_user_message)
-        preprocessed_text = u.query_gpt(messages)
-
-        return preprocessed_text
-
-    @staticmethod
-    def __interpret_timestamps(text):
-        """Interpret a Timestamp to the input for better understanding of events."""
-        messages = Prompt.objects.get(name="PREPROCESSING_TIME_INTERPRETATION").text
-        new_user_message = {"role": "user", "content": text}
-        messages.append(new_user_message)
-        preprocessed_text = u.query_gpt(messages)
+    def __make_sentences(text):
+        """Splits the input into a list of its sentences."""
+        text = text.replace("\n", " ")
+        text = text.split(". ")
 
-        return preprocessed_text
+        return text