added log probs metrics and remove cot+fc of event type and location …

…prompts
bptlab · Jan 28, 2024 · e8eced4 · e8eced4
1 parent cb7db64
commit e8eced4
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 62 deletions.
diff --git a/test.py b/test.py
@@ -0,0 +1,35 @@
+# pylint: skip-file
+# pylint: enable=wrong-import-position
+import os
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tracex.tracex.settings")
+
+from tracex.extraction.prototype import input_inquiry as ii
+from tracex.extraction.prototype import input_handling as ih
+from tracex.extraction.prototype import utils as u
+from tracex.extraction.prototype import function_calls as fc
+from tracex.extraction.prototype import metrics as m
+from tracex.extraction.prototype import create_xes as x
+
+text = open(u.input_path / "journey_synth_covid_0.txt").read()
+# df = ih.convert_text_to_bulletpoints(text)
+# print(df)
+
+df = m.measure_event_types(text)
+print(df)
+df = m.measure_location(text)
+print(df)
+ih.convert_dataframe_to_csv(df)
+# df = ih.add_start_dates(text, df)
+# print(df)
+# df = ih.add_end_dates(text, df)
+# print(df)
+# df = ih.add_durations(df)
+# print(df)
+# df = ih.add_event_types(df)
+# print(df)
+# df = ih.add_locations(df)
+# print(df)
+# ih.convert_dataframe_to_csv(df)
+# x.create_xes(u.output_path / "single_trace.csv", "test", "event_information")
+
diff --git a/tracex/extraction/content/outputs/single_trace.csv b/tracex/extraction/content/outputs/single_trace.csv
@@ -1,9 +1,9 @@
-case_id,event_information,start_date,end_date,duration,event_type,attribute_location
-0,"experiencing first Covid-19 symptoms: mild cough, fatigue",20220601T0000,20220608T0000,168:00:00,Symptom Onset,Home
-0,brushing off symptoms as common cold,20220601T0000,20220608T0000,168:00:00,Other,Home
-0,developing high fever and difficulty breathing,20220601T0000,20220611T0000,240:00:00,Symptom Onset,Hospital
-0,deciding to get tested for Covid-19,20220601T0000,20220617T0000,384:00:00,Diagnosis,Doctors
-0,going to local testing center,20220617T0000,20220617T0000,00:00:00,Diagnosis,Hospital
-0,undergoing PCR test,20220617T0000,20220617T0000,00:00:00,Diagnosis,Home
-0,receiving negative test results,20220617T0000,20220619T0000,48:00:00,Diagnosis,Doctors
-0,getting infected and testing positive,20220617T0000,20220617T0000,00:00:00,Symptom Onset,Home
+case_id,event_information,event_type,location,"(token1, lin_prob1)","(token2, lin_prob2)"
+0,experiencing first Covid-19 symptoms in June 2022,Symptom Onset,Doctors,"('Doctors', 47.42)","('Home', 45.07)"
+0,brushing off symptoms as common cold,Lifestyle Change,Home,"('Home', 86.14)","('Doctors', 12.56)"
+0,developing high fever and difficulty breathing,Symptom Onset,Doctors,"('Doctors', 68.56)","('Hospital', 13.31)"
+0,getting tested for Covid-19,Diagnosis,Doctors,"('Doctors', 91.6)","('Hospital', 3.18)"
+0,visiting local testing center,Doctor visit,Doctors,"('Doctors', 99.36)","('Doctor', 0.27)"
+0,undergoing PCR test,Diagnosis,Doctors,"('Doctors', 82.7)","('Di', 7.96)"
+0,receiving negative test results,"The bulletpoint ""receiving negative test results"" can be classified as ""Symptom Offset"".",Doctors,"('Doctors', 55.6)","('Home', 39.8)"
+0,getting infected and testing positive,Diagnosis,Doctors,"('Doctors', 90.93)","('Hospital', 3.57)"
diff --git a/tracex/extraction/prototype/input_handling.py b/tracex/extraction/prototype/input_handling.py
@@ -175,19 +175,19 @@ def add_event_types(df):
             },
             {"role": "assistant", "content": p.EVENT_TYPE_ANSWER},
         ]
-        output = u.query_gpt(messages)
+        event_type = u.query_gpt(messages)
 
-        fc_message = [
-            {"role": "system", "content": p.FC_EVENT_TYPE_CONTEXT},
-            {"role": "user", "content": p.FC_EVENT_TYPE_PROMPT + "The text: " + output},
-        ]
-        event_type = u.query_gpt(
-            fc_message,
-            tool_choice={"type": "function", "function": {"name": "add_event_type"}},
-        )
+        # fc_message = [
+        #     {"role": "system", "content": p.FC_EVENT_TYPE_CONTEXT},
+        #     {"role": "user", "content": p.FC_EVENT_TYPE_PROMPT + "The text: " + output},
+        # ]
+        # event_type = u.query_gpt(
+        #     fc_message,
+        #     tool_choice={"type": "function", "function": {"name": "add_event_type"}},
+        # )
         new_row = pd.DataFrame([event_type], columns=[name])
         new_df = pd.concat([new_df, new_row], ignore_index=True)
-        document_intermediates(output)
+        document_intermediates(event_type)
     df = pd.concat([df, new_df], axis=1)
     return df
 
@@ -211,16 +211,16 @@ def add_locations(df):
             },
             {"role": "assistant", "content": p.LOCATION_ANSWER},
         ]
-        output = u.query_gpt(messages)
+        location = u.query_gpt(messages)
 
-        fc_message = [
-            {"role": "system", "content": p.FC_LOCATION_CONTEXT},
-            {"role": "user", "content": p.FC_LOCATION_PROMPT + "The text: " + output},
-        ]
-        location = u.query_gpt(
-            fc_message,
-            tool_choice={"type": "function", "function": {"name": "add_location"}},
-        )
+        # fc_message = [
+        #     {"role": "system", "content": p.FC_LOCATION_CONTEXT},
+        #     {"role": "user", "content": p.FC_LOCATION_PROMPT + "The text: " + output},
+        # ]
+        # location = u.query_gpt(
+        #     fc_message,
+        #     tool_choice={"type": "function", "function": {"name": "add_location"}},
+        # )
         new_row = pd.DataFrame([location], columns=[name])
         new_df = pd.concat([new_df, new_row], ignore_index=True)
         document_intermediates(output)

diff --git a/tracex/extraction/prototype/metrics.py b/tracex/extraction/prototype/metrics.py
@@ -0,0 +1,73 @@
+import pandas as pd
+import numpy as np
+
+from . import utils as u
+from . import prompts as p
+from . import input_handling as ih
+
+
+def measure_event_types(text):
+    df = ih.convert_text_to_bulletpoints(text)
+    new_df = pd.DataFrame([], columns=["event_type", "(token1, lin_prob1)", "(token2, lin_prob2)"])
+    values_list = df.values.tolist()
+    for item in values_list:
+        messages = [
+            {"role": "system", "content": p.EVENT_TYPE_CONTEXT},
+            {
+                "role": "user",
+                "content": p.EVENT_TYPE_PROMPT + "\nThe bulletpoint: " + item[0],
+            },
+            {"role": "assistant", "content": p.EVENT_TYPE_ANSWER},
+        ]
+        content, top_logprops = u.query_gpt(messages, logprobs=True, top_logprobs=2)
+        metrics = [content]
+
+        for logprob in top_logprops:
+            token = logprob.token
+            lin_prop = calculate_linear_probability(logprob.logprob)
+            metrics.append((token, lin_prop))
+
+        new_row = pd.DataFrame([metrics], columns=["event_type", "(token1, lin_prob1)", "(token2, lin_prob2)"])
+        new_df = pd.concat([new_df, new_row], ignore_index=True)
+        ih.document_intermediates(new_row.to_string())
+        print(new_row.to_string())
+    df = pd.concat([df, new_df], axis=1)
+    return df
+
+def measure_location(text):
+    df = ih.add_event_types(ih.convert_text_to_bulletpoints(text))
+    new_df = pd.DataFrame([], columns=["location", "(token1, lin_prob1)", "(token2, lin_prob2)"])
+    values_list = df.values.tolist()
+    event_type_key = df.columns.get_loc("event_type")
+    for item in values_list:
+        messages = [
+            {"role": "system", "content": p.LOCATION_CONTEXT},
+            {
+                "role": "user",
+                "content": p.LOCATION_PROMPT
+                + item[0]
+                + "\nThe category: "
+                + item[event_type_key],
+            },
+            {"role": "assistant", "content": p.LOCATION_ANSWER},
+        ]
+        content, top_logprops = u.query_gpt(messages, logprobs=True, top_logprobs=2)
+        metrics = [content]
+
+        for logprob in top_logprops:
+            token = logprob.token
+            lin_prop = calculate_linear_probability(logprob.logprob)
+            metrics.append((token, lin_prop))
+
+        new_row = pd.DataFrame([metrics], columns=["location", "(token1, lin_prob1)", "(token2, lin_prob2)"])
+        new_df = pd.concat([new_df, new_row], ignore_index=True)
+        ih.document_intermediates(new_row.to_string())
+    df = pd.concat([df, new_df], axis=1)
+    return df
+
+
+
+
+def calculate_linear_probability(logprob):
+    linear_prob = np.round(np.exp(logprob) * 100, 2)
+    return linear_prob
diff --git a/tracex/extraction/prototype/pipeline_without_user_io.py b/tracex/extraction/prototype/pipeline_without_user_io.py
@@ -1,7 +1,6 @@
 """Module to run the pipeline without user interaction."""
 import input_inquiry as ii
 import input_handling as ih
-import output_handling as oh
 import utils as u
 
 
@@ -10,7 +9,6 @@ def run_pipeline():
     input_text = ii.create_patient_journey()
     u.pause_between_queries()
     ih.convert_text_to_csv(input_text)
-    oh.get_output_without_user_io()
 
 
 REPS = 1

diff --git a/tracex/extraction/prototype/prompts.py b/tracex/extraction/prototype/prompts.py
@@ -218,51 +218,57 @@ def life_circumstances_prompt(sex):
 
 # Adding of a event type to every bulletpoint
 EVENT_TYPE_CONTEXT = """
-    You are an expert in text categorization and your job is to take a given bulletpoint and to add one of given event type to it.
-    The given event types are 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
-    It is important, that every bulletpoint gets an event type.
+    You are an expert in text categorization and your job is to take given bulletpoints and to add one of given event type to every bulletpoint.
+    The given event types are 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital admission', 'Hospital discharge', 'Medication', 'Lifestyle Change' and 'Feelings'.
+    It is important, that every bullet point gets an event type.
     Furthermore it is really important, that that event type is correct and not 'Other'.
-    The only output should be the event type!
+    The only output should be the updated bullet points, nothing else!
 """
 EVENT_TYPE_PROMPT = """
-    Here is the bulletpoint for which you should extract the event type.
-    Explain step by step your conclusions your choice of location: 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'
+    You will be given a bulletpoint of a patient journey.
+    Classify the bulletpoint into one of the following event types: Symptom Onset, Symptom Offset, Diagnosis, Doctor visit, Treatment, Hospital stay, Medication, Lifestyle Change and Feelings.
+    Return only the name of the event type, and nothing else.
+    MAKE SURE your output is one of the nine event types stated. ONLY return the name of the event type, and nothing else!
+
 """
 EVENT_TYPE_ANSWER = """
     For example for the bulletpoint 'visiting doctor's' you should return 'Doctors Visit'.
     For 'testing positive for Covid19' you should return 'Diagnosis' and for 'getting hospitalized' you should return 'Hospital stay'.
 """
-FC_EVENT_TYPE_CONTEXT = """
-    You are an expert in extracting information. You easily detect event types and extract them as they are without changing any format. The only possible event types are
-    'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
-"""
-FC_EVENT_TYPE_PROMPT = """
-    Please extract the following event type of the text without changing the given format:
-"""
+# FC_EVENT_TYPE_CONTEXT = """
+#     You are an expert in extracting information. You easily detect event types and extract them as they are without changing any format. The only possible event types are
+#     'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
+# """
+# FC_EVENT_TYPE_PROMPT = """
+#     Please extract the following event type of the text without changing the given format:
+# """
 
 # Adding of a location type to every bulletpoint
 LOCATION_CONTEXT = """
-    You are an expert in text categorization and your job is to take a given bulletpoint and a category and to add one of given locations to it.
+    You are an expert in text categorization and your job is to take given bulletpoints and to add one of given locations to every bulletpoint.
     The given locations are 'Home', 'Hospital' and 'Doctors'.
-    Take the category but also the content of the bulletpoint into account.
     If it is unclear, where the person is, please use 'Home'.
-    It is important, that every bulletpoint gets a location.
-    Furthermore it is really important, that that location is correct.
-    The only output should be the location.
+    It is important, that every bullet point gets an event type.
+    Furthermore it is really important, that that event type is correct.
+    The only (!) output should be the updated bullet points, nothing else!
+    Please do not add a phrase like "here are your bulletpoints" or something like that..
 """
 LOCATION_PROMPT = """
-    Here is the bulletpoint for which you should extract the location.
-    Explain step by step your conclusions your choice of location: 'Home' or 'Hospital' or 'Doctors' or 'Other'.
+    You will be given a bulletpoint and the according event type of a patient journey.
+    Classify the bulletpoint into one of the following locations: Home, Hospital and Doctors.
+    Return only the name of the location, and nothing else.
+    MAKE SURE your output is one of the three locations stated. ONLY return the name of the location, and nothing else!
+    Here is the bulletpoint and the event type for which you should extract the location:
 """
 LOCATION_ANSWER = """
     For example for the bulletpoints 'visiting doctor's', you should return 'Doctors'.
     For the point 'testing positive for Covid19', you also should return 'Doctors'.
     For 'getting hospitalized' the output is 'Hospital'.
 """
-FC_LOCATION_CONTEXT = """
-    You are an expert in extracting information. You easily detect locations and extract them as they are without changing any format.
-    The only possible locations are 'Home', 'Hospital', 'Doctors' and 'Other'.
-"""
-FC_LOCATION_PROMPT = """
-    Please extract the following location of the text without changing the given date format:
-"""
+# FC_LOCATION_CONTEXT = """
+#     You are an expert in extracting information. You easily detect locations and extract them as they are without changing any format.
+#     The only possible locations are 'Home', 'Hospital', 'Doctors' and 'Other'.
+# """
+# FC_LOCATION_PROMPT = """
+#     Please extract the following location of the text without changing the given date format:
+# """
diff --git a/tracex/extraction/prototype/utils.py b/tracex/extraction/prototype/utils.py
@@ -44,8 +44,7 @@ def get_decision(question):
 
 
 def query_gpt(
-    messages, tools=fc.TOOLS, tool_choice="none", temperature=TEMPERATURE_SUMMARIZING
-):
+    messages, tools=fc.TOOLS, tool_choice="none", temperature=TEMPERATURE_SUMMARIZING, logprobs=False, top_logprobs=None):
     """Queries the GPT engine."""
     response = client.chat.completions.create(
         model=MODEL,
@@ -54,10 +53,17 @@ def query_gpt(
         temperature=temperature,
         tools=tools,
         tool_choice=tool_choice,
+        logprobs=logprobs,
+        top_logprobs=top_logprobs,
     )
-    if tool_choice == "none":
-        output = response.choices[0].message.content
-    else:
+    if tool_choice != "none":
         api_response = response.choices[0].message.tool_calls[0].function.arguments
         output = json.loads(api_response)["output"][0]
+
+    elif logprobs:
+        top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
+        content = response.choices[0].message.content
+        return content, top_logprobs
+    else:
+        output = response.choices[0].message.content
     return output