Skip to content

Commit

Permalink
added log probs metrics and remove cot+fc of event type and location …
Browse files Browse the repository at this point in the history
…prompts
  • Loading branch information
tkv29 committed Jan 28, 2024
1 parent cb7db64 commit e8eced4
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 62 deletions.
35 changes: 35 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# pylint: skip-file
# pylint: enable=wrong-import-position
import os

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tracex.tracex.settings")

from tracex.extraction.prototype import input_inquiry as ii
from tracex.extraction.prototype import input_handling as ih
from tracex.extraction.prototype import utils as u
from tracex.extraction.prototype import function_calls as fc
from tracex.extraction.prototype import metrics as m
from tracex.extraction.prototype import create_xes as x

text = open(u.input_path / "journey_synth_covid_0.txt").read()
# df = ih.convert_text_to_bulletpoints(text)
# print(df)

df = m.measure_event_types(text)
print(df)
df = m.measure_location(text)
print(df)
ih.convert_dataframe_to_csv(df)
# df = ih.add_start_dates(text, df)
# print(df)
# df = ih.add_end_dates(text, df)
# print(df)
# df = ih.add_durations(df)
# print(df)
# df = ih.add_event_types(df)
# print(df)
# df = ih.add_locations(df)
# print(df)
# ih.convert_dataframe_to_csv(df)
# x.create_xes(u.output_path / "single_trace.csv", "test", "event_information")

18 changes: 9 additions & 9 deletions tracex/extraction/content/outputs/single_trace.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
case_id,event_information,start_date,end_date,duration,event_type,attribute_location
0,"experiencing first Covid-19 symptoms: mild cough, fatigue",20220601T0000,20220608T0000,168:00:00,Symptom Onset,Home
0,brushing off symptoms as common cold,20220601T0000,20220608T0000,168:00:00,Other,Home
0,developing high fever and difficulty breathing,20220601T0000,20220611T0000,240:00:00,Symptom Onset,Hospital
0,deciding to get tested for Covid-19,20220601T0000,20220617T0000,384:00:00,Diagnosis,Doctors
0,going to local testing center,20220617T0000,20220617T0000,00:00:00,Diagnosis,Hospital
0,undergoing PCR test,20220617T0000,20220617T0000,00:00:00,Diagnosis,Home
0,receiving negative test results,20220617T0000,20220619T0000,48:00:00,Diagnosis,Doctors
0,getting infected and testing positive,20220617T0000,20220617T0000,00:00:00,Symptom Onset,Home
case_id,event_information,event_type,location,"(token1, lin_prob1)","(token2, lin_prob2)"
0,experiencing first Covid-19 symptoms in June 2022,Symptom Onset,Doctors,"('Doctors', 47.42)","('Home', 45.07)"
0,brushing off symptoms as common cold,Lifestyle Change,Home,"('Home', 86.14)","('Doctors', 12.56)"
0,developing high fever and difficulty breathing,Symptom Onset,Doctors,"('Doctors', 68.56)","('Hospital', 13.31)"
0,getting tested for Covid-19,Diagnosis,Doctors,"('Doctors', 91.6)","('Hospital', 3.18)"
0,visiting local testing center,Doctor visit,Doctors,"('Doctors', 99.36)","('Doctor', 0.27)"
0,undergoing PCR test,Diagnosis,Doctors,"('Doctors', 82.7)","('Di', 7.96)"
0,receiving negative test results,"The bulletpoint ""receiving negative test results"" can be classified as ""Symptom Offset"".",Doctors,"('Doctors', 55.6)","('Home', 39.8)"
0,getting infected and testing positive,Diagnosis,Doctors,"('Doctors', 90.93)","('Hospital', 3.57)"
38 changes: 19 additions & 19 deletions tracex/extraction/prototype/input_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,19 +175,19 @@ def add_event_types(df):
},
{"role": "assistant", "content": p.EVENT_TYPE_ANSWER},
]
output = u.query_gpt(messages)
event_type = u.query_gpt(messages)

fc_message = [
{"role": "system", "content": p.FC_EVENT_TYPE_CONTEXT},
{"role": "user", "content": p.FC_EVENT_TYPE_PROMPT + "The text: " + output},
]
event_type = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_event_type"}},
)
# fc_message = [
# {"role": "system", "content": p.FC_EVENT_TYPE_CONTEXT},
# {"role": "user", "content": p.FC_EVENT_TYPE_PROMPT + "The text: " + output},
# ]
# event_type = u.query_gpt(
# fc_message,
# tool_choice={"type": "function", "function": {"name": "add_event_type"}},
# )
new_row = pd.DataFrame([event_type], columns=[name])
new_df = pd.concat([new_df, new_row], ignore_index=True)
document_intermediates(output)
document_intermediates(event_type)
df = pd.concat([df, new_df], axis=1)
return df

Expand All @@ -211,16 +211,16 @@ def add_locations(df):
},
{"role": "assistant", "content": p.LOCATION_ANSWER},
]
output = u.query_gpt(messages)
location = u.query_gpt(messages)

fc_message = [
{"role": "system", "content": p.FC_LOCATION_CONTEXT},
{"role": "user", "content": p.FC_LOCATION_PROMPT + "The text: " + output},
]
location = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_location"}},
)
# fc_message = [
# {"role": "system", "content": p.FC_LOCATION_CONTEXT},
# {"role": "user", "content": p.FC_LOCATION_PROMPT + "The text: " + output},
# ]
# location = u.query_gpt(
# fc_message,
# tool_choice={"type": "function", "function": {"name": "add_location"}},
# )
new_row = pd.DataFrame([location], columns=[name])
new_df = pd.concat([new_df, new_row], ignore_index=True)
document_intermediates(output)
Expand Down
73 changes: 73 additions & 0 deletions tracex/extraction/prototype/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
import numpy as np

from . import utils as u
from . import prompts as p
from . import input_handling as ih


def measure_event_types(text):
df = ih.convert_text_to_bulletpoints(text)
new_df = pd.DataFrame([], columns=["event_type", "(token1, lin_prob1)", "(token2, lin_prob2)"])
values_list = df.values.tolist()
for item in values_list:
messages = [
{"role": "system", "content": p.EVENT_TYPE_CONTEXT},
{
"role": "user",
"content": p.EVENT_TYPE_PROMPT + "\nThe bulletpoint: " + item[0],
},
{"role": "assistant", "content": p.EVENT_TYPE_ANSWER},
]
content, top_logprops = u.query_gpt(messages, logprobs=True, top_logprobs=2)
metrics = [content]

for logprob in top_logprops:
token = logprob.token
lin_prop = calculate_linear_probability(logprob.logprob)
metrics.append((token, lin_prop))

new_row = pd.DataFrame([metrics], columns=["event_type", "(token1, lin_prob1)", "(token2, lin_prob2)"])
new_df = pd.concat([new_df, new_row], ignore_index=True)
ih.document_intermediates(new_row.to_string())
print(new_row.to_string())
df = pd.concat([df, new_df], axis=1)
return df

def measure_location(text):
df = ih.add_event_types(ih.convert_text_to_bulletpoints(text))
new_df = pd.DataFrame([], columns=["location", "(token1, lin_prob1)", "(token2, lin_prob2)"])
values_list = df.values.tolist()
event_type_key = df.columns.get_loc("event_type")
for item in values_list:
messages = [
{"role": "system", "content": p.LOCATION_CONTEXT},
{
"role": "user",
"content": p.LOCATION_PROMPT
+ item[0]
+ "\nThe category: "
+ item[event_type_key],
},
{"role": "assistant", "content": p.LOCATION_ANSWER},
]
content, top_logprops = u.query_gpt(messages, logprobs=True, top_logprobs=2)
metrics = [content]

for logprob in top_logprops:
token = logprob.token
lin_prop = calculate_linear_probability(logprob.logprob)
metrics.append((token, lin_prop))

new_row = pd.DataFrame([metrics], columns=["location", "(token1, lin_prob1)", "(token2, lin_prob2)"])
new_df = pd.concat([new_df, new_row], ignore_index=True)
ih.document_intermediates(new_row.to_string())
df = pd.concat([df, new_df], axis=1)
return df




def calculate_linear_probability(logprob):
linear_prob = np.round(np.exp(logprob) * 100, 2)
return linear_prob
2 changes: 0 additions & 2 deletions tracex/extraction/prototype/pipeline_without_user_io.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Module to run the pipeline without user interaction."""
import input_inquiry as ii
import input_handling as ih
import output_handling as oh
import utils as u


Expand All @@ -10,7 +9,6 @@ def run_pipeline():
input_text = ii.create_patient_journey()
u.pause_between_queries()
ih.convert_text_to_csv(input_text)
oh.get_output_without_user_io()


REPS = 1
Expand Down
60 changes: 33 additions & 27 deletions tracex/extraction/prototype/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,51 +218,57 @@ def life_circumstances_prompt(sex):

# Adding of a event type to every bulletpoint
EVENT_TYPE_CONTEXT = """
You are an expert in text categorization and your job is to take a given bulletpoint and to add one of given event type to it.
The given event types are 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
It is important, that every bulletpoint gets an event type.
You are an expert in text categorization and your job is to take given bulletpoints and to add one of given event type to every bulletpoint.
The given event types are 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital admission', 'Hospital discharge', 'Medication', 'Lifestyle Change' and 'Feelings'.
It is important, that every bullet point gets an event type.
Furthermore it is really important, that that event type is correct and not 'Other'.
The only output should be the event type!
The only output should be the updated bullet points, nothing else!
"""
EVENT_TYPE_PROMPT = """
Here is the bulletpoint for which you should extract the event type.
Explain step by step your conclusions your choice of location: 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'
You will be given a bulletpoint of a patient journey.
Classify the bulletpoint into one of the following event types: Symptom Onset, Symptom Offset, Diagnosis, Doctor visit, Treatment, Hospital stay, Medication, Lifestyle Change and Feelings.
Return only the name of the event type, and nothing else.
MAKE SURE your output is one of the nine event types stated. ONLY return the name of the event type, and nothing else!
"""
EVENT_TYPE_ANSWER = """
For example for the bulletpoint 'visiting doctor's' you should return 'Doctors Visit'.
For 'testing positive for Covid19' you should return 'Diagnosis' and for 'getting hospitalized' you should return 'Hospital stay'.
"""
FC_EVENT_TYPE_CONTEXT = """
You are an expert in extracting information. You easily detect event types and extract them as they are without changing any format. The only possible event types are
'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
"""
FC_EVENT_TYPE_PROMPT = """
Please extract the following event type of the text without changing the given format:
"""
# FC_EVENT_TYPE_CONTEXT = """
# You are an expert in extracting information. You easily detect event types and extract them as they are without changing any format. The only possible event types are
# 'Symptom Onset', 'Symptom Offset', 'Diagnosis', 'Doctor visit', 'Treatment', 'Hospital stay', 'Medication', 'Lifestyle Change' and 'Feelings'.
# """
# FC_EVENT_TYPE_PROMPT = """
# Please extract the following event type of the text without changing the given format:
# """

# Adding of a location type to every bulletpoint
LOCATION_CONTEXT = """
You are an expert in text categorization and your job is to take a given bulletpoint and a category and to add one of given locations to it.
You are an expert in text categorization and your job is to take given bulletpoints and to add one of given locations to every bulletpoint.
The given locations are 'Home', 'Hospital' and 'Doctors'.
Take the category but also the content of the bulletpoint into account.
If it is unclear, where the person is, please use 'Home'.
It is important, that every bulletpoint gets a location.
Furthermore it is really important, that that location is correct.
The only output should be the location.
It is important, that every bullet point gets an event type.
Furthermore it is really important, that that event type is correct.
The only (!) output should be the updated bullet points, nothing else!
Please do not add a phrase like "here are your bulletpoints" or something like that..
"""
LOCATION_PROMPT = """
Here is the bulletpoint for which you should extract the location.
Explain step by step your conclusions your choice of location: 'Home' or 'Hospital' or 'Doctors' or 'Other'.
You will be given a bulletpoint and the according event type of a patient journey.
Classify the bulletpoint into one of the following locations: Home, Hospital and Doctors.
Return only the name of the location, and nothing else.
MAKE SURE your output is one of the three locations stated. ONLY return the name of the location, and nothing else!
Here is the bulletpoint and the event type for which you should extract the location:
"""
LOCATION_ANSWER = """
For example for the bulletpoints 'visiting doctor's', you should return 'Doctors'.
For the point 'testing positive for Covid19', you also should return 'Doctors'.
For 'getting hospitalized' the output is 'Hospital'.
"""
FC_LOCATION_CONTEXT = """
You are an expert in extracting information. You easily detect locations and extract them as they are without changing any format.
The only possible locations are 'Home', 'Hospital', 'Doctors' and 'Other'.
"""
FC_LOCATION_PROMPT = """
Please extract the following location of the text without changing the given date format:
"""
# FC_LOCATION_CONTEXT = """
# You are an expert in extracting information. You easily detect locations and extract them as they are without changing any format.
# The only possible locations are 'Home', 'Hospital', 'Doctors' and 'Other'.
# """
# FC_LOCATION_PROMPT = """
# Please extract the following location of the text without changing the given date format:
# """
16 changes: 11 additions & 5 deletions tracex/extraction/prototype/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ def get_decision(question):


def query_gpt(
messages, tools=fc.TOOLS, tool_choice="none", temperature=TEMPERATURE_SUMMARIZING
):
messages, tools=fc.TOOLS, tool_choice="none", temperature=TEMPERATURE_SUMMARIZING, logprobs=False, top_logprobs=None):
"""Queries the GPT engine."""
response = client.chat.completions.create(
model=MODEL,
Expand All @@ -54,10 +53,17 @@ def query_gpt(
temperature=temperature,
tools=tools,
tool_choice=tool_choice,
logprobs=logprobs,
top_logprobs=top_logprobs,
)
if tool_choice == "none":
output = response.choices[0].message.content
else:
if tool_choice != "none":
api_response = response.choices[0].message.tool_calls[0].function.arguments
output = json.loads(api_response)["output"][0]

elif logprobs:
top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
content = response.choices[0].message.content
return content, top_logprobs
else:
output = response.choices[0].message.content
return output

0 comments on commit e8eced4

Please sign in to comment.