Skip to content

Commit

Permalink
add imporved prompts
Browse files Browse the repository at this point in the history
  • Loading branch information
nils-schmitt committed Jan 10, 2024
1 parent bfe30f6 commit 894962e
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 99 deletions.
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tracex.extraction.prototype import function_calls as fc
from tracex.extraction.prototype import prompts as p

text = open(u.input_path / "journey_synth_covid_0.txt").read()
text = open(u.input_path / "journey_synth_covid_1.txt").read()
df = ih.convert_text_to_bulletpoints(text)
print(df)
df = ih.add_start_dates(text, df)
Expand Down
132 changes: 66 additions & 66 deletions tracex/extraction/prototype/function_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,101 +2,101 @@
{
"type": "function",
"function": {
"name": "add_start_dates",
"description": "this function extract the start date",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a start date",
}
"name": "add_start_dates",
"description": "this function extract the start date",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a start date in the format YYYYMMDDT0000",
},
},
"required": ["output"]
},
}
"required": ["output"],
},
},
},
{
"type": "function",
"function": {
"name": "add_end_dates",
"description": "this function extract the end date",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a end date",
}
"name": "add_end_dates",
"description": "this function extract the end date",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a end date in the format YYYYMMDDT0000",
},
},
"required": ["output"]
},
}
"required": ["output"],
},
},
},
{
"type": "function",
"function": {
"name": "add_duration",
"description": "this function extract the duration",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a duration",
}
"name": "add_duration",
"description": "this function extract the duration",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a duration",
},
},
"required": ["output"]
},
}
"required": ["output"],
},
},
},
{
"type": "function",
"function": {
"name": "add_event_type",
"description": "this function extract the event type",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "an event type",
}
"name": "add_event_type",
"description": "this function extract the event type",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "an event type",
},
},
"required": ["output"]
},
}
"required": ["output"],
},
},
},
{
"type": "function",
"function": {
"name": "add_location",
"description": "this function extract the location",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a location",
}
"name": "add_location",
"description": "this function extract the location",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "array",
"items": {
"type": "string",
"description": "a location",
},
},
"required": ["output"]
},
}
"required": ["output"],
},
},
},
]
]
75 changes: 57 additions & 18 deletions tracex/extraction/prototype/input_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,25 +80,23 @@ def add_start_dates(text, df):
{
"role": "user",
"content": p.BULLETPOINTS_START_DATE_PROMPT
+ "The text: "
+ "\nThe text: "
+ text
+ "\nThe bulletpoint: "
+ item[0],
},
{"role": "assistant", "content": p.BULLETPOINTS_START_DATE_ANSWER},
]
output = u.query_gpt(messages)

fc_message = [
{"role": "system", "content": p.FC_START_DATE_CONTEXT},
{
"role": "user",
"content": p.FC_START_DATE_PROMPT
+ "The text: "
+ output
},
{"role": "user", "content": p.FC_START_DATE_PROMPT + "The text: " + output},
]
start_date = u.query_gpt(fc_message, tool_choice={ "type": "function", "function": {"name": "add_start_dates"}})
start_date = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_start_dates"}},
)
new_row = pd.DataFrame([start_date], columns=["start_date"])
start_date_df = pd.concat([start_date_df, new_row], ignore_index=True)
print(i, end="\r")
Expand All @@ -123,7 +121,7 @@ def add_end_dates(text, df):
{
"role": "user",
"content": p.BULLETPOINTS_END_DATE_PROMPT
+ "The text: "
+ "\nThe text: "
+ text
+ "\nThe bulletpoint: "
+ item[0]
Expand All @@ -136,14 +134,12 @@ def add_end_dates(text, df):

fc_message = [
{"role": "system", "content": p.FC_END_DATE_CONTEXT},
{
"role": "user",
"content": p.FC_END_DATE_PROMPT
+ "The text: "
+ output
},
{"role": "user", "content": p.FC_END_DATE_PROMPT + "The text: " + output},
]
end_date = u.query_gpt(fc_message, tool_choice={ "type": "function", "function": {"name": "add_end_dates"}})
end_date = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_end_dates"}},
)
new_row = pd.DataFrame([end_date], columns=["end_date"])
end_date_df = pd.concat([end_date_df, new_row], ignore_index=True)
print(i, end="\r")
Expand All @@ -159,11 +155,54 @@ def add_end_dates(text, df):

def add_durations(text, bulletpoints_start):
"""Adds durations to the bulletpoints."""
duration_df = pd.DataFrame([], columns=["duration"])
list = df.values.tolist()
i = 0
for item in list:
messages = [
{"role": "system", "content": p.BULLETPOINTS_END_DATE_CONTEXT},
{
"role": "user",
"content": p.BULLETPOINTS_END_DATE_PROMPT
+ "\nThe text: "
+ text
+ "\nThe bulletpoint: "
+ item[0]
+ "\nThe start date: "
+ item[1],
},
{"role": "assistant", "content": p.BULLETPOINTS_END_DATE_ANSWER},
]
output = u.query_gpt(messages)

fc_message = [
{"role": "system", "content": p.FC_END_DATE_CONTEXT},
{"role": "user", "content": p.FC_END_DATE_PROMPT + "The text: " + output},
]
end_date = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_end_dates"}},
)
new_row = pd.DataFrame([end_date], columns=["end_date"])
end_date_df = pd.concat([end_date_df, new_row], ignore_index=True)
print(i, end="\r")
i = i + 1
with open(
(u.output_path / "intermediates/bulletpoints.txt"),
"a",
) as f:
f.write("\n\n" + output)
df = pd.concat([df, end_date_df], axis=1)
return df

messages = [
{"role": "system", "content": p.BULLETPOINTS_DURATION_CONTEXT},
{
"role": "user",
"content": p.BULLETPOINTS_DURATION_PROMPT + text + "\n" + bulletpoints_start,
"content": p.BULLETPOINTS_DURATION_PROMPT
+ text
+ "\n"
+ bulletpoints_start,
},
{"role": "assistant", "content": p.BULLETPOINTS_DURATION_ANSWER},
]
Expand Down
26 changes: 12 additions & 14 deletions tracex/extraction/prototype/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,23 @@ def life_circumstances_prompt(sex):
If there is a conclusion at the end of the text and an outlook set the start date of the last bullet point to the start date of the corresponding bulletpoint.
If there is really no information about the start date to be extracted from the text but there is information about events happening at the same time,
use that information to draw conclusions about the start dates.
Only return the date! Nothing else!
If there is only a month specified, use the first of this month as start date.
"""
BULLETPOINTS_START_DATE_PROMPT = """
Here is the text and the bulletpoint for which you should extract the start date:
Here is the text and the bulletpoint for which you should extract the start date in the format YYYYMMDD with the postfix T0000:
"""
BULLETPOINTS_START_DATE_ANSWER = """
For example for the text 'On April 1, 2020, I started experiencing mild symptoms such as a persistent cough, fatigue, and a low-grade fever.
Four days later I went to the doctor and got tested positive for Covid19.' and the bullet points
Four days later I went to the doctor and got tested positive for Covid19. In June I got infected again.' and the bullet points
'experiencing mild symptoms' you should return '20200401T0000'.
If the bullet point is 'testing positive for Covid19' you should return '20200405T0000'.
If the bulletpoint is 'testing positive for Covid19' you should return '20200405T0000'.
The bulletpoint 'getting infected again' should be returned as '20200601T0000'.
"""
FC_START_DATE_CONTEXT = """
You are an expert in extracting information. You easily detect the start dates and extract them as they are without changing any format.
You are an expert in extracting information. You easily detect the start dates in the format YYYYMMDD with the postfix 'T0000' and extract them as they are without changing any format.
"""

FC_START_DATE_PROMPT = """
Please extract the following start date of the text without changing the given date format:
Please extract the following start date of the text without changing the given date format:
"""

# Adding of a end date to every bullet point
Expand All @@ -174,11 +174,10 @@ def life_circumstances_prompt(sex):
The text 'In the next time I made sure to improve my mental wellbeing.' and the bulletpoint 'improving mental wellbeing' with the start date '20210610T0000', you should output '20210710T0000'.
"""
FC_END_DATE_CONTEXT = """
You are an expert in extracting information. You easily detect the end dates and extract them as they are without changing any format.
You are an expert in extracting information. You easily detect the end dates and extract them as they are without changing any format.
"""

FC_END_DATE_PROMPT = """
Please extract the following end date of the text without changing the given date format:
Please extract the following end date of the text without changing the given date format:
"""

# Adding of a duration to every bullet point
Expand Down Expand Up @@ -206,11 +205,10 @@ def life_circumstances_prompt(sex):
The text 'In the next time I made sure to improve my mental wellbeing.' and the bulletpoint '(improving mental wellbeing, 20210610T0000)' could be updated to '(improving mental wellbeing, 20210610T0000, 720:00:00)'.
"""
FC_DURATION_CONTEXT = """
You are an expert in extracting information. You easily detect durations and extract them as they are without changing any format.
You are an expert in extracting information. You easily detect durations and extract them as they are without changing any format.
"""

FC_DURATION_PROMPT = """
Please extract the following duration of the text without changing the given date format:
Please extract the following duration of the text without changing the given date format:
"""


Expand Down Expand Up @@ -265,4 +263,4 @@ def life_circumstances_prompt(sex):

FC_LOCATION_PROMPT = """
Please extract the following location of the text without changing the given date format:
"""
"""

0 comments on commit 894962e

Please sign in to comment.