Skip to content

Commit

Permalink
🔥implement backup time extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
PitButtchereit committed Jan 29, 2024
1 parent 14371bd commit db1c7d8
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 7 deletions.
2 changes: 1 addition & 1 deletion tracex/extraction/logic/modules/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .module_patient_journey_generator import PatientJourneyGenerator
from .module_activity_labeler import ActivityLabeler
from .module_time_extractor import TimeExtractor
from .module_time_extractor_backup import TimeExtractorBackup
from .module_location_extractor import LocationExtractor
from .module_event_type_classifier import EventTypeClassifier
1 change: 1 addition & 0 deletions tracex/extraction/logic/modules/module_time_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __extract_start_date(self, activity_label):
{"role": "assistant", "content": p.START_DATE_ANSWER},
]
output = u.query_gpt(messages)
print(output + "\n")
fc_message = [
{"role": "system", "content": p.FC_START_DATE_CONTEXT},
{"role": "user", "content": p.FC_START_DATE_PROMPT + "The text: " + output},
Expand Down
133 changes: 133 additions & 0 deletions tracex/extraction/logic/modules/module_time_extractor_backup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from datetime import datetime
from pathlib import Path

from ..logging import log_execution_time
from ..module import Module
from .. import prompts as p
from .. import utils as u


class TimeExtractorBackup(Module):
"""
This is the module that extracts the time information from the patient journey. This includes start dates,
end dates and durations.
"""

def __init__(self):
super().__init__()
self.name = "Time Extractor Backup"
self.description = "Extracts the timestamps for the corresponding activity labels from a patient journey."

@log_execution_time(Path("extraction/logs/execution_time.log"))
def execute(self, df, patient_journey=None):
super().execute(df, patient_journey)
df["start"] = df["event_information"].apply(self.__extract_start_date)
df["end"] = df.apply(self.__extract_end_date, axis=1)
df["duration"] = df.apply(self.__calculate_row_duration, axis=1)
self.result = df

def __extract_start_date(self, activity_label):
messages = [
{"role": "system", "content": START_DATE_CONTEXT},
{
"role": "user",
"content": f"The text: {self.patient_journey} \nThe activity label: {activity_label}",
},
]
start_date = u.query_gpt(messages)
print(start_date + "\n")
assert self.is_valid_date_format(start_date, "%Y%m%dT%H%M") is True, f"Date {start_date} has no valid format."

return start_date

def __extract_end_date(self, row):
messages = [
{"role": "system", "content": END_DATE_CONTEXT},
{
"role": "user",
"content": f"\nThe text: {self.patient_journey} \nThe bulletpoint: "
f"{row['event_information']} \nThe start date: {row['start']}",
},
]
end_date = u.query_gpt(messages)
print(end_date + "\n")
assert self.is_valid_date_format(end_date, "%Y%m%dT%H%M") is True, f"Date {end_date} has no valid format."

return end_date

@staticmethod
def __calculate_row_duration(row):
if row["start"] == "N/A" or row["end"] == "N/A":
return "N/A"
start_date = datetime.strptime(row["start"], "%Y%m%dT%H%M")
end_date = datetime.strptime(row["end"], "%Y%m%dT%H%M")
duration = end_date - start_date
hours, remainder = divmod(duration.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

@staticmethod
def is_valid_date_format(date_string, date_format):
try:
datetime.strptime(date_string, date_format)
return True
except ValueError:
return False

# START_DATE_CONTEXT = """
# You are an expert in text understanding and your job is to take a given text and given summarizing bulletpoints and to add a start date to every bulletpoint.
# Edit the bulletpoints in a way, that you just take the existing bulletpoints and add a start date at the end of it.
# The information about the start date should be extracted from the text or from the context and should be as precise as possible.
# Do not modify the content of the bulletpoint and keep ending commas.
# Please use the format YYYYMMDD for the dates and extend every date by "T0000".
# Keep in mind, that the start date of a bullet point is not necessarily later than the start of the previous one.
# Also, the start date doesn't have to be the next date information in the text, but can be related to the previous.
# If the text talks about getting medication and then improving and the bullet point says 'improving', you should return the date of getting the medication as start date.
# If there is a conclusion at the end of the text and an outlook set the start date of the last bullet point to the start date of the corresponding bulletpoint.
# If there is really no information about the start date to be extracted from the text but there is information about events happening at the same time,
# use that information to draw conclusions about the start dates.
# If there is no information about the start date at all and there is no way of finding some, delete that bulletpoint.
# The only output should be the updated bullet points, nothing else!
# """

START_DATE_CONTEXT = """
You are provided with a natural language text containing various events. Your task is to identify the start date of
a specific activity mentioned in the text. The activity label will be provided, and it is your job to extract only
the start date associated with this activity from the text.
Under no circumstances put anything else in the ouptut apart from the extracted start date.
Please follow the following rules:
1. The format of the date should always be YYYYMMDDT0000. For example, 20200101T0000.
2. If only a month in mentioned then the date should always be the first day of the month. For example for March it should be 20200301T0000.
3. If the date is mentioned in a different format, please convert it to the format mentioned above.
4. Also consider context information from previous activities and their start dates.
"""

# END_DATE_CONTEXT = """
# You are an expert in text understanding and your job is to take a given text and given summarizing bulletpoints with a start date and to add a end date to every bulletpoint.
# It is important, that every bullet point gets an end date, even if it is the same as the start date.
# Edit the bulletpoints in a way, that you just take the existing bulletpoints and add a end date to it.
# The information about the end date should be extracted from the text or from the context and should be as precise as possible.
# Please use the format YYYYMMDD for the dates and extend every date by "T0000".
# If the duration of an event is given, use that information to draw conclusions about the end date.
# If the duration of an event is not given, use the context to draw conclusions about the end date.
# If two bulletpoints are related, it is possible, that the end dates should match.
# Think about how long humans tend to stay in hospitals, how long it takes to recover from a disease, how long they practice new habits and so on.
# If there is no information about the end date at all, please state the start date also as the end date.
# The only output should be the updated bullet points, nothing else!
# """

END_DATE_CONTEXT = """
You are provided with a natural language text containing various events. Your task is to identify the end date of
a specific activity mentioned in the text. The activity label and the corresponding start date will be provided,
and it is your job to extract only the end date associated with this activity from the text.
Under no circumstances put anything else in the ouptut apart from the extracted end date.
Please follow the following rules:
1. The format of the date should always be YYYYMMDDT0000. For example, 20200101T0000.
2. If only a month in mentioned then the date should always be the first day of the month. For example for March it should be 20200301T0000.
3. If the date is mentioned in a different format, please convert it to the format mentioned above.
4. Also consider context information from previous activities and their start dates and end dates. The end dates should
follow logically from the start dates.
5. End dates can not be earlier than the start dates.
"""

4 changes: 2 additions & 2 deletions tracex/extraction/logic/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from . import Module, logging
from .modules.module_patient_journey_generator import PatientJourneyGenerator
from .modules.module_activity_labeler import ActivityLabeler
from .modules.module_time_extractor import TimeExtractor
from .modules.module_time_extractor_backup import TimeExtractorBackup
from .modules.module_location_extractor import LocationExtractor
from .modules.module_event_type_classifier import EventTypeClassifier

Expand All @@ -28,7 +28,7 @@ class ExtractionConfiguration:
"patient_journey_generation": PatientJourneyGenerator,
"activity_labeling": ActivityLabeler,
"event_type_classification": EventTypeClassifier,
"time_extraction": TimeExtractor,
"time_extraction": TimeExtractorBackup,
"location_extraction": LocationExtractor,
}
activity_key: Optional[str] = "event_type"
Expand Down
5 changes: 3 additions & 2 deletions tracex/extraction/logic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ def get_decision(question):

def query_gpt(
messages,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE_SUMMARIZING,
tools=function_calls.TOOLS,
tool_choice="none",
temperature=TEMPERATURE_SUMMARIZING,
):
@log_tokens_used(Path("extraction/logs/tokens_used.log"))
def make_api_call():
Expand All @@ -54,7 +55,7 @@ def make_api_call():
_response = client.chat.completions.create(
model=MODEL,
messages=messages,
max_tokens=MAX_TOKENS,
max_tokens=max_tokens,
temperature=temperature,
tools=tools,
tool_choice=tool_choice,
Expand Down
5 changes: 3 additions & 2 deletions tracex/extraction/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def get_context_data(self, **kwargs):
"attribute_location": orchestrator.configuration.locations,
}
is_extracted = (
True
False
if self.request.session.get("is_extracted") is None
else self.request.session.get("is_extracted")
)
Expand All @@ -124,6 +124,7 @@ def get_context_data(self, **kwargs):
if not (IS_TEST or is_extracted):
orchestrator.run()
single_trace_df = orchestrator.data
print(single_trace_df)
single_trace_df["caseID"] = single_trace_df["caseID"].astype(str)
single_trace_df["start"] = pd.to_datetime(single_trace_df["start"])
single_trace_df["end"] = pd.to_datetime(single_trace_df["end"])
Expand All @@ -141,6 +142,7 @@ def get_context_data(self, **kwargs):
name="single_trace",
key=orchestrator.configuration.activity_key,
)
self.request.session["is_extracted"] = True
else:
output_path_xes = (
f"{str(utils.output_path / 'single_trace')}_event_type.xes"
Expand Down Expand Up @@ -178,7 +180,6 @@ def get_context_data(self, **kwargs):
context["all_xes_html"] = utils.Conversion.create_html_from_xes(
all_traces_df_filtered
).getvalue()
self.request.session["is_extracted"] = False

return context

Expand Down

0 comments on commit db1c7d8

Please sign in to comment.