Skip to content

Commit

Permalink
change duration to deterministic and adapt create_xes
Browse files Browse the repository at this point in the history
  • Loading branch information
tkv29 committed Jan 17, 2024
1 parent 9f06697 commit 1c223bd
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 62 deletions.
23 changes: 12 additions & 11 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,21 @@
from tracex.extraction.prototype import utils as u
from tracex.extraction.prototype import function_calls as fc
from tracex.extraction.prototype import prompts as p
from tracex.extraction.prototype import create_xes as x

text = open(u.input_path / "journey_synth_covid_1.txt").read()
df = ih.convert_text_to_bulletpoints(text)
print(df)
df = ih.add_start_dates(text, df)
df = ih.add_end_dates(text, df)
# df = ih.add_durations(text, df)
# df = ih.add_event_types(df)
# df = ih.add_locations(df)
# text = open(u.input_path / "journey_synth_covid_0.txt").read()
# df = ih.convert_text_to_bulletpoints(text)
# print(df)
# df = ih.add_start_dates(text, df)
# df = ih.add_end_dates(text, df)
# df = ih.add_durations(df)
# # df = ih.add_event_types(df)
# # df = ih.add_locations(df)

print(df)
# print(df)

df.to_csv(u.output_path / "intermediates/dataframe.csv")
ih.convert_dataframe_to_csv(df)
# ih.convert_dataframe_to_csv(df)
x.create_xes(u.output_path / "single_trace.csv", "test", "event_information")

# output = 'The end date for the bulletpoint "Experiencing mild cough and fatigue" and the start date 20220108T0000 is 20230911T0000.'

Expand Down
12 changes: 6 additions & 6 deletions tracex/extraction/prototype/create_xes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ def get_activity_key():
def create_xes(csv_file, name, key):
"""Creates a xes with all traces from the regarding csv."""
dataframe = pd.read_csv(csv_file, sep=",")
dataframe["caseID"] = dataframe["caseID"].astype(str)
dataframe["start"] = pd.to_datetime(dataframe["start"])
dataframe["end"] = pd.to_datetime(dataframe["end"])
dataframe["case_id"] = dataframe["case_id"].astype(str)
dataframe["start_date"] = pd.to_datetime(dataframe["start_date"])
dataframe["end_date"] = pd.to_datetime(dataframe["end_date"])
dataframe["duration"] = pd.to_timedelta(dataframe["duration"])
dataframe = dataframe.rename(
columns={
key: "concept:name",
"caseID": "case:concept:name",
"start": "time:timestamp",
"end": "time:endDate",
"case_id": "case:concept:name",
"start_date": "time:timestamp",
"end_date": "time:end_date",
"duration": "time:duration",
}
)
Expand Down
112 changes: 67 additions & 45 deletions tracex/extraction/prototype/input_handling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module providing functions for converting text to CSV."""
import pandas as pd
from datetime import datetime

from . import utils as u
from . import prompts as p
Expand Down Expand Up @@ -86,7 +87,7 @@ def add_start_dates(text, df):
},
{"role": "assistant", "content": p.START_DATE_ANSWER},
]

output = u.query_gpt(messages)

fc_message = [
Expand All @@ -103,8 +104,10 @@ def add_start_dates(text, df):

if start_date == "N/A" and row_count > 1:
last_index = new_df.index[-1]
previous_index = last_index -1
new_df.at[last_index, 'start_date'] = new_df.at[previous_index, 'start_date']
previous_index = last_index - 1
new_df.at[last_index, "start_date"] = new_df.at[
previous_index, "start_date"
]

print(name + ": " + str(i) + " ", end="\r")
i = i + 1
Expand Down Expand Up @@ -161,49 +164,68 @@ def add_end_dates(text, df):
return df


def add_durations(text, df):
"""Adds durations to the bulletpoints."""
name = "duration"
new_df = pd.DataFrame([], columns=[name])
values_list = df.values.tolist()
i = 0
for item in values_list:
messages = [
{"role": "system", "content": p.DURATION_CONTEXT},
{
"role": "user",
"content": p.DURATION_PROMPT
+ "\nThe text: "
+ text
+ "\nThe bulletpoint: "
+ item[0]
+ "\nThe start date: "
+ item[1]
+ "\nThe end date: "
+ item[2],
},
{"role": "assistant", "content": p.DURATION_ANSWER},
]
output = u.query_gpt(messages)
# def add_durations(text, df):
# """Adds durations to the bulletpoints."""
# name = "duration"
# new_df = pd.DataFrame([], columns=[name])
# values_list = df.values.tolist()
# i = 0
# for item in values_list:
# messages = [
# {"role": "system", "content": p.DURATION_CONTEXT},
# {
# "role": "user",
# "content": p.DURATION_PROMPT
# + "\nThe text: "
# + text
# + "\nThe bulletpoint: "
# + item[0]
# + "\nThe start date: "
# + item[1]
# + "\nThe end date: "
# + item[2],
# },
# {"role": "assistant", "content": p.DURATION_ANSWER},
# ]
# output = u.query_gpt(messages)

# fc_message = [
# {"role": "system", "content": p.FC_DURATION_CONTEXT},
# {"role": "user", "content": p.FC_DURATION_PROMPT + "The text: " + output},
# ]
# duration = u.query_gpt(
# fc_message,
# tool_choice={"type": "function", "function": {"name": "add_duration"}},
# )
# new_row = pd.DataFrame([duration], columns=[name])
# new_df = pd.concat([new_df, new_row], ignore_index=True)
# print(name + ": " + str(i) + " ", end="\r")
# i = i + 1
# with open(
# (u.output_path / "intermediates/bulletpoints.txt"),
# "a",
# ) as f:
# f.write("\n" + output)
# df = pd.concat([df, new_df], axis=1)
# return df


def add_durations(df):
# Funktion zur Berechnung der Dauer im gewünschten Format
def calculate_row_duration(row):
if row["start_date"] == "N/A" or row["end_date"] == "N/A":
return "N/A"

start_date = datetime.strptime(row["start_date"], "%Y%m%dT%H%M")
end_date = datetime.strptime(row["end_date"], "%Y%m%dT%H%M")
duration = end_date - start_date
hours, remainder = divmod(duration.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

# Neue Spalte 'duration' erstellen und für jede Zeile die Dauer berechnen
df["duration"] = df.apply(calculate_row_duration, axis=1)

fc_message = [
{"role": "system", "content": p.FC_DURATION_CONTEXT},
{"role": "user", "content": p.FC_DURATION_PROMPT + "The text: " + output},
]
duration = u.query_gpt(
fc_message,
tool_choice={"type": "function", "function": {"name": "add_duration"}},
)
new_row = pd.DataFrame([duration], columns=[name])
new_df = pd.concat([new_df, new_row], ignore_index=True)
print(name + ": " + str(i) + " ", end="\r")
i = i + 1
with open(
(u.output_path / "intermediates/bulletpoints.txt"),
"a",
) as f:
f.write("\n" + output)
df = pd.concat([df, new_df], axis=1)
return df


Expand Down

0 comments on commit 1c223bd

Please sign in to comment.