Skip to content

Commit

Permalink
♻️ refactoring, add logger
Browse files Browse the repository at this point in the history
  • Loading branch information
FR-SON committed Jan 25, 2024
1 parent 1a129a4 commit c21b8b4
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 50 deletions.
102 changes: 102 additions & 0 deletions tracex/extraction/logic/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import ast
import time
import functools
import inspect
import logging

import pandas as pd


def log_execution_time(log_file_path):
"""Decorator to log the execution time of a function."""
logger = setup_logger(
"execution_time_logger", log_file_path, "%(asctime)s - %(message)s"
)

def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
execution_time = end_time - start_time

calling_frame = inspect.stack()[1]
log_entry = {
"function_name": func.__name__,
"calling_file": calling_frame[1],
"calling_line": calling_frame[2],
"execution_time": f"{execution_time:.5f} seconds",
}
logger.info(log_entry)

return result

return wrapper

return decorator


def log_tokens_used(log_file_path):
"""Decorator to log the tokens used in an API call to openAI."""
logger = setup_logger("token_logger", log_file_path, "%(asctime)s - %(message)s")

def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
response = func(*args, **kwargs)

tokens_used = response.usage.total_tokens
parent_calling_frame = inspect.stack()[2]
log_entry = {
"calling_function_name": parent_calling_frame[3],
"calling_file": parent_calling_frame[1],
"calling_line": parent_calling_frame[2],
"tokens_used": tokens_used,
}
logger.info(log_entry)

return response

return wrapper

return decorator


def setup_logger(logger_name, log_file_path, log_format):
"""Set up a logger with the given path and format."""
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler(log_file_path)
formatter = logging.Formatter(log_format)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)

return logger


def calculate_tokens_sum(log_file_path):
"""Calculate the sum of tokens used for each function in the given log file."""
df = pd.read_csv(
log_file_path,
sep=" - ",
engine="python",
header=None,
names=["timestamp", "message"],
)

df["message"] = df["message"].apply(ast.literal_eval)

df["calling_function_name"] = df["message"].apply(lambda x: x.get("function_name"))
df["parent_calling_file"] = df["message"].apply(lambda x: x.get("calling_file"))
df["tokens_used"] = df["message"].apply(lambda x: x.get("tokens_used", 0))

result_df = (
df.groupby(["calling_function_name", "parent_calling_file"])["tokens_used"]
.sum()
.reset_index()
)

return result_df
4 changes: 4 additions & 0 deletions tracex/extraction/logic/module.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from abc import ABC, abstractmethod
from pathlib import Path

from .logging import log_execution_time


class Module(ABC):
Expand All @@ -19,6 +22,7 @@ def __init__(self):
self.patient_journey = None
self.result = None

@log_execution_time(Path("extraction/execution_time.log"))
@abstractmethod
def execute(self, _input, patient_journey=None):
"""
Expand Down
12 changes: 6 additions & 6 deletions tracex/extraction/logic/modules/module_time_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def __init__(self):

def execute(self, df, patient_journey=None):
super().execute(df, patient_journey)
df["start_date"] = df["event_information"].apply(self.__extract_start_date)
df["end_date"] = df.apply(self.__extract_end_date, axis=1)
df["start"] = df["event_information"].apply(self.__extract_start_date)
df["end"] = df.apply(self.__extract_end_date, axis=1)
df["duration"] = df.apply(self.__calculate_row_duration, axis=1)
self.result = df

Expand Down Expand Up @@ -52,7 +52,7 @@ def __extract_end_date(self, row):
{
"role": "user",
"content": f"{p.END_DATE_PROMPT} \nThe text: {self.patient_journey} \nThe bulletpoint: "
f"{row['event_information']} \nThe start date: {row['start_date']}",
f"{row['event_information']} \nThe start date: {row['start']}",
},
{"role": "assistant", "content": p.END_DATE_ANSWER},
]
Expand All @@ -70,10 +70,10 @@ def __extract_end_date(self, row):

@staticmethod
def __calculate_row_duration(row):
if row["start_date"] == "N/A" or row["end_date"] == "N/A":
if row["start"] == "N/A" or row["end"] == "N/A":
return "N/A"
start_date = datetime.strptime(row["start_date"], "%Y%m%dT%H%M")
end_date = datetime.strptime(row["end_date"], "%Y%m%dT%H%M")
start_date = datetime.strptime(row["start"], "%Y%m%dT%H%M")
end_date = datetime.strptime(row["end"], "%Y%m%dT%H%M")
duration = end_date - start_date
hours, remainder = divmod(duration.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
Expand Down
7 changes: 4 additions & 3 deletions tracex/extraction/logic/orchestrator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import csv
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List

from . import Module
from . import Module, logging
from .modules.module_patient_journey_generator import PatientJourneyGenerator
from .modules.module_activity_labeler import ActivityLabeler
from .modules.module_time_extractor import TimeExtractor
Expand Down Expand Up @@ -84,8 +85,8 @@ def run(self):
for module in modules:
module.execute(self.data, self.configuration.patient_journey)
self.data = module.result
# replace with self.data = self.__convert_bulletpoints_to_csv(self.data) when dataframes are implemented
return self.data.to_csv(utils.CSV_OUTPUT, index=False)
self.data.insert(0, "caseID", 1)
self.data.to_csv(utils.CSV_OUTPUT, index=False, header=True)

# This method may be deleted later. The original idea was to always call Orchestrator.run() and depending on if
# a configuration was given or not, the patient journey generation may be executed.
Expand Down
46 changes: 18 additions & 28 deletions tracex/extraction/logic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,16 @@
import base64
import json
import tempfile
import time
import functools
import warnings
import pandas as pd
import pm4py
from dataclasses import dataclass
from io import StringIO, BytesIO
from typing import Optional
from openai import OpenAI

from .constants import *
from . import function_calls
from .logging import log_tokens_used


def deprecated(func):
Expand Down Expand Up @@ -49,16 +47,22 @@ def query_gpt(
tool_choice="none",
temperature=TEMPERATURE_SUMMARIZING,
):
client = OpenAI(api_key=oaik)
"""Queries the GPT engine."""
response = client.chat.completions.create(
model=MODEL,
messages=messages,
max_tokens=MAX_TOKENS,
temperature=temperature,
tools=tools,
tool_choice=tool_choice,
)
@log_tokens_used(Path("extraction/tokens_used.log"))
def make_api_call():
client = OpenAI(api_key=oaik)
"""Queries the GPT engine."""
_response = client.chat.completions.create(
model=MODEL,
messages=messages,
max_tokens=MAX_TOKENS,
temperature=temperature,
tools=tools,
tool_choice=tool_choice,
)

return _response

response = make_api_call()
if tool_choice == "none":
output = response.choices[0].message.content
else:
Expand Down Expand Up @@ -104,20 +108,6 @@ def append_csv():
f.writelines(row)


@deprecated
@dataclass
class ExtractionConfiguration:
patient_journey: str
event_types: list
locations: list
activity_key: Optional[str] = "event_type"

def update(self, **kwargs):
"""Update the configuration with a dictionary."""
for key, value in kwargs.items():
setattr(self, key, value)


class Conversion:
@staticmethod
def create_xes(csv_file, name, key):
Expand Down Expand Up @@ -157,7 +147,7 @@ def create_html_from_xes(df):
return xes_html_buffer

@staticmethod
def create_dfg_png_from_df(df):
def create_dfg_from_df(df):
"""Create png image from xes file."""
dfg_img_buffer = BytesIO()
output_dfg_file = pm4py.discover_dfg(
Expand Down
38 changes: 25 additions & 13 deletions tracex/extraction/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class JourneyInputView(generic.FormView):

def form_valid(self, form):
"""Save the uploaded journey in the cache."""
cache.set("is_extracted", False)
self.request.session["is_extracted"] = False
orchestrator = Orchestrator.get_instance()
orchestrator.configuration.update(
event_types=form.cleaned_data["event_types"],
Expand Down Expand Up @@ -78,7 +78,7 @@ def get_context_data(self, **kwargs):

def form_valid(self, form):
"""Save the generated journey in the orchestrator's configuration."""
cache.set("is_extracted", False)
self.request.session["is_extracted"] = False
orchestrator = Orchestrator.get_instance()
orchestrator.configuration.update(
patient_journey=orchestrator.configuration.patient_journey, # This should not be necessary, unspecefied values should be unchanged
Expand Down Expand Up @@ -115,25 +115,37 @@ def get_context_data(self, **kwargs):
"attribute_location": orchestrator.configuration.locations,
}
is_extracted = (
True if cache.get("is_extracted") is None else cache.get("is_extracted")
True
if self.request.session.get("is_extracted") is None
else self.request.session.get("is_extracted")
)

# 1. Run the pipeline to create the single trace
if not (IS_TEST or is_extracted):
output_path_csv = orchestrator.run()
orchestrator.run()
single_trace_df = orchestrator.data
single_trace_df["caseID"] = single_trace_df["caseID"].astype(str)
single_trace_df["start"] = pd.to_datetime(single_trace_df["start"])
single_trace_df["end"] = pd.to_datetime(single_trace_df["end"])
single_trace_df = single_trace_df.rename(
columns={
orchestrator.configuration.activity_key: "concept:name",
"caseID": "case:concept:name",
"start": "time:timestamp",
"end": "time:endDate",
"duration": "time:duration",
}
)
output_path_xes = utils.Conversion.create_xes(
output_path_csv,
utils.CSV_OUTPUT,
name="single_trace",
key=orchestrator.configuration.activity_key,
)
else:
output_path_xes = (
str(utils.output_path / "single_trace")
+ "_"
+ orchestrator.configuration.activity_key
+ ".xes"
f"{str(utils.output_path / 'single_trace')}_event_type.xes"
)
single_trace_df = pm4py.read_xes(output_path_xes)
single_trace_df = pm4py.read_xes(output_path_xes)

# 2. Sort and filter the single journey dataframe
single_trace_df = self.sort_dataframe(single_trace_df)
Expand All @@ -156,17 +168,17 @@ def get_context_data(self, **kwargs):
}
)
context["journey"] = orchestrator.configuration.patient_journey
context["dfg_img"] = utils.Conversion.create_dfg_png_from_df(output_df_filtered)
context["dfg_img"] = utils.Conversion.create_dfg_from_df(output_df_filtered)
context["xes_html"] = utils.Conversion.create_html_from_xes(
output_df_filtered
).getvalue()
context["all_dfg_img"] = utils.Conversion.create_dfg_png_from_df(
context["all_dfg_img"] = utils.Conversion.create_dfg_from_df(
all_traces_df_filtered
)
context["all_xes_html"] = utils.Conversion.create_html_from_xes(
all_traces_df_filtered
).getvalue()
cache.set("is_extracted", True)
self.request.session["is_extracted"] = False

return context

Expand Down

0 comments on commit c21b8b4

Please sign in to comment.