fix pipeline issues

bptlab · Jan 17, 2024 · 680932e · 680932e
1 parent 2652a1b
commit 680932e
Show file tree

Hide file tree

Showing 9 changed files with 71 additions and 857 deletions.
diff --git a/command_line_tool.py b/command_line_tool.py
@@ -7,16 +7,13 @@
 
 from tracex.extraction.prototype import input_inquiry as ii
 from tracex.extraction.prototype import input_handling as ih
-from tracex.extraction.prototype import output_handling as oh
 
 
 def main():
     """Main function calling every pipeline step needed to run the program."""
     ii.greeting()
     input_text = ii.get_input()
-    ih.convert_text_to_csv(input_text)
-    oh.get_output()
-    oh.farewell()
+    outputpath = ih.convert_text_to_csv(input_text)
 
 
 main()
diff --git a/test.py b/test.py
@@ -12,7 +12,7 @@
 from tracex.extraction.prototype import prompts as p
 from tracex.extraction.prototype import create_xes as x
 
-text = open(u.input_path / "journey_synth_covid_0.txt").read()
+""" text = open(u.input_path / "journey_synth_covid_0.txt").read()
 df = ih.convert_text_to_bulletpoints(text)
 print(df)
 df = ih.add_start_dates(text, df)
@@ -27,19 +27,6 @@
 print(df)
 
 ih.convert_dataframe_to_csv(df)
-x.create_xes(u.output_path / "single_trace.csv", "test", "event_information")
+x.create_xes(u.output_path / "single_trace.csv", "test", "event_information") """
 
-# output = 'The end date for the bulletpoint "Experiencing mild cough and fatigue" and the start date 20220108T0000 is 20230911T0000.'
-
-# fc_message = [
-#     {"role": "system", "content": p.END_DATE_CONTEXT},
-#     {
-#         "role": "user",
-#         "content": p.END_DATE_FUNCTION_CALL
-#         + "The text: "
-#         + output
-#     },
-# ]
-
-
-# print(u.query_gpt(fc_message, tool_choice={ "type": "function", "function": {"name": "add_end_dates"}}))
+oh.get_output(u.output_path / "single_trace.csv")
diff --git a/tracex/extraction/content/outputs/all_traces.csv b/tracex/extraction/content/outputs/all_traces.csv
diff --git a/tracex/extraction/content/outputs/dataframe.csv b/tracex/extraction/content/outputs/dataframe.csv
diff --git a/tracex/extraction/content/outputs/single_trace.csv b/tracex/extraction/content/outputs/single_trace.csv
@@ -0,0 +1,9 @@
+case_id,event_information,start_date,end_date,duration,event_type,attribute_location
+0,"experiencing first Covid-19 symptoms: mild cough, fatigue",20220601T0000,20220608T0000,168:00:00,Symptom Onset,Home
+0,brushing off symptoms as common cold,20220601T0000,20220608T0000,168:00:00,Other,Home
+0,developing high fever and difficulty breathing,20220601T0000,20220611T0000,240:00:00,Symptom Onset,Hospital
+0,deciding to get tested for Covid-19,20220601T0000,20220617T0000,384:00:00,Diagnosis,Doctors
+0,going to local testing center,20220617T0000,20220617T0000,00:00:00,Diagnosis,Hospital
+0,undergoing PCR test,20220617T0000,20220617T0000,00:00:00,Diagnosis,Home
+0,receiving negative test results,20220617T0000,20220619T0000,48:00:00,Diagnosis,Doctors
+0,getting infected and testing positive,20220617T0000,20220617T0000,00:00:00,Symptom Onset,Home
diff --git a/tracex/extraction/prototype/create_xes.py b/tracex/extraction/prototype/create_xes.py
@@ -16,7 +16,7 @@ def get_activity_key():
     return get_activity_key()
 
 
-def create_xes(csv_file, name, key):
+def create_xes(csv_file, name="all_traces", key="event_type"):
     """Creates a xes with all traces from the regarding csv."""
     dataframe = pd.read_csv(csv_file, sep=",")
     dataframe["case_id"] = dataframe["case_id"].astype(str)

diff --git a/tracex/extraction/prototype/input_handling.py b/tracex/extraction/prototype/input_handling.py
@@ -10,39 +10,40 @@ def convert_text_to_csv(text):
     """Converts the input to CSV with intermediate steps."""
     steps = str(7)
     print("Converting Data: Summarizing the text. (1/" + steps + ")", end="\r")
-    bulletpoints = convert_text_to_bulletpoints(text)
+    dataframe = convert_text_to_bulletpoints(text)
     print(
         "Converting Data: Extracting start date information. (2/" + steps + ")",
         end="\r",
     )
     u.pause_between_queries()
-    start = add_start_dates(text, bulletpoints)
+    dataframe = add_start_dates(text, dataframe)
     print(
         "Converting Data: Extracting end date information. (3/" + steps + ")   ",
         end="\r",
     )
     u.pause_between_queries()
-    end = add_end_dates(text, start)
+    dataframe = add_end_dates(text, dataframe)
     print(
         "Converting Data: Extracting duration information. (4/" + steps + ") ", end="\r"
     )
     u.pause_between_queries()
-    duration = add_durations(text, end)
+    dataframe = add_durations(dataframe)
     print(
         "Converting Data: Extracting event types. (5/" + steps + ")          ", end="\r"
     )
     u.pause_between_queries()
-    event_type = add_event_types(duration)
+    dataframe = add_event_types(dataframe)
     print(
         "Converting Data: Extracting location information. (6/" + steps + ")", end="\r"
     )
     u.pause_between_queries()
-    location = add_locations(event_type)
+    dataframe = add_locations(dataframe)
     print(
         "Converting Data: Creating output CSV. (7/" + steps + ")             ", end="\r"
     )
-    output_path = convert_dataframe_to_csv(location)
+    output_path = convert_dataframe_to_csv(dataframe)
     print("Dataconversion finished.                    ")
+    output(dataframe)
     return output_path
 
 
@@ -318,3 +319,42 @@ def convert_dataframe_to_csv(df):
         path_or_buf=output_path, sep=",", encoding="utf-8", header=True, index=False
     )
     return output_path
+
+
+def output(df):
+    decision = u.get_decision("Would you like to see the output? (y/n)\n")
+    if decision:
+        print(df)
+    else:
+        print("The output can be found at: " + u.output_path / "single_trace.csv.")
+    decision = u.get_decision(
+        "Would you like to append this trace to all_traces.csv? (y/n)\n"
+    )
+    if decision:
+        append_csv()
+    farewell()
+
+
+def append_csv():
+    """Appends the current trace to the CSV containing all traces."""
+    trace_count = 0
+    with open(u.CSV_ALL_TRACES, "r") as f:
+        rows = f.readlines()[1:]
+        if len(rows) >= 2:
+            trace_count = max(int(row.split(",")[0]) for row in rows if row)
+    with open(u.CSV_OUTPUT, "r") as f:
+        previous_content = f.readlines()
+        content = []
+        for row in previous_content:
+            if row != "\n":
+                content.append(row)
+        content = content[1:]
+    with open(u.CSV_ALL_TRACES, "a") as f:
+        for row in content:
+            row = row.replace(row[0], str(int(row[0]) + trace_count + 1), 1)
+            f.writelines(row)
+
+
+def farewell():
+    """Prints a farewell message."""
+    print("-----------------------------------\nThank you for using TracEX!\n\n")
diff --git a/tracex/extraction/prototype/output_handling.py b/tracex/extraction/prototype/output_handling.py
diff --git a/tracex/extraction/prototype/utils.py b/tracex/extraction/prototype/utils.py
@@ -23,7 +23,7 @@
 MAX_TOKENS = 1100
 TEMPERATURE_SUMMARIZING = 0
 TEMPERATURE_CREATION = 1
-CSV_OUTPUT = settings.BASE_DIR / "extraction/content/outputs/intermediates/7_output.csv"
+CSV_OUTPUT = settings.BASE_DIR / "extraction/content/outputs/single_trace.csv"
 CSV_ALL_TRACES = settings.BASE_DIR / "extraction/content/outputs/all_traces.csv"