-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidation.py
More file actions
666 lines (512 loc) · 33 KB
/
validation.py
File metadata and controls
666 lines (512 loc) · 33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
import os
import openpyxl
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import matplotlib.dates as mdates
def load_data(filepath):
"""Load data from the given Excel file."""
return pd.read_excel(filepath)
# Function to extract year, month from filename
def extract_date_from_filename(filename):
'''
# Extract the date from the filename
Parameters
--------------
filename: filename of the postprocessed data. These are in the structure/format "STN_YYYYMM_SF" or "STN_YYYYMM_HD" using the data inventory, where:
STN is the three digit station number,
YYYY is the year
MM is the month,
SF represents Standard Format
HD represents a hand drawn form / or photocopied form of the standard format
Returns
--------------
year, month : Year (YYYY), Month (MM)
'''
match = re.search(r'_(\d{4})(\d{2})_', filename)
if match:
year = int(match.group(1))
month = int(match.group(2))
return year, month
return None, None
def is_highlighted_green(cell, color):
''' Checks if a cell is highlighted 'GREEN' during the earlier post processing steps that symbolizes that this data was confirmed
Parameters
--------------
cell: coordinates of cell to check
color: highlighted color
Returns
--------------
boolean: 1 (True) if the cell is highlighted with GREEN (i.e. confirmed in Quality Control)
'''
fill = cell.fill.start_color
if isinstance(fill, openpyxl.styles.colors.Color):
return fill.rgb == color
return False
def select_and_convert_postprocessed_data(filepath, date_column, columns_to_check, header_rows, multi_day_totals, multi_day_averages, excluded_rows, additional_excluded_rows, final_totals_rows):
'''
Selects confirmed transcribed data after QA/QC checks and converts it into an .xlsx format ready for conversion to the Station Exchange Format (SEF).
This function processes a QA/QC-verified Excel file, extracting validated temperature data for max, min, and average values.
It filters out invalid rows, handles sharp transitions, and identifies outliers. The resulting DataFrame is structured for further analysis or export.
Parameters
--------------
filepath : str
Path to the QA/QC-verified Excel file.
date_column : str
Column letter containing the date values (e.g., 'B').
columns_to_check : list of str
List of column letters to extract temperature values (e.g., ['D', 'E', 'F'] for max, min, and average temperatures).
header_rows : int
Number of header rows to exclude from processing.
multi_day_totals : bool
Whether the dataset includes multi-day total rows.
multi_day_averages : bool
Whether the dataset includes multi-day average rows.
excluded_rows : list of int
List of row indices to exclude from processing.
additional_excluded_rows : list of int
Additional row indices to exclude if multi-day averages are included.
final_totals_rows : list of int
Row indices of final totals to exclude if included
Returns
--------------
merged_df : pandas.DataFrame
A DataFrame containing validated and cleaned daily temperature data with the following columns:
- "Year": Year of the record.
- "Month": Month of the record.
- "Day": Day of the record.
- "Max_Temperature": Maximum temperature (validated or NaN if invalid/missing).
- "Min_Temperature": Minimum temperature (validated or NaN if invalid/missing).
- "Avg_Temperature": Average temperature (validated or NaN if invalid/missing).
'''
# Extract year and month from the filename
filename = os.path.basename(filepath) # Get the filename from the full file path
year, month = extract_date_from_filename(filename) # Assuming `extract_date_from_filename` is a utility that parses the year/month from the filename
if year and month:
workbook = openpyxl.load_workbook(filepath)
worksheet = workbook.active
if not (year and month):
raise ValueError(f"Could not extract year and month from filename: {filename}")
# List to hold all data
data = [] # All the variables
# Lists to hold all data for each temperature type
data_max = []
data_min = []
data_avg = []
# Rows to exclude. Adjust these according to your specific sheet
if multi_day_totals and not multi_day_averages:
excluded_rows = excluded_rows # These include titles and multi-day (e.g. 5/6) day totals
if multi_day_totals and multi_day_averages:
excluded_rows = excluded_rows + additional_excluded_rows # including both multi day totals and averages
if not multi_day_totals:
excluded_rows = final_totals_rows # Exlude only the final totals
# Convert the day column letter and temperature columns to numeric indices
date_column_idx = ord(date_column) - ord('A') + 1 # Convert 'B' -> 2 # Date
column_indices = [ord(col.strip()) - ord('A') + 1 for col in columns_to_check] # Max, min and average temperatures
# Now `column_indices` will contain [4, 5, 6] for 'D', 'E', 'F'
max_temp_column_idx = column_indices[0] # 'D' column index -> Maximum temperature
min_temp_column_idx = column_indices[1] # 'E' column index -> Minimum temperature
avg_temp_column_idx = column_indices[2] # 'F' column index -> Avergae temperature
total_transcribed_cells = 0
# Extract data from rows and columns, excluding specific rows.
for row_num in range(header_rows+1, worksheet.max_row+1): #Here this represents Max, Min and Average Temperatures
if row_num not in excluded_rows:
day_cell = worksheet.cell(row=row_num, column=date_column_idx) # Assuming the day is in the first column
max_temperature_cell = worksheet.cell(row=row_num, column=max_temp_column_idx) # Column for Max Temperature
min_temperature_cell = worksheet.cell(row=row_num, column=min_temp_column_idx) # Column for Min Temperature
average_temperature_cell = worksheet.cell(row=row_num, column=avg_temp_column_idx) # Column for Avg Temperature
# Count the total transcribed values, including even the non-confirmed (GREEN) ones
total_transcribed_cells += sum(cell.value is not None for cell in [max_temperature_cell, min_temperature_cell, average_temperature_cell])
if day_cell.value is not None and day_cell.value.isdigit():
day = int(day_cell.value)
max_temperature = max_temperature_cell.value if is_highlighted_green(max_temperature_cell, 'FF6DCD57') else 'NaN'
min_temperature = min_temperature_cell.value if is_highlighted_green(min_temperature_cell, 'FF6DCD57') else 'NaN'
average_temperature = average_temperature_cell.value if is_highlighted_green(average_temperature_cell, 'FF6DCD57') else 'NaN'
data.append([year, month, day, max_temperature, min_temperature, average_temperature])
data_max.append([year, month, day, max_temperature])
data_min.append([year, month, day, min_temperature])
data_avg.append([year, month, day, average_temperature])
# Create a DataFrame from the data
df = pd.DataFrame(data, columns=["Year", "Month", "Day", "Max_Temperature", "Min_Temperature", "Avg_Temperature"])
# Generate a complete date range for each year and month combination
years_months = df[['Year', 'Month']].drop_duplicates()
complete_data = []
for _, row in years_months.iterrows():
year = row['Year']
month = row['Month']
num_days = pd.Period(f'{year}-{month}').days_in_month
for day in range(1, num_days + 1):
complete_data.append([year, month, day])
complete_df = pd.DataFrame(complete_data, columns=["Year", "Month", "Day"])
# Merge the complete date range with the extracted data
merged_df = pd.merge(complete_df, df, on=["Year", "Month", "Day"], how='left')
# Fill missing temperatures with a placeholder value (e.g., NaN or a specific value)
for column in ["Max_Temperature", "Min_Temperature", "Avg_Temperature"]: # Since this is temperature, missing vales cannot be zero (0)
merged_df[column] = merged_df[column].fillna(np.nan)
# Convert temperature columns to numeric, coerce errors to NaN.
merged_df['Max_Temperature'] = pd.to_numeric(merged_df['Max_Temperature'], errors='coerce')
merged_df['Min_Temperature'] = pd.to_numeric(merged_df['Min_Temperature'], errors='coerce')
merged_df['Avg_Temperature'] = pd.to_numeric(merged_df['Avg_Temperature'], errors='coerce')
# Sort DataFrame by Year, Month, Day
merged_df = merged_df.sort_values(by=['Year', 'Month', 'Day'])
# Standard deviation and outlier detection
for column in ["Max_Temperature", "Min_Temperature", "Avg_Temperature"]:
# Calculate the standard deviation and mean for the column
std = merged_df[column].std()
mean = merged_df[column].mean()
# # Condition 1: Remove values more than 3 standard deviations from the mean
# merged_df[column] = merged_df[column].apply(lambda x: np.nan if abs(x - mean) > 3 * std else x)
# Condition 2: Detect sharp transitions between days (e.g., +4sd to -4sd)
# Calculate the standard deviation differences for each day relative to the mean
merged_df['std_diff'] = (merged_df[column] - mean) / std
for i in range(1, len(merged_df) - 1): # Avoid the first and last rows to prevent boundary issues
prev_std_diff = merged_df.loc[i - 1, 'std_diff'] if not pd.isna(merged_df.loc[i - 1, 'std_diff']) else 0 # std deviation difference of previous day
curr_std_diff = merged_df.loc[i, 'std_diff'] # std deviation difference of current day
next_std_diff = merged_df.loc[i + 1, 'std_diff'] if not pd.isna(merged_df.loc[i + 1, 'std_diff']) else 0 # std deviation difference of following day
if not pd.isna(curr_std_diff):
# Detect sharp opposite changes (e.g., large negative difference to large positive difference or vice versa)
if (prev_std_diff < -4 and curr_std_diff > 4 and next_std_diff < -4) or \
(prev_std_diff > 4 and curr_std_diff < -4 and next_std_diff > 4):
merged_df.loc[i, column] = np.nan # Mark the current value as an outlier
# Drop the temporary 'std_diff' column
merged_df.drop(columns=['std_diff'], inplace=True)
print(f"Total automatically transcribed Cells: {total_transcribed_cells}")
return merged_df
def select_and_convert_manually_transcribed_data(filepath, date_column, columns_to_check, header_rows, multi_day_totals, multi_day_averages, excluded_rows, additional_excluded_rows, final_totals_rows):
'''
Processes and selects manually transcribed meteorological data for analysis and conversion to a standardized format.
This function reads an Excel file containing manually transcribed meteorological observations, selects relevant
data columns, excludes rows as specified, and generates a complete time series. Missing values are handled,
and the data is returned as a structured DataFrame, ready for conversion to the Station Exchange Format (SEF).
Parameters
----------
filepath : str
Path to the manually transcribed data sheet (Excel file).
date_column : str
Column letter representing the day/date of observation (e.g., 'B').
columns_to_check : list of str
List of column letters to extract (e.g., ['D', 'E', 'F'] for Max, Min, and Average temperatures).
header_rows : int
Number of rows at the top of the sheet used as headers (not part of the data).
multi_day_totals : bool
Indicates whether the sheet contains multi-day totals that should be excluded.
multi_day_averages : bool
Indicates whether the sheet contains multi-day averages that should be excluded.
excluded_rows : list of int
Row indices to exclude from the processing (e.g., rows with multi-day totals).
additional_excluded_rows : list of int
Additional rows to exclude if multi-day averages are present.
final_totals_rows : list of int
Rows containing final totals to exclude if no multi-day totals are present.
Returns
-------
pandas.DataFrame
A DataFrame containing the processed data with columns for year, month, day,
maximum temperature, minimum temperature, and average temperature.
'''
# Extract year and month from the filename
filename = os.path.basename(filepath) # Get the filename from the full file path
year, month = extract_date_from_filename(filename) # Assuming `extract_date_from_filename` is a utility that parses the year/month from the filename
if year and month:
workbook = openpyxl.load_workbook(filepath)
worksheet = workbook.active
if not (year and month):
raise ValueError(f"Could not extract year and month from filename: {filename}")
# List to hold all data
data = [] # All the variables
# Lists to hold all data for each temperature type
data_max = []
data_min = []
data_avg = []
# Rows to exclude. Adjust these according to your specific sheet
if multi_day_totals and not multi_day_averages:
excluded_rows = excluded_rows # These include titles and multi-day (e.g. 5/6) day totals
if multi_day_totals and multi_day_averages:
excluded_rows = excluded_rows + additional_excluded_rows # including both multi day totals and averages
if not multi_day_totals:
excluded_rows = final_totals_rows # Exlude only the final totals
# Convert the day column letter and temperature columns to numeric indices
date_column_idx = ord(date_column) - ord('A') + 1 # Convert 'B' -> 2 # Date
column_indices = [ord(col.strip()) - ord('A') + 1 for col in columns_to_check] # Max, min and average temperatures
# Now `column_indices` will contain [4, 5, 6] for 'D', 'E', 'F'
max_temp_column_idx = column_indices[0] # 'D' column index -> Maximum temperature
min_temp_column_idx = column_indices[1] # 'E' column index -> Minimum temperature
avg_temp_column_idx = column_indices[2] # 'F' column index -> Avergae temperature
total_transcribed_cells = 0
# Extract data from rows and columns, excluding specific rows.
for row_num in range(header_rows+1, worksheet.max_row+1): #Here this represents Max, Min and Average Temperatures
if row_num not in excluded_rows:
day_cell = worksheet.cell(row=row_num, column=date_column_idx) # Assuming the day is in the first column
max_temperature_cell = worksheet.cell(row=row_num, column=max_temp_column_idx) # Column for Max Temperature
min_temperature_cell = worksheet.cell(row=row_num, column=min_temp_column_idx) # Column for Min Temperature
average_temperature_cell = worksheet.cell(row=row_num, column=avg_temp_column_idx) # Column for Avg Temperature
# Count the total manually transcribed values
total_transcribed_cells += sum(cell.value is not None for cell in [max_temperature_cell, min_temperature_cell, average_temperature_cell])
if day_cell.value is not None and day_cell.value.isdigit():
day = int(day_cell.value)
max_temperature = float(max_temperature_cell.value) if max_temperature_cell.value is not None else 'NaN'
min_temperature = float(min_temperature_cell.value) if min_temperature_cell.value is not None else 'NaN'
average_temperature = float(average_temperature_cell.value) if average_temperature_cell.value is not None else 'NaN'
data.append([year, month, day, max_temperature, min_temperature, average_temperature])
data_max.append([year, month, day, max_temperature])
data_min.append([year, month, day, min_temperature])
data_avg.append([year, month, day, average_temperature])
# Create a DataFrame from the data
df = pd.DataFrame(data, columns=["Year", "Month", "Day", "Max_Temperature", "Min_Temperature", "Avg_Temperature"])
# Generate a complete date range for each year and month combination
years_months = df[['Year', 'Month']].drop_duplicates()
complete_data = []
for _, row in years_months.iterrows():
year = row['Year']
month = row['Month']
num_days = pd.Period(f'{year}-{month}').days_in_month
for day in range(1, num_days + 1):
complete_data.append([year, month, day])
complete_df = pd.DataFrame(complete_data, columns=["Year", "Month", "Day"])
# Merge the complete date range with the extracted data
merged_df = pd.merge(complete_df, df, on=["Year", "Month", "Day"], how='left')
# Fill missing temperatures with a placeholder value (e.g., NaN or a specific value)
for column in ["Max_Temperature", "Min_Temperature", "Avg_Temperature"]: # Since this is temperature, missing vales cannot be zero (0)
merged_df[column] = merged_df[column].fillna(np.nan)
# Sort DataFrame by Year, Month, Day
merged_df = merged_df.sort_values(by=['Year', 'Month', 'Day'])
print(f"Total manually transcribed Cells: {total_transcribed_cells}")
return merged_df
def compare_dataframes(df1, df2, uncertainty_margin):
"""
Compare temperature data between two dataframes and calculate accuracy percentage and mean absolute error (MAE).
This function evaluates the similarity between temperature values in two dataframes, typically one containing
manually transcribed data and the other containing automatically transcribed post-processed data. It calculates the percentage of values
that fall within a specified uncertainty margin and computes the mean absolute error (MAE) for the compared data.
Parameters
----------
df1 : pandas.DataFrame
The first dataframe, generally containing manually transcribed data. It should have columns for 'Year',
'Month', 'Day', and temperature variables ('Max_Temperature', 'Min_Temperature', 'Avg_Temperature').
df2 : pandas.DataFrame
The second dataframe, typically containing automatically transcribed data. It must match
the structure and columns of df1.
uncertainty_margin : float, optional
The allowable difference between corresponding values in the two dataframes for them to be considered a match.
Returns
-------
tuple
A tuple containing:
- accuracy_percentage (float): The percentage of values that fall within the uncertainty margin.
- mae (float): The mean absolute error of the compared values.
"""
total_highlighted_cells = 0
accurate_matches = 0
absolute_errors = [] # List to store absolute errors for MAE calculation
# Ensure the dataframes are aligned on the same date
merged_df = pd.merge(df1, df2, on=['Year', 'Month', 'Day'], suffixes=('_manual', '_post'))
# Convert temperature columns to numeric, coercing errors to NaN
for col in ['Max_Temperature', 'Min_Temperature', 'Avg_Temperature']:
merged_df[col + '_manual'] = pd.to_numeric(merged_df[col + '_manual'], errors='coerce')
merged_df[col + '_post'] = pd.to_numeric(merged_df[col + '_post'], errors='coerce')
# Iterate through the relevant columns to compare values
for col in ['Max_Temperature', 'Min_Temperature', 'Avg_Temperature']:
col_manual = col + '_manual'
col_post = col + '_post'
for i in range(len(merged_df)):
if not pd.isna(merged_df[col_manual].iloc[i]) and not pd.isna(merged_df[col_post].iloc[i]):
total_highlighted_cells += 1
difference = abs(merged_df[col_manual].iloc[i] - merged_df[col_post].iloc[i])
# Add the absolute difference to the list for MAE calculation
absolute_errors.append(difference)
# Check if the difference is within the uncertainty margin
if difference <= uncertainty_margin:
accurate_matches += 1
# Calculate accuracy percentage
accuracy_percentage = (accurate_matches / total_highlighted_cells) * 100 if total_highlighted_cells > 0 else 0.0
# Calculate Mean Absolute Error (MAE)
mae = sum(absolute_errors) / len(absolute_errors) if absolute_errors else 0.0
print(f"Total Highlighted Cells: {total_highlighted_cells}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
return accuracy_percentage, mae
def plot_comparison(manual_df, post_processed_df, output_folder_path, station, post_file, accuracy_percentage_and_mean_absolute_error, uncertainty_margin):
"""
Generate a comparison plot for daily temperature data from manually transcribed and post-processed datasets.
This function creates a time series plot comparing daily maximum, minimum, and average temperatures from two datasets:
manually transcribed data and automatically transcribed post-processed data. The post-processed data is displayed as solid markers, while the
manually transcribed data is visualized as lighter bands representing the uncertainty margin. The plot highlights
the accuracy percentage and mean absolute error (MAE) between the datasets.
Parameters
----------
manual_df : pandas.DataFrame
Dataframe containing manually transcribed temperature data, with columns for 'Year', 'Month', 'Day',
and temperature variables ('Max_Temperature', 'Min_Temperature', 'Avg_Temperature').
post_processed_df : pandas.DataFrame
Dataframe containing post-processed temperature data, with the same structure as `manual_df`.
output_folder_path : str
Path to the directory where the generated plot will be saved.
station : str
Name or identifier of the station (station number), used for labeling the plot and the output file.
post_file : str
Filename of the post-processed data file, used to generate a unique name for the saved plot.
accuracy_percentage_and_mean_absolute_error : tuple
A tuple containing the accuracy percentage (float) and mean absolute error (float) calculated from the
comparison of the datasets.
uncertainty_margin : float
The uncertainty margin applied to the manually transcribed data to create the lighter bands in the plot.
Returns
-------
None
This function saves the generated plot as an image file in the specified output directory and displays it.
"""
# Ensure Date column exists in both dataframes
manual_df['Date'] = pd.to_datetime(manual_df[['Year', 'Month', 'Day']])
post_processed_df['Date'] = pd.to_datetime(post_processed_df[['Year', 'Month', 'Day']])
# Merge data on Date
merged_df = pd.merge(post_processed_df, manual_df, on='Date', suffixes=('_post', '_manual'))
# Create a figure and axis for the plot
fig, ax = plt.subplots(figsize=(12, 8))
# Plot post-QA/QC data using MeteoSaver (Max, Min, Avg) as dotted lines ('--')
ax.plot(merged_df['Date'], merged_df['Max_Temperature_post'], 'o', label='Max (Automatically transcribed)', color='red')
ax.plot(merged_df['Date'], merged_df['Min_Temperature_post'], 'o', label='Min (Automatically transcribed)', color='blue')
ax.plot(merged_df['Date'], merged_df['Avg_Temperature_post'], 'o', label='Avg (Automatically transcribed)', color='orange')
# Fill uncertainty margins around post-QA/QC data
ax.fill_between(merged_df['Date'], merged_df['Max_Temperature_manual'] - uncertainty_margin,
merged_df['Max_Temperature_manual'] + uncertainty_margin, color='red', alpha=0.2)
ax.fill_between(merged_df['Date'], merged_df['Min_Temperature_manual'] - uncertainty_margin,
merged_df['Min_Temperature_manual'] + uncertainty_margin, color='blue', alpha=0.2)
ax.fill_between(merged_df['Date'], merged_df['Avg_Temperature_manual'] - uncertainty_margin,
merged_df['Avg_Temperature_manual'] + uncertainty_margin, color='orange', alpha=0.2)
# Set plot labels and title
ax.set_xlabel('Date', fontsize=23, labelpad=15)
ax.set_ylabel('Temperature (°C)', fontsize=23, labelpad=15)
ax.set_title(f'Daily Max, Min, and Avg Temperatures at station {station}')
# Adjust tick label font size
ax.tick_params(axis='x', labelsize=23) # Set font size for x-axis tick labels
ax.tick_params(axis='y', labelsize=23) # Set font size for y-axis tick labels
# Adjust the y-axis limit to ensure the legend is above the plotted lines
ylim = ax.get_ylim()
y_max = max(ylim)
legend_y_position = y_max + (y_max * 0.1)
ax.set_ylim(ylim[0], legend_y_position)
# Format the date labels to show at intervals of 5 days
# Locator for every 5 days
ax.xaxis.set_major_locator(mdates.DayLocator(interval=5))
plt.xticks(rotation=45) # Rotate x-axis tick labels by 45 degrees
# Add accuracy percentage below the x-axis
plt.figtext(0.70, 0.92, f'Accuracy Percentage: {accuracy_percentage_and_mean_absolute_error[0]:.1f}%', ha='left', fontsize=22, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
plt.figtext(0.70, 0.87, f'Mean Absolute Error: {accuracy_percentage_and_mean_absolute_error[1]:.1f}', ha='left', fontsize=22, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
# # LEGEND
# # Plot manually transcribed data as points with 'o' markers
# manual_handle = [
# Line2D([0], [0], marker='o', color='red', label='Max (Automatically transcribed)', linestyle='None'),
# Line2D([0], [0], marker='o', color='blue', label='Min (Automatically transcribed)', linestyle='None'),
# Line2D([0], [0], marker='o', color='orange', label='Avg (Automatically transcribed)', linestyle='None')]
# # Create custom legend entries for automatically transcribed data + uncertainty
# custom_handle = [
# ( Patch(facecolor='red', alpha=0.2)),
# (Patch(facecolor='blue', alpha=0.2)),
# (Patch(facecolor='orange', alpha=0.2))]
# # Combine the manual transcribed and automatically transcribed legend handles
# all_handles = manual_handle + custom_handle
# # Create custom labels
# labels = [
# 'Max (Automatically transcribed)', 'Min (Automatically transcribed)', 'Avg (Automatically transcribed)',
# 'Max (Manually transcribed)', 'Min (Manually transcribed)', 'Avg (Manually transcribed)'
# ]
# # Set legend handles with custom lines for each temperature type
# ax.legend(handles=all_handles, labels=labels, fontsize='small', loc='upper right')
# Add the legend below the plot
# ax.legend(handles=all_handles, labels=labels, loc='upper center', bbox_to_anchor=(0.5, -0.40), ncol=2, fontsize=14)
ax.grid(True)
# # Add accuracy percentage below the x-axis
# plt.figtext(0.40, 0.01, f'Accuracy Percentage: {accuracy_percentage_and_mean_absolute_error[0]:.1f}%', ha='center', fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
# plt.figtext(0.60, 0.01, f'Mean Absolute Error: {accuracy_percentage_and_mean_absolute_error[1]:.1f}', ha='center', fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
# Remove the '.xlsx' extension from the post_file to clean the filename
cleaned_post_file_name = os.path.splitext(post_file)[0]
plt.tight_layout()
# Save the plot
plot_filename = os.path.join(output_folder_path, f'temperature_comparison_plot_{cleaned_post_file_name}.jpg')
plt.savefig(plot_filename, format='jpg')
# plt.show()
plt.close()
def validate(manually_transcribed_data_dir_station, postprocessed_data_dir_station, output_folder_path, station, date_column, daily_temperature_columns, header_rows, multi_day_totals, multi_day_averages, excluded_rows, additional_excluded_rows, final_totals_rows, uncertainty_margin):
"""
Validate and compare manually transcribed data with post-processed data, generating accuracy metrics and visualizations.
This function performs validation by comparing manually transcribed meteorological data with automatically transcribed post-processed data.
It matches corresponding files, processes the data, calculates accuracy metrics such as the accuracy percentage
and mean absolute error (MAE), and generates time series plots for visual comparison.
Parameters
----------
manually_transcribed_data_dir_station : str
Directory containing manually transcribed data files. These files should include the suffix '_manually_entered_temperatures'.
postprocessed_data_dir_station : str
Directory containing post-processed data files, named with the suffix '_post_QA_QC'.
output_folder_path : str
Path where the generated plots will be saved.
station : str
Identifier or name of the station (station no.), used for labeling plots and output files.
date_column : str
The column name or letter indicating the date in the input files.
columns_to_check : list of str
List of column names or letters corresponding to maximum, minimum, and average temperatures.
header_rows : int
Number of header rows in the input files.
multi_day_totals : bool
Whether multi-day totals are present in the input files.
multi_day_averages : bool
Whether multi-day averages are present in the input files.
excluded_rows : list of int
Rows to exclude during processing, such as title rows or non-temperature rows.
additional_excluded_rows : list of int
Additional rows to exclude for multi-day totals and averages.
final_totals_rows : list of int
Rows containing final totals to exclude if multi-day totals are not included.
uncertainty_margin : float
Allowable difference between manually transcribed and post-processed values for validation.
Returns
-------
None
This function does not return a value but processes the data, calculates metrics, and generates plots.
"""
manually_transcribed_files = os.listdir(manually_transcribed_data_dir_station)
postprocessed_files = os.listdir(postprocessed_data_dir_station)
manual_files = [f for f in manually_transcribed_files if 'manually_entered' in f]
post_processed_files = [f for f in postprocessed_files if 'post_QA_QC' in f]
# Create a dictionary to map manual files to post-processed files
file_pairs = {}
for manual_file in manual_files:
base_name = manual_file.replace('_manually_entered_temperatures', '').replace('.xlsx', '')
corresponding_post_file = f'{base_name}_post_QA_QC.xlsx'
if corresponding_post_file in post_processed_files:
file_pairs[manual_file] = corresponding_post_file
else:
print(f"Post-processed file not found for {manual_file}")
if not file_pairs:
print("No matching files found.")
return
# Process each file pair
for manual_file, post_file in file_pairs.items():
print(f"Processing pair: {manual_file} and {post_file}")
manual_filepath = os.path.join(manually_transcribed_data_dir_station, manual_file)
print(manual_filepath)
post_processed_filepath = os.path.join(postprocessed_data_dir_station, post_file)
# Load data
# Manually transcribed data
manual_df = select_and_convert_manually_transcribed_data(manual_filepath, date_column, daily_temperature_columns, header_rows, multi_day_totals, multi_day_averages, excluded_rows, additional_excluded_rows, final_totals_rows)
# Automatically transcribed data
post_processed_df = select_and_convert_postprocessed_data(post_processed_filepath, date_column, daily_temperature_columns, header_rows, multi_day_totals, multi_day_averages, excluded_rows, additional_excluded_rows, final_totals_rows)
# # Convert temperature columns to numeric, coerce errors to NaN
# manual_df['Max_Temperature'] = pd.to_numeric(manual_df['Max_Temperature'], errors='coerce')
# manual_df['Min_Temperature'] = pd.to_numeric(manual_df['Max_Temperature'], errors='coerce')
# manual_df['Avg_Temperature'] = pd.to_numeric(manual_df['Max_Temperature'], errors='coerce')
# Convert temperature columns to numeric, coerce errors to NaN for post-processed data
post_processed_df['Max_Temperature'] = pd.to_numeric(post_processed_df['Max_Temperature'], errors='coerce')
post_processed_df['Min_Temperature'] = pd.to_numeric(post_processed_df['Min_Temperature'], errors='coerce')
post_processed_df['Avg_Temperature'] = pd.to_numeric(post_processed_df['Avg_Temperature'], errors='coerce')
# Check
print(manual_df[['Year', 'Month', 'Day']].head())
print(post_processed_df[['Year', 'Month', 'Day']].head())
# Compare the dataframes and get accuracy percentage
accuracy_percentage_and_mean_absolute_error = compare_dataframes(manual_df, post_processed_df, uncertainty_margin)
# Plot the comparison
plot_comparison(manual_df, post_processed_df, output_folder_path, station, post_file, accuracy_percentage_and_mean_absolute_error, uncertainty_margin)