-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweather_data_preprocessing.py
135 lines (99 loc) · 3.48 KB
/
weather_data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
from pandas import DataFrame
def column_preci_treatment(file_route:str) -> DataFrame:
r'''Function to preprocess column info from precipitation data from
Direccion Nacional de Meterología de Uruguay
Receives
--------
file_route : str
string that represents the file route to load the data from .csv
Returns
-------
df : DataFrame
DataFrame of processed columns
'''
# Data import
df = pd.read_csv(# File route of csv
file_route,
# csv separator
sep = ';')
if any(df.columns == 'Nombre'):
# 'Nombre' field is dropped
df = df.drop(columns=['Nombre'])
# DataFrame transpose
df = df.T
# Index reboot
df = df.reset_index()
# Column names declaration
df.columns = df.iloc[0]
# Repeated row dropping
df.drop(labels=0, axis=0, inplace=True)
else:
pass
return df
def data_preci_treatment(file_route) -> DataFrame:
r'''Function to preprocess precipitation data from Direccion Nacional de
Meterología de Uruguay
Receives
--------
file_route : str
string that represents the file route to load the data from .csv
Returns
-------
df : DataFrame
DataFrame of preprocessed data
'''
# Data import
df = pd.read_csv(# Data route
file_route,
# NaN values masking
na_values=['s/dato'],
# csv separator
sep = ';')
if any(df.columns == 'MES') and any(df.columns == 'AÑO'):
# From month abbreviation to month number
month_names = df['MES'].unique().tolist()
month_dict = {v:f'{k:02d}' for k,v in enumerate(month_names, start=1)}
df['MES'] = df['MES'].map(month_dict)
# 'Fecha' column declaration
df['Fecha'] = df['AÑO'].astype(str) + '-' + df['MES']
# 'AÑO' and 'MES' columns dropping
df.drop(columns=['AÑO', 'MES'], inplace=True)
# 'Fecha' placement at first column
first_col = df.pop('Fecha')
# First column insertion
df.insert(0, 'Fecha', first_col)
# DataFrame reshape
df = pd.melt(# DataFrame to reshape
df,
# Identifier variable
id_vars = ['Fecha'],
# New column asignment
var_name=['Nombre Estación'],
# New values to place in the dataframe
value_name= 'Precipitacion')
# 'Fecha' colum conversion from string to datetime
df['Fecha'] = pd.to_datetime(df['Fecha'])
else:
pass
return df
def null_report(df:DataFrame) -> DataFrame:
r'''
Function to generate a null value report
Receives
--------
df : DataFrame
A dataframe to compute the null values
Returns
-------
df_null_rep : DataFrame
A dataframe that returns the null values per variable from the input df
'''
null_percentage = []
column_names = df.columns
for column in column_names:
null_percentage += [df[column].isnull().sum()/len(df) * 100]
# Dataframe null values percentage
df_null_rep = pd.DataFrame({'variables':column_names,
'null_values': null_percentage})
return df_null_rep