-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
57 lines (46 loc) · 1.47 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
def drop_notes(df):
"""
Drop the 'notes' column from the DataFrame.
"""
if 'notes' in df.columns:
df = df.drop(columns=['notes'])
return df
def select_high_ratings(df):
"""
Select only rows where the 'rating' column is 90 or higher.
"""
if 'ratings' in df.columns:
df = df[df['ratings'] >= 90]
return df
def drop_and_one_hot_encode_red_wine(df):
"""
Create a 'Red_Wine' column that is 1 if 'variety' is 'Red Wine' and 0 otherwise.
Drop the original 'variety' column.
"""
if 'variety' in df.columns:
df = pd.get_dummies(df, columns=['variety'], prefix='Red Wine', drop_first=True)
return df
def remove_newlines_carriage_returns(df):
"""
Remove newlines and carriage returns from all string columns in the DataFrame.
"""
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].str.replace('\n', ' ').str.replace('\r', ' ')
return df
def convert_ratings_to_int(df):
"""
Convert the 'rating' column from float to integer.
"""
if 'ratings' in df.columns:
df['ratings'] = df['ratings'].to_bool()
return df
# Example usage
if __name__ == "__main__":
df = pd.read_csv('train.csv')
df = drop_notes(df)
df = select_high_ratings(df)
df = drop_and_one_hot_encode_red_wine(df)
df = remove_newlines_carriage_returns(df)
df = convert_ratings_to_int(df)
df.to_csv('transformed_train.csv', index=False)