-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge-data.py
executable file
·98 lines (73 loc) · 3.15 KB
/
merge-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import pandas as pd
# Define input and output file paths
csv_file = "generated_data/turkish_cities.csv"
tsv_file = "generated_data/filtered_geonames.tsv"
output_file = "generated_data/db.tsv"
gps_data_file = "generated_data/gps-data-tsv.tsv"
# Load the CSV and TSV files
csv_df = pd.read_csv(csv_file)
tsv_df = pd.read_csv(tsv_file, sep="\t")
gps_data_df = pd.read_csv(gps_data_file, sep="\t")
# Rename columns in TSV to match CSV
tsv_df.rename(
columns={"state": "state_name", "lat": "latitude", "lon": "longitude"}, inplace=True
)
# Function to check and print rows with latitude and longitude (0.0, 0.0)
def check_zero_lat_lon(df, operation_name):
zero_lat_lon_df = df[(df["latitude"] == 0.0) & (df["longitude"] == 0.0)]
if not zero_lat_lon_df.empty:
print(
f"After {operation_name}, the following places have (0.0, 0.0) latitude and longitude:"
)
print(zero_lat_lon_df)
else:
print(
f"No (0.0, 0.0) latitude and longitude places found after {operation_name}."
)
check_zero_lat_lon(tsv_df, "loading TSV")
check_zero_lat_lon(gps_data_df, "loading GPS data")
def merge_on_name_and_country(df1, df2):
"""df1 overrides df2 if some data exist in both of them"""
# Merge the two data frames on 'name' and 'country_code'
merged_df = pd.merge(
df1,
df2,
on=["name", "country_code"],
how="outer",
)
# Create a new DataFrame with the desired column names
final_df = pd.DataFrame()
# Copy 'name' and 'country_code' columns directly
final_df["name"] = merged_df["name"]
final_df["country_code"] = merged_df["country_code"]
# List of columns to check and merge
columns_to_merge = ["state_name", "latitude", "longitude"]
# Iterate through the columns and choose values from CSV if present, otherwise from TSV
for col in columns_to_merge:
final_df[col] = merged_df[f"{col}_x"].combine_first(merged_df[f"{col}_y"])
# Add the 'alternative_names' column from TSV
final_df["alternative_names"] = merged_df["alternative_names"]
return final_df
final_df = merge_on_name_and_country(
merge_on_name_and_country(csv_df, tsv_df), gps_data_df
)
# Remove duplicates based on 'name', 'country_code', 'state_name', 'latitude', and 'longitude'
final_df.drop_duplicates(
subset=["name", "country_code", "state_name", "latitude", "longitude"], inplace=True
)
# Trim leading and trailing spaces
columns_to_trim = ["name", "country_code", "state_name", "alternative_names"]
for col in columns_to_trim:
final_df[col] = final_df[col].str.strip()
# Replace empty strings in specific columns with NaN
final_df.replace(
{"name": "", "country_code": "", "state_name": ""}, np.nan, inplace=True
)
# Drop rows with NaN values in 'name', 'country_code', or 'state_name'
final_df.dropna(subset=["name", "country_code", "state_name"], inplace=True)
# Sort the DataFrame to make the file look tidy
sorted_df = final_df.sort_values(by=["country_code", "state_name", "name"])
# Save the final dataframe to a TSV file
sorted_df.to_csv(output_file, sep="\t", index=False)
print(f"Merge complete. Output saved to {output_file}")