Skip to content

Commit

Permalink
added landmarks to validation set
Browse files Browse the repository at this point in the history
  • Loading branch information
tzuf committed Jun 11, 2023
1 parent 4c023c7 commit f3990f0
Show file tree
Hide file tree
Showing 6 changed files with 2,922 additions and 2,274 deletions.
889 changes: 772 additions & 117 deletions app/notebooks/query_firestore.ipynb

Large diffs are not rendered by default.

26 changes: 12 additions & 14 deletions cabby/evals/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import numpy as np
from geopy.distance import great_circle

from cabby import logger
from absl import logging

# Object for storing each evaluation tuple parsed from the input file.
EvalDataTuple = collections.namedtuple(
Expand All @@ -39,9 +39,7 @@ class Evaluator:
"""Class for evaluating geo models."""

def __init__(self):
self.eval_logger = logger.create_logger(
"rvs_geo_eval.log", "rvs_geo_eval")
self.eval_logger.info("Starting evaluation.")
logging.info("Starting evaluation.")

def get_error_distances(self, input_file):
"""Compute error distance in meters between true and predicted coordinates.
Expand All @@ -56,11 +54,11 @@ def get_error_distances(self, input_file):
"""
error_distances = []
total_examples = 0
self.eval_logger.info(f"Opening file <= {input_file}")
logging.info(f"Opening file <= {input_file}")
for line in open(input_file):
toks = line.strip().split("\t")
if len(toks) != 7:
self.eval_logger.warning(
logging.warning(
"Unexpected line format: [%s]. Skipping", line)
continue
eval_tuple = EvalDataTuple(toks[0], float(toks[1]), float(toks[2]),
Expand All @@ -79,9 +77,9 @@ def compute_metrics(self, error_distances):
eval_logger: Logger object.
"""
num_examples = len(error_distances)
self.eval_logger.info(f"Started evaluation with {num_examples} samples")
logging.info(f"Started evaluation with {num_examples} samples")
if num_examples == 0:
self.eval_logger.error("No examples to be evaluated.")
logging.error("No examples to be evaluated.")
accuracy = float(
len(np.where(np.array(error_distances) == 0.)[0])) / num_examples

Expand All @@ -91,8 +89,8 @@ def compute_metrics(self, error_distances):
accuracy_100m = float(
len(np.where(np.array(error_distances) <= 100.)[0])) / num_examples

accuracy_1000m = float(
len(np.where(np.array(error_distances) <= 1000.)[0])) / num_examples
accuracy_250m = float(
len(np.where(np.array(error_distances) <= 250.)[0])) / num_examples

mean_distance, median_distance, max_error = np.mean(error_distances), np.median(
error_distances), np.max(error_distances)
Expand All @@ -104,11 +102,11 @@ def compute_metrics(self, error_distances):
# Normalized AUC by maximum error possible.
norm_auc = auc / (_MAX_LOG_HAVERSINE_DIST * (num_examples - 1))

self.eval_logger.info(
logging.info(
"Metrics: \nExact accuracy : [%.2f]\n10 m accuracy : [%.2f]\n100 m accuracy : [%.2f]" +
"\n1000 m accuracy : [%.2f]" + "\nmean error [%.2f], " +
"\n250 m accuracy : [%.2f]" + "\nmean error [%.2f], " +
"\nmedian error [%.2f]\nmax error [%.2f]\n" +
"AUC of error curve [%.2f]", accuracy, accuracy_10m, accuracy_100m, accuracy_1000m,
"AUC of error curve [%.2f]", accuracy, accuracy_10m, accuracy_100m, accuracy_250m,
mean_distance, median_distance, max_error, norm_auc)
return EvalMetrics(accuracy, accuracy_10m, accuracy_100m, accuracy_1000m,
return EvalMetrics(accuracy, accuracy_10m, accuracy_100m, accuracy_250m,
mean_distance, median_distance, max_error, norm_auc)
27 changes: 12 additions & 15 deletions cabby/model/dataset_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,15 @@ def __init__(self, region: str, data: pd.DataFrame, s2level: int,
self.graph_embed_start = self.start_cells.apply(
lambda cell: util.get_valid_graph_embed(self.graph_embed_file, str(cell)))

data['landmarks_cells'] = data.landmarks.apply(
lambda l: [gutil.cellid_from_point(x, self.s2level) for x in l])
if 'landmarks' in data:
data['landmarks_cells'] = data.landmarks.apply(
lambda l: [gutil.cellid_from_point(x, self.s2level) for x in l])

self.graph_embed_landmarks = data.landmarks_cells.apply(
lambda l: [util.get_valid_graph_embed(
self.graph_embed_file, str(cell)) for cell in l])
self.graph_embed_landmarks = data.landmarks_cells.apply(
lambda l: [util.get_valid_graph_embed(
self.graph_embed_file, str(cell)) for cell in l])
else:
self.graph_embed_landmarks = ['0']*data.instructions.shape[0]

self.start_embed_text_input_list = [
str(i).replace(':', f': Start at {str(s)}.') for s, i in zip(
Expand All @@ -310,6 +313,7 @@ def __init__(self, region: str, data: pd.DataFrame, s2level: int,
self.landmarks_dist_raw.append('; '.join(landmark_dist_cur))

else:
logging.info(self.region)
self.landmark_label = ['0']*data.shape[0]
if is_dist:
logging.info(f"Calculating distances between {dist_matrix.shape[0]} cells")
Expand Down Expand Up @@ -561,11 +565,9 @@ def set_generation_model(self, data):
self.text_output_tokenized = output_text
self.text_input_tokenized = input_text
return
try:
self.text_output_tokenized = self.text_tokenizer(
output_text, truncation=True, padding=True, add_special_tokens=True).input_ids
except:
logging.info(f"????? {output_text}")

self.text_output_tokenized = self.text_tokenizer(
output_text, truncation=True, padding=True, add_special_tokens=True).input_ids

self.text_input_tokenized = self.text_tokenizer(
input_text, truncation=True, padding='max_length', add_special_tokens=True, max_length=200)
Expand Down Expand Up @@ -734,15 +736,10 @@ def __getitem__(self, idx: int):

text_output = torch.tensor(self.text_output_tokenized[idx])

key = self.data.iloc[idx].key

# print ("!!!!!!!!!! ", key)

sample = {'text': text_input, 'cellid': cellid,
'neighbor_cells': neighbor_cells, 'far_cells': far_cells,
'end_point': end_point, 'start_point': start_point, 'label': label, 'prob': prob,
'text_output': text_output, 'graph_embed_start': graph_embed_start,
'key': key
}

return sample
Expand Down
2 changes: 0 additions & 2 deletions cabby/model/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ def load_data(self, data_dir: str, ds_set: str, lines: bool):

ds['start_end'] = ds.apply(self.get_fixed_point_along_route, axis=1)

ds = shuffle(ds)
ds.reset_index(inplace=True, drop=True)
return ds

Expand Down Expand Up @@ -318,7 +317,6 @@ def load_data(self, data_dir: str, lines: bool):
['map', 'id', 'instructions', 'end_point', 'start_point'])
ds.drop(columns_keep, 1, inplace=True)

ds = shuffle(ds)
ds.reset_index(inplace=True, drop=True)

dataset_size = ds.shape[0]
Expand Down
Loading

0 comments on commit f3990f0

Please sign in to comment.