added landmarks to validation set

googleinterns · Jun 11, 2023 · f3990f0 · f3990f0
1 parent 4c023c7
commit f3990f0
Show file tree

Hide file tree

Showing 6 changed files with 2,922 additions and 2,274 deletions.
diff --git a/app/notebooks/query_firestore.ipynb b/app/notebooks/query_firestore.ipynb
diff --git a/cabby/evals/utils.py b/cabby/evals/utils.py
@@ -18,7 +18,7 @@
 import numpy as np
 from geopy.distance import great_circle
 
-from cabby import logger
+from absl import logging
 
 # Object for storing each evaluation tuple parsed from the input file.
 EvalDataTuple = collections.namedtuple(
@@ -39,9 +39,7 @@ class Evaluator:
   """Class for evaluating geo models."""
 
   def __init__(self):
-    self.eval_logger = logger.create_logger(
-      "rvs_geo_eval.log", "rvs_geo_eval")
-    self.eval_logger.info("Starting evaluation.")
+    logging.info("Starting evaluation.")
 
   def get_error_distances(self, input_file):
     """Compute error distance in meters between true and predicted coordinates.
@@ -56,11 +54,11 @@ def get_error_distances(self, input_file):
     """
     error_distances = []
     total_examples = 0
-    self.eval_logger.info(f"Opening file <= {input_file}")
+    logging.info(f"Opening file <= {input_file}")
     for line in open(input_file):
       toks = line.strip().split("\t")
       if len(toks) != 7:
-        self.eval_logger.warning(
+        logging.warning(
           "Unexpected line format: [%s]. Skipping", line)
         continue
       eval_tuple = EvalDataTuple(toks[0], float(toks[1]), float(toks[2]),
@@ -79,9 +77,9 @@ def compute_metrics(self, error_distances):
         eval_logger: Logger object.
     """
     num_examples = len(error_distances)
-    self.eval_logger.info(f"Started evaluation with {num_examples} samples") 
+    logging.info(f"Started evaluation with {num_examples} samples") 
     if num_examples == 0:
-      self.eval_logger.error("No examples to be evaluated.")
+      logging.error("No examples to be evaluated.")
     accuracy = float(
       len(np.where(np.array(error_distances) == 0.)[0])) / num_examples
 
@@ -91,8 +89,8 @@ def compute_metrics(self, error_distances):
     accuracy_100m = float(
       len(np.where(np.array(error_distances) <= 100.)[0])) / num_examples
 
-    accuracy_1000m = float(
-      len(np.where(np.array(error_distances) <= 1000.)[0])) / num_examples
+    accuracy_250m = float(
+      len(np.where(np.array(error_distances) <= 250.)[0])) / num_examples
 
     mean_distance, median_distance, max_error = np.mean(error_distances), np.median(
       error_distances), np.max(error_distances)
@@ -104,11 +102,11 @@ def compute_metrics(self, error_distances):
     # Normalized AUC by maximum error possible.
     norm_auc = auc / (_MAX_LOG_HAVERSINE_DIST * (num_examples - 1))
 
-    self.eval_logger.info(
+    logging.info(
       "Metrics: \nExact accuracy : [%.2f]\n10 m accuracy : [%.2f]\n100 m accuracy : [%.2f]" +
-      "\n1000 m accuracy : [%.2f]" + "\nmean error [%.2f], " +
+      "\n250 m accuracy : [%.2f]" + "\nmean error [%.2f], " +
       "\nmedian error [%.2f]\nmax error [%.2f]\n" +
-      "AUC of error curve [%.2f]", accuracy, accuracy_10m, accuracy_100m, accuracy_1000m,
+      "AUC of error curve [%.2f]", accuracy, accuracy_10m, accuracy_100m, accuracy_250m,
       mean_distance, median_distance, max_error, norm_auc)
-    return EvalMetrics(accuracy, accuracy_10m, accuracy_100m, accuracy_1000m,
+    return EvalMetrics(accuracy, accuracy_10m, accuracy_100m, accuracy_250m,
                mean_distance, median_distance, max_error, norm_auc)
diff --git a/cabby/model/dataset_item.py b/cabby/model/dataset_item.py
@@ -279,12 +279,15 @@ def __init__(self, region: str, data: pd.DataFrame, s2level: int,
       self.graph_embed_start = self.start_cells.apply(
         lambda cell: util.get_valid_graph_embed(self.graph_embed_file, str(cell)))
 
-      data['landmarks_cells'] = data.landmarks.apply(
-        lambda l: [gutil.cellid_from_point(x, self.s2level) for x in l])
+      if 'landmarks' in data:
+        data['landmarks_cells'] = data.landmarks.apply(
+          lambda l: [gutil.cellid_from_point(x, self.s2level) for x in l])
 
-      self.graph_embed_landmarks = data.landmarks_cells.apply(
-        lambda l: [util.get_valid_graph_embed(
-          self.graph_embed_file, str(cell)) for cell in l])
+        self.graph_embed_landmarks = data.landmarks_cells.apply(
+          lambda l: [util.get_valid_graph_embed(
+            self.graph_embed_file, str(cell)) for cell in l])
+      else:
+        self.graph_embed_landmarks = ['0']*data.instructions.shape[0]
 
       self.start_embed_text_input_list = [
         str(i).replace(':', f': Start at {str(s)}.') for s, i in zip(
@@ -310,6 +313,7 @@ def __init__(self, region: str, data: pd.DataFrame, s2level: int,
         self.landmarks_dist_raw.append('; '.join(landmark_dist_cur))
 
     else:
+      logging.info(self.region)
       self.landmark_label = ['0']*data.shape[0]
     if is_dist:
       logging.info(f"Calculating distances between {dist_matrix.shape[0]} cells")
@@ -561,11 +565,9 @@ def set_generation_model(self, data):
       self.text_output_tokenized = output_text
       self.text_input_tokenized = input_text
       return
-    try:
-      self.text_output_tokenized = self.text_tokenizer(
-        output_text, truncation=True, padding=True, add_special_tokens=True).input_ids
-    except:
-      logging.info(f"????? {output_text}")
+
+    self.text_output_tokenized = self.text_tokenizer(
+      output_text, truncation=True, padding=True, add_special_tokens=True).input_ids
 
     self.text_input_tokenized = self.text_tokenizer(
       input_text, truncation=True, padding='max_length', add_special_tokens=True, max_length=200)
@@ -734,15 +736,10 @@ def __getitem__(self, idx: int):
 
     text_output = torch.tensor(self.text_output_tokenized[idx])
 
-    key = self.data.iloc[idx].key
-
-    # print ("!!!!!!!!!! ", key)
-
     sample = {'text': text_input, 'cellid': cellid, 
               'neighbor_cells': neighbor_cells, 'far_cells': far_cells, 
               'end_point': end_point, 'start_point': start_point, 'label': label, 'prob': prob, 
               'text_output': text_output, 'graph_embed_start': graph_embed_start,
-              'key': key
               }
 
     return sample

diff --git a/cabby/model/datasets.py b/cabby/model/datasets.py
@@ -261,7 +261,6 @@ def load_data(self, data_dir: str, ds_set: str, lines: bool):
 
       ds['start_end'] = ds.apply(self.get_fixed_point_along_route, axis=1)
 
-    ds = shuffle(ds)
     ds.reset_index(inplace=True, drop=True)
     return ds
 
@@ -318,7 +317,6 @@ def load_data(self, data_dir: str, lines: bool):
       ['map', 'id', 'instructions', 'end_point', 'start_point'])
     ds.drop(columns_keep, 1, inplace=True)
 
-    ds = shuffle(ds)
     ds.reset_index(inplace=True, drop=True)
 
     dataset_size = ds.shape[0]