diff --git a/pytext/metric_reporters/channel.py b/pytext/metric_reporters/channel.py index bd9ad2eae..ed3f1a49f 100644 --- a/pytext/metric_reporters/channel.py +++ b/pytext/metric_reporters/channel.py @@ -5,9 +5,7 @@ import traceback from typing import Tuple -import numpy as np import torch -from numpy import linalg as LA from pytext.common.constants import Stage from pytext.utils.file_io import PathManager from torch.utils.tensorboard import SummaryWriter @@ -204,8 +202,6 @@ def report( meta, model, optimizer, - log_gradient, - gradients, *args, ): """ @@ -270,32 +266,33 @@ def stage2prefix(stage: Stage): self.summary_writer.add_scalar( f"optimizer.lr.param_group.{idx}", param_group["lr"], epoch ) - if log_gradient and gradients: - for key in gradients: - if len(gradients[key]): - sum_gradient = sum(gradients[key]) - avg_gradient = sum_gradient / len(gradients[key]) - grad_norms = np.array([LA.norm(g) for g in gradients[key]]) - self.log_vector(key + "_avg_gradients", avg_gradient, epoch) - self.log_vector(key + "_sum_gradients", sum_gradient, epoch) - self.log_vector(key + "_l2norm_gradients", grad_norms, epoch) - for key, val in model.named_parameters(): if val is not None and len(val) > 0 and not (val == 0).all(): limit = 9.9e19 + grad = val.grad val = torch.clamp(val.float(), -limit, limit) - self.log_vector(key, val, epoch) - - def log_vector(self, key, val, epoch): - if len(val) > 0 and not (val == 0).all(): - try: - self.summary_writer.add_histogram(key, val, epoch) - except Exception: - print( - f"WARNING: Param {key} " "cannot be sent to Tensorboard", - file=sys.stderr, - ) - traceback.print_exc(file=sys.stderr) + try: + self.summary_writer.add_histogram(key, val, epoch) + except Exception: + print( + f"WARNING: Param {key} cannot be sent to Tensorboard", + file=sys.stderr, + ) + traceback.print_exc(file=sys.stderr) + + if grad is not None and len(grad) > 0 and not (grad == 0).all(): + grad = torch.clamp(grad.float(), -limit, limit) + try: + self.summary_writer.add_histogram( + key + "_gradients", grad, epoch + ) + except Exception: + print( + f"WARNING: Grad for param {key} " + "cannot be sent to Tensorboard", + file=sys.stderr, + ) + traceback.print_exc(file=sys.stderr) def add_texts(self, tag, metrics): """ diff --git a/pytext/metric_reporters/metric_reporter.py b/pytext/metric_reporters/metric_reporter.py index 8d951db8a..5060b7cb4 100644 --- a/pytext/metric_reporters/metric_reporter.py +++ b/pytext/metric_reporters/metric_reporter.py @@ -45,7 +45,6 @@ class MetricReporter(Component): __COMPONENT_TYPE__ = ComponentType.METRIC_REPORTER lower_is_better: bool = False - log_gradient: bool = False class Config(ConfigBase): output_path: str = "/tmp/test_out.txt" @@ -53,8 +52,7 @@ class Config(ConfigBase): #: Useful for KD training, column names that used by student but not teacher. student_column_names: List[str] = [] - def __init__(self, channels, log_gradient=False, pep_format=False) -> None: - self.log_gradient = log_gradient + def __init__(self, channels, pep_format=False) -> None: self._reset() self.channels = channels self.pep_format = pep_format @@ -68,7 +66,6 @@ def _reset(self): self.all_scores: List = [] self.n_batches = 0 self.batch_size: List = [] - self.all_gradients: Dict[str, List[List]] = {} def _reset_realtime(self): self.realtime_meters: Dict = {} @@ -114,16 +111,6 @@ def add_batch_stats( self.realtime_meters["tps"].update(context[DatasetFieldName.NUM_TOKENS]) self.realtime_meters["ups"].update(1) - def add_gradients(self, model): - if self.log_gradient: - for key, value in model.named_parameters(): - grad = value.grad - if grad is not None and len(grad) > 0 and not (grad == 0).all(): - if key in self.all_gradients: - self.all_gradients[key].append(grad.cpu().numpy()) - else: - self.all_gradients[key] = [grad.cpu().numpy()] - def aggregate_preds(self, batch_preds, batch_context=None): self.aggregate_data(self.all_preds, batch_preds) @@ -267,8 +254,6 @@ def report_metric( self.get_meta(), model, optimizer, - self.log_gradient, - self.get_gradients(), ) if reset: @@ -322,9 +307,6 @@ def compare_metric(self, new_metric, old_metric): return False return (new < old) == self.lower_is_better - def get_gradients(self): - return self.all_gradients - class PureLossMetricReporter(MetricReporter): lower_is_better = True diff --git a/pytext/metric_reporters/seq2seq_compositional.py b/pytext/metric_reporters/seq2seq_compositional.py index b35290c2f..98e836fb9 100644 --- a/pytext/metric_reporters/seq2seq_compositional.py +++ b/pytext/metric_reporters/seq2seq_compositional.py @@ -67,8 +67,8 @@ def gen_content(self, metrics, loss, preds, targets, scores, context): class Seq2SeqCompositionalMetricReporter(Seq2SeqMetricReporter): - def __init__(self, channels, log_gradient, tensorizers, accept_flat_intents_slots): - super().__init__(channels, log_gradient, tensorizers) + def __init__(self, channels, tensorizers, accept_flat_intents_slots): + super().__init__(channels, tensorizers) self.accept_flat_intents_slots = accept_flat_intents_slots class Config(MetricReporter.Config): diff --git a/pytext/metric_reporters/seq2seq_metric_reporter.py b/pytext/metric_reporters/seq2seq_metric_reporter.py index 1714f84f0..c4ddc821d 100644 --- a/pytext/metric_reporters/seq2seq_metric_reporter.py +++ b/pytext/metric_reporters/seq2seq_metric_reporter.py @@ -43,8 +43,8 @@ class Seq2SeqMetricReporter(MetricReporter): class Config(MetricReporter.Config): pass - def __init__(self, channels, log_gradient, tensorizers): - super().__init__(channels, log_gradient) + def __init__(self, channels, tensorizers): + super().__init__(channels) self.tensorizers = tensorizers def _reset(self): diff --git a/pytext/metric_reporters/tests/tensorboard_test.py b/pytext/metric_reporters/tests/tensorboard_test.py index 361594e06..6e73368d4 100644 --- a/pytext/metric_reporters/tests/tensorboard_test.py +++ b/pytext/metric_reporters/tests/tensorboard_test.py @@ -47,8 +47,6 @@ def test_report_metrics_with_nan(self): meta={}, model=model, optimizer=optimizer, - log_gradient=False, - gradients={}, ) def test_report_metrics_to_others(self): @@ -73,6 +71,4 @@ def test_report_metrics_to_others(self): meta={}, model=model, optimizer=optimizer, - log_gradient=False, - gradients={}, ) diff --git a/pytext/trainers/trainer.py b/pytext/trainers/trainer.py index 94b884214..6bbc60177 100644 --- a/pytext/trainers/trainer.py +++ b/pytext/trainers/trainer.py @@ -619,9 +619,6 @@ def run_step( ) # update gradients after len(samples) forward & backward self.optimizer_step(state) - with timing.time("add gradients"): - if report_metric and state.stage == Stage.TRAIN: - metric_reporter.add_gradients(state.model) self.sparsification_step(state) @@ -676,9 +673,6 @@ def run_step( metric_reporter.report_realtime_metric(state.stage) # update gradients after #len(samples) forward & backward self.optimizer_step(state) - with timing.time("add gradients"): - if report_metric and state.stage == Stage.TRAIN: - metric_reporter.add_gradients(state.model) self.sparsification_step(state) def _prepare_scheduler(self, training_batches, scheduler=None):