token_classification/token_classification.py

# Adapted from huggingface transformers classificaton scripts

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
from seqeval.metrics import classification_report, accuracy_score

import glob

import datasets
import numpy as np
from datasets import ClassLabel, load_metric
from datasets.io.json import JsonDatasetReader

import transformers
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from normalizer import normalize

EXT2CONFIG = {
    "jsonl": (JsonDatasetReader, {}),
    "json": (JsonDatasetReader, {})
}

logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
   
    dataset_dir: Optional[str] = field(
        default=None, metadata={
            "help": "Path to the directory containing the data files. (.jsonl)"
            "File datatypes will be identified with their prefix names as follows: "
            "`train`- Training file(s) e.g. `train.jsonl`/ `train_part1.jsonl` etc. "
            "`validation`- Evaluation file(s) e.g. `validation.jsonl`/ `validation_part1.jsonl` etc. "
            "`test`- Test file(s) e.g. `test.jsonl`/ `test_part1.jsonl` etc. "
            "All files for must have the same extension."
        }
    )
    max_seq_length: int = field(
        default=512,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv / tsv / jsonl file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv / tsv / jsonl file containing the validation data."}
    )
    test_file: Optional[str] = field(default=None, metadata={"help": "A csv / tsv / jsonl file containing the test data."})
    do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."})
    label_all_tokens: bool = field(
        default=False,
        metadata={
            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
            "one (in which case the other tokens will have a padding index)."
        },
    )
    tokens_key: Optional[str] = field(
        default="tokens", metadata={"help": "Key name in the input file corresponding to the tokens."}
    )
    tags_key: Optional[str] = field(
        default="tags", metadata={"help": "Key name in the input file corresponding to the token labels/tags."}
    )

    def __post_init__(self):
        if self.train_file is not None and self.validation_file is not None:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "jsonl", "tsv"], "`train_file` should be a csv / tsv / jsonl file."
            validation_extension = self.validation_file.split(".")[-1]
            assert (
                validation_extension == train_extension
            ), "`validation_file` should have the same extension csv / tsv / jsonl as `train_file`."


@dataclass
class ModelArguments:
    
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )


def main():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)
    has_ext = lambda path: len(os.path.basename(path).split(".")) > 1
    get_ext = lambda path: os.path.basename(path).split(".")[-1]

    if data_args.dataset_dir is not None:
        data_files = {}
        all_files = glob.glob(
            os.path.join(
                data_args.dataset_dir,
                "*"
            )
        )
        all_exts = [get_ext(k) for k in all_files if has_ext(k)]
        if not all_exts:
            raise ValueError("The `dataset_dir` doesnt have any valid file.")
            
        selected_ext = max(set(all_exts), key=all_exts.count)
        for search_prefix in ["train", "validation", "test"]:
            found_files = glob.glob(
                os.path.join(
                    data_args.dataset_dir,
                    search_prefix + "*" + selected_ext
                )
            )
            if not found_files:
                continue

            data_files[search_prefix] = found_files
        
    else:
        data_files = {
            "train": data_args.train_file, 
            "validation": data_args.validation_file,
            "test": data_args.test_file
        }

        data_files = {k: v for k, v in data_files.items() if v is not None}
        
        if not data_files:
            raise ValueError("No valid input file found.")

        selected_ext = get_ext(list(data_files.values())[0])


    dataset_configs = EXT2CONFIG[selected_ext]
    raw_datasets = dataset_configs[0](
        data_files, 
        **dataset_configs[1]
    ).read()

    for data_type, ds in raw_datasets.items():
        assert data_args.tokens_key in ds.features, f"Input files doesnt have the `{data_args.tokens_key}` key"
        if data_type != "test":
            assert data_args.tags_key in ds.features, f"Input files doesnt have the `{data_args.tags_key}` key"
    
        ignored_columns = set(ds.column_names) - set([data_args.tokens_key, data_args.tags_key])
        raw_datasets[data_type] = ds.remove_columns(ignored_columns)

    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    
    label_to_id = config.label2id if config.task_specific_params and config.task_specific_params.get("finetuned", False) else None
    if label_to_id is None:
        def get_label_list(labels):
            unique_labels = set()
            for label in labels:
                unique_labels = unique_labels | set(label)
            label_list = list(unique_labels)
            label_list.sort()
            return label_list

        label_list = get_label_list(raw_datasets["train"][data_args.tags_key])
        num_labels = len(label_list)
        label_to_id = {v: i for i, v in enumerate(label_list)}
        config.label2id = label_to_id
        config.id2label = {id: label for label, id in config.label2id.items()}
        config.task_specific_params = {"finetuned": True}
    else:
        label_list = list(label_to_id.keys())
        num_labels = len(label_list)

    tokenizer_kwargs = {"add_prefix_space": True} if config.model_type in {"gpt2", "roberta"} else {}   
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        **tokenizer_kwargs
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir
    )

    
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if data_args.do_normalize:
        def normalize_example(example):
            for i, token in enumerate(example[data_args.tokens_key]):
                normalized_token = normalize(token)
                if len(normalized_token) > 0:
                    example[data_args.tokens_key][i] = normalized_token

            return example

        raw_datasets = raw_datasets.map(
            normalize_example,
            desc="Running normalization on dataset",
            load_from_cache_file=not data_args.overwrite_cache
        )
    
    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        
        tokenized_inputs = tokenizer(
            examples[data_args.tokens_key],
            padding=padding,
            truncation=True,
            max_length=max_seq_length,
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[data_args.tags_key]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels

        return tokenized_inputs

    with training_args.main_process_first(desc="dataset map pre-processing"):
        raw_datasets = raw_datasets.map(
            tokenize_and_align_labels,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    if training_args.do_predict  or data_args.test_file is not None:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))

    # Log a few random samples from the training set:
    if training_args.do_train:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    
    def compute_metrics(p: EvalPrediction):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        report = classification_report(
            y_true=true_labels,
            y_pred=true_predictions,
            output_dict=True
        )
        
        scores = {
            type_name: {
                "precision": score["precision"],
                "recall": score["recall"],
                "f1": score["f1-score"],
                "number": score["support"],
            }
            for type_name, score in report.items()
        }
        scores["overall_accuracy"] = accuracy_score(y_true=true_labels, y_pred=true_predictions)

        final_results = {}
        for key, value in scores.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    key = key.replace(" ", "_")
                    n = n.replace(" ", "_")
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results

    data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = (
            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        )
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        # Save predictions
        output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    
def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()