rep_gen.py

# Copyright (C) 2020 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html

from dataset_builder import DatasetBuilder
from model_builder import ModelBuilder
import numpy as np
import os


class RepresentationGenerator:
    """A representation (embeddings) generator for visualization of characters.
    When embeddings and labels are written to files, embeddings are written to
    'out_file'_vec.tsv file and labels are written to 'out_file'_meta.tsv file.
    You can see a 3D visualization of the embeddings in browser using TensorFlow
    embedding projector. Go to https://projector.tensorflow.org/ and click
    'Load' button on the left-hand side. Load 'out_file'_vec.tsv as vectors and
    'out_file'_meta.tsv as metadata to visualize embeddings.
    
    Initialization:
        >>> rg = RepresentationGenerator(config_path='configs/config.ini', \
                                         out_dir='embeddings/')

    Configurations are all set in .ini file. Change path to new config file to
    change configurations:
        >>> rg.config_path = 'configs/new_config.ini'

    Get representations for all images in a directory (IMPORTANT! Expect all
    images to be generated by VisualizationGenerator in vis_gen. Otherwise, make
    sure file name in format 'U+XXXX_*' to use 'char_as_label' feature):
        >>> codepoints, embeddings = rg.get_embeddings(img_dir='test_imgs', \
                                                       char_as_label=True)

    Write codepoints and embeddings to file:
        >>> rg.write_embeddings_from_list(codepoints=codepoints, \
                                          embeddings=embeddings)

    Write write labels and embeddings to file directly from image directory.
        >>> rg.write_embeddings_from_image(img_dir='test_imgs')
    """

    def __init__(self, config_path='configs/sample_config.ini',
                 out_dir="embeddings"):
        """Need a checkpoint directory to initialize RepresentationGenerator.

        Args:
            config_path: Str, path to config (.ini) file. (default
                "configs/sample_config.ini"
            out_dir: Str, relative path of the output directory (default
                "embeddings").

        Raises:
            ValueError: if model_name not found
            ValueError: if ckpt_dir don't contain TensorFlow formatted
                checkpoint
        """
        self._dataset_builder = None
        self._model_builder = None
        self._model = None
        self.config_path = config_path
        self.out_dir = out_dir

    @property
    def config_path(self):
        """
        Returns:
            self._config_path: Str, path to config file (.ini file).
        """
        return self._config_path

    @property
    def out_dir(self):
        """
        Returns:
            self._out_dir: Str, path to output directory.
        """
        return self._out_dir

    @config_path.setter
    def config_path(self, config_path):
        """Load config file (.ini file) and get dataset builder and neural
        network model. Set private attribute self._dataset_builder, self._model
        and self._config_path. Note that this function does not parse config
        file. It simply uses config file to set up dataset builder and model
        builder.

        Args:
            config_path: Str, path to config (.ini) file.
        """
        if not os.path.isfile(config_path):
            raise ValueError("Specified config file does not exist.")

        # Get dataset builder based on config file
        self._dataset_builder = DatasetBuilder(config_path=config_path,
                                               one_hot=False)
        # Get model builder and get encoder (triplet model with weights)
        self._model_builder = ModelBuilder(config_path=config_path)
        self._model = self._model_builder.get_encoder()

        # Set self._config_path
        self._config_path = config_path


    @out_dir.setter
    def out_dir(self, out_dir):
        """
        Args:
             out_dir: Str, relative path of the output directory.
        """
        self._out_dir = out_dir

    def get_embeddings(self, img_dir):
        """For the image files in 'img_dir', return their embeddings.

        Args:
            img_dir: Str, relative path to directory where all character images
            are stored.

        Returns:
            codepoints: List of codepoints with other configs in format
                'CODEPOINT_FONTNAME[_FONTSTYLE]_ANTIALIAS'. Same as the filename
                generated by vis_gen.
            embeddings: List of embeddings. Each element is a representation of
                a character.
        """
        # Get dataset with filename as label
        dataset = self._dataset_builder.get_filename_dataset(img_dir)

        # Get unicode code points and their corresponding embeddings
        codepoints = []
        embeddings = []
        i = 0
        print('Generating embeddings...')
        for img, filename in dataset:
            i += 1
            if i % 100 == 0:
                print("Getting embedding #" + str(i) + ".")
            # decode Tensor into string
            filename_str = filename.numpy()[0].decode('utf-8')
            codepoints.append(filename_str.split('.')[0])

            # Get embeddings
            embedding = self._model.predict(img)[0]
            embeddings.append(embedding)

        return codepoints, embeddings

    def write_embeddings_from_image(self, img_dir, out_file,
                                    char_as_label=True):
        """Get embeddings and write embeddings and labels to .tsv files. This
        function will write to two .tsv files: 'out_file'_vec.tsv and
        'out_file'_meta.tsv. Entries in 'out_file'_vec.tsv are separated by
        newline. Elements in each embeddings are separated by tab. Entries in
        'out_file'_meta.tsv are separated by newline.

        Args:
            img_dir: Str, relative path to directory where all character images
                are stored
            out_file: Str, name of the output file intended to write to
            char_as_label: Bool, whether to
        """
        # Get model predictions and unicode code points
        codepoints, embeddings = self.get_embeddings(img_dir=img_dir)

        # Write code points to '_meta.tsv' and embeddgins to '_vec.tsv'
        self.write_embeddings_from_list(codepoints, embeddings, out_file,
                                        char_as_label)

    def write_embeddings_from_list(self, codepoints, embeddings, out_file,
                                   char_as_label=True):
        """Write labels and embeddings to file.

        Args:
            codepoints: List of Str, each element must be in format 'U+XXXX'.
            embeddings:
            out_file: Str, name of the output file intended to write to.
            char_as_label: Bool, whether to use character as label. Otherwise,
                use code points.

        Raises:
            ValueError: if codepoints and embeddings does not have the same
                number of entries.
        """
        # Throw exception if codepoint array and embedding array does not have
        # the same number of elements
        if len(codepoints) != len(embeddings):
            raise ValueError('Expect array codepoints and embeddings to have '
                             'the same number of elements.')

        # Get absolute directory path, create new folder if needed.
        out_dir_abs = os.path.abspath(self.out_dir)
        os.makedirs(out_dir_abs, exist_ok=True)

        # Get absolute path to output files
        out_file_abs = os.path.join(out_dir_abs, out_file)
        out_file_vec_abs = out_file_abs + '_vec.tsv'
        out_file_meta_abs = out_file_abs + '_meta.tsv'

        # Write embeddings to file
        print("Writing embeddings to file {}...".format(out_file_vec_abs))
        np.savetxt(out_file_abs + "_vec.tsv", embeddings, delimiter='\t')
        print('Successfully written to file {}.'.format(out_file_vec_abs))

        # Change Unicode code point to character if specified
        if char_as_label:
            try:
                # 'CODEPOINT_FONTNAME[_FONTSTYLE]_ANTIALIAS' -> 'CODEPOINT'
                codepoints = [codepoint.split('_')[0] for
                              codepoint in codepoints]
                # 'U+XXXX' -> char
                codepoints = [chr(int('0x' + codepoint[2:], 16))
                              for codepoint in codepoints]
            except:
                print('All entries of codepoints array must be in format: '
                      'CODEPOINT_FONTNAME[_FONTSTYLE]_ANTIALIAS. Example: '
                      'U+4eba_Noto Sans CJK SC_Default.')
                raise

        # Write labels
        print("Writing labels to file {}...".format(out_file_meta_abs))
        with open(out_file_meta_abs, "w+") as f_out:
            for label in codepoints:
                f_out.write(label)
                f_out.write('\n')
        print('Successfully written to file {}.'.format(out_file_meta_abs))