clip_score

#!/usr/bin/env python -u
# Author: Mehdi Cherti
# Thanks to @lucidrains for the DALL-E PyTorch repo <https://github.com/lucidrains/DALLE-pytorch>
# this code is based on <https://github.com/lucidrains/DALLE-pytorch/blob/main/generate.py>
import joblib
import json
import argparse
from pathlib import Path
from tqdm import tqdm
import random
import numpy as np
# torch

import torch

from einops import repeat

# vision imports

from PIL import Image
from torchvision.utils import make_grid, save_image

# clip
import clip

# dalle related classes and utils

from dalle_pytorch import DiscreteVAE, OpenAIDiscreteVAE, VQGanVAE1024, DALLE
from dalle_pytorch.tokenizer import tokenizer, HugTokenizer, YttmTokenizer, ChineseTokenizer
from loader import TextImageDataset

CLIP_THRESH_DEFAULT = 25

# shape for real:  (1, 1, nb_captions)
# shape for fakes: (num_generate, nb_captions, nb_captions)
# == explanation ==
# - for each real image, we have different captions `nb_captions`
# - for each caption, we generate a number of images `num_generate`
# - for each generated image' and caption, we compute CLIP score
# - in the shape of fakes, second dim is the generated image of a caption i, and third dim is a caption j. So we also compute CLIP score across captions (e.g., CLIP score for generated image of caption i with caption j)
# - shape of real is reshape to have the same dims as shape of fakes, so that we can compute the metrics
# In summary:
# - fakes[k,i,j] means the CLIP score of generated image of caption i with caption j, the k-th attempt (each of the k is an independent sample)
# - real[:,:,j] means the CLIp score of the real image with caption j

def CLIP_score_real(real, fakes):
    return real.mean()

def CLIP_atleast(real, fakes, th=CLIP_THRESH_DEFAULT):
    if args.clip_thresh:
        th = args.clip_thresh
    return np.any(fakes > th, axis=0).mean()

def CLIP_score(real, fakes):
    return fakes.mean()

def CLIP_score_top1(real, fakes):
    return fakes.max(axis=0).mean()

def CLIP_score_relative(real, fakes):
    return (fakes.mean(axis=0, keepdims=True) / real).mean()

def CLIP_score_relative_top1(real, fakes):
    return (fakes.max(axis=0, keepdims=True) / real).mean()

metrics = [
    CLIP_score_real, 
    CLIP_score,
    CLIP_score_top1, 
    CLIP_score_relative,
    CLIP_score_relative_top1,
    CLIP_atleast,
]
# argument parsing

parser = argparse.ArgumentParser()

parser.add_argument('--dalle_path', type = str, required = True,
                    help='path to your trained DALL-E')

parser.add_argument('--out_file', type = str, required =False, default='clip_score.json',
                    help='Output file')

parser.add_argument('--image_text_folder', type=str, required=True,
                    help='path to your folder of images and text for learning the DALL-E')

parser.add_argument('--num_generate', type = int, default = 128, required = False,
                    help='number of images to generate per caption')

parser.add_argument('--clip_thresh', type = float, default = CLIP_THRESH_DEFAULT, required = False,
                    help='CLIP threshold for computing the "atleast" metric')

parser.add_argument('--nb_examples', type = int, default = None, required = False,
                    help='number of real images to consider for computing CLIP score (per worker if horovod is used)')

parser.add_argument('--num_captions_per_image', type = int, default = None, required = False,
                    help='number of captions to retain per real image. if None, ignored. If provided, only select the captions with best CLIP score w.r.t real image')

parser.add_argument('--batch_size', type = int, default = 4, required = False,
                    help='batch size')

parser.add_argument('--seed', type = int, default = 42, required = False,
                    help='seed')

parser.add_argument('--top_k', type = float, default = 0.9, required = False,
                    help='top k filter threshold')

parser.add_argument('--outputs_dir', type = str, default = './outputs', required = False,
                    help='output directory')

parser.add_argument('--bpe_path', type = str,
                    help='path to your huggingface BPE json file')

parser.add_argument('--hug', dest='hug', action = 'store_true')

parser.add_argument('--chinese', dest='chinese', action = 'store_true')

parser.add_argument('--taming', dest='taming', action='store_true')

parser.add_argument('--horovod', dest='horovod', action='store_true', help='whether to use horovod for computing the metrics in a distributed manner')

parser.add_argument('--dump', dest='dump', action='store_true', help='whether to dump all the generated images in the output directory')

args = parser.parse_args()

# helper fns

def exists(val):
    return val is not None

# tokenizer

if exists(args.bpe_path):
    klass = HugTokenizer if args.hug else YttmTokenizer
    tokenizer = klass(args.bpe_path)
elif args.chinese:
    tokenizer = ChineseTokenizer()

# load DALL-E

dalle_path = Path(args.dalle_path)

assert dalle_path.exists(), 'trained DALL-E must exist'

load_obj = torch.load(str(dalle_path), map_location='cpu')
dalle_params, vae_params, weights = load_obj.pop('hparams'), load_obj.pop('vae_params'), load_obj.pop('weights')

dalle_params.pop('vae', None) # cleanup later

if vae_params is not None:
    vae = DiscreteVAE(**vae_params)
elif not args.taming:
    vae = OpenAIDiscreteVAE()
else:
    vae = VQGanVAE1024()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using", device)
dalle = DALLE(vae = vae, **dalle_params).to(device)

dalle.load_state_dict(weights)

# generate images

image_size = vae.image_size
ds = TextImageDataset(
    args.image_text_folder,
    text_len=dalle.text_seq_len,
    image_size=vae.image_size,
    # resize_ratio=args.resize_ratio,
    # truncate_captions=args.truncate_captions,
    tokenizer=tokenizer,
    shuffle=False,
    seed=args.seed,
)

clip_mean = torch.Tensor([0.48145466, 0.4578275, 0.40821073]).view(1,3,1,1).to(device)
clip_std = torch.Tensor([0.26862954, 0.26130258, 0.27577711]).view(1,3,1,1).to(device)
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device, jit=False)
ds.image_transform = clip_preprocess
real_scores = []
fake_scores = []
if args.horovod:
    import horovod.torch as hvd
    hvd.init()
    print("Using horovod to distribute the score computation")
    print("Number of workers:", hvd.size())
    indices = np.arange(len(ds))
    indices = indices[(indices % hvd.size()) == hvd.rank()]
    ds = torch.utils.data.Subset(ds, indices)
    display = (hvd.rank() == 0)
else:
    display = True
nb_examples = args.nb_examples if args.nb_examples else len(ds)
# t0 = time.time()
inds = (range(nb_examples))
if display:
    inds = tqdm(inds)
for i in inds: 
    # clip_model, clip_preprocess = clip.load("ViT-B/32", device=device, jit=False)
    text, text_str, image = ds[i]
    image = image.to(device)
    image = image.unsqueeze(0)
    clip_text = clip.tokenize(text_str).to(device)
    if args.num_captions_per_image:
        with torch.no_grad():
            logits, _ = clip_model(image, clip_text)
            logits = logits[0]
            order = torch.argsort(-logits).cpu().numpy()
        order = order[:args.num_captions_per_image]
        text = text[order]
        clip_text = clip_text[order]
        text_str = [text_str[ind] for ind in order]
    nb_captions = len(text_str)
    # text_str = tokenizer.decode(text)
    # print('x',text_str, 'x')
    # if display:
        # print(i,'/', nb_examples, text_str, time.time() - t0)
    text = text.to(device)
    # text = tokenizer.tokenize([args.text], dalle.text_seq_len).cuda()
    # text = repeat(text, '() n -> b n', b = args.num_generate)

    # num_generate * nb_captions, text_seq_len
    text = text.repeat(args.num_generate, 1)

    outputs = []

    for text_chunk in text.split(args.batch_size):
        output = dalle.generate_images(text_chunk, filter_thres = args.top_k)
        outputs.append(output)
    # generated images
    # num_generate * nb_captions, c, h, w
    outputs = torch.cat(outputs)
    # put real and generated images in `outputs`
    with torch.no_grad():
        outputs = torch.nn.functional.interpolate(outputs, size=(224, 224), mode='bicubic')
        outputs = (outputs - clip_mean) / clip_std
        outputs = torch.cat((image, outputs), dim=0)
    with torch.no_grad():
        #logits_per_image: num_generate*nb_captions+1,nb_captions
        logits_per_image, logits_per_text = clip_model(outputs, clip_text)
        logits_per_image.clamp_min_(0)
    # if display:
        # print(logits_per_image.shape)

    # (1, 1, nb_captions)
    real_clip_score = logits_per_image[0:1,:].cpu().numpy().reshape((1, 1, nb_captions))
    # num_generate*nb_captions,nb_captions
    fake_clip_scores = logits_per_image[1:,:].data.cpu().numpy()
    # (num_generate, nb_captions, nb_captions)
    fake_clip_scores = fake_clip_scores.reshape((args.num_generate, nb_captions, nb_captions))
    if args.dump:
        real = outputs[0]
        fakes = outputs[1:].view(args.num_generate, nb_captions, outputs.size(1), outputs.size(2), outputs.size(3))
        for ind, text_str_cur in enumerate(text_str):
            real_clip_score_cur = real_clip_score[:, :, ind]
            fake_clip_scores_cur = fake_clip_scores[:, ind, ind]
            order = np.argsort(-fake_clip_scores_cur)
            fakes_cur = fakes[order, ind]
            outputs_dir = Path(args.outputs_dir) / text_str_cur.replace(' ', '_')[:(100)]
            outputs_dir.mkdir(parents = True, exist_ok = True)
            save_image(real, outputs_dir / f'true.jpg', normalize=True)
            for i, image in enumerate(fakes_cur):
                save_image(image, outputs_dir / f'{i}.jpg', normalize=True)
            results = {}
            for metric in metrics:
                score = metric(real_clip_score_cur, fake_clip_scores_cur)
                score = float(score)
                results[metric.__name__] = score
            with open(outputs_dir / 'metrics.json', "w") as fd:
                json.dump(results, fd)
    real_scores.append(real_clip_score)
    fake_scores.append(fake_clip_scores)
if args.horovod:
    hvd.join()
real_scores = np.array(real_scores)
fake_scores = np.array(fake_scores)
results = {}
for metric in metrics:
    score = float(np.mean([metric(r, f) for r, f in zip(real_scores, fake_scores)]))
    if args.horovod:
        score = hvd.allreduce(torch.Tensor([score])).item()
    score = float(score)
    if display:
        results[metric.__name__] = score
        print(metric.__name__, score)
if display:
    dump = {
        'metrics': results,
        'num_generated_images_per_caption': args.num_generate,
        'path': args.dalle_path,
        'image_text_folder': args.image_text_folder,
        'nb_examples': (nb_examples * hvd.size() if args.horovod else nb_examples),
        'top_k': args.top_k,
        'clip_thresh': args.clip_thresh,
        'num_captions_per_image': args.num_captions_per_image,
    }
    with open(args.out_file, "w") as fd:
        json.dump(dump, fd)
    joblib.dump({'real_scores': real_scores, 'fake_scores': fake_scores}, args.out_file+'.pkl')