As proposed in our paper, the "Words of a Thousand Pictures" metric (W1KP) measures perceptual variability for sets of images in text-to-image generation, bootstrapped from existing perceptual distances such as DreamSim.
-
Install PyTorch for your Python 3.10+ environment.
-
Install W1KP:
pip install w1kp
-
Download the calibration data file.
-
You're done!
We recommend
import asyncio
import torch
from w1kp import StableDiffusionXLImageGenerator, DreamSimDistanceMeasure, query_inverted_cdf
async def amain():
# Generate 10 SDXL images for a prompt
prompt = 'cat'
images = []
image_gen = StableDiffusionXLImageGenerator()
for seed in range(10):
ret = await image_gen.generate_image(prompt, seed=seed)
images.append(ret['image'])
# Compute and normalize the W1KP score
dreamsim_l2 = DreamSimDistanceMeasure().to_listwise()
w1kp_score = dreamsim_l2.measure(images)
cdf_x, cdf_y = torch.load('cdf-xy.pt') # download this data file from the repo
dist = dreamsim_l2.measure(prompt, images)
dist = query_inverted_cdf(cdf_x, cdf_y, dist) # normalize to U[0, 1]
w1kp_score = 1 - dist # invert for the W1KP score
for im in images:
im.show()
print(f'The W1KP score for the images are {w1kp_score}')
if __name__ == '__main__':
asyncio.run(amain())
@inproceedings{tang2024words,
title = "Words Worth a Thousand Pictures: Measuring and Understanding Perceptual Variability in Text-to-Image Generation",
author = "Tang, Raphael and
Zhang, Crystina and
Xu, Lixinyu and
Lu, Yao and
Li, Wenyan and
Stenetorp, Pontus and
Lin, Jimmy and
Ture, Ferhan",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
year = "2024",
url = "https://aclanthology.org/2024.emnlp-main.311",
pages = "5441--5454",
}