Skip to main content

Formatting common types of image datasets

Run in Google ColabRun in Google Colab

This tutorial demonstrates how to format image data in various popular Python formats before running Cleanlab Studio. Each section of the tutorial covers one specific data format and outlines the steps to create a zip file that Cleanlab Studio can natively process. Here we only show how to produce a properly formatted data file, not how to run Cleanlab Studio on it – for that refer to the image data quickstart tutorial.

Cleanlab Studio can be directly run on image datasets in a ZIP file format with or without metadata. The application natively supports many other data formats listed in this guide, refer to it instead if your image data are not in one of the formats presented in this tutorial.

This tutorial demonstrates how to convert each of the following Python data formats into a dataset that can be directly processed by Cleanlab Studio:

Below we show a toy example of the folder and file structure that a local dataset should adhere to before zipping the folder for Cleanlab Studio.

|-- <image_dataset>
| |-- <class_0>
| | |-- <image_0>
| | |-- <image_1>
...
| |-- <class_n>
...

The following command produces a ZIP file of the image dataset that can be directly processed by Cleanlab Studio:

zip -r <image_dataset>.zip <image_dataset>

The rest of this tutorial demonstrates how to create such files from various Python datasets. We begin by installing and importing some necessary packages:

%pip install tqdm ipywidgets Pillow datasets tensorflow-datasets tensorflow torch torchvision
import os
from tqdm.auto import tqdm
import io
import zipfile
from PIL import Image

1. Huggingface Datasets

Here, we load the CIFAR10 dataset which consists of 60,000 images across 10 classes. It is one of the most common datasets for image classification.

# Load dataset from the Hub
from datasets import load_dataset, concatenate_datasets

cifar10_dict = load_dataset("cifar10")
cifar10_dict
    DatasetDict({
train: Dataset({
features: ['img', 'label'],
num_rows: 50000
})
test: Dataset({
features: ['img', 'label'],
num_rows: 10000
})
})

For finding issues across splits, we concatenate the splits into one single dataset.

cifar10_hf = concatenate_datasets(cifar10_dict.values())
cifar10_hf
    Dataset({
features: ['img', 'label'],
num_rows: 60000
})

View an example from the dataset

cifar10_hf[0]
    {'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32>,
'label': 0}
# construct mapping from id to label str
label_str_list = cifar10_hf.features["label"].names
label_mapping = {i: name for i, name in enumerate(label_str_list)}
print(label_mapping)
    {0: 'airplane', 1: 'automobile', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck'}

Method for formatting Huggingface dataset

def format_huggingface_image_dataset(
dataset, image_key, label_key, label_mapping, filename, save_dir
):
"""Convert a Huggingface dataset to Cleanlab Studio format.

dataset: datasets.Dataset
HuggingFace image dataset
image_key: str
column name for image in dataset
label_key: str
column name for label in dataset
label_mapping: Dict[str, int]
id to label str mapping
If labels are already strings, set label_mapping to None
filename: str
filename for the zip file
save_dir: str
directory to save the zip file

"""

def image_data_generator():
"""Generator to yield image data and its path in the zip file."""
for idx, data in enumerate(dataset):
image = data[image_key]
label = data[label_key]
class_dir = label_mapping[label] if label_mapping else label

buf = io.BytesIO()
image.save(buf, format="PNG")
image_data = buf.getvalue()

yield f"hf_dataset/{class_dir}/image_{idx}.png", image_data

zip_path = os.path.join(save_dir, f"{filename}.zip")

with zipfile.ZipFile(zip_path, "w") as zf:
for path, data in tqdm(image_data_generator(), total=len(dataset)):
zf.writestr(path, data)

print(f"Saved zip file to: {zip_path}")

Format the dataset and save to a zip file.

format_huggingface_image_dataset(
dataset=cifar10_hf,
image_key="img",
label_key="label",
label_mapping=label_mapping,
filename="cifar10_hf",
save_dir="./",
)

Now you can upload the file ./cifar10_hf.zip to Cleanlab Studio, either using Web UI or Cleanlab Studio API. Check Upload Dataset section for more details.

2. Torchvision datasets

from torchvision.datasets import CIFAR10
from torch.utils.data import ConcatDataset

Here, we again load the CIFAR10 dataset from torchvision.

cifar10_test = CIFAR10(root="./", train=False, download=True)
cifar10_train = CIFAR10(root="./", train=True, download=True)
cifar10_torch = ConcatDataset([cifar10_train, cifar10_test])
classes = (
"plane",
"car",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
)
label_mapping = {i: classes[i] for i in range(len(classes))}

View an example from the dataset.

cifar10_torch[0]
    (<PIL.Image.Image image mode=RGB size=32x32>, 6)

Method for formatting Torchvision dataset

def format_torchvision_dataset(
dataset, image_key, label_key, label_mapping, filename, save_dir
):
"""Convert a Torchvision dataset to Cleanlab Studio format.

dataset: torchvision.datasets
Torchvision dataset
image_key: str
column name for image in dataset
label_key: str
column name for label in dataset
label_mapping: Dict[str, int]
id to label str mapping
If labels are already strings, set label_mapping to None
filename: str
filename for the zip file
save_dir: str
directory to save the zip file

"""

def image_data_generator():
"""Generator to yield image data and its path in the zip file."""
for idx, data in enumerate(dataset):
image = data[image_key]
label = label_mapping[data[label_key]] if label_mapping else data[label_key]

buf = io.BytesIO()
image.save(buf, format="PNG")
image_data = buf.getvalue()

yield f"torch_dataset/{label}/image_{idx}.png", image_data

zip_path = os.path.join(save_dir, f"{filename}.zip")

with zipfile.ZipFile(zip_path, "w") as zf:
for path, data in tqdm(image_data_generator(), total=len(dataset)):
zf.writestr(path, data)

print(f"Saved zip file to: {zip_path}")

Format the dataset and save to a zip file.

format_torchvision_dataset(
dataset=cifar10_torch,
image_key=0,
label_key=1,
label_mapping=label_mapping,
filename="cifar10_torch",
save_dir="./",
)

Now you can upload the file ./cifar10_torch.zip to Cleanlab Studio, either using Web UI or Studio API. Check Upload Dataset section for more details.

3. Tensorflow datasets

import tensorflow_datasets as tfds
import tensorflow as tf
cifar10_tf, metadata = tfds.load(
"cifar10", split="train", with_info=True, as_supervised=True
)
# construct mapping from id to label str
label_str_list = metadata.features["label"].names
label_mapping = {i: name for i, name in enumerate(label_str_list)}
print(label_mapping)
    {0: 'airplane', 1: 'automobile', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck'}
def format_tensorflow_image_dataset(
dataset, image_key, label_key, label_mapping, filename, save_dir
):
"""Convert a Tensorflow dataset to Cleanlab Studio format.

dataset: tf.data.Dataset
Tensorflow dataset
image_key: str
column name for image in dataset
label_key: str
column name for label in dataset
label_mapping: Dict[str, int]
id to label str mapping
filename: str
filename for the zip file
save_dir: str
directory to save the zip file

"""

def image_data_generator():
"""Generator to yield image data and its path in the zip file."""
for idx, example in enumerate(dataset):
image = Image.fromarray(example[image_key].numpy())
label = label_mapping[example[label_key].numpy()]

buf = io.BytesIO()
image.save(buf, format="PNG")
image_data = buf.getvalue()

yield f"tf_dataset/{label}/image_{idx}.png", image_data

zip_path = os.path.join(save_dir, f"{filename}.zip")

with zipfile.ZipFile(zip_path, "w") as zf:
for path, data in tqdm(image_data_generator()):
zf.writestr(path, data)

print(f"Saved zip file to: {zip_path}")
format_tensorflow_image_dataset(
dataset=cifar10_tf,
image_key=0,
label_key=1,
label_mapping=label_mapping,
filename="cifar10_tf",
save_dir="./",
)

Now you can upload the file ./cifar10_tf.zip to Cleanlab Studio, either using Web UI or Studio API. Check Upload Dataset section for more details.