Source code for kitcar_ml.utils.data.import_label_studio_labels

import json
import os
from argparse import ArgumentParser
from io import BytesIO

import boto3
from PIL import Image
from tqdm import tqdm

from kitcar_ml.utils.data.labeled_dataset import LabeledDataset


[docs]def download_image_from_s3(s3, bucket: str, filepath: str, output_path: str) -> Image:
    file_byte_string = s3.get_object(Bucket=bucket, Key=filepath)["Body"].read()
    with Image.open(BytesIO(file_byte_string)) as image:
        image.save(output_path)


[docs]def import_label_studio_labels(
    annotations_dir: str, output_dir: str, force_download: bool = False
):
    """Convert a Label Studio JSON based Dataset to our LabeledDataset Format.

    See: https://doc.kitcar-team.de/kitcar-machine-learning/tutorials/datasets.html
    """
    os.makedirs(output_dir, exist_ok=True)

    s3 = boto3.client("s3", endpoint_url="https://dvc.kitcar-team.de")

    dataset = LabeledDataset()
    dataset.attributes = ["x1", "y1", "x2", "y2", "class_id"]

    for annotation_file in tqdm(
        [
            os.path.join(folder, filename)
            for folder, _, filenames in os.walk(annotations_dir)
            for filename in filenames
        ],
        desc="Importing Label Studio Data",
    ):
        # Load annotations from file
        with open(annotation_file) as f:
            annotations = json.load(f)

        # Copy Image to output dir
        image_path = annotations["task"]["data"]["image"]
        bucket, file_path = image_path.replace("s3://", "").split("/", 1)
        image_filename = os.path.basename(image_path)

        # Load and Save Image
        local_path = os.path.join(output_dir, image_filename)
        # Download image only if it does not exist yet or if the user forces it
        if force_download or not os.path.exists(local_path):
            # Download image file
            download_image_from_s3(s3, bucket, file_path, local_path)

        for label in annotations["result"]:
            img_width, img_height = label["original_width"], label["original_height"]

            # Get Bounding Box
            bbox_values = label["value"]

            width = bbox_values["width"] / 100 * img_width
            height = bbox_values["height"] / 100 * img_height
            x1 = bbox_values["x"] / 100 * img_width
            y1 = bbox_values["y"] / 100 * img_height
            x2 = x1 + width
            y2 = y1 + height

            x1, y1, x2, y2 = (round(val) for val in (x1, y1, x2, y2))

            # Check label coordinates
            if x1 < 0 or x2 > img_width:
                continue
            if y1 < 0 or y2 > img_height:
                continue
            if x2 - x1 <= 0 or x2 - x1 > img_width:
                continue
            if y2 - y1 <= 0 or y2 - y1 > img_height:
                continue
            if "rectanglelabels" not in bbox_values:
                continue

            # Get class
            class_name = bbox_values["rectanglelabels"][0]
            if class_name in dataset.classes.values():
                class_id = [
                    id for id, name in dataset.classes.items() if name == class_name
                ][0]
            else:
                class_id = len(dataset.classes)
                dataset.classes[class_id] = class_name

            # Add Label to dataset
            dataset.append_label(image_filename, [x1, y1, x2, y2, class_id])
    # Save dataset yaml
    dataset.save_as_yaml(os.path.join(output_dir, "labels.yaml"))


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument(
        "--annotations-dir",
        type=str,
        required=True,
        help="The dir containing all annotations.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="The output dir of the whole dataset.",
    )
    parser.add_argument(
        "--force-download",
        action="store_true",
        help="Download images from s3 even if they are already in output dir",
    )
    args = parser.parse_args()
    import_label_studio_labels(args.annotations_dir, args.output_dir, args.force_download)