import json
import os
from argparse import ArgumentParser
from io import BytesIO
import boto3
from PIL import Image
from tqdm import tqdm
from kitcar_ml.utils.data.labeled_dataset import LabeledDataset
[docs]def download_image_from_s3(s3, bucket: str, filepath: str, output_path: str) -> Image:
file_byte_string = s3.get_object(Bucket=bucket, Key=filepath)["Body"].read()
with Image.open(BytesIO(file_byte_string)) as image:
image.save(output_path)
[docs]def import_label_studio_labels(
annotations_dir: str, output_dir: str, force_download: bool = False
):
"""Convert a Label Studio JSON based Dataset to our LabeledDataset Format.
See: https://doc.kitcar-team.de/kitcar-machine-learning/tutorials/datasets.html
"""
os.makedirs(output_dir, exist_ok=True)
s3 = boto3.client("s3", endpoint_url="https://dvc.kitcar-team.de")
dataset = LabeledDataset()
dataset.attributes = ["x1", "y1", "x2", "y2", "class_id"]
for annotation_file in tqdm(
[
os.path.join(folder, filename)
for folder, _, filenames in os.walk(annotations_dir)
for filename in filenames
],
desc="Importing Label Studio Data",
):
# Load annotations from file
with open(annotation_file) as f:
annotations = json.load(f)
# Copy Image to output dir
image_path = annotations["task"]["data"]["image"]
bucket, file_path = image_path.replace("s3://", "").split("/", 1)
image_filename = os.path.basename(image_path)
# Load and Save Image
local_path = os.path.join(output_dir, image_filename)
# Download image only if it does not exist yet or if the user forces it
if force_download or not os.path.exists(local_path):
# Download image file
download_image_from_s3(s3, bucket, file_path, local_path)
for label in annotations["result"]:
img_width, img_height = label["original_width"], label["original_height"]
# Get Bounding Box
bbox_values = label["value"]
width = bbox_values["width"] / 100 * img_width
height = bbox_values["height"] / 100 * img_height
x1 = bbox_values["x"] / 100 * img_width
y1 = bbox_values["y"] / 100 * img_height
x2 = x1 + width
y2 = y1 + height
x1, y1, x2, y2 = (round(val) for val in (x1, y1, x2, y2))
# Check label coordinates
if x1 < 0 or x2 > img_width:
continue
if y1 < 0 or y2 > img_height:
continue
if x2 - x1 <= 0 or x2 - x1 > img_width:
continue
if y2 - y1 <= 0 or y2 - y1 > img_height:
continue
if "rectanglelabels" not in bbox_values:
continue
# Get class
class_name = bbox_values["rectanglelabels"][0]
if class_name in dataset.classes.values():
class_id = [
id for id, name in dataset.classes.items() if name == class_name
][0]
else:
class_id = len(dataset.classes)
dataset.classes[class_id] = class_name
# Add Label to dataset
dataset.append_label(image_filename, [x1, y1, x2, y2, class_id])
# Save dataset yaml
dataset.save_as_yaml(os.path.join(output_dir, "labels.yaml"))
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"--annotations-dir",
type=str,
required=True,
help="The dir containing all annotations.",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="The output dir of the whole dataset.",
)
parser.add_argument(
"--force-download",
action="store_true",
help="Download images from s3 even if they are already in output dir",
)
args = parser.parse_args()
import_label_studio_labels(args.annotations_dir, args.output_dir, args.force_download)