EfficientSAM と OpenVINO によるオブジェクトのセグメント化#

この Jupyter ノートブックは、ローカルへのインストール後にのみ起動できます。

GitHub

セグメント・エニシング・モデル (SAM) は、数多くのビジョン・アプリケーション向けの強力なツールとして登場しました。ゼロショット転送と高い汎用性を実現する優れたパフォーマンスを実現する重要なコンポーネントは、広範囲にわたる高品質の SA-1B データセットでトレーニングされた超大規模トランスフォーマー・モデルです。SAM モデルは有益ではあるものの、計算コストが膨大であるため、その応用範囲は実世界のより広範なアプリケーションに限定されています。この制限に対処するため、複雑さを大幅に削減しながら適切なパフォーマンスを発揮する軽量 SAM モデルである EfficientSAM が提案されました。EfficientSAM のアイデアは、効果的な視覚表現学習のために SAM 画像エンコーダーから特徴を再構築することを学習するマスクされた画像の事前トレーニング (SAMI) を活用することに基づいています。

overview.png

overview.png#

モデルの詳細については、論文モデルのウェブページ元のリポジトリーを参照してください

このチュートリアルでは、OpenVINO を使用して EfficientSAM を変換して実行する方法について説明します。また、NNCF を使用してモデルを量子化する方法も示します

目次:

必要条件#

import platform 

if platform.system() != "Windows":
     %pip install -q "matplotlib>=3.4" 
else:
     %pip install -q "matplotlib>=3.4,<3.7" 
%pip install -q "openvino>=2023.3.0" "nncf>=2.7.0" opencv-python "gradio>=4.13" torch torchvision tqdm --extra-index-url https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages. 
Note: you may need to restart the kernel to use updated packages.
from pathlib import Path 

repo_dir = Path("EfficientSAM") 

if not repo_dir.exists():
     !git clone https://github.com/yformer/EfficientSAM.git 
%cd $repo_dir
Cloning into 'FreeVC'... 
remote: Enumerating objects: 424, done.[K 
remote: Counting objects: 100% (85/85), done.[K 
remote: Compressing objects: 100% (33/33), done.[K 
remote: Total 424 (delta 76), reused 52 (delta 52), pack-reused 339[K 
Receiving objects: 100% (424/424), 262.14 MiB | 28.43 MiB/s, done.
Resolving deltas: 100% (246/246), done. /opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM

PyTorch モデルのロード#

リポジトリーにはいくつかのモデルが用意されています:

  • efficient-sam-vitt - 画像エンコーダーとして Vision Transformer Tiny (VIT-T) を搭載した EfficientSAM。EfficientSAM ファミリーの最小かつ最速のモデル。

  • efficient-sam-vits - 画像エンコーダーとして Vision Transformer Small (VIT-S) を搭載した EfficientSAM。Efficiency-sam-vitt よりも重くなりますが、より正確なモデルです。

EfficientSAM は、モデルとの対話のための統一されたインターフェイスを提供します。つまり、ノートブックで提供されるモデルの変換と実行のすべての手順は、すべてのモデルで同じになります。以下では、例としてそのうちの 1 つを選択できます。

from efficient_sam.build_efficient_sam import ( 
    build_efficient_sam_vitt, 
    build_efficient_sam_vits, 
) 
import zipfile 

MODELS_LIST = { 
    "efficient-sam-vitt": build_efficient_sam_vitt, 
    "efficient-sam-vits": build_efficient_sam_vits, 
} 

# EfficientSAM-S チェックポイント・ファイルは 100 MB を超えるため、zip ファイルで保存 
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", "r") as zip_ref: 
    zip_ref.extractall("weights")

サポートされるモデルから 1 つ選択してください:

import ipywidgets as widgets 

model_ids = list(MODELS_LIST) 

model_id = widgets.Dropdown( 
    options=model_ids, 
    value=model_ids[0], 
    description="Model:", 
    disabled=False, 
) 

model_id
Dropdown(description='Model:', options=('efficient-sam-vitt', 'efficient-sam-vits'), value='efficient-sam-vitt…

PyTorch モデルのビルド

pt_model = MODELS_LIST[model_id.value]() 

pt_model.eval();

PyTorch モデル推論を実行#

PyTorch モデルを選択してロードすると、その結果を確認できます

入力データを準備#

まず、モデルの入力データを準備する必要があります。モデルには 3 つの入力があります: * 画像テンソル - 正規化された入力画像を持つテンソル。* 入力ポイント - ユーザーが指定したポイントを持つテンソル。これは、画像上の特定のポイント (画面上でのユーザーのクリックによって提供されるものなど) である場合もあれば、左上の角度ポイントと右下の角度ポイントの形式の境界ボックス座標である場合もあります。* 入力ラベル - 指定された各ポイントのポイントタイプの定義を含むテンソル。1 - 通常のポイント、2 - 境界ボックスの左上のポイント、3 - 境界ボックスの右下のポイント。

from PIL import Image 

image_path = "figs/examples/dogs.jpg" 

image = Image.open(image_path) 
image
../_images/efficient-sam-with-output_11_0.png

入力および出力処理のヘルパーを定義#

以下のコードは、モデル入力と後処理推論結果を準備するためのヘルパーを定義します。入力形式は、上記のモデルで受け入れられます。モデルは、画像上の各ピクセルのマスクロジットと、各領域の交差と結合のスコア、指定されたポイントにどれだけ近いかを予測します。結果を視覚化するヘルパー関数もいくつか用意しました。

import torch 
import matplotlib.pyplot as plt 
import numpy as np 

def prepare_input(input_image, points, labels, torch_tensor=True): 
    img_tensor = np.ascontiguousarray(input_image)[None, ...].astype(np.float32) / 255 
    img_tensor = np.transpose(img_tensor, (0, 3, 1, 2)) 
    pts_sampled = np.reshape(np.ascontiguousarray(points), [1, 1, -1, 2]) 
    pts_labels = np.reshape(np.ascontiguousarray(labels), [1, 1, -1]) 
    if torch_tensor: 
        img_tensor = torch.from_numpy(img_tensor) 
        pts_sampled = torch.from_numpy(pts_sampled) 
        pts_labels = torch.from_numpy(pts_labels) 
    return img_tensor, pts_sampled, pts_labels 

def postprocess_results(predicted_iou, predicted_logits): 
    sorted_ids = np.argsort(-predicted_iou, axis=-1) 
    predicted_iou = np.take_along_axis(predicted_iou, sorted_ids, axis=2) 
    predicted_logits = np.take_along_axis(predicted_logits, sorted_ids[..., None, None], axis=2) 

    return predicted_logits[0, 0, 0, :, :]>= 0 

def show_points(coords, labels, ax, marker_size=375): 
    pos_points = coords[labels == 1] 
    neg_points = coords[labels == 0] 
    ax.scatter( 
        pos_points[:, 0], 
        pos_points[:, 1], 
        color="green", 
        marker="*", 
        s=marker_size, 
        edgecolor="white", 
        linewidth=1.25, 
    ) 
    ax.scatter( 
        neg_points[:, 0], 
        neg_points[:, 1], 
        color="red", 
        marker="*", 
        s=marker_size, 
        edgecolor="white", 
        linewidth=1.25, 
    ) 

def show_box(box, ax): 
    x0, y0 = box[0], box[1] 
    w, h = box[2] - box[0], box[3] - box[1] 
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="yellow", facecolor=(0, 0, 0, 0), lw=5)) 

def show_anns(mask, ax): 
    ax.set_autoscale_on(False) 
    img = np.ones((mask.shape[0], mask.shape[1], 4)) 
    img[:, :, 3] = 0 
    # for ann in mask:     # m = ann 
    color_mask = np.concatenate([np.random.random(3), [0.5]]) 
    img[mask] = color_mask 
    ax.imshow(img)

完全なモデル推論の例を以下に示します

input_points = [[580, 350], [650, 350]] 
input_labels = [1, 1] 

example_input = prepare_input(image, input_points, input_labels) 

predicted_logits, predicted_iou = pt_model(*example_input) 

predicted_mask = postprocess_results(predicted_iou.detach().numpy(), predicted_logits.detach().numpy())
image = Image.open(image_path) 

plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_points(np.array(input_points), np.array(input_labels), plt.gca()) 
plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_anns(predicted_mask, plt.gca()) 
plt.title(f"PyTorch {model_id.value}", fontsize=18) 
plt.show()
../_images/efficient-sam-with-output_16_0.png ../_images/efficient-sam-with-output_16_1.png

モデルを OpenVINO IR 形式に変換#

OpenVINO は、OpenVINO モデル・トランスフォーメーション API を使用して中間表現 (IR) 形式に変換することにより、PyTorch モデルをサポートします。openvino.convert_model 関数は、PyTorch モデルのインスタンスとサンプル入力 (正しいモデル操作のトレースと形状の推論に役立ちます) を受け入れ、OpenVINO フレームワークでモデルを表す openvino.Model オブジェクトを返します。この openvino.Model は、ov.Core.compile_model を使用してデバイスに読み込む準備ができており、openvino.save_model を使用してディスクに保存することもできます。

import openvino as ov 

core = ov.Core() 

ov_model_path = Path(f"{model_id.value}.xml") 

if not ov_model_path.exists(): 
    ov_model = ov.convert_model(pt_model, example_input=example_input) 
    ov.save_model(ov_model, ov_model_path) 
else: 
    ov_model = core.read_model(ov_model_path)
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  if ( 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  assert ( 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  size = int(math.sqrt(xy_num)) 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  assert size * size == xy_num 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  if size != h or size != w: 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  assert x.shape[2] == num_patches 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  if num_pts > self.decoder_max_num_input_points: 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  elif num_pts < self.decoder_max_num_input_points: 
/opt/home/k8sworker/ci-ai/cibuilds/ov-notebook/OVNotebookOps-727/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect.We can't record the data flow of Python values, so this value will be treated as a constant in the future.This means that the trace might not generalize to other inputs! 
  if output_w > 0 and output_h > 0:
['batched_images', 'batched_points', 'batched_point_labels']

OpenVINO モデル推論を実行#

ドロップダウン・リストから推論デバイスを選択#

device = widgets.Dropdown( 
    options=core.available_devices + ["AUTO"], 
    value="AUTO", 
    description="Device:", 
    disabled=False, 
) 

device
Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')

OpenVINO モデルをコンパイル#

compiled_model = core.compile_model(ov_model, device.value)

推論と結果の視覚化#

OpenVINO モデルの予測を見てみましょう。

example_input = prepare_input(image, input_points, input_labels, torch_tensor=False) 
result = compiled_model(example_input) 

predicted_logits, predicted_iou = result[0], result[1] 
predicted_mask = postprocess_results(predicted_iou, predicted_logits) 

plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_points(np.array(input_points), np.array(input_labels), plt.gca()) 
plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_anns(predicted_mask, plt.gca()) 
plt.title(f"OpenVINO {model_id.value}", fontsize=18) 
plt.show()
../_images/efficient-sam-with-output_24_0.png ../_images/efficient-sam-with-output_24_1.png

量子化#

NNCF は、モデルグラフに量子化レイヤーを追加し、トレーニング・データセットのサブセットを使用してこれらの追加の量子化レイヤーのパラメーターを初期化することで、トレーニング後の量子化を可能にします。このフレームワークは、元のトレーニング・コードへの変更が最小限になるように設計されています。

最適化プロセスには次の手順が含まれます:

  1. 量子化用のキャリブレーション・データセットを作成します。

  2. nncf.quantize を実行して、量子化されたエンコーダーおよびデコーダーモデルを取得します。

  3. openvino.save_model 関数を使用して INT8 モデルをシリアル化します。

: 量子化は時間とメモリーを消費する操作です。以下の量子化コードの実行には時間がかかる場合があります。

EfficientSAM 量子化を実行するかどうかを以下から選択してください。

to_quantize = widgets.Checkbox( 
    value=True, 
    description="Quantization", 
    disabled=False, 
) 

to_quantize
Checkbox(value=True, description='Quantization')
# `skip_kernel_extension` モジュールを取得 
import requests 

r = requests.get( 

url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", 
) 
open("skip_kernel_extension.py", "w").write(r.text) 

%load_ext skip_kernel_extension

キャリブレーション・データセットの準備#

最初のステップは、量子化のキャリブレーション・データセットを準備することです。量子化には coco128 データセットを使用します。通常、このデータセットはオブジェクト検出タスクを解決するために使用され、そのアノテーションは画像のボックス座標を提供します。この場合、ボックス座標はオブジェクト・セグメント化の入力ポイントとして機能し、以下のコードはデータセットをダウンロードし、EfficientSAM モデルの入力を準備する DataLoader を作成します。

%%skip not $to_quantize.value 

from zipfile import ZipFile 

r = requests.get( 
    url='https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py',
 ) 

open('notebook_utils.py', 'w').write(r.text) 

from notebook_utils import download_file 

DATA_URL = "https://ultralytics.com/assets/coco128.zip" 
OUT_DIR = Path('.') 

download_file(DATA_URL, directory=OUT_DIR, show_progress=True) 

if not (OUT_DIR / "coco128/images/train2017").exists(): 
    with ZipFile('coco128.zip' , "r") as zip_ref: 
        zip_ref.extractall(OUT_DIR)
coco128.zip: 0%|          | 0.00/6.66M [00:00<?, ?B/s]
%%skip not $to_quantize.value 

import torch.utils.data as data 

class COCOLoader(data.Dataset): 
    def __init__(self, images_path): 
        self.images = list(Path(images_path).iterdir()) 
        self.labels_dir = images_path.parents[1] / 'labels' / images_path.name 

    def get_points(self, image_path, image_width, image_height): 
        file_name = image_path.name.replace('.jpg', '.txt') 
        label_file = self.labels_dir / file_name 
        if not label_file.exists(): 
            x1, x2 = np.random.randint(low=0, high=image_width, size=(2, )) 
            y1, y2 = np.random.randint(low=0, high=image_height, size=(2, )) 
        else: 
            with label_file.open("r") as f: 
                box_line = f.readline() 
            _, x1, y1, x2, y2 = box_line.split() 
            x1 = int(float(x1) * image_width) 
            y1 = int(float(y1) * image_height) 
            x2 = int(float(x2) * image_width) 
            y2 = int(float(y2) * image_height) 
        return [[x1, y1], [x2, y2]] 

    def __getitem__(self, index): 
        image_path = self.images[index] 
        image = Image.open(image_path) 
        image = image.convert('RGB') 
        w, h = image.size 
        points = self.get_points(image_path, w, h) 
        labels = [1, 1] if index % 2 == 0 else [2, 3] 
        batched_images, batched_points, batched_point_labels = prepare_input(image, points, labels, torch_tensor=False) 
        return {'batched_images': np.ascontiguousarray(batched_images)[0], 'batched_points': np.ascontiguousarray(batched_points)[0], 'batched_point_labels': np.ascontiguousarray(batched_point_labels)[0]} 

    def __len__(self): 
        return len(self.images)
%%skip not $to_quantize.value 

coco_dataset = COCOLoader(OUT_DIR / 'coco128/images/train2017') 
calibration_loader = torch.utils.data.DataLoader(coco_dataset)

モデル量子化の実行#

nncf.quantize 関数は、モデル量子化のインターフェイスを提供します。OpenVINO モデルのインスタンスと量子化データセットが必要です。オプションで、量子化プロセスの追加パラメーター (量子化のサンプル数、プリセット、無視される範囲など) を提供できます。EfficientSAM には、活性化の非対称量子化を必要とする非 ReLU 活性化関数が含まれています。さらに良い結果を得るため、mixed 量子化プリセットを使用します。モデル・エンコーダー部分はビジョン・トランスフォーマー・アーキテクチャーに基づいており、このアーキテクチャー・タイプに特別な最適化を有効にするには、model_typetransformer を指定する必要があります。

%%skip not $to_quantize.value 

import nncf 

calibration_dataset = nncf.Dataset(calibration_loader) 

model = core.read_model(ov_model_path) 
quantized_model = nncf.quantize(
    model, 
    calibration_dataset, 
    model_type=nncf.parameters.ModelType.TRANSFORMER, 
    subset_size=128
) 
print("model quantization finished")
INFO:nncf:NNCF initialized successfully.Supported frameworks detected: torch, tensorflow, onnx, openvino
2024-07-13 00:20:24.222824: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on.You may see slightly different numerical results due to floating-point round-off errors from different computation orders.To turn them off, set the environment variable TF_ENABLE_ONEDNN_OPTS=0.
2024-07-13 00:20:24.255951: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-13 00:20:24.882804: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
Output()
Output()
INFO:nncf:57 ignored nodes were found by name in the NNCFGraph 
INFO:nncf:88 ignored nodes were found by name in the NNCFGraph
Output()
Output()
model quantization finished

量子化モデル推論の検証#

%%skip not $to_quantize.value 

compiled_model = core.compile_model(quantized_model, device.value) 

result = compiled_model(example_input) 

predicted_logits, predicted_iou = result[0], result[1] 

predicted_mask = postprocess_results(predicted_iou, predicted_logits) 

plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_points(np.array(input_points), np.array(input_labels), plt.gca()) 
plt.figure(figsize=(20, 20)) 
plt.axis("off") 
plt.imshow(image) 
show_anns(predicted_mask, plt.gca()) 
plt.title(f"OpenVINO INT8 {model_id.value}", fontsize=18) 
plt.show()
../_images/efficient-sam-with-output_35_0.png ../_images/efficient-sam-with-output_35_1.png

量子化モデルをディスクに保存#

%%skip not $to_quantize.value 

quantized_model_path = Path(f"{model_id.value}_int8.xml") 
ov.save_model(quantized_model, quantized_model_path)

量子化モデルのサイズを比較#

%%skip not $to_quantize.value 

fp16_weights = ov_model_path.with_suffix('.bin') 
quantized_weights = quantized_model_path.with_suffix('.bin') 

print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") 
print(f"Size of INT8 quantized model is {quantized_weights.stat().st_size / 1024 / 1024:.2f} MB") 
print(f"Compression rate for INT8 model: {fp16_weights.stat().st_size / quantized_weights.stat().st_size:.3f}")
Size of FP16 model is 21.50 MB 
Size of INT8 quantized model is 11.08 MB 
Compression rate for INT8 model: 1.941

FP16 モデルと INT8 モデルの推論時間を比較#

FP16 モデルと INT8 モデルの推論パフォーマンスを測定するには、bencmark_app を使用します。

: 最も正確なパフォーマンス推定を行うには、他のアプリケーションを閉じた後、ターミナル/コマンドプロンプトで benchmark_app を実行することを推奨します。

!benchmark_app -m $ov_model_path -d $device.value -data_shape "batched_images[1,3,512,512],batched_points[1,1,2,2],batched_point_labels[1,1,2]" -t 15
[Step 1/11] Parsing and validating input arguments 
[ INFO ] Parsing input parameters 
[Step 2/11] Loading OpenVINO Runtime 
[ INFO ] OpenVINO: 
[ INFO ] Build .................................2024.4.0-16028-fe423b97163 
[ INFO ] 
[ INFO ] Device info: 
[ INFO ] AUTO 
[ INFO ] Build .................................2024.4.0-16028-fe423b97163 
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration 
[ WARNING ] Performance hint was not explicitly specified in command line.Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT.
[Step 4/11] Reading model files 
[ INFO ] Loading model files 
[ INFO ] Read model took 29.66 ms 
[ INFO ] Original model I/O parameters: 
[ INFO ] Model inputs: 
[ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] 
[ INFO ] batched_points (node: batched_points) : i64 / [...] / [?,?,?,?] 
[ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...] / [?,?,?] 
[ INFO ] Model outputs: 
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,3,?,?] 
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,3] 
[Step 5/11] Resizing model to match image sizes and given batch 
[ INFO ] Model batch size: 1 
[Step 6/11] Configuring input of the model 
[ INFO ] Model inputs: 
[ INFO ] batched_images (node: batched_images) : f32 / [...]/ [?,?,?,?]
[ INFO ] batched_points (node: batched_points) : i64 / [...]/ [?,?,?,?]
[ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...]/ [?,?,?]
[ INFO ] Model outputs: 
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_3) : f32 / [...]/ [?,?,3,?,?]
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_2) : f32 / [...]/ [?,?,3] 
[Step 7/11] Loading the model to the device 
[ INFO ] Compile model took 1312.01 ms 
[Step 8/11] Querying optimal runtime parameters 
[ INFO ] Model: 
[ INFO ]     NETWORK_NAME: Model0 
[ INFO ]     EXECUTION_DEVICES: ['CPU'] 
[ INFO ]     PERFORMANCE_HINT: PerformanceMode.THROUGHPUT 
[ INFO ]     OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 
[ INFO ]     MULTI_DEVICE_PRIORITIES: CPU 
[ INFO ]     CPU: 
[ INFO ]       AFFINITY: Affinity.CORE 
[ INFO ]       CPU_DENORMALS_OPTIMIZATION: False 
[ INFO ]       CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 
[ INFO ]       DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 
[ INFO ]       ENABLE_CPU_PINNING: True 
[ INFO ]       ENABLE_HYPER_THREADING: True 
[ INFO ]       EXECUTION_DEVICES: ['CPU'] 
[ INFO ]       EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE 
[ INFO ]       INFERENCE_NUM_THREADS: 24 
[ INFO ]       INFERENCE_PRECISION_HINT: <Type: 'float32'> 
[ INFO ]       KV_CACHE_PRECISION: <Type: 'float16'> 
[ INFO ]       LOG_LEVEL: Level.NO 
[ INFO ]       MODEL_DISTRIBUTION_POLICY: set() 
[ INFO ]       NETWORK_NAME: Model0 
[ INFO ]       NUM_STREAMS: 6 
[ INFO ]       OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 
[ INFO ]       PERFORMANCE_HINT: THROUGHPUT 
[ INFO ]       PERFORMANCE_HINT_NUM_REQUESTS: 0 
[ INFO ]       PERF_COUNT: NO 
[ INFO ]       SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE 
[ INFO ] MODEL_PRIORITY: Priority.MEDIUM 
[ INFO ] LOADED_FROM_CACHE: False 
[ INFO ] PERF_COUNT: False 
[Step 9/11] Creating infer requests and preparing input tensors 
[ WARNING ] No input files were given for input 'batched_images'!.This input will be filled with random values! 
[ WARNING ] No input files were given for input 'batched_points'!. This input will be filled with random values! 
[ WARNING ] No input files were given for input 'batched_point_labels'!. This input will be filled with random values! 
[ INFO ] Fill input 'batched_images' with random values 
[ INFO ] Fill input 'batched_points' with random values 
[ INFO ] Fill input 'batched_point_labels' with random values 
[Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) 
[ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop).
[ INFO ] First inference took 666.28 ms 
[Step 11/11] Dumping statistics report 
[ INFO ] Execution Devices:['CPU'] 
[ INFO ] Count: 49 iterations 
[ INFO ] Duration: 15804.58 ms 
[ INFO ] Latency: 
[ INFO ]     Median: 1903.83 ms 
[ INFO ]     Average: 1881.69 ms 
[ INFO ]     Min: 626.71 ms 
[ INFO ]     Max: 1969.71 ms 
[ INFO ] Throughput: 3.10 FPS
if to_quantize.value:     !benchmark_app -m $quantized_model_path -d $device.value -data_shape "batched_images[1,3,512,512],batched_points[1,1,2,2],batched_point_labels[1,1,2]" -t 15
[Step 1/11] Parsing and validating input arguments 
[ INFO ] Parsing input parameters 
[Step 2/11] Loading OpenVINO Runtime 
[ INFO ] OpenVINO: 
[ INFO ] Build .................................2024.4.0-16028-fe423b97163 
[ INFO ] 
[ INFO ] Device info: 
[ INFO ] AUTO 
[ INFO ] Build .................................2024.4.0-16028-fe423b97163 
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration 
[ WARNING ] Performance hint was not explicitly specified in command line.Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT.
[Step 4/11] Reading model files 
[ INFO ] Loading model files 
[ INFO ] Read model took 43.30 ms 
[ INFO ] Original model I/O parameters: 
[ INFO ] Model inputs: 
[ INFO ] batched_images (node: batched_images) : f32 / [...]/ [?,?,?,?]
[ INFO ] batched_points (node: batched_points) : i64 / [...]/ [?,?,?,?]
[ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...]/ [?,?,?]
[ INFO ] Model outputs: 
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_3) : f32 / [...]/ [?,?,3,?,?]
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_2) : f32 / [...]/ [?,?,3] 
[Step 5/11] Resizing model to match image sizes and given batch 
[ INFO ] Model batch size: 1 
[Step 6/11] Configuring input of the model 
[ INFO ] Model inputs: 
[ INFO ] batched_images (node: batched_images) : f32 / [...]/ [?,?,?,?]
[ INFO ] batched_points (node: batched_points) : i64 / [...]/ [?,?,?,?]
[ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...]/ [?,?,?]
[ INFO ] Model outputs: 
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_3) : f32 / [...]/ [?,?,3,?,?]
[ INFO ] *NO_NAME* (node: aten::reshape/Reshape_2) : f32 / [...]/ [?,?,3] 
[Step 7/11] Loading the model to the device 
[ INFO ] Compile model took 1679.82 ms 
[Step 8/11] Querying optimal runtime parameters 
[ INFO ] Model: 
[ INFO ]     NETWORK_NAME: Model0 
[ INFO ]     EXECUTION_DEVICES: ['CPU'] 
[ INFO ]     PERFORMANCE_HINT: PerformanceMode.THROUGHPUT 
[ INFO ]     OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 
[ INFO ]     MULTI_DEVICE_PRIORITIES: CPU 
[ INFO ]     CPU: 
[ INFO ]       AFFINITY: Affinity.CORE 
[ INFO ]       CPU_DENORMALS_OPTIMIZATION: False 
[ INFO ]       CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 
[ INFO ]       DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 
[ INFO ]       ENABLE_CPU_PINNING: True 
[ INFO ]       ENABLE_HYPER_THREADING: True 
[ INFO ]       EXECUTION_DEVICES: ['CPU'] 
[ INFO ]       EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE 
[ INFO ]       INFERENCE_NUM_THREADS: 24 
[ INFO ]       INFERENCE_PRECISION_HINT: <Type: 'float32'> 
[ INFO ]       KV_CACHE_PRECISION: <Type: 'float16'> 
[ INFO ]       LOG_LEVEL: Level.NO 
[ INFO ]       MODEL_DISTRIBUTION_POLICY: set() 
[ INFO ]       NETWORK_NAME: Model0 
[ INFO ]       NUM_STREAMS: 6 
[ INFO ]       OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 
[ INFO ]       PERFORMANCE_HINT: THROUGHPUT 
[ INFO ]       PERFORMANCE_HINT_NUM_REQUESTS: 0 
[ INFO ]       PERF_COUNT: NO 
[ INFO ]       SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE 
[ INFO ] MODEL_PRIORITY: Priority.MEDIUM 
[ INFO ] LOADED_FROM_CACHE: False 
[ INFO ] PERF_COUNT: False 
[Step 9/11] Creating infer requests and preparing input tensors 
[ WARNING ] No input files were given for input 'batched_images'!.This input will be filled with random values! 
[ WARNING ] No input files were given for input 'batched_points'!.This input will be filled with random values! 
[ WARNING ] No input files were given for input 'batched_point_labels'!.This input will be filled with random values! 
[ INFO ] Fill input 'batched_images' with random values 
[ INFO ] Fill input 'batched_points' with random values 
[ INFO ] Fill input 'batched_point_labels' with random values 
[Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) 
[ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop).
[ INFO ] First inference took 604.97 ms 
[Step 11/11] Dumping statistics report 
[ INFO ] Execution Devices:['CPU'] 
[ INFO ] Count: 55 iterations 
[ INFO ] Duration: 16291.10 ms 
[ INFO ] Latency: 
[ INFO ]     Median: 1758.14 ms 
[ INFO ]     Average: 1740.52 ms 
[ INFO ]     Min: 625.06 ms 
[ INFO ]     Max: 1830.61 ms 
[ INFO ] Throughput: 3.38 FPS

インタラクティブなセグメント化のデモ#

import copy 
import gradio as gr 
import numpy as np 
from PIL import ImageDraw, Image 
import cv2 
import matplotlib.pyplot as plt 

example_images = [ 
    "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/b8083dd5-1ce7-43bf-8b09-a2ebc280c86e", 
    "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/9a90595d-70e7-469b-bdaf-469ef4f56fa2", 
    "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/b626c123-9fa2-4aa6-9929-30565991bf0c", 
] 

examples_dir = Path("examples") 
examples_dir.mkdir(exist_ok=True) 

for img_id, image_url in enumerate(example_images): 
    r = requests.get(image_url) 
    img_path = examples_dir / f"example_{img_id}.jpg" 
    with img_path.open("wb") as f: 
        f.write(r.content) 

def sigmoid(x): 
    return 1 / (1 + np.exp(-x)) 

def clear(): 
    return None, None, [], [] 

def format_results(masks, scores, logits, filter=0): 
    annotations = [] 
    n = len(scores) 
    for i in range(n): 
        annotation = {} 

        mask = masks[i] 
        tmp = np.where(mask != 0) 
        if np.sum(mask) < filter: 
            continue 
        annotation["id"] = i 
        annotation["segmentation"] = mask 
        annotation["bbox"] = [ 
            np.min(tmp[0]), 
            np.min(tmp[1]), 
            np.max(tmp[1]), 
            np.max(tmp[0]), 
        ] 
        annotation["score"] = scores[i] 
        annotation["area"] = annotation["segmentation"].sum() 
        annotations.append(annotation) 
    return annotations 

def point_prompt(masks, points, point_label, target_height, target_width): # numpy 
    h = masks[0]["segmentation"].shape[0] 
    w = masks[0]["segmentation"].shape[1] 
    if h != target_height or w != target_width: 
        points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points] 
    onemask = np.zeros((h, w)) 
    for i, annotation in enumerate(masks): 
        if isinstance(annotation, dict): 
            mask = annotation["segmentation"] 
        else: 
            mask = annotation 
        for i, point in enumerate(points): 
            if point[1] < mask.shape[0] and point[0] < mask.shape[1]: 
                if mask[point[1], point[0]] == 1 and point_label[i] == 1: 
                    onemask += mask 
                if mask[point[1], point[0]] == 1 and point_label[i] == 0: 
                    onemask -= mask 
    onemask = onemask >= 1 
    return onemask, 0 

def show_mask( 
    annotation, 
    ax, 
    random_color=False, 
    bbox=None, 
    retinamask=True, 
    target_height=960, 
    target_width=960, 
): 
    mask_sum = annotation.shape[0] 
    height = annotation.shape[1] 
    weight = annotation.shape[2] 
    # アノテーションはエリア別にソートされます 
    areas = np.sum(annotation, axis=(1, 2)) 
    sorted_indices = np.argsort(areas)[::1] 
    annotation = annotation[sorted_indices] 

    index = (annotation != 0).argmax(axis=0) 
    if random_color: 
        color = np.random.random((mask_sum, 1, 1, 3)) 
    else: 
        color = np.ones((mask_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255]) 
    transparency = np.ones((mask_sum, 1, 1, 1)) * 0.6 
    visual = np.concatenate([color, transparency], axis=-1) 
    mask_image = np.expand_dims(annotation, -1) * visual 

    mask = np.zeros((height, weight, 4)) 

    h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing="ij") 
    indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) 

    mask[h_indices, w_indices, :]= mask_image[indices] 
    if bbox is not None: 
        x1, y1, x2, y2 = bbox 
        ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1)) 

    if not retinamask: 
        mask = cv2.resize(mask, (target_width, target_height), interpolation=cv2.INTER_NEAREST) 

    return mask 

def process( 
    annotations, 
    image, 
    scale, 
    better_quality=False, 
    mask_random_color=True, 
    bbox=None, 
    points=None, 
    use_retina=True, 
    withContours=True, 
): 
    if isinstance(annotations[0], dict): 
        annotations = [annotation["segmentation"] for annotation in annotations] 

    original_h = image.height 
    original_w = image.width 
    if better_quality: 
        if isinstance(annotations[0], torch.Tensor): 
            annotations = np.array(annotations) 
        for i, mask in enumerate(annotations): 
            mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)) 
            annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)) 
    annotations = np.array(annotations) 
    inner_mask = show_mask( 
        annotations, 
        plt.gca(), 
        random_color=mask_random_color, 
        bbox=bbox, 
        retinamask=use_retina, 
        target_height=original_h, 
        target_width=original_w, 
    ) 

    if isinstance(annotations, torch.Tensor): 
        annotations = annotations.cpu().numpy() 

    if withContours: 
        contour_all = [] 
        temp = np.zeros((original_h, original_w, 1)) 
        for i, mask in enumerate(annotations): 
            if isinstance(mask, dict): 
                mask = mask["segmentation"] 
            annotation = mask.astype(np.uint8) 
            if not use_retina: 
                annotation = cv2.resize( 
                    annotation, 
                    (original_w, original_h), 
                    interpolation=cv2.INTER_NEAREST, 
                ) 
            contours, _ = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 
            for contour in contours: 
                contour_all.append(contour) 
        cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2 // scale) 
        color = np.array([0 / 255, 0 / 255, 255 / 255, 0.9]) 
        contour_mask = temp / 255 * color.reshape(1, 1, -1) 

    image = image.convert("RGBA") 
    overlay_inner = Image.fromarray((inner_mask * 255).astype(np.uint8), "RGBA") 
    image.paste(overlay_inner, (0, 0), overlay_inner) 

    if withContours: 
        overlay_contour = Image.fromarray((contour_mask * 255).astype(np.uint8), "RGBA") 
        image.paste(overlay_contour, (0, 0), overlay_contour) 

    return image 

# 説明 
title = "<center><strong><font size='8'>Efficient Segment Anything with OpenVINO and EfficientSAM <font></strong></center>" 

description_p = """# Interactive Instance Segmentation 
                - Point-prompt instruction 
                <ol> 
                <li> Click on the left image (point input), visualizing the point on the right image </li> 
                <li> Click the button of Segment with Point Prompt </li> 
                </ol> 
                - Box-prompt instruction 
                <ol> 
                <li> Click on the left image (one point input), visualizing the point on the right image </li> 
                <li> Click on the left image (another point input), visualizing the point and the box on the right image</li> 
                <li> Click the button of Segment with Box Prompt </li> 
                </ol> 
                """ 

# 例 
examples = [[img] for img in examples_dir.glob("*.jpg")] 

default_example = examples[0] 

css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }" 

def segment_with_boxs( 
    image, 
    seg_image, 
    global_points, 
    global_point_label, 
    input_size=1024, 
    better_quality=False, 
    withContours=True, 
    use_retina=True, 
    mask_random_color=True, 
): 
    if global_points is None or len(global_points) < 2 or global_points[0] is None: 
        return image, global_points, global_point_label 

    input_size = int(input_size) 
    w, h = image.size 
    scale = input_size / max(w, h) 
    new_w = int(w * scale) 
    new_h = int(h * scale) 
    image = image.resize((new_w, new_h)) 

    scaled_points = np.array([[int(x * scale) for x in point] for point in global_points]) 
    scaled_points = scaled_points[:2] 
    scaled_point_label = np.array(global_point_label)[:2] 

    if scaled_points.size == 0 and scaled_point_label.size == 0: 
        return image, global_points, global_point_label 

    nd_image = np.array(image) 
    img_tensor = nd_image.astype(np.float32) / 255 
    img_tensor = np.transpose(img_tensor, (2, 0, 1)) 

    pts_sampled = np.reshape(scaled_points, [1, 1, -1, 2]) 
    pts_sampled = pts_sampled[:, :, :2, :] 
    pts_labels = np.reshape(np.array([2, 3]), [1, 1, 2]) 

    results = compiled_model([img_tensor[None, ...], pts_sampled, pts_labels]) 
    predicted_logits = results[0] 
    predicted_iou = results[1] 
    all_masks = sigmoid(predicted_logits[0, 0, :, :, :])>= 0.5 
    predicted_iou = predicted_iou[0, 0, ...] 

    max_predicted_iou = -1 
    selected_mask_using_predicted_iou = None 
    selected_predicted_iou = None 

    for m in range(all_masks.shape[0]): 
        curr_predicted_iou = predicted_iou[m] 
        if curr_predicted_iou > max_predicted_iou or selected_mask_using_predicted_iou is None: 
            max_predicted_iou = curr_predicted_iou 
            selected_mask_using_predicted_iou = all_masks[m : m + 1] 
            selected_predicted_iou = predicted_iou[m : m + 1] 

    results = format_results(selected_mask_using_predicted_iou, selected_predicted_iou, predicted_logits, 0) 

    annotations = results[0]["segmentation"] 
    annotations = np.array([annotations]) 
    fig = process( 
        annotations=annotations, 
        image=image, 
        scale=(1024 // input_size), 
        better_quality=better_quality, 
        mask_random_color=mask_random_color, 
        use_retina=use_retina, 
        bbox=scaled_points.reshape([4]), 
        withContours=withContours, 
    ) 

    global_points = [] 
    global_point_label = [] 
    return fig, global_points, global_point_label 

def segment_with_points( 
    image, 
    global_points, 
    global_point_label, 
    input_size=1024, 
    better_quality=False, 
    withContours=True, 
    use_retina=True, mask_random_color=True, 
): 
    input_size = int(input_size) 
    w, h = image.size 
    scale = input_size / max(w, h) 
    new_w = int(w * scale) 
    new_h = int(h * scale) 
    image = image.resize((new_w, new_h)) 

    if global_points is None or len(global_points) < 1 or global_points[0] is None: 
        return image, global_points, global_point_label 
    scaled_points = np.array([[int(x * scale) for x in point] for point in global_points]) 
    scaled_point_label = np.array(global_point_label) 

    if scaled_points.size == 0 and scaled_point_label.size == 0: 
        return image, global_points, global_point_label 

    nd_image = np.array(image) 
    img_tensor = (nd_image).astype(np.float32) / 255 
    img_tensor = np.transpose(img_tensor, (2, 0, 1)) 

    pts_sampled = np.reshape(scaled_points, [1, 1, -1, 2]) 
    pts_labels = np.reshape(np.array(global_point_label), [1, 1, -1]) 

    results = compiled_model([img_tensor[None, ...], pts_sampled, pts_labels]) 
    predicted_logits = results[0] 
    predicted_iou = results[1] 
    all_masks = sigmoid(predicted_logits[0, 0, :, :, :])>= 0.5 
    predicted_iou = predicted_iou[0, 0, ...] 

    results = format_results(all_masks, predicted_iou, predicted_logits, 0) 
    annotations, _ = point_prompt(results, scaled_points, scaled_point_label, new_h, new_w) 
    annotations = np.array([annotations]) 

fig = process( 
    annotations=annotations, 
    image=image, 
    scale=(1024 // input_size), 
    better_quality=better_quality, 
    mask_random_color=mask_random_color, 
    points=scaled_points, 
    bbox=None, 
    use_retina=use_retina, 
    withContours=withContours, 
) 

global_points = [] 
global_point_label = [] 
# return fig, None 
return fig, global_points, global_point_label 

def get_points_with_draw(image, cond_image, global_points, global_point_label, evt: gr.SelectData): 
    print(global_points) 
    if len(global_points) == 0: 
        image = copy.deepcopy(cond_image) 
    x, y = evt.index[0], evt.index[1] 
    label = "Add Mask" 
    point_radius, point_color = 15, ( 
        (255, 255, 0) 
        if label == "Add Mask" 
        else ( 
            255, 
            0, 
            255, 
        ) 
    ) 
    global_points.append([x, y]) 
    global_point_label.append(1 if label == "Add Mask" else 0) 

    if image is not None: 
        draw = ImageDraw.Draw(image) 

        draw.ellipse( 
            [ 
                (x - point_radius, y - point_radius), 
                (x + point_radius, y + point_radius), 
            ], 
            fill=point_color, 
        ) 

    return image, global_points, global_point_label 

def get_points_with_draw_(image, cond_image, global_points, global_point_label, evt: gr.SelectData): 
    if len(global_points) == 0: 
        image = copy.deepcopy(cond_image) 
    if len(global_points) > 2: 
        return image, global_points, global_point_label 
    x, y = evt.index[0], evt.index[1] 
    label = "Add Mask" 
    point_radius, point_color = 15, ( 
        (255, 255, 0) 
        if label == "Add Mask" 
        else ( 
            255, 
            0, 
            255, 
        ) 
    ) 
    global_points.append([x, y]) 
    global_point_label.append(1 if label == "Add Mask" else 0) 

    if image is not None: 
        draw = ImageDraw.Draw(image) 
        draw.ellipse( 
            [ 
                (x - point_radius, y - point_radius), 
                (x + point_radius, y + point_radius), 
            ], 
            fill=point_color, 
        ) 

    if len(global_points) == 2: 
        x1, y1 = global_points[0] 
        x2, y2 = global_points[1] 
        if x1 < x2 and y1 < y2: 
            draw.rectangle([x1, y1, x2, y2], outline="red", width=5) 
        elif x1 < x2 and y1 >= y2: 
            draw.rectangle([x1, y2, x2, y1], outline="red", width=5) 
            global_points[0][0] = x1 
            global_points[0][1] = y2 
            global_points[1][0] = x2 
            global_points[1][1] = y1 
        elif x1 >= x2 and y1 < y2: 
            draw.rectangle([x2, y1, x1, y2], outline="red", width=5) 
            global_points[0][0] = x2 
            global_points[0][1] = y1 
            global_points[1][0] = x1 
            global_points[1][1] = y2 
        elif x1 >= x2 and y1 >= y2: 
            draw.rectangle([x2, y2, x1, y1], outline="red", width=5) 
            global_points[0][0] = x2 
            global_points[0][1] = y2 
            global_points[1][0] = x1 
            global_points[1][1] = y1 

    return image, global_points, global_point_label 

cond_img_p = gr.Image(label="Input with Point", value=default_example[0], type="pil") 
cond_img_b = gr.Image(label="Input with Box", value=default_example[0], type="pil") 

segm_img_p = gr.Image(label="Segmented Image with Point-Prompt", interactive=False, type="pil") 
segm_img_b = gr.Image(label="Segmented Image with Box-Prompt", interactive=False, type="pil") 

with gr.Blocks(css=css, title="Efficient SAM") as demo: 
    global_points = gr.State([]) 
    global_point_label = gr.State([]) 
    with gr.Row(): 
        with gr.Column(scale=1):
            # タイトル 
            gr.Markdown(title) 

    with gr.Tab("Point mode"):
        # 画像
        with gr.Row(variant="panel"): 
            with gr.Column(scale=1): 
                cond_img_p.render() 

            with gr.Column(scale=1): 
                segm_img_p.render() 

        # 送信とクリア 
        # ### 
        with gr.Row(): 
            with gr.Column(): 
                with gr.Column(): 
                    segment_btn_p = gr.Button("Segment with Point Prompt", variant="primary") 
                    clear_btn_p = gr.Button("Clear", variant="secondary") 

                gr.Markdown("Try some of the examples below ⬇️") 
                gr.Examples( 
                    examples=examples, 
                    inputs=[cond_img_p], 
                    examples_per_page=4, 
                ) 

            with gr.Column():
                # 説明 
                gr.Markdown(description_p) 

    with gr.Tab("Box mode"):
        # 画像 
        with gr.Row(variant="panel"): 
            with gr.Column(scale=1): 
                cond_img_b.render() 

            with gr.Column(scale=1): 
                segm_img_b.render() 

        # 送信とクリア 
        with gr.Row(): 
            with gr.Column(): 
                with gr.Column(): 
                    segment_btn_b = gr.Button("Segment with Box Prompt", variant="primary") 
                    clear_btn_b = gr.Button("Clear", variant="secondary") 

                gr.Markdown("Try some of the examples below ⬇️") 
                gr.Examples( 
                    examples=examples, 
                    inputs=[cond_img_b], 
                    examples_per_page=4, 
                ) 

        with gr.Column():
            # 説明 
            gr.Markdown(description_p) 

    cond_img_p.select( 
        get_points_with_draw, 
        inputs=[segm_img_p, cond_img_p, global_points, global_point_label], 
        outputs=[segm_img_p, global_points, global_point_label], 
    ) 

    cond_img_b.select( 
        get_points_with_draw_, 
        [segm_img_b, cond_img_b, global_points, global_point_label], 
        [segm_img_b, global_points, global_point_label], 
    ) 

    segment_btn_p.click( 
        segment_with_points, 
        inputs=[cond_img_p, global_points, global_point_label], 
        outputs=[segm_img_p, global_points, global_point_label], 
    ) 

    segment_btn_b.click( 
        segment_with_boxs, 
        inputs=[cond_img_b, segm_img_b, global_points, global_point_label], 
        outputs=[segm_img_b, global_points, global_point_label], 
    ) 

    clear_btn_p.click(clear, outputs=[cond_img_p, segm_img_p, global_points, global_point_label]) 
    clear_btn_b.click(clear, outputs=[cond_img_b, segm_img_b, global_points, global_point_label]) 

demo.queue() 
try: 
    demo.launch(debug=False) 
except Exception: 
    demo.launch(share=True, debug=False) 
# リモートで起動する場合は、server_name と server_port を指定 
# demo.launch(server_name='your server name', server_port='server port in int') 
# 詳細はドキュメントをご覧ください: https://gradio.app/docs/
ローカル URL で実行中: http://127.0.0.1:7860 
パブリックリンクを作成するには、launch()share=True を設定します。