feat: show OCR bounding box (#23717)

* feat: ocr bounding box * bounding boxes * pr feedback * pr feedback * allow copy across text boxes * pr feedback
2025-11-19 09:52:40 +07:00 · 2025-11-19 09:52:40 +07:00 · 56e431226f
parent f59417cc77
commit 56e431226f
9 changed files with 293 additions and 5 deletions
--- a/i18n/en.json
+++ b/i18n/en.json
@ -1158,6 +1158,7 @@
  "hide_named_person": "Hide person {name}",
  "hide_password": "Hide password",
  "hide_person": "Hide person",
+  "hide_text_recognition": "Hide text recognition",
  "hide_unnamed_people": "Hide unnamed people",
  "home_page_add_to_album_conflicts": "Added {added} assets to album {album}. {failed} assets are already in the album.",
  "home_page_add_to_album_err_local": "Can not add local assets to albums yet, skipping",
@ -1967,6 +1968,7 @@
  "show_slideshow_transition": "Show slideshow transition",
  "show_supporter_badge": "Supporter badge",
  "show_supporter_badge_description": "Show a supporter badge",
+  "show_text_recognition": "Show text recognition",
  "show_text_search_menu": "Show text search menu",
  "shuffle": "Shuffle",
  "sidebar": "Sidebar",
@ -2037,6 +2039,7 @@
  "tags": "Tags",
  "tap_to_run_job": "Tap to run job",
  "template": "Template",
+  "text_recognition": "Text recognition",
  "theme": "Theme",
  "theme_selection": "Theme selection",
  "theme_selection_description": "Automatically set the theme to light or dark based on your browser's system preference",
--- a/web/src/lib/actions/zoom-image.ts
+++ b/web/src/lib/actions/zoom-image.ts
@ -2,7 +2,7 @@ import { photoZoomState } from '$lib/stores/zoom-image.store';
 import { useZoomImageWheel } from '@zoom-image/svelte';
 import { get } from 'svelte/store';

-export const zoomImageAction = (node: HTMLElement) => {
+export const zoomImageAction = (node: HTMLElement, options?: { disabled?: boolean }) => {
  const { createZoomImage, zoomImageState, setZoomImageState } = useZoomImageWheel();

  createZoomImage(node, {
@ -14,9 +14,32 @@ export const zoomImageAction = (node: HTMLElement) => {
    setZoomImageState(state);
  }

+  // Store original event handlers so we can prevent them when disabled
+  const wheelHandler = (event: WheelEvent) => {
+    if (options?.disabled) {
+      event.stopImmediatePropagation();
+    }
+  };
+
+  const pointerDownHandler = (event: PointerEvent) => {
+    if (options?.disabled) {
+      event.stopImmediatePropagation();
+    }
+  };
+
+  // Add handlers at capture phase with higher priority
+  node.addEventListener('wheel', wheelHandler, { capture: true });
+  node.addEventListener('pointerdown', pointerDownHandler, { capture: true });
+
  const unsubscribes = [photoZoomState.subscribe(setZoomImageState), zoomImageState.subscribe(photoZoomState.set)];
+
  return {
+    update(newOptions?: { disabled?: boolean }) {
+      options = newOptions;
+    },
    destroy() {
+      node.removeEventListener('wheel', wheelHandler, { capture: true });
+      node.removeEventListener('pointerdown', pointerDownHandler, { capture: true });
      for (const unsubscribe of unsubscribes) {
        unsubscribe();
      }
--- a/web/src/lib/components/asset-viewer/asset-viewer.svelte
+++ b/web/src/lib/components/asset-viewer/asset-viewer.svelte
@ -13,6 +13,7 @@
  import type { TimelineAsset } from '$lib/managers/timeline-manager/types';
  import { closeEditorCofirm } from '$lib/stores/asset-editor.store';
  import { assetViewingStore } from '$lib/stores/asset-viewing.store';
+  import { ocrManager } from '$lib/stores/ocr.svelte';
  import { alwaysLoadOriginalVideo, isShowDetail } from '$lib/stores/preferences.store';
  import { SlideshowNavigation, SlideshowState, slideshowStore } from '$lib/stores/slideshow.store';
  import { user } from '$lib/stores/user.store';
@ -44,6 +45,7 @@
  import CropArea from './editor/crop-tool/crop-area.svelte';
  import EditorPanel from './editor/editor-panel.svelte';
  import ImagePanoramaViewer from './image-panorama-viewer.svelte';
+  import OcrButton from './ocr-button.svelte';
  import PhotoViewer from './photo-viewer.svelte';
  import SlideshowBar from './slideshow-bar.svelte';
  import VideoViewer from './video-wrapper-viewer.svelte';
@ -392,9 +394,13 @@
      handlePromiseError(activityManager.init(album.id, asset.id));
    }
  });
+
+  let currentAssetId = $derived(asset.id);
  $effect(() => {
-    if (asset.id) {
-      handlePromiseError(handleGetAllAlbums());
+    if (currentAssetId) {
+      untrack(() => handlePromiseError(handleGetAllAlbums()));
+      ocrManager.clear();
+      handlePromiseError(ocrManager.getAssetOcr(currentAssetId));
    }
  });
 </script>
@ -535,6 +541,7 @@
            {playOriginalVideo}
          />
        {/if}
+
        {#if $slideshowState === SlideshowState.None && isShared && ((album && album.isActivityEnabled) || activityManager.commentCount > 0) && !activityManager.isLoading}
          <div class="absolute bottom-0 end-0 mb-20 me-8">
            <ActivityStatus
@ -547,6 +554,12 @@
            />
          </div>
        {/if}
+
+        {#if $slideshowState === SlideshowState.None && asset.type === AssetTypeEnum.Image && !isShowEditor && ocrManager.hasOcrData}
+          <div class="absolute bottom-0 end-0 mb-6 me-6">
+            <OcrButton />
+          </div>
+        {/if}
      {/key}
    {/if}
  </div>
--- a/web/src/lib/components/asset-viewer/detail-panel.svelte
+++ b/web/src/lib/components/asset-viewer/detail-panel.svelte
@ -503,7 +503,7 @@
 {/if}

 {#if albums.length > 0}
-  <section class="px-6 pt-6 dark:text-immich-dark-fg">
+  <section class="px-6 py-6 dark:text-immich-dark-fg">
    <p class="uppercase pb-4 text-sm">{$t('appears_in')}</p>
    {#each albums as album (album.id)}
      <a href={resolve(`${AppRoute.ALBUMS}/${album.id}`)}>
--- a/web/src/lib/components/asset-viewer/ocr-bounding-box.svelte
+++ b/web/src/lib/components/asset-viewer/ocr-bounding-box.svelte
@ -0,0 +1,36 @@
+<script lang="ts">
+  import type { OcrBox } from '$lib/utils/ocr-utils';
+  import { calculateBoundingBoxDimensions } from '$lib/utils/ocr-utils';
+
+  type Props = {
+    ocrBox: OcrBox;
+  };
+
+  let { ocrBox }: Props = $props();
+
+  const dimensions = $derived(calculateBoundingBoxDimensions(ocrBox.points));
+
+  const transform = $derived(
+    `translate(${dimensions.minX}px, ${dimensions.minY}px) rotate(${dimensions.rotation}deg) skew(${dimensions.skewX}deg, ${dimensions.skewY}deg)`,
+  );
+
+  const transformOrigin = $derived(
+    `${dimensions.centerX - dimensions.minX}px ${dimensions.centerY - dimensions.minY}px`,
+  );
+</script>
+
+<div class="absolute group left-0 top-0 pointer-events-none">
+  <!-- Bounding box with CSS transforms -->
+  <div
+    class="absolute border-2 border-blue-500 bg-blue-500/10 cursor-pointer pointer-events-auto transition-all group-hover:bg-blue-500/30 group-hover:border-blue-600 group-hover:border-[3px]"
+    style="width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: {transformOrigin};"
+  ></div>
+
+  <!-- Text overlay - always rendered but invisible, allows text selection and copy -->
+  <div
+    class="absolute flex items-center justify-center text-transparent text-sm px-2 py-1 pointer-events-auto cursor-text whitespace-pre-wrap wrap-break-word select-text group-hover:text-white group-hover:bg-black/75 group-hover:z-10"
+    style="width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: {transformOrigin};"
+  >
+    {ocrBox.text}
+  </div>
+</div>
--- a/web/src/lib/components/asset-viewer/ocr-button.svelte
+++ b/web/src/lib/components/asset-viewer/ocr-button.svelte
@ -0,0 +1,17 @@
+<script lang="ts">
+  import { ocrManager } from '$lib/stores/ocr.svelte';
+  import { IconButton } from '@immich/ui';
+  import { mdiTextRecognition } from '@mdi/js';
+  import { t } from 'svelte-i18n';
+</script>
+
+<IconButton
+  title={ocrManager.showOverlay ? $t('hide_text_recognition') : $t('show_text_recognition')}
+  icon={mdiTextRecognition}
+  class={"dark {ocrStore.showOverlay ? 'bg-immich-primary text-white dark' : 'dark'}"}
+  color="secondary"
+  variant="ghost"
+  shape="round"
+  aria-label={$t('text_recognition')}
+  onclick={() => ocrManager.toggleOcrBoundingBox()}
+/>
--- a/web/src/lib/components/asset-viewer/photo-viewer.svelte
+++ b/web/src/lib/components/asset-viewer/photo-viewer.svelte
@ -2,12 +2,14 @@
  import { shortcuts } from '$lib/actions/shortcut';
  import { zoomImageAction } from '$lib/actions/zoom-image';
  import FaceEditor from '$lib/components/asset-viewer/face-editor/face-editor.svelte';
+  import OcrBoundingBox from '$lib/components/asset-viewer/ocr-bounding-box.svelte';
  import BrokenAsset from '$lib/components/assets/broken-asset.svelte';
  import { assetViewerFadeDuration } from '$lib/constants';
  import { castManager } from '$lib/managers/cast-manager.svelte';
  import type { TimelineAsset } from '$lib/managers/timeline-manager/types';
  import { photoViewerImgElement } from '$lib/stores/assets-store.svelte';
  import { isFaceEditMode } from '$lib/stores/face-edit.svelte';
+  import { ocrManager } from '$lib/stores/ocr.svelte';
  import { boundingBoxesArray } from '$lib/stores/people.store';
  import { alwaysLoadOriginalFile } from '$lib/stores/preferences.store';
  import { SlideshowLook, SlideshowState, slideshowLookCssMapping, slideshowStore } from '$lib/stores/slideshow.store';
@ -15,6 +17,7 @@
  import { getAssetOriginalUrl, getAssetThumbnailUrl, handlePromiseError } from '$lib/utils';
  import { canCopyImageToClipboard, copyImageToClipboard, isWebCompatibleImage } from '$lib/utils/asset-utils';
  import { handleError } from '$lib/utils/handle-error';
+  import { getOcrBoundingBoxes } from '$lib/utils/ocr-utils';
  import { getBoundingBox } from '$lib/utils/people-utils';
  import { cancelImageUrl } from '$lib/utils/sw-messaging';
  import { getAltText } from '$lib/utils/thumbnail-util';
@ -71,6 +74,14 @@
    $boundingBoxesArray = [];
  });

+  let ocrBoxes = $derived(
+    ocrManager.showOverlay && $photoViewerImgElement
+      ? getOcrBoundingBoxes(ocrManager.data, $photoZoomState, $photoViewerImgElement)
+      : [],
+  );
+
+  let isOcrActive = $derived(ocrManager.showOverlay);
+
  const preload = (targetSize: AssetMediaSize | 'original', preloadAssets?: TimelineAsset[]) => {
    for (const preloadAsset of preloadAssets || []) {
      if (preloadAsset.isImage) {
@ -130,9 +141,15 @@
    if ($photoZoomState.currentZoom > 1) {
      return;
    }
+
+    if (ocrManager.showOverlay) {
+      return;
+    }
+
    if (onNextAsset && event.detail.direction === 'left') {
      onNextAsset();
    }
+
    if (onPreviousAsset && event.detail.direction === 'right') {
      onPreviousAsset();
    }
@ -235,7 +252,7 @@
    </div>
  {:else if !imageError}
    <div
-      use:zoomImageAction
+      use:zoomImageAction={{ disabled: isOcrActive }}
      {...useSwipe(onSwipe)}
      class="h-full w-full"
      transition:fade={{ duration: haveFadeTransition ? assetViewerFadeDuration : 0 }}
@ -264,6 +281,10 @@
          style="top: {boundingbox.top}px; left: {boundingbox.left}px; height: {boundingbox.height}px; width: {boundingbox.width}px;"
        ></div>
      {/each}
+
+      {#each ocrBoxes as ocrBox (ocrBox.id)}
+        <OcrBoundingBox {ocrBox} />
+      {/each}
    </div>

    {#if isFaceEditMode.value}
--- a/web/src/lib/stores/ocr.svelte.ts
+++ b/web/src/lib/stores/ocr.svelte.ts
@ -0,0 +1,44 @@
+import { getAssetOcr } from '@immich/sdk';
+
+export type OcrBoundingBox = {
+  id: string;
+  assetId: string;
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
+  x3: number;
+  y3: number;
+  x4: number;
+  y4: number;
+  boxScore: number;
+  textScore: number;
+  text: string;
+};
+
+class OcrManager {
+  #data = $state<OcrBoundingBox[]>([]);
+  showOverlay = $state(false);
+  hasOcrData = $state(false);
+
+  get data() {
+    return this.#data;
+  }
+
+  async getAssetOcr(id: string) {
+    this.#data = await getAssetOcr({ id });
+    this.hasOcrData = this.#data.length > 0;
+  }
+
+  clear() {
+    this.#data = [];
+    this.showOverlay = false;
+    this.hasOcrData = false;
+  }
+
+  toggleOcrBoundingBox() {
+    this.showOverlay = !this.showOverlay;
+  }
+}
+
+export const ocrManager = new OcrManager();
--- a/web/src/lib/utils/ocr-utils.ts
+++ b/web/src/lib/utils/ocr-utils.ts
@ -0,0 +1,131 @@
+import type { OcrBoundingBox } from '$lib/stores/ocr.svelte';
+import type { ZoomImageWheelState } from '@zoom-image/core';
+
+const getContainedSize = (img: HTMLImageElement): { width: number; height: number } => {
+  const ratio = img.naturalWidth / img.naturalHeight;
+  let width = img.height * ratio;
+  let height = img.height;
+  if (width > img.width) {
+    width = img.width;
+    height = img.width / ratio;
+  }
+  return { width, height };
+};
+
+export interface OcrBox {
+  id: string;
+  points: { x: number; y: number }[];
+  text: string;
+  confidence: number;
+}
+
+export interface BoundingBoxDimensions {
+  minX: number;
+  maxX: number;
+  minY: number;
+  maxY: number;
+  width: number;
+  height: number;
+  centerX: number;
+  centerY: number;
+  rotation: number;
+  skewX: number;
+  skewY: number;
+}
+
+/**
+ * Calculate bounding box dimensions and properties from OCR points
+ * @param points - Array of 4 corner points of the bounding box
+ * @returns Dimensions, rotation, and skew values for the bounding box
+ */
+export const calculateBoundingBoxDimensions = (points: { x: number; y: number }[]): BoundingBoxDimensions => {
+  const [topLeft, topRight, bottomRight, bottomLeft] = points;
+  const minX = Math.min(...points.map(({ x }) => x));
+  const maxX = Math.max(...points.map(({ x }) => x));
+  const minY = Math.min(...points.map(({ y }) => y));
+  const maxY = Math.max(...points.map(({ y }) => y));
+  const width = maxX - minX;
+  const height = maxY - minY;
+  const centerX = (minX + maxX) / 2;
+  const centerY = (minY + maxY) / 2;
+
+  // Calculate rotation angle from the bottom edge (bottomLeft to bottomRight)
+  const rotation = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x) * (180 / Math.PI);
+
+  // Calculate skew angles to handle perspective distortion
+  // SkewX: compare left and right edges
+  const leftEdgeAngle = Math.atan2(bottomLeft.y - topLeft.y, bottomLeft.x - topLeft.x);
+  const rightEdgeAngle = Math.atan2(bottomRight.y - topRight.y, bottomRight.x - topRight.x);
+  const skewX = (rightEdgeAngle - leftEdgeAngle) * (180 / Math.PI);
+
+  // SkewY: compare top and bottom edges
+  const topEdgeAngle = Math.atan2(topRight.y - topLeft.y, topRight.x - topLeft.x);
+  const bottomEdgeAngle = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x);
+  const skewY = (bottomEdgeAngle - topEdgeAngle) * (180 / Math.PI);
+
+  return {
+    minX,
+    maxX,
+    minY,
+    maxY,
+    width,
+    height,
+    centerX,
+    centerY,
+    rotation,
+    skewX,
+    skewY,
+  };
+};
+
+/**
+ * Convert normalized OCR coordinates to screen coordinates
+ * OCR coordinates are normalized (0-1) and represent the 4 corners of a rotated rectangle
+ */
+export const getOcrBoundingBoxes = (
+  ocrData: OcrBoundingBox[],
+  zoom: ZoomImageWheelState,
+  photoViewer: HTMLImageElement | null,
+): OcrBox[] => {
+  const boxes: OcrBox[] = [];
+
+  if (photoViewer === null || !photoViewer.naturalWidth || !photoViewer.naturalHeight) {
+    return boxes;
+  }
+
+  const clientHeight = photoViewer.clientHeight;
+  const clientWidth = photoViewer.clientWidth;
+  const { width, height } = getContainedSize(photoViewer);
+
+  const imageWidth = photoViewer.naturalWidth;
+  const imageHeight = photoViewer.naturalHeight;
+
+  for (const ocr of ocrData) {
+    // Convert normalized coordinates (0-1) to actual pixel positions
+    // OCR provides 4 corners of a potentially rotated rectangle
+    const points = [
+      { x: ocr.x1, y: ocr.y1 },
+      { x: ocr.x2, y: ocr.y2 },
+      { x: ocr.x3, y: ocr.y3 },
+      { x: ocr.x4, y: ocr.y4 },
+    ].map((point) => ({
+      x:
+        (width / imageWidth) * zoom.currentZoom * point.x * imageWidth +
+        ((clientWidth - width) / 2) * zoom.currentZoom +
+        zoom.currentPositionX,
+      y:
+        (height / imageHeight) * zoom.currentZoom * point.y * imageHeight +
+        ((clientHeight - height) / 2) * zoom.currentZoom +
+        zoom.currentPositionY,
+    }));
+
+    boxes.push({
+      id: ocr.id,
+      points,
+      text: ocr.text,
+      confidence: ocr.textScore,
+    });
+  }
+
+  return boxes;
+};