immich/web/src/lib/utils/ocr-utils.ts

import type { OcrBoundingBox } from '$lib/stores/ocr.svelte';
import type { ZoomImageWheelState } from '@zoom-image/core';

const getContainedSize = (img: HTMLImageElement): { width: number; height: number } => {
  const ratio = img.naturalWidth / img.naturalHeight;
  let width = img.height * ratio;
  let height = img.height;
  if (width > img.width) {
    width = img.width;
    height = img.width / ratio;
  }
  return { width, height };
};

export interface OcrBox {
  id: string;
  points: { x: number; y: number }[];
  text: string;
  confidence: number;
}

export interface BoundingBoxDimensions {
  minX: number;
  maxX: number;
  minY: number;
  maxY: number;
  width: number;
  height: number;
  centerX: number;
  centerY: number;
  rotation: number;
  skewX: number;
  skewY: number;
}

/**
 * Calculate bounding box dimensions and properties from OCR points
 * @param points - Array of 4 corner points of the bounding box
 * @returns Dimensions, rotation, and skew values for the bounding box
 */
export const calculateBoundingBoxDimensions = (points: { x: number; y: number }[]): BoundingBoxDimensions => {
  const [topLeft, topRight, bottomRight, bottomLeft] = points;
  const minX = Math.min(...points.map(({ x }) => x));
  const maxX = Math.max(...points.map(({ x }) => x));
  const minY = Math.min(...points.map(({ y }) => y));
  const maxY = Math.max(...points.map(({ y }) => y));
  const width = maxX - minX;
  const height = maxY - minY;
  const centerX = (minX + maxX) / 2;
  const centerY = (minY + maxY) / 2;

  // Calculate rotation angle from the bottom edge (bottomLeft to bottomRight)
  const rotation = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x) * (180 / Math.PI);

  // Calculate skew angles to handle perspective distortion
  // SkewX: compare left and right edges
  const leftEdgeAngle = Math.atan2(bottomLeft.y - topLeft.y, bottomLeft.x - topLeft.x);
  const rightEdgeAngle = Math.atan2(bottomRight.y - topRight.y, bottomRight.x - topRight.x);
  const skewX = (rightEdgeAngle - leftEdgeAngle) * (180 / Math.PI);

  // SkewY: compare top and bottom edges
  const topEdgeAngle = Math.atan2(topRight.y - topLeft.y, topRight.x - topLeft.x);
  const bottomEdgeAngle = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x);
  const skewY = (bottomEdgeAngle - topEdgeAngle) * (180 / Math.PI);

  return {
    minX,
    maxX,
    minY,
    maxY,
    width,
    height,
    centerX,
    centerY,
    rotation,
    skewX,
    skewY,
  };
};

/**
 * Convert normalized OCR coordinates to screen coordinates
 * OCR coordinates are normalized (0-1) and represent the 4 corners of a rotated rectangle
 */
export const getOcrBoundingBoxes = (
  ocrData: OcrBoundingBox[],
  zoom: ZoomImageWheelState,
  photoViewer: HTMLImageElement | null,
): OcrBox[] => {
  const boxes: OcrBox[] = [];

  if (photoViewer === null || !photoViewer.naturalWidth || !photoViewer.naturalHeight) {
    return boxes;
  }

  const clientHeight = photoViewer.clientHeight;
  const clientWidth = photoViewer.clientWidth;
  const { width, height } = getContainedSize(photoViewer);

  const imageWidth = photoViewer.naturalWidth;
  const imageHeight = photoViewer.naturalHeight;

  for (const ocr of ocrData) {
    // Convert normalized coordinates (0-1) to actual pixel positions
    // OCR provides 4 corners of a potentially rotated rectangle
    const points = [
      { x: ocr.x1, y: ocr.y1 },
      { x: ocr.x2, y: ocr.y2 },
      { x: ocr.x3, y: ocr.y3 },
      { x: ocr.x4, y: ocr.y4 },
    ].map((point) => ({
      x:
        (width / imageWidth) * zoom.currentZoom * point.x * imageWidth +
        ((clientWidth - width) / 2) * zoom.currentZoom +
        zoom.currentPositionX,
      y:
        (height / imageHeight) * zoom.currentZoom * point.y * imageHeight +
        ((clientHeight - height) / 2) * zoom.currentZoom +
        zoom.currentPositionY,
    }));

    boxes.push({
      id: ocr.id,
      points,
      text: ocr.text,
      confidence: ocr.textScore,
    });
  }

  return boxes;
};