mirror of https://github.com/immich-app/immich.git
feat: show OCR bounding box (#23717)
* feat: ocr bounding box * bounding boxes * pr feedback * pr feedback * allow copy across text boxes * pr feedbackpull/23998/head
parent
f59417cc77
commit
56e431226f
@ -0,0 +1,36 @@
|
||||
<script lang="ts">
|
||||
import type { OcrBox } from '$lib/utils/ocr-utils';
|
||||
import { calculateBoundingBoxDimensions } from '$lib/utils/ocr-utils';
|
||||
|
||||
type Props = {
|
||||
ocrBox: OcrBox;
|
||||
};
|
||||
|
||||
let { ocrBox }: Props = $props();
|
||||
|
||||
const dimensions = $derived(calculateBoundingBoxDimensions(ocrBox.points));
|
||||
|
||||
const transform = $derived(
|
||||
`translate(${dimensions.minX}px, ${dimensions.minY}px) rotate(${dimensions.rotation}deg) skew(${dimensions.skewX}deg, ${dimensions.skewY}deg)`,
|
||||
);
|
||||
|
||||
const transformOrigin = $derived(
|
||||
`${dimensions.centerX - dimensions.minX}px ${dimensions.centerY - dimensions.minY}px`,
|
||||
);
|
||||
</script>
|
||||
|
||||
<div class="absolute group left-0 top-0 pointer-events-none">
|
||||
<!-- Bounding box with CSS transforms -->
|
||||
<div
|
||||
class="absolute border-2 border-blue-500 bg-blue-500/10 cursor-pointer pointer-events-auto transition-all group-hover:bg-blue-500/30 group-hover:border-blue-600 group-hover:border-[3px]"
|
||||
style="width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: {transformOrigin};"
|
||||
></div>
|
||||
|
||||
<!-- Text overlay - always rendered but invisible, allows text selection and copy -->
|
||||
<div
|
||||
class="absolute flex items-center justify-center text-transparent text-sm px-2 py-1 pointer-events-auto cursor-text whitespace-pre-wrap wrap-break-word select-text group-hover:text-white group-hover:bg-black/75 group-hover:z-10"
|
||||
style="width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: {transformOrigin};"
|
||||
>
|
||||
{ocrBox.text}
|
||||
</div>
|
||||
</div>
|
||||
@ -0,0 +1,17 @@
|
||||
<script lang="ts">
|
||||
import { ocrManager } from '$lib/stores/ocr.svelte';
|
||||
import { IconButton } from '@immich/ui';
|
||||
import { mdiTextRecognition } from '@mdi/js';
|
||||
import { t } from 'svelte-i18n';
|
||||
</script>
|
||||
|
||||
<IconButton
|
||||
title={ocrManager.showOverlay ? $t('hide_text_recognition') : $t('show_text_recognition')}
|
||||
icon={mdiTextRecognition}
|
||||
class={"dark {ocrStore.showOverlay ? 'bg-immich-primary text-white dark' : 'dark'}"}
|
||||
color="secondary"
|
||||
variant="ghost"
|
||||
shape="round"
|
||||
aria-label={$t('text_recognition')}
|
||||
onclick={() => ocrManager.toggleOcrBoundingBox()}
|
||||
/>
|
||||
@ -0,0 +1,44 @@
|
||||
import { getAssetOcr } from '@immich/sdk';
|
||||
|
||||
export type OcrBoundingBox = {
|
||||
id: string;
|
||||
assetId: string;
|
||||
x1: number;
|
||||
y1: number;
|
||||
x2: number;
|
||||
y2: number;
|
||||
x3: number;
|
||||
y3: number;
|
||||
x4: number;
|
||||
y4: number;
|
||||
boxScore: number;
|
||||
textScore: number;
|
||||
text: string;
|
||||
};
|
||||
|
||||
class OcrManager {
|
||||
#data = $state<OcrBoundingBox[]>([]);
|
||||
showOverlay = $state(false);
|
||||
hasOcrData = $state(false);
|
||||
|
||||
get data() {
|
||||
return this.#data;
|
||||
}
|
||||
|
||||
async getAssetOcr(id: string) {
|
||||
this.#data = await getAssetOcr({ id });
|
||||
this.hasOcrData = this.#data.length > 0;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.#data = [];
|
||||
this.showOverlay = false;
|
||||
this.hasOcrData = false;
|
||||
}
|
||||
|
||||
toggleOcrBoundingBox() {
|
||||
this.showOverlay = !this.showOverlay;
|
||||
}
|
||||
}
|
||||
|
||||
export const ocrManager = new OcrManager();
|
||||
@ -0,0 +1,131 @@
|
||||
import type { OcrBoundingBox } from '$lib/stores/ocr.svelte';
|
||||
import type { ZoomImageWheelState } from '@zoom-image/core';
|
||||
|
||||
const getContainedSize = (img: HTMLImageElement): { width: number; height: number } => {
|
||||
const ratio = img.naturalWidth / img.naturalHeight;
|
||||
let width = img.height * ratio;
|
||||
let height = img.height;
|
||||
if (width > img.width) {
|
||||
width = img.width;
|
||||
height = img.width / ratio;
|
||||
}
|
||||
return { width, height };
|
||||
};
|
||||
|
||||
export interface OcrBox {
|
||||
id: string;
|
||||
points: { x: number; y: number }[];
|
||||
text: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
export interface BoundingBoxDimensions {
|
||||
minX: number;
|
||||
maxX: number;
|
||||
minY: number;
|
||||
maxY: number;
|
||||
width: number;
|
||||
height: number;
|
||||
centerX: number;
|
||||
centerY: number;
|
||||
rotation: number;
|
||||
skewX: number;
|
||||
skewY: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate bounding box dimensions and properties from OCR points
|
||||
* @param points - Array of 4 corner points of the bounding box
|
||||
* @returns Dimensions, rotation, and skew values for the bounding box
|
||||
*/
|
||||
export const calculateBoundingBoxDimensions = (points: { x: number; y: number }[]): BoundingBoxDimensions => {
|
||||
const [topLeft, topRight, bottomRight, bottomLeft] = points;
|
||||
const minX = Math.min(...points.map(({ x }) => x));
|
||||
const maxX = Math.max(...points.map(({ x }) => x));
|
||||
const minY = Math.min(...points.map(({ y }) => y));
|
||||
const maxY = Math.max(...points.map(({ y }) => y));
|
||||
const width = maxX - minX;
|
||||
const height = maxY - minY;
|
||||
const centerX = (minX + maxX) / 2;
|
||||
const centerY = (minY + maxY) / 2;
|
||||
|
||||
// Calculate rotation angle from the bottom edge (bottomLeft to bottomRight)
|
||||
const rotation = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x) * (180 / Math.PI);
|
||||
|
||||
// Calculate skew angles to handle perspective distortion
|
||||
// SkewX: compare left and right edges
|
||||
const leftEdgeAngle = Math.atan2(bottomLeft.y - topLeft.y, bottomLeft.x - topLeft.x);
|
||||
const rightEdgeAngle = Math.atan2(bottomRight.y - topRight.y, bottomRight.x - topRight.x);
|
||||
const skewX = (rightEdgeAngle - leftEdgeAngle) * (180 / Math.PI);
|
||||
|
||||
// SkewY: compare top and bottom edges
|
||||
const topEdgeAngle = Math.atan2(topRight.y - topLeft.y, topRight.x - topLeft.x);
|
||||
const bottomEdgeAngle = Math.atan2(bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x);
|
||||
const skewY = (bottomEdgeAngle - topEdgeAngle) * (180 / Math.PI);
|
||||
|
||||
return {
|
||||
minX,
|
||||
maxX,
|
||||
minY,
|
||||
maxY,
|
||||
width,
|
||||
height,
|
||||
centerX,
|
||||
centerY,
|
||||
rotation,
|
||||
skewX,
|
||||
skewY,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert normalized OCR coordinates to screen coordinates
|
||||
* OCR coordinates are normalized (0-1) and represent the 4 corners of a rotated rectangle
|
||||
*/
|
||||
export const getOcrBoundingBoxes = (
|
||||
ocrData: OcrBoundingBox[],
|
||||
zoom: ZoomImageWheelState,
|
||||
photoViewer: HTMLImageElement | null,
|
||||
): OcrBox[] => {
|
||||
const boxes: OcrBox[] = [];
|
||||
|
||||
if (photoViewer === null || !photoViewer.naturalWidth || !photoViewer.naturalHeight) {
|
||||
return boxes;
|
||||
}
|
||||
|
||||
const clientHeight = photoViewer.clientHeight;
|
||||
const clientWidth = photoViewer.clientWidth;
|
||||
const { width, height } = getContainedSize(photoViewer);
|
||||
|
||||
const imageWidth = photoViewer.naturalWidth;
|
||||
const imageHeight = photoViewer.naturalHeight;
|
||||
|
||||
for (const ocr of ocrData) {
|
||||
// Convert normalized coordinates (0-1) to actual pixel positions
|
||||
// OCR provides 4 corners of a potentially rotated rectangle
|
||||
const points = [
|
||||
{ x: ocr.x1, y: ocr.y1 },
|
||||
{ x: ocr.x2, y: ocr.y2 },
|
||||
{ x: ocr.x3, y: ocr.y3 },
|
||||
{ x: ocr.x4, y: ocr.y4 },
|
||||
].map((point) => ({
|
||||
x:
|
||||
(width / imageWidth) * zoom.currentZoom * point.x * imageWidth +
|
||||
((clientWidth - width) / 2) * zoom.currentZoom +
|
||||
zoom.currentPositionX,
|
||||
y:
|
||||
(height / imageHeight) * zoom.currentZoom * point.y * imageHeight +
|
||||
((clientHeight - height) / 2) * zoom.currentZoom +
|
||||
zoom.currentPositionY,
|
||||
}));
|
||||
|
||||
boxes.push({
|
||||
id: ocr.id,
|
||||
points,
|
||||
text: ocr.text,
|
||||
confidence: ocr.textScore,
|
||||
});
|
||||
}
|
||||
|
||||
return boxes;
|
||||
};
|
||||
Loading…
Reference in New Issue