Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 151 additions & 86 deletions src/tools/ExtractImages.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
* images or a ZIP of all selected images.
*/

import { useState, useCallback } from "react";
import { ImageDown, Loader2 } from "lucide-react";
import { useState, useCallback, useMemo } from "react";
import { CheckSquare, ImageDown, Loader2, X } from "lucide-react";
import { FileDropZone } from "../components/FileDropZone.tsx";
import { categoryAccent, categoryGlow } from "../config/theme.ts";
import type { PDFDocumentProxy } from "pdfjs-dist";
Expand All @@ -30,6 +30,87 @@ interface ExtractedImage {
blob: Blob;
}

/** Shape returned by PDF.js for resolved image objects. */
interface PdfjsImageData {
width: number;
height: number;
data?: Uint8ClampedArray;
kind?: number;
src?: string;
bitmap?: ImageBitmap;
}

/** Fetch a named image object, routing to commonObjs for global ("g_"-prefixed) IDs. */
function fetchNamedImage(
page: Awaited<ReturnType<PDFDocumentProxy["getPage"]>>,
objName: string,
): Promise<PdfjsImageData> {
return new Promise<PdfjsImageData>((resolve, reject) => {
try {
const store = objName.startsWith("g_") ? page.commonObjs : page.objs;
store.get(objName, (obj: unknown) => {
if (obj) resolve(obj as PdfjsImageData);
else reject(new Error("null object"));
});
} catch {
reject(new Error("object not found"));
}
});
}

/** Paint image data onto a canvas, handling ImageBitmap, raw pixels (RGBA/RGB), and src URLs. */
async function paintImageToCanvas(
imgData: PdfjsImageData,
ctx: CanvasRenderingContext2D,
): Promise<boolean> {
if (imgData.bitmap) {
ctx.drawImage(imgData.bitmap, 0, 0);
return true;
}

if (imgData.data) {
const expectedRgba = imgData.width * imgData.height * 4;
if (imgData.data.length === expectedRgba) {
ctx.putImageData(
new ImageData(new Uint8ClampedArray(imgData.data), imgData.width, imgData.height),
0,
0,
);
return true;
}
// RGB (3 bytes per pixel) — expand to RGBA
const expectedRgb = imgData.width * imgData.height * 3;
if (imgData.data.length === expectedRgb) {
const rgba = new Uint8ClampedArray(expectedRgba);
const src = imgData.data;
for (let j = 0, k = 0; j < src.length; j += 3, k += 4) {
rgba[k] = src[j];
rgba[k + 1] = src[j + 1];
rgba[k + 2] = src[j + 2];
rgba[k + 3] = 255;
}
ctx.putImageData(new ImageData(rgba, imgData.width, imgData.height), 0, 0);
return true;
}
return false;
}

if (imgData.src) {
await new Promise<void>((resolve, reject) => {
const img = new window.Image();
img.onload = () => {
ctx.drawImage(img, 0, 0);
resolve();
};
img.onerror = reject;
img.src = imgData.src as string;
});
return true;
}

return false;
}

/**
* Extract all raster images embedded in a PDF document.
*
Expand All @@ -43,87 +124,55 @@ async function extractImagesFromPdf(
): Promise<ExtractedImage[]> {
const images: ExtractedImage[] = [];

// Reuse a single pair of canvases across all images to avoid
// creating/destroying DOM elements per image.
const canvas = document.createElement("canvas");
const ctx = canvas.getContext("2d")!;
const thumbCanvas = document.createElement("canvas");
const tctx = thumbCanvas.getContext("2d")!;

for (let p = 1; p <= pdf.numPages; p++) {
const page = await pdf.getPage(p);
const ops = await page.getOperatorList();

// Track which object names we've already extracted on this page to
// Track which named objects we've already extracted on this page to
// avoid duplicates when the same XObject is painted more than once.
const seen = new Set<string>();
let indexOnPage = 0;

for (let i = 0; i < ops.fnArray.length; i++) {
const fn = ops.fnArray[i];

// OPS.paintImageXObject and OPS.paintInlineImageXObject
if (fn !== pdfjsLib.OPS.paintImageXObject && fn !== pdfjsLib.OPS.paintInlineImageXObject)
let imgData: PdfjsImageData | null = null;

if (fn === pdfjsLib.OPS.paintImageXObject || fn === pdfjsLib.OPS.paintImageXObjectRepeat) {
// Named image reference — look up in the correct object store.
const objName = ops.argsArray[i]?.[0] as string | undefined;
if (!objName || typeof objName !== "string" || seen.has(objName)) continue;
seen.add(objName);
try {
imgData = await fetchNamedImage(page, objName);
} catch {
continue;
}
} else if (fn === pdfjsLib.OPS.paintInlineImageXObject) {
// Inline image — the image data is embedded directly in the args.
const arg = ops.argsArray[i]?.[0];
if (!arg || typeof arg !== "object") continue;
imgData = arg as PdfjsImageData;
} else {
continue;
}

const objName = ops.argsArray[i]?.[0] as string | undefined;
if (!objName || seen.has(objName)) continue;
seen.add(objName);
// Skip tiny images (likely artifacts or masks)
if (!imgData || imgData.width < 10 || imgData.height < 10) continue;

try {
const imgData = await new Promise<{
width: number;
height: number;
data?: Uint8ClampedArray;
kind?: number;
src?: string;
}>((resolve, reject) => {
// PDF.js stores common objects (shared across pages) separately.
// Try page-level objects first, then fall back to the common pool.
try {
page.objs.get(objName, (obj: unknown) => {
if (obj)
resolve(
obj as {
width: number;
height: number;
data?: Uint8ClampedArray;
kind?: number;
src?: string;
},
);
else reject(new Error("null object"));
});
} catch {
reject(new Error("object not found"));
}
});

// Skip tiny images (likely artifacts or masks)
if (imgData.width < 10 || imgData.height < 10) continue;

const canvas = document.createElement("canvas");
canvas.width = imgData.width;
canvas.height = imgData.height;
const ctx = canvas.getContext("2d")!;

if (imgData.data) {
// Raw pixel data — write directly via ImageData
const imageData = new ImageData(
new Uint8ClampedArray(imgData.data),
imgData.width,
imgData.height,
);
ctx.putImageData(imageData, 0, 0);
} else if (imgData.src) {
// JPEG image with a blob/data URL — draw via Image element
await new Promise<void>((resolve, reject) => {
const img = new window.Image();
img.onload = () => {
ctx.drawImage(img, 0, 0);
resolve();
};
img.onerror = reject;
img.src = imgData.src as string;
});
} else {
canvas.width = 0;
canvas.height = 0;
continue;
}

const painted = await paintImageToCanvas(imgData, ctx);
if (!painted) continue;

// Convert to PNG blob
const blob = await new Promise<Blob>((resolve, reject) =>
Expand All @@ -134,19 +183,11 @@ async function extractImagesFromPdf(
const thumbScale = Math.min(1, 200 / Math.max(imgData.width, imgData.height));
const tw = Math.round(imgData.width * thumbScale);
const th = Math.round(imgData.height * thumbScale);
const thumbCanvas = document.createElement("canvas");
thumbCanvas.width = tw;
thumbCanvas.height = th;
const tctx = thumbCanvas.getContext("2d")!;
tctx.drawImage(canvas, 0, 0, tw, th);
const dataUrl = thumbCanvas.toDataURL("image/png");

// Release canvas memory
canvas.width = 0;
canvas.height = 0;
thumbCanvas.width = 0;
thumbCanvas.height = 0;

images.push({
page: p,
indexOnPage,
Expand All @@ -161,9 +202,16 @@ async function extractImagesFromPdf(
}
}

page.cleanup();
onProgress?.(p, pdf.numPages);
}

// Release canvas memory
canvas.width = 0;
canvas.height = 0;
thumbCanvas.width = 0;
thumbCanvas.height = 0;

return images;
}

Expand Down Expand Up @@ -199,7 +247,7 @@ export default function ExtractImages() {
setFile(null);
} else {
setImages(extracted);
setSelected(new Set(extracted.map((_, i) => i)));
setSelected(new Set());
}
} catch (e) {
setError(
Expand All @@ -223,12 +271,14 @@ export default function ExtractImages() {
});
}, []);

const toggleAll = useCallback(() => {
setSelected((prev) =>
prev.size === images.length ? new Set() : new Set(images.map((_, i) => i)),
);
const selectAll = useCallback(() => {
setSelected(new Set(images.map((_, i) => i)));
}, [images]);

const clearAll = useCallback(() => {
setSelected(new Set());
}, []);

const handleDownload = useCallback(async () => {
if (selected.size === 0) return;
setDownloading(true);
Expand Down Expand Up @@ -258,7 +308,10 @@ export default function ExtractImages() {
}
}, [file, images, selected]);

const totalSize = Array.from(selected).reduce((sum, i) => sum + images[i].blob.size, 0);
const totalSize = useMemo(
() => Array.from(selected).reduce((sum, i) => sum + images[i].blob.size, 0),
[selected, images],
);

return (
<div className="space-y-6">
Expand Down Expand Up @@ -310,12 +363,24 @@ export default function ExtractImages() {
<p className="text-sm font-medium text-slate-700 dark:text-dark-text">
Select images to download
</p>
<button
onClick={toggleAll}
className="text-sm text-primary-600 hover:text-primary-700"
>
{selected.size === images.length ? "Deselect all" : "Select all"}
</button>
<div className="flex items-center gap-3">
<button
type="button"
onClick={selectAll}
className="inline-flex items-center gap-1.5 text-sm text-primary-600 hover:text-primary-700 dark:text-primary-400 dark:hover:text-primary-300 transition-colors"
>
<CheckSquare className="w-4 h-4" />
Select all
</button>
<button
type="button"
onClick={clearAll}
className="inline-flex items-center gap-1.5 text-sm text-slate-500 hover:text-slate-700 dark:text-dark-text-muted dark:hover:text-dark-text transition-colors"
>
<X className="w-4 h-4" />
Clear
</button>
</div>
</div>

<div className="grid grid-cols-2 sm:grid-cols-3 md:grid-cols-4 lg:grid-cols-5 gap-3">
Expand Down
Loading