https://huggingface.co/DILHTWD/documentlayoutsegmentation_YOLOv8_ondoclaynet with ONNX weights to be compatible with Transformers.js.
Usage (Transformers.js)
If you haven't already, you can install the Transformers.js JavaScript library from NPM using:
npm i @xenova/transformers
Example: Perform object-detection with Oblix/yolov8x-doclaynet_ONNX
.
import { AutoModel, AutoProcessor, RawImage } from '@xenova/transformers';
const model = await AutoModel.from_pretrained(
"Oblix/yolov8x-doclaynet_ONNX",
{
quantized: false,
}
);
const processor = await AutoProcessor.from_pretrained("Oblix/yolov8x-doclaynet_ONNX");
const url = 'https://huggingface.co/DILHTWD/documentlayoutsegmentation_YOLOv8_ondoclaynet/resolve/main/sample1.png';
const rawImage = await RawImage.fromURL(url);
const { pixel_values } = await processor(rawImage);
const output = await model({ images: pixel_values });
// Post-process:
const permuted = output.output0[0].transpose(1, 0);
// `permuted` is a Tensor of shape [ 8400, 15 ]:
// - 8400 potential bounding boxes
// - 15 parameters for each box:
// - first 4 are coordinates for the bounding boxes (x-center, y-center, width, height)
// - the remaining 11 are the probabilities for each class
// Example code to format it nicely:
const results = [];
const threshold = 0.5; // Adjust the threshold as needed
const [scaledHeight, scaledWidth] = pixel_values.dims.slice(-2);
for (const [xc, yc, w, h, ...scores] of permuted.tolist()) {
// Get pixel values, taking into account the original image size
const x1 = (xc - w/2) / scaledWidth * rawImage.width;
const y1 = (yc - h/2) / scaledHeight * rawImage.height;
const x2 = (xc + w/2) / scaledWidth * rawImage.width;
const y2 = (yc + h/2) / scaledHeight * rawImage.height;
// Get best class
const argmax = scores.reduce((maxIndex, currentVal, currentIndex, arr) => currentVal > arr[maxIndex] ? currentIndex : maxIndex, 0);
const score = scores[argmax];
if (score < threshold) continue; // Not confident enough
const label = model.config.id2label[argmax];
results.push({
x1, x2, y1, y2, score, label, index: argmax,
});
}
const iouThreshold = 0.5; // Adjust the threshold as needed
const filteredResults = removeDuplicates(results, iouThreshold);
console.log(filteredResults);
function removeDuplicates(detections, iouThreshold) {
const filteredDetections = [];
for (const detection of detections) {
let isDuplicate = false;
let duplicateIndex = -1;
let maxIoU = 0;
for (let i = 0; i < filteredDetections.length; i++) {
const filteredDetection = filteredDetections[i];
const iou = calculateIoU(detection, filteredDetection);
if (iou > iouThreshold) {
isDuplicate = true;
if (iou > maxIoU) {
maxIoU = iou;
duplicateIndex = i;
}
}
}
if (!isDuplicate) {
filteredDetections.push(detection);
} else if (duplicateIndex !== -1) {
if (detection.score > filteredDetections[duplicateIndex].score) {
filteredDetections[duplicateIndex] = detection;
}
}
}
return filteredDetections;
}
function calculateIoU(detection1, detection2) {
const xOverlap = Math.max(0, Math.min(detection1.x2, detection2.x2) - Math.max(detection1.x1, detection2.x1));
const yOverlap = Math.max(0, Math.min(detection1.y2, detection2.y2) - Math.max(detection1.y1, detection2.y1));
const overlapArea = xOverlap * yOverlap;
const area1 = (detection1.x2 - detection1.x1) * (detection1.y2 - detection1.y1);
const area2 = (detection2.x2 - detection2.x1) * (detection2.y2 - detection2.y1);
const unionArea = area1 + area2 - overlapArea;
return overlapArea / unionArea;
}
Result
[
{
"x1": 54.53195288479328,
"y1": 170.06781649589539,
"x2": 95.52642979323865,
"y2": 186.62115139961244,
"score": 0.8901662826538086,
"label": "Text",
"index": 9
},
{
"x1": 53.96503926515579,
"y1": 195.67131299972536,
"x2": 221.8717828631401,
"y2": 212.6188931465149,
"score": 0.8967247605323792,
"label": "Text",
"index": 9
},
{
"x1": 54.53195288479328,
"y1": 221.1506155014038,
"x2": 98.4759178608656,
"y2": 238.44384784698488,
"score": 0.8795284032821655,
"label": "Text",
"index": 9
},
{
"x1": 55.731045877933504,
"y1": 338.1506155014038,
"x2": 103.58089088201523,
"y2": 355.22782917022704,
"score": 0.9104153513908386,
"label": "Section-header",
"index": 7
},
{
"x1": 54.501348263025285,
"y1": 452.59601612091063,
"x2": 144.76493505835532,
"y2": 469.1547849655152,
"score": 0.9181555509567261,
"label": "Section-header",
"index": 7
},
{
"x1": 54.37510642111301,
"y1": 568.1918724060059,
"x2": 73.67877252995967,
"y2": 584.1619010925293,
"score": 0.899300754070282,
"label": "Section-header",
"index": 7
},
{
"x1": 54.27563991844654,
"y1": 840.2569072723389,
"x2": 70.35437833964825,
"y2": 859.4512378692626,
"score": 0.6805046796798706,
"label": "Section-header",
"index": 7
},
{
"x1": 309.2861147403717,
"y1": 908.7717830657958,
"x2": 373.8879840373993,
"y2": 922.6841892242431,
"score": 0.8969672918319702,
"label": "Page-footer",
"index": 4
},
{
"x1": 311.53335428237915,
"y1": 10.31740515232086,
"x2": 607.2475433349609,
"y2": 33.85392036437988,
"score": 0.9498511552810669,
"label": "Page-header",
"index": 5
},
{
"x1": 56.66784882545471,
"y1": 289.38916368484496,
"x2": 416.7734823703766,
"y2": 306.94164075851444,
"score": 0.856067419052124,
"label": "Text",
"index": 9
},
{
"x1": 56.03344459533691,
"y1": 309.5055012702942,
"x2": 317.7232768535614,
"y2": 325.49175367355343,
"score": 0.8314194083213806,
"label": "Text",
"index": 9
},
{
"x1": 53.00637502670288,
"y1": 429.9619674682617,
"x2": 414.61163306236267,
"y2": 445.95904312133786,
"score": 0.8927980661392212,
"label": "Text",
"index": 9
},
{
"x1": 55.619012689590456,
"y1": 638.6609138488769,
"x2": 384.32462439537045,
"y2": 656.8182655334473,
"score": 0.9029342532157898,
"label": "List-item",
"index": 3
},
{
"x1": 58.06927928924561,
"y1": 794.932172012329,
"x2": 520.523375415802,
"y2": 811.1884700775146,
"score": 0.9037705063819885,
"label": "List-item",
"index": 3
},
{
"x1": 54.25830144882202,
"y1": 76.01902542114259,
"x2": 552.8331304550171,
"y2": 158.67227897644042,
"score": 0.9725438356399536,
"label": "Title",
"index": 10
},
{
"x1": 53.636448097229,
"y1": 244.93504171371458,
"x2": 610.1452471733094,
"y2": 274.8768593788147,
"score": 0.8954038619995117,
"label": "Text",
"index": 9
},
{
"x1": 54.76330833435059,
"y1": 364.74734601974484,
"x2": 625.0439935684204,
"y2": 405.74994478225705,
"score": 0.7930819988250732,
"label": "Text",
"index": 9
},
{
"x1": 55.78299608230591,
"y1": 480.10940895080563,
"x2": 623.4623931884765,
"y2": 556.692225265503,
"score": 0.9482676982879639,
"label": "Text",
"index": 9
},
{
"x1": 52.160629177093504,
"y1": 593.5841983795166,
"x2": 609.7405840873719,
"y2": 635.7749668121338,
"score": 0.9440742135047913,
"label": "Text",
"index": 9
},
{
"x1": 53.12467575073242,
"y1": 654.1885282516479,
"x2": 615.2034725189209,
"y2": 697.286619758606,
"score": 0.9134702086448669,
"label": "List-item",
"index": 3
},
{
"x1": 52.52786092758179,
"y1": 712.9350305557251,
"x2": 622.7321027755737,
"y2": 754.2832815170287,
"score": 0.9259238243103027,
"label": "Text",
"index": 9
},
{
"x1": 56.837522792816166,
"y1": 758.6981185913086,
"x2": 607.179635810852,
"y2": 787.9486541748047,
"score": 0.9015638828277588,
"label": "List-item",
"index": 3
},
{
"x1": 56.57186779975891,
"y1": 810.8556049346925,
"x2": 446.48612236976624,
"y2": 828.0084697723388,
"score": 0.8806689977645874,
"label": "List-item",
"index": 3
}
]
Labels
- Caption
- Footnote
- Formula
- List-item
- Page-footer
- Page-header
- Picture
- Section-header
- Table
- Text
- Title
- Downloads last month
- 22
Inference API (serverless) does not yet support transformers.js models for this pipeline type.