Since the previous article, I kept playing with different optimizations to improve the speed of the data-oriented design implementation. The idea behind using this approach was to make it easy the use SIMD instructions in the rendering algorithm.
After lots of experimentation I found that, while the SIMD instructions helped increase the speed of the path tracer, this change alone was not entirely capable of reaching a speed of a frame under a second. In the end, I decided to move the sphere intersection code to the Wasm library which allowed the SIMD code to finally shine.
Adding to that, there were plenty of changes in this new update, the following are a list of the modifications that I made to the code:
- Added a custom memory manager to allocate a block of space only once and just use offsets to the data for array references
- Used SIMD instructions to calculate the intersections of four spheres at the same time
- Did an small amount of loop unrolling
- Implemented some other small optimizations like: not using the
unit
vector function inrandomDirection
and removing thescale
operation from thetrace
andshade
functions. This is becauseunit
andscale
create intermediary arrays causing memory pressure. - Removed the triangle intersection function for the time being as only spheres support is implemented in Wasm, this has nothing to do with speed but with code cleanup.
- Since this is already very fast, I changed the code a bit to accumulate all the frames generated and visualize how the scene lighthing convergences.
- Updated the pipeline to not execute all the steps for every frame but return a
render
function.
All the changes made it possible to improve the visualization of the same 8
spheres in the scene from a rendering time of 2273ms
down to 409ms
, or about 4.5x
faster (!!!). Now I wonder how much further the performance can be increased if I implement more changes to the code. Some other optimization opportunities available are:
- Accelerator structures
- Multi-threading
- GPU shaders
Further reading
If you want to learn more about SIMD instructions in Wasm, these links are very good guides on how to use them in the Rust language: Authoring a SIMD enhanced Wasm library with Rust and Rust doc on v128.
Code
I encourage you to play around with the code if your browser allows you; some do not work well with my code editor.
You can experiment with different performance characteristics by changing the scene composition, one easy way is to update the sceneSelector
to have a value of 2
, this will render a scene with only 4
spheres; then remove one, and see the performance difference.
Note: there is no code to disable the rendering loop after you hit Run
, to make it stop you will have to reload the page.
async function loadWasm(wasmFile) {
const {
instance: { exports: wasm },
} = await WebAssembly.instantiateStreaming(fetch(wasmFile), {});
return wasm;
}
function createScene(args) {
const {
vector: { sub, unit, cross },
memory: { allocStaticFloat32Array, set },
sceneSelector,
} = args;
const scene1 = [
{
center: [0, -10002, 0],
radius: 9999.0,
color: [1.0, 1.0, 1.0],
isLight: false,
isSphere: true,
},
{
center: [-10012, 0, 0],
radius: 9999.0,
color: [1, 0, 0],
isLight: false,
isSphere: true,
},
{
center: [10012, 0, 0],
radius: 9999.0,
color: [0, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0, 0, 10012],
radius: 9999.0,
color: [1, 1, 1],
isLight: false,
isSphere: true,
},
{
center: [0, 10012, 0],
radius: 9999.0,
color: [1, 1, 1],
isLight: true,
isSphere: true,
},
{
center: [-5, 0, 2],
radius: 2.0,
color: [1, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0, 5, -1],
radius: 4.0,
color: [1, 0, 0],
isLight: false,
isSphere: true,
},
{
center: [8, 5, -1],
radius: 2,
color: [0, 0, 1],
isLight: false,
isSphere: true,
},
];
const scene2 = [
{
center: [0.0, 0.5, 0.0],
radius: 5.0,
color: [0, 0, 1],
isLight: false,
isSphere: true,
},
{
center: [-10, -2, 2],
radius: 2.0,
color: [1, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0.0, -9000.0 - 5.0, 0.0],
radius: 9000.0,
color: [1.0, 1.0, 1.0],
isLight: false,
isSphere: true,
},
{
center: [0, 10300, 0],
radius: 9999.0,
color: [1, 1, 1],
isLight: true,
isSphere: true,
},
];
const sceneSelected = sceneSelector === 1 ? scene1 : scene2;
const scene = sceneSelected.map((obj, id) => {
if (obj.isTriangle) {
const edge0 = sub(obj.pt1, obj.pt0);
const edge1 = sub(obj.pt2, obj.pt0);
obj.normal = unit(cross(edge0, edge1));
}
obj.id = id;
return obj;
});
const spheres = scene.filter((obj) => obj.isSphere);
const count = spheres.length;
const centerX = allocStaticFloat32Array(count);
const centerY = allocStaticFloat32Array(count);
const centerZ = allocStaticFloat32Array(count);
const radius = allocStaticFloat32Array(count);
const id = [];
spheres.forEach((s, i) => {
set(centerX, i, s.center[0]);
set(centerY, i, s.center[1]);
set(centerZ, i, s.center[2]);
set(radius, i, s.radius);
id[i] = i;
});
const spheresVector = {
centerX,
centerY,
centerZ,
radius,
id,
};
return {
scene,
spheresVector,
};
}
// 256
function createRandomDirectionFunction(args) {
const {
vector: { dot, norm },
} = args;
const randomDirection = (normal) => {
const p = [0, 0, 0];
while (true) {
p[0] = Math.random() - 0.5;
p[1] = Math.random() - 0.5;
p[2] = Math.random() - 0.5;
const n = 1.0 / norm(p);
p[0] *= n;
p[1] *= n;
p[2] *= n;
if (dot(p, normal) >= 0) {
return p;
}
}
};
return {
randomDirection,
};
}
function createMemoryFunctions(args) {
const { wasm } = args;
const totalAvailable = 1024 * 4;
const memPtr = wasm.alloc(totalAvailable);
const memView = new DataView(wasm.memory.buffer, memPtr, totalAvailable);
let staOffset = 0;
let dynOffset = 2048; // half of totalAvailable
function get(A, idx) {
return memView.getFloat32(A + idx * 4 - memPtr, true);
}
function set(A, idx, v) {
memView.setFloat32(A + idx * 4 - memPtr, v, true);
}
const allocFloat32Array = (size) => {
const byteOffset = memPtr + dynOffset;
dynOffset += size * 4;
return byteOffset;
};
const allocStaticFloat32Array = (size) => {
const byteOffset = memPtr + staOffset;
staOffset += size * 4;
return byteOffset;
};
const free = () => (dynOffset = 2048);
return {
memory: {
allocFloat32Array,
allocStaticFloat32Array,
free,
get,
set,
},
};
}
function createSphereIntersectSIMDFunction(args) {
const {
memory: { allocFloat32Array, free },
wasm,
} = args;
const sphereIntersectSIMD = (ray, spheresVector) => {
free();
const len = spheresVector.id.length;
const OUT = allocFloat32Array(len);
wasm.spheres_intersect(
len,
ray.origin[0],
ray.origin[1],
ray.origin[2],
ray.direction[0],
ray.direction[1],
ray.direction[2],
spheresVector.centerX,
spheresVector.centerY,
spheresVector.centerZ,
spheresVector.radius,
OUT,
);
return OUT;
};
return {
sphereIntersectSIMD,
};
}
function createAspectRatioFunction() {
return {
aspectRatio: (width, height) => {
let gcd = width;
let reminder = height;
while (reminder != 0) {
const temp = reminder;
reminder = gcd % reminder;
gcd = temp;
}
return [width / gcd, height / gcd];
},
};
}
function createCamera(args) {
const { width, height, aspectRatio } = args;
const [w, h] = aspectRatio(width, height);
return {
camera: {
leftTop: [-w, h + 1, -50.0],
rightTop: [w, h + 1, -50.0],
leftBottom: [-w, -h + 1, -50.0],
eye: [0.0, 0.0, -65.0],
},
};
}
function createImageGeometry({ width, height }) {
return {
imageGeometry: {
width,
height,
},
};
}
function createVectorFunctions() {
const sub = (A, B) => [A[0] - B[0], A[1] - B[1], A[2] - B[2]];
const add = (A, B) => [A[0] + B[0], A[1] + B[1], A[2] + B[2]];
const mul = (A, B) => [A[0] * B[0], A[1] * B[1], A[2] * B[2]];
const dot = (A, B) => A[0] * B[0] + A[1] * B[1] + A[2] * B[2];
// 135ms
const scale = (A, s) => [A[0] * s, A[1] * s, A[2] * s];
const norm = (A) => Math.sqrt(dot(A, A));
const unit = (A) => scale(A, 1.0 / norm(A));
const abs = (A) => [Math.abs(A[0]), Math.abs(A[1]), Math.abs(A[2])];
const maxDimension = (A) => {
if (A[0] > A[1] && A[0] > A[2]) return 0;
if (A[1] > A[0] && A[1] > A[3]) return 1;
return 2;
};
const permute = (A, i, j, k) => [A[i], A[j], A[k]];
const cross = (A, B) => {
const j = A[1] * B[2] - B[1] * A[2];
const k = A[2] * B[0] - A[0] * B[2];
const l = A[0] * B[1] - A[1] * B[0];
return [j, k, l];
};
const vector = {
sub,
add,
mul,
dot,
scale,
norm,
unit,
abs,
maxDimension,
permute,
cross,
};
return {
vector,
};
}
function calculatePrimaryRays(args) {
const {
camera: { rightTop, leftTop, leftBottom, eye },
imageGeometry: { width, height },
vector: { scale, add, sub, unit },
} = args;
const vdu = scale(sub(rightTop, leftTop), 1.0 / width);
const vdv = scale(sub(leftBottom, leftTop), 1.0 / height);
const primaryRays = [];
for (let y = 0; y < height; y++) {
for (let x = 0; x < width; x++) {
const pixel = y * width + x;
const origin = eye;
const direction = unit(
sub(add(add(scale(vdu, x), scale(vdv, y)), leftTop), origin),
);
primaryRays.push({
pixel,
origin,
direction,
});
}
}
return {
primaryRays,
};
}
function createTraceSIMDFunction(args) {
const {
scene,
spheresVector,
sphereIntersectSIMD,
vector: { norm },
memory: { get },
} = args;
const len = spheresVector.id.length;
const nohit = { hit: false };
const trace = (ray) => {
const distances = sphereIntersectSIMD(ray, spheresVector);
let closestIndex = -1;
let closestDistance = Number.MAX_VALUE;
for (let i = 0; i < len; i++) {
const distance = get(distances, i);
if (distance > 0 && distance < closestDistance) {
closestDistance = distance;
closestIndex = i;
}
}
if (closestIndex === -1) {
return nohit;
}
const sphere = scene[closestIndex];
const point = [];
point[0] = ray.direction[0] * closestDistance + ray.origin[0];
point[1] = ray.direction[1] * closestDistance + ray.origin[1];
point[2] = ray.direction[2] * closestDistance + ray.origin[2];
const normal = [];
normal[0] = point[0] - sphere.center[0];
normal[1] = point[1] - sphere.center[1];
normal[2] = point[2] - sphere.center[2];
const d = 1.0 / norm(normal);
normal[0] *= d;
normal[1] *= d;
normal[2] *= d;
return {
hit: true,
distance: closestDistance,
point,
obj: sphere,
normal,
};
};
return {
trace,
};
}
// 514ms
function createShadingFunction(args) {
const {
vector: { dot },
trace,
randomDirection,
} = args;
const shading = (shadingPoint, pointNormal, depth) => {
const color = [0, 0, 0];
if (depth === 5) {
return color;
}
const origin = shadingPoint;
const direction = randomDirection(pointNormal);
const d = dot(pointNormal, direction);
const ray = { origin, direction };
const tr = trace(ray);
if (!tr.hit) {
return color; //black up to this point;
}
color[0] = tr.obj.color[0] * d;
color[1] = tr.obj.color[1] * d;
color[2] = tr.obj.color[2] * d;
if (!tr.obj.isLight) {
const ncolor = shading(tr.point, tr.normal, depth + 1);
color[0] *= ncolor[0];
color[1] *= ncolor[1];
color[2] *= ncolor[2];
}
return color;
};
return {
shading,
};
}
function createGenerateBitmapFunction(args) {
const {
shading,
vector: { mul },
} = args;
const bitmap = [];
const generateBitmap = (traceResults) => {
let idx = 0;
for (const it of traceResults) {
let r = 0;
let g = 0;
let b = 0;
if (it.hit) {
r = it.obj.color[0];
g = it.obj.color[1];
b = it.obj.color[2];
if (!it.obj.isLight) {
const intensity = shading(it.point, it.normal, 0);
r *= intensity[0];
g *= intensity[1];
b *= intensity[2];
}
}
bitmap[idx] = r;
bitmap[idx + 1] = g;
bitmap[idx + 2] = b;
idx += 3;
}
return bitmap;
};
return { generateBitmap };
}
function createTracePrimaryRaysFunction(args) {
const { trace, primaryRays } = args;
const count = primaryRays.length;
const traceResults = new Array(count);
const tracePrimaryRays = () => {
for (let i = 0; i < count; i++) {
traceResults[i] = trace(primaryRays[i]);
}
return traceResults;
};
return {
tracePrimaryRays,
};
}
function createRenderFunction(args) {
const { tracePrimaryRays, generateBitmap } = args;
const render = () => {
const result = tracePrimaryRays();
return generateBitmap(result);
};
return { render };
}
function pipeline(fns) {
return (args) => {
let acc = { ...args };
for (const fn of fns) {
const result = fn(acc);
acc = { ...acc, ...result };
}
return acc;
};
}
var renderingPipeline = pipeline([
createMemoryFunctions,
createVectorFunctions,
createAspectRatioFunction,
createScene,
createCamera,
createImageGeometry,
createRandomDirectionFunction,
calculatePrimaryRays,
createSphereIntersectSIMDFunction,
createTraceSIMDFunction,
createShadingFunction,
createTracePrimaryRaysFunction,
createGenerateBitmapFunction,
createRenderFunction,
]);
function renderAndPresent(canvas, render, finalBitmap, frameCount, framesAcc) {
const ctx = canvas.getContext('2d');
const width = canvas.width;
const renderStart = performance.now();
const bitmap = render();
for (let i = 0; i < bitmap.length; i += 3) {
const r = bitmap[i] + (framesAcc[i] ?? 0);
const g = bitmap[i + 1] + (framesAcc[i + 1] ?? 0);
const b = bitmap[i + 2] + (framesAcc[i + 2] ?? 0);
framesAcc[i] = r;
framesAcc[i + 1] = g;
framesAcc[i + 2] = b;
finalBitmap[i / 3] =
(255 << 24) |
(((b / frameCount) * 255) << 16) |
(((g / frameCount) * 255) << 8) |
((r / frameCount) * 255);
}
const imageData = new ImageData(
new Uint8ClampedArray(finalBitmap.buffer),
width,
);
ctx.putImageData(imageData, 0, 0);
const elapsed = Math.floor(performance.now() - renderStart);
const elapsedMs = `${elapsed}ms/${Math.round(10000 / elapsed) / 10}fps`;
ctx.font = '20px monospace';
ctx.textBaseline = 'top';
const measure = ctx.measureText(elapsedMs);
ctx.fillStyle = '#000000';
ctx.fillRect(0, 0, measure.width, measure.fontBoundingBoxDescent);
ctx.fillStyle = '#999999';
ctx.fillText(elapsedMs, 0, 0);
}
window.running = false;
(async () => {
const canvas = document.getElementById('canvas-1');
const width = canvas.width;
const height = canvas.height;
console.time('WasmLoad');
const wasm = await loadWasm('wasm/vector_simd.wasm');
console.timeEnd('WasmLoad');
const render = renderingPipeline({
wasm,
width,
height,
sceneSelector: 1,
}).render;
let frameCount = 0;
const framesAcc = [];
window.running = true;
const finalBitmap = new Uint32Array(width * height);
const animation = () => {
frameCount++;
renderAndPresent(canvas, render, finalBitmap, frameCount, framesAcc);
window.running && window.requestAnimationFrame(animation);
};
window.requestAnimationFrame(animation);
})();
Summary
In the end, the biggest performance increase was achieved by moving the critical code to the Wasm library and using SIMD instructions to calculate the sphere intersections.
One thing I want you to know is that the current SIMD support in WebAssembly only allows for vectors of 128
bits. This vector width can be used in different ways, in this case, I used it as four single precision (f32
) values; this allowed me to calculate the intersection of a single ray against four different spheres at the same time.
Interestingly, outside of WebAssembly, there are several SIMD implementations with varying levels of CPU support, one of the newest ones is AVX-512, that can work with up to sixteen single precision values. It will be interesting to see if WebAssembly starts supporting wider SIMD vectors in the future.
Nonetheless, WebAssembly does not need to support them directly as long as the virtual machine it runs on makes the optimizations itself, potentially by means of fusing similar adyacent SIMD operations into wider ones; if this ends up being the case then loop unrolling will be key.
Another interesting tidbit, Safari shows the best performance out of the three major web browsers when working with this path tracer implementation:
- Safari (v17.2.1):
409ms
- Firefox (v122.0):
567ms
- Chrome (v121.0.6167.139):
587ms
Time taken to render a single frame on a MacBook Air M1 with 16GB RAM.
Extras
Below is the code behind the vector_simd.wasm
library. If you want to build it, make sure to use the following Cargo.toml
configuration:
[lib]
crate-type = ["cdylib", "rlib"]
And use this commands for building:
rustup target add wasm32-unknown-unknown
RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --release
Finally, the code:
#![cfg(target_feature = "simd128")]
use core::arch::wasm32::*;
use std::ptr::copy;
macro_rules! unroll {
($i: ident, $size: ident, $block: block) => {
while $i < $size {
if $size - $i >= 8 {
$block
$i += 4;
$block
$i += 4;
} else if $size - $i >= 4 {
$block
$i += 4;
} else {
break;
}
}
}
}
#[no_mangle]
pub unsafe fn alloc(capacity: usize) -> *mut u8 {
let mut memory = Vec::with_capacity(capacity);
let ptr = memory.as_mut_ptr();
std::mem::forget(memory);
ptr
}
#[no_mangle]
pub unsafe fn dealloc(n: usize, ptr: *mut u8) {
let _bytes: Vec<u8> = Vec::from_raw_parts(ptr, n, n);
}
#[no_mangle]
pub unsafe fn get(a: *const f32) -> f32 {
*a
}
#[no_mangle]
pub unsafe fn set(a: *mut f32, value: f32) {
copy(&value, a, 1);
}
#[no_mangle]
pub unsafe fn add(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
c.add(i) as *mut v128,
f32x4_add(
v128_load(a.add(i) as *const v128),
v128_load(b.add(i) as *const v128),
),
);
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let r = x + y;
set(c.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn fill(size: usize, v: f32, a: *mut f32) {
let values = f32x4_splat(v);
let mut i = 0;
unroll!(i, size, {
v128_store(a.add(i) as *mut v128, values);
});
while i < size {
set(a.add(i), v);
i += 1;
}
}
#[no_mangle]
pub unsafe fn sub(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
c.add(i) as *mut v128,
f32x4_sub(
v128_load(a.add(i) as *const v128),
v128_load(b.add(i) as *const v128),
),
);
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let r = x - y;
set(c.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn mul_add(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
let va = v128_load(a.add(i) as *const v128);
let vb = v128_load(b.add(i) as *const v128);
let vc = v128_load(c.add(i) as *const v128);
v128_store(c.add(i) as *mut v128, f32x4_add(vc, f32x4_mul(va, vb)));
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let z = *c.add(i);
let r = x * y + z;
set(c.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn sqrt(size: usize, a: *const f32, b: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
b.add(i) as *mut v128,
f32x4_sqrt(v128_load(a.add(i) as *const v128)),
);
});
while i < size {
let x = *a.add(i);
let r = x.sqrt();
set(b.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn scale(size: usize, v: f32, a: *mut f32, b: *mut f32) {
let values = f32x4_splat(v);
let mut i = 0;
unroll!(i, size, {
let va = v128_load(a.add(i) as *const v128);
v128_store(b.add(i) as *mut v128, f32x4_mul(va, values));
});
while i < size {
let x = *a.add(i);
let r = x * v;
set(b.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn div(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
c.add(i) as *mut v128,
f32x4_div(
v128_load(a.add(i) as *const v128),
v128_load(b.add(i) as *const v128),
),
);
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let r = x / y;
set(c.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn mul(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
c.add(i) as *mut v128,
f32x4_mul(
v128_load(a.add(i) as *const v128),
v128_load(b.add(i) as *const v128),
),
);
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let r = x * y;
set(c.add(i), r);
i += 1;
}
}
#[no_mangle]
pub unsafe fn min(size: usize, a: *const f32, b: *const f32, c: *mut f32) {
let mut i = 0;
unroll!(i, size, {
v128_store(
c.add(i) as *mut v128,
f32x4_min(
v128_load(a.add(i) as *const v128),
v128_load(b.add(i) as *const v128),
),
);
});
while i < size {
let x = *a.add(i);
let y = *b.add(i);
let r = x.min(y);
set(c.add(i), r);
i += 1;
}
}
static mut MEMORY: [f32; 1024] = [0.0; 1024];
#[no_mangle]
pub unsafe fn spheres_intersect(
size: usize,
rox: f32,
roy: f32,
roz: f32,
dx: f32,
dy: f32,
dz: f32,
spcx: *mut f32,
spcy: *mut f32,
spcz: *mut f32,
spcr: *mut f32,
out: *mut f32,
) {
let mem = MEMORY.as_mut_ptr() as *mut f32;
let mut offset = 0;
let ocx = mem.add(offset);
offset += size;
fill(size, rox, ocx);
let ocy = mem.add(offset);
offset += size;
fill(size, roy, ocy);
let ocz = mem.add(offset);
offset += size;
fill(size, roz, ocz);
sub(size, ocx, spcx, ocx);
sub(size, ocy, spcy, ocy);
sub(size, ocz, spcz, ocz);
let rdx = mem.add(offset);
offset += size;
fill(size, dx, rdx);
let rdy = mem.add(offset);
offset += size;
fill(size, dy, rdy);
let rdz = mem.add(offset);
offset += size;
fill(size, dz, rdz);
let a = mem.add(offset);
offset += size;
fill(size, dx * dx + dy * dy + dz * dz, a);
let b = mem.add(offset);
fill(size, 0.0, b);
mul_add(size, ocx, rdx, b);
mul_add(size, ocy, rdy, b);
mul_add(size, ocz, rdz, b);
let m0 = rdy;
fill(size, -1.0, m0);
mul(size, b, m0, b);
let c = rdz;
fill(size, 0.0, c);
mul_add(size, ocx, ocx, c);
mul_add(size, ocy, ocy, c);
mul_add(size, ocz, ocz, c);
let mr = m0;
mul(size, spcr, spcr, mr);
sub(size, c, mr, c);
let bb = ocx;
mul(size, b, b, bb);
let ac = ocy;
mul(size, a, c, ac);
let dis = ocz;
sub(size, bb, ac, dis);
let mask = mr;
for i in 0..size {
let v = if get(dis.add(i)) > 0.0 { 1.0 } else { 0.0 };
set(mask.add(i), v);
}
mul(size, dis, mask, dis);
let e = dis;
sqrt(size, dis, e);
let t1 = bb;
sub(size, b, e, t1);
let t2 = ac;
add(size, b, e, t2);
div(size, t1, a, t1);
div(size, t2, a, t2);
min(size, t1, t2, t1);
mul(size, t1, mask, out);
}