In the previous entry, I was able to get a huge speedup for the path tracing implementation using SIMD instructions plus some optimizations to memory usage, then when closing I mentioned that one of the next steps was to use multi-threading.
Multi-threading means spinning several threads to execute operations in parallel using as many cores as there are available in the CPU. In theory, this promises good speedups with low effort, in reality, things are not as straightfoward.
JavaScript is a single threaded language, but offers ways to scape it using WebWorkers. A WebWorker is an interface supported by web browsers that allow you to run background tasks and communicate back and forth using a messaging interface.
Using this facility, I spent some time updating the path tracer to move the rendering code into a WebWorker and add a synchronization mechanism to trigger rendering sections of an image that then get stitched together to form the final product. The end result is a faster rendering time but not quite what I expected.
This is the code
function wasmWorker() {
function createScene(args) {
const {
vector: { sub, unit, cross },
memory: { allocStaticFloat32Array, set },
sceneSelector,
} = args;
const scene1 = [
{
center: [0, -10002, 0],
radius: 9999.0,
color: [1.0, 1.0, 1.0],
isLight: false,
isSphere: true,
},
{
center: [-10012, 0, 0],
radius: 9999.0,
color: [1, 0, 0],
isLight: false,
isSphere: true,
},
{
center: [10012, 0, 0],
radius: 9999.0,
color: [0, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0, 0, 10012],
radius: 9999.0,
color: [1, 1, 1],
isLight: false,
isSphere: true,
},
{
center: [0, 10012, 0],
radius: 9999.0,
color: [1, 1, 1],
isLight: true,
isSphere: true,
},
{
center: [-5, 0, 2],
radius: 2.0,
color: [1, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0, 5, -1],
radius: 4.0,
color: [1, 0, 0],
isLight: false,
isSphere: true,
},
{
center: [8, 5, -1],
radius: 2,
color: [0, 0, 1],
isLight: false,
isSphere: true,
},
];
const scene2 = [
{
center: [0.0, 0.5, 0.0],
radius: 5.0,
color: [0, 0, 1],
isLight: false,
isSphere: true,
},
{
center: [-10, -2, 2],
radius: 2.0,
color: [1, 1, 0],
isLight: false,
isSphere: true,
},
{
center: [0.0, -9000.0 - 5.0, 0.0],
radius: 9000.0,
color: [1.0, 1.0, 1.0],
isLight: false,
isSphere: true,
},
{
center: [0, 10300, 0],
radius: 9999.0,
color: [1, 1, 1],
isLight: true,
isSphere: true,
},
];
const sceneSelected = sceneSelector === 1 ? scene1 : scene2;
const scene = sceneSelected.map((obj, id) => {
if (obj.isTriangle) {
const edge0 = sub(obj.pt1, obj.pt0);
const edge1 = sub(obj.pt2, obj.pt0);
obj.normal = unit(cross(edge0, edge1));
}
obj.id = id;
return obj;
});
const spheres = scene.filter((obj) => obj.isSphere);
const count = spheres.length;
const centerX = allocStaticFloat32Array(count);
const centerY = allocStaticFloat32Array(count);
const centerZ = allocStaticFloat32Array(count);
const radius = allocStaticFloat32Array(count);
const id = [];
spheres.forEach((s, i) => {
set(centerX, i, s.center[0]);
set(centerY, i, s.center[1]);
set(centerZ, i, s.center[2]);
set(radius, i, s.radius);
id[i] = i;
});
const spheresVector = {
centerX,
centerY,
centerZ,
radius,
id,
};
return {
scene,
spheresVector,
};
}
// 256
function createRandomDirectionFunction(args) {
const {
vector: { dot, norm },
} = args;
const randomDirection = (normal) => {
const p = [0, 0, 0];
while (true) {
p[0] = Math.random() - 0.5;
p[1] = Math.random() - 0.5;
p[2] = Math.random() - 0.5;
const n = 1.0 / norm(p);
p[0] *= n;
p[1] *= n;
p[2] *= n;
if (dot(p, normal) >= 0) {
return p;
}
}
};
return {
randomDirection,
};
}
function createMemoryFunctions(args) {
const { wasm } = args;
const totalAvailable = 1024 * 4;
const memPtr = wasm.alloc(totalAvailable);
const memView = new DataView(wasm.memory.buffer, memPtr, totalAvailable);
let staOffset = 0;
let dynOffset = 2048; // half of totalAvailable
function get(A, idx) {
return memView.getFloat32(A + idx * 4 - memPtr, true);
}
function set(A, idx, v) {
memView.setFloat32(A + idx * 4 - memPtr, v, true);
}
const allocFloat32Array = (size) => {
const byteOffset = memPtr + dynOffset;
dynOffset += size * 4;
return byteOffset;
};
const allocStaticFloat32Array = (size) => {
const byteOffset = memPtr + staOffset;
staOffset += size * 4;
return byteOffset;
};
const free = () => (dynOffset = 2048);
return {
memory: {
allocFloat32Array,
allocStaticFloat32Array,
free,
get,
set,
},
};
}
function createSphereIntersectSIMDFunction(args) {
const {
memory: { allocFloat32Array, free },
wasm,
} = args;
const sphereIntersectSIMD = (ray, spheresVector) => {
free();
const len = spheresVector.id.length;
const OUT = allocFloat32Array(len);
wasm.spheres_intersect(
len,
ray.origin[0],
ray.origin[1],
ray.origin[2],
ray.direction[0],
ray.direction[1],
ray.direction[2],
spheresVector.centerX,
spheresVector.centerY,
spheresVector.centerZ,
spheresVector.radius,
OUT,
);
return OUT;
};
return {
sphereIntersectSIMD,
};
}
function createAspectRatioFunction() {
return {
aspectRatio: (width, height) => {
let gcd = width;
let reminder = height;
while (reminder != 0) {
const temp = reminder;
reminder = gcd % reminder;
gcd = temp;
}
return [width / gcd, height / gcd];
},
};
}
function createCamera(args) {
const { width, height, aspectRatio } = args;
const [w, h] = aspectRatio(width, height);
return {
camera: {
leftTop: [-w, h + 1, -50.0],
rightTop: [w, h + 1, -50.0],
leftBottom: [-w, -h + 1, -50.0],
eye: [0.0, 0.0, -65.0],
},
};
}
function createImageGeometry({ width, height }) {
return {
imageGeometry: {
width,
height,
},
};
}
function createVectorFunctions() {
const sub = (A, B) => [A[0] - B[0], A[1] - B[1], A[2] - B[2]];
const add = (A, B) => [A[0] + B[0], A[1] + B[1], A[2] + B[2]];
const mul = (A, B) => [A[0] * B[0], A[1] * B[1], A[2] * B[2]];
const dot = (A, B) => A[0] * B[0] + A[1] * B[1] + A[2] * B[2];
// 135ms
const scale = (A, s) => [A[0] * s, A[1] * s, A[2] * s];
const norm = (A) => Math.sqrt(dot(A, A));
const unit = (A) => scale(A, 1.0 / norm(A));
const abs = (A) => [Math.abs(A[0]), Math.abs(A[1]), Math.abs(A[2])];
const maxDimension = (A) => {
if (A[0] > A[1] && A[0] > A[2]) return 0;
if (A[1] > A[0] && A[1] > A[3]) return 1;
return 2;
};
const permute = (A, i, j, k) => [A[i], A[j], A[k]];
const cross = (A, B) => {
const j = A[1] * B[2] - B[1] * A[2];
const k = A[2] * B[0] - A[0] * B[2];
const l = A[0] * B[1] - A[1] * B[0];
return [j, k, l];
};
const vector = {
sub,
add,
mul,
dot,
scale,
norm,
unit,
abs,
maxDimension,
permute,
cross,
};
return {
vector,
};
}
function calculatePrimaryRays(args) {
const {
camera: { rightTop, leftTop, leftBottom, eye },
imageGeometry: { width, height },
vector: { scale, add, sub, unit },
} = args;
const vdu = scale(sub(rightTop, leftTop), 1.0 / width);
const vdv = scale(sub(leftBottom, leftTop), 1.0 / height);
const primaryRays = [];
for (let y = 0; y < height; y++) {
for (let x = 0; x < width; x++) {
const pixel = y * width + x;
const origin = eye;
const direction = unit(sub(add(add(scale(vdu, x), scale(vdv, y)), leftTop), origin));
primaryRays[pixel] = {
pixel,
origin,
direction,
};
}
}
return {
primaryRays,
};
}
function createTraceSIMDFunction(args) {
const {
scene,
spheresVector,
sphereIntersectSIMD,
vector: { norm },
memory: { get },
} = args;
const len = spheresVector.id.length;
const nohit = { hit: false };
const trace = (ray) => {
const distances = sphereIntersectSIMD(ray, spheresVector);
let closestIndex = -1;
let closestDistance = Number.MAX_VALUE;
for (let i = 0; i < len; i++) {
const distance = get(distances, i);
if (distance > 0 && distance < closestDistance) {
closestDistance = distance;
closestIndex = i;
}
}
if (closestIndex === -1) {
return nohit;
}
const sphere = scene[closestIndex];
const point = [];
point[0] = ray.direction[0] * closestDistance + ray.origin[0];
point[1] = ray.direction[1] * closestDistance + ray.origin[1];
point[2] = ray.direction[2] * closestDistance + ray.origin[2];
const normal = [];
normal[0] = point[0] - sphere.center[0];
normal[1] = point[1] - sphere.center[1];
normal[2] = point[2] - sphere.center[2];
const d = 1.0 / norm(normal);
normal[0] *= d;
normal[1] *= d;
normal[2] *= d;
return {
hit: true,
distance: closestDistance,
point,
obj: sphere,
normal,
};
};
return {
trace,
};
}
function createTracePrimaryRaysFunction(args) {
const { trace, primaryRays, width } = args;
const traceResults = [];
const tracePrimaryRays = (section, tileSize) => {
let idx = 0;
const startPixel = section;
const endPixel = section + width * tileSize;
for (let i = startPixel; i < endPixel; i++) {
const ray = primaryRays[i];
traceResults[idx++] = trace(ray);
}
return traceResults;
};
return {
tracePrimaryRays,
};
}
function createGenerateBitmapFunction(args) {
const {
shading,
vector: { mul },
} = args;
const generateBitmap = (traceResults, section, bbp) => {
let idx = section * 4 * 3;
for (const it of traceResults) {
let pixel = [0, 0, 0];
if (it.hit) {
pixel = it.obj.color;
if (!it.obj.isLight) {
const intensity = shading(it.point, it.normal, 0);
pixel = mul(pixel, intensity);
}
}
bbp.setFloat32(idx, pixel[0], true);
bbp.setFloat32(idx + 4, pixel[1], true);
bbp.setFloat32(idx + 8, pixel[2], true);
idx += 12;
}
};
return { generateBitmap };
}
function createRenderFunction(args) {
const { tracePrimaryRays, generateBitmap, width, height } = args;
const totalPixels = width * height;
const render = (tileSize, bitmap, syncBuffer) => {
const sync = new Uint32Array(syncBuffer);
const sectionSize = tileSize * width;
let section = Atomics.add(sync, 0, sectionSize);
const bpp = new DataView(bitmap, 0, bitmap.length);
while (section < totalPixels) {
const traceResults = tracePrimaryRays(section, tileSize);
generateBitmap(traceResults, section, bpp);
section = Atomics.add(sync, 0, sectionSize);
}
};
return { render };
}
function pipeline(fns) {
return (args) => {
let acc = { ...args };
for (const fn of fns) {
const result = fn(acc);
acc = { ...acc, ...result };
}
return acc;
};
}
function createShadingFunction(args) {
const {
vector: { dot },
trace,
randomDirection,
} = args;
const shading = (shadingPoint, pointNormal, depth) => {
const color = [0, 0, 0];
if (depth === 5) {
return color;
}
const origin = [];
origin[0] = pointNormal[0] * 0.1 + shadingPoint[0];
origin[1] = pointNormal[1] * 0.1 + shadingPoint[1];
origin[2] = pointNormal[2] * 0.1 + shadingPoint[2];
const direction = randomDirection(pointNormal);
const d = dot(pointNormal, direction);
const ray = { origin, direction };
const tr = trace(ray);
if (!tr.hit) {
return color; //black up to this point;
}
color[0] = tr.obj.color[0] * d;
color[1] = tr.obj.color[0] * d;
color[2] = tr.obj.color[0] * d;
if (!tr.obj.isLight) {
const ncolor = shading(tr.point, tr.normal, depth + 1);
color[0] *= ncolor[0];
color[1] *= ncolor[1];
color[2] *= ncolor[2];
}
return color;
};
return {
shading,
};
}
var renderingPipeline = pipeline([
createMemoryFunctions,
createVectorFunctions,
createAspectRatioFunction,
createScene,
createCamera,
createImageGeometry,
createRandomDirectionFunction,
calculatePrimaryRays,
createSphereIntersectSIMDFunction,
createTraceSIMDFunction,
createShadingFunction,
createTracePrimaryRaysFunction,
createGenerateBitmapFunction,
createRenderFunction,
]);
async function loadWasm(wasmFile) {
const {
instance: { exports: wasm },
} = await WebAssembly.instantiate(wasmFile, {});
return wasm;
}
let render = null;
async function init(wasmFile, width, height) {
const wasm = await loadWasm(wasmFile);
render = renderingPipeline({
wasm,
width,
height,
sceneSelector: 1,
}).render;
}
this.onmessage = async (msg) => {
const { data } = msg;
if (data.operation === 'init') {
const { wasmFile, width, height } = data;
await init(wasmFile, width, height);
this.postMessage(0);
} else {
render(data.tileSize, data.bitmap, data.syncBuffer);
this.postMessage(1);
}
};
}
class WorkersPool {
#blobUrl;
poolSize;
maxPoolSize;
#pool = [];
results = [];
constructor(poolSize, maxPoolSize, workerFunction) {
const wasmWorkerString = `(${workerFunction.toString()})()`;
const blob = new Blob([wasmWorkerString], {
type: 'application/javascript',
});
this.#blobUrl = URL.createObjectURL(blob);
this.poolSize = poolSize;
this.maxPoolSize = maxPoolSize;
}
init(initPayload) {
this.#pool = [];
this.poolSize = Math.max(1, this.poolSize);
return new Promise((resolve) => {
let workersDone = 0;
for (let i = 0; i < this.maxPoolSize; i++) {
const worker = new Worker(this.#blobUrl);
this.#pool.push(worker);
worker.onmessage = () => {
if (++workersDone === this.maxPoolSize) {
resolve();
}
};
worker.postMessage(initPayload);
}
});
}
resize(newSize) {
this.poolSize = Math.max(1, newSize);
}
async process(payload) {
return new Promise((resolve) => {
let currentJob = 0;
for (let i = 0; i < this.poolSize; i++) {
let wrk = this.#pool[i];
wrk.onmessage = async (msg) => {
if (++currentJob === this.poolSize) {
resolve();
}
};
wrk.postMessage(payload);
}
});
}
}
const workersPool = new WorkersPool(1, navigator.hardwareConcurrency, wasmWorker);
async function renderAndPresent(canvas, frameCount, framesAcc, syncBuffer, sharedBuffer, finalBitmap) {
const ctx = canvas.getContext('2d');
const width = canvas.width;
const renderStart = performance.now();
const bitmap = new Float32Array(sharedBuffer);
const sync = new Uint32Array(syncBuffer);
const tileSize = 8;
sync[0] = 0;
await workersPool.process({ tileSize, bitmap: sharedBuffer, syncBuffer });
for (let i = 0; i < bitmap.length; i += 3) {
const r = bitmap[i] + (framesAcc[i] ?? 0);
const g = bitmap[i + 1] + (framesAcc[i + 1] ?? 0);
const b = bitmap[i + 2] + (framesAcc[i + 2] ?? 0);
framesAcc[i] = r;
framesAcc[i + 1] = g;
framesAcc[i + 2] = b;
finalBitmap[i / 3] =
(255 << 24) | (((b / frameCount) * 255) << 16) | (((g / frameCount) * 255) << 8) | ((r / frameCount) * 255);
}
const imageData = new ImageData(new Uint8ClampedArray(finalBitmap.buffer), width);
ctx.putImageData(imageData, 0, 0);
const elapsed = Math.floor(performance.now() - renderStart);
const elapsedMs = `${elapsed}ms|${(Math.round(10000 / elapsed) / 10).toFixed(1)}fps|${workersPool.poolSize}/${workersPool.maxPoolSize}threads(${window.adjustPool ? '?' : '!'})`;
ctx.font = '20px monospace';
ctx.textBaseline = 'top';
const measure = ctx.measureText(elapsedMs);
ctx.fillStyle = '#000000';
ctx.fillRect(0, 0, measure.width, measure.fontBoundingBoxDescent);
ctx.fillStyle = '#999999';
ctx.fillText(elapsedMs, 0, 0);
}
window.running = false;
(async () => {
const canvas = document.getElementById('canvas-1');
const width = canvas.width;
const height = canvas.height;
const wasmFile = await (await fetch('wasm/vector_simd.wasm')).arrayBuffer();
await workersPool.init({ operation: 'init', wasmFile, width, height });
let frameCount = 0;
const framesAcc = new Array(width * height * 3);
const sharedBuffer = new SharedArrayBuffer(width * height * 3 * 4);
const syncBuffer = new SharedArrayBuffer(4);
const finalBitmap = new Uint32Array(width * height);
window.running = true;
/* START: Auto adjust poolSize */
const renderTimes = {};
let renderStart = performance.now();
window.adjustPool = true;
/* END: Auto adjust poolSize */
const animation = async () => {
frameCount++;
await renderAndPresent(canvas, frameCount, framesAcc, syncBuffer, sharedBuffer, finalBitmap);
/* START: Auto adjust poolSize */
renderTimes[workersPool.poolSize] += performance.now()
if (window.adjustPool && frameCount % 20 === 0) {
renderTimes[workersPool.poolSize] = performance.now() - renderStart;
if (workersPool.poolSize < workersPool.maxPoolSize) {
await workersPool.resize(workersPool.poolSize + 1);
renderStart = performance.now();
} else {
window.adjustPool = false;
let fastest = Number.MAX_VALUE;
let poolSize = 0;
for (const [size, time] of Object.entries(renderTimes)) {
if (time < fastest) {
fastest = time;
poolSize = size;
}
}
await workersPool.resize(poolSize);
}
}
/* END: Auto adjust poolSize */
window.running && window.requestAnimationFrame(animation);
};
window.requestAnimationFrame(animation);
})();
Implementation details
Tiles but not exactly
Initially I implemented this as square tiles, but the overhead of the message passing interface made the implementation much slower than single threaded. To alleviate this, I implemented full width sections with 8 pixels
of height, which requires less synchronization and fewer messages sent.
While on later developments I think I can go back to square tiles again, I left it like this for the time being.
WorkersPool
The available WebWorkers are managed through a WorkersPool
class that handles the initialization of the WebWorkers and triggers the start the rendering workflow.
When the WorkersPool
is instantiated, there are two parameters passed that control the size of the pool: poolSize
and maxPoolSize
. Neither value is used right away, but until the pool is initialized or the processing is started.
On initialization, the WorkersPool
will create maxPoolSize
number of WebWorkers and initalize them, they will remain in memory, waiting for the signal to start the rendering job.
When the process
function is called, WorkersPool
will signal to poolSize
number of threads to pick the next available section to render and start working on it until no more sections are left.
Since the total number of WebWorkers are available on initialization, it is easier to change the amount of workers to use on a particular frame, and as such, I implemented a method to find the best poolSize
configuration. This code will simply go from 1
thread up until maxPoolSize
and record all the rendering times on each step (signaled by (?)
), after all the configurations have been tested, it scans for the number of threads that gave the best performances and indicates to WorkersPool
to use this configuration from that point onwards (indicated by (!)
).
Message Passing
The default mechanism for comunicating the main JavaScript thread and the spanwed WebWorkers is using a message passing interface that offers a postMessage function and an onmessage event handler (docs).
This mechanism works well to isolate both threads and comunicate between them without data races, the problem is that, to do so, parameters are either copied or moved which makes for slow transactions. In the case of a fast rendering job, it incurrs on a hefty performance penalty.
SharedArrayBuffer
Fortunately, JavaScript offers a less involved communication mechanism using SharedArrayBuffer, which can be read and written by various WebWorkers and the main thread simultaneously, which also means that data races can occur if you are not careful.
With this facility, I was able to deprecate the use of the message passing interface to sync rendering jobs and their results. Now, all WebWorkers write to separate sections of the shared memory.
But for using SharedArrayBuffer
, there is one big caveat to be aware of: you can only use it inside a secure context. To achieve this, you need your webserver to emit two headers:
Cross-Origin-Opener-Policy: same-origin
Cross-Origin-Embedder-Policy: require-corp
And then, if you are not testing from local (either through localhost
or 127.0.0.1
), you need to serve your resources from https
or wss
, docs.
Atomics
Atomics are a facility offered by JavaScript to synchronize the access to shared resources in a multi-threaded environment. They can be used to access a memory location for reading or writing with the known safety that no other updates will happen while the operation is being executed.
Using them, I was able to pass a memory object containing the next available section for processing, enabling the implementation of a sort of job queue where WebWorkers can take the next un-processed portion of the image to work on.
This sort of work stealing strategy allows for two advantages
- Threads working on difficult parts of the scene will not block other threads from other sections.
- In case of heterogeneous CPU architectures, like big.LITTLE, slower cores can still help during rendering at their own pace without slowing the faster cores work.
What went wrong
Sadly, not everything went as smooth as I expected and the main problem this implementation faces is the JavaScript garbage collector, which on certain browsers (Firefox and Chrome), causes big performance drops when executing major swipes. Interestingly, this is not a problem in Safari, so Apple devices are unnafected by this.
Also, different browsers do not advertise the same value for navigator.hardwareConcurrency
on the same CPU, for instance on a Ryzen 3950X, Chrome returns 32
while Firefox gives 16
, which changes the program behavior if using that as a guide on threads configuration.
But anyway, usually after two or three threads, the speedup dramatically decreases, indicating a bottleneck is somewhere that I have not identified yet, my guess is that there is too much pressure in the memory subsystem, and a fix will require removing as many allocations as possible in the code's hot path.
Summary
I started writing this entry on a defeating tone since the initial implementation was very slow, until I refactored to use SharedArrayBuffer
and Atomics
, which improved things dramatically.
Still, the bottleneck from the memory allocator and garbage collector is stopping the implementation from using all the available computational resources for rendering. I might spend some time on this later, and move all the code to WebAssembly where memory management is more deterministic using Rust.
The next step will be implement a good accelerator structure to speedup the intersection tests, but to really experience its advantages, there needs to be several objects in the scene, and there is no easier way to increase the scene complexity than building everything out of triangles.
So next chapter in this series will bring back triangles using WebAssembly SIMD instructions.