gpu: add opencl support for macos

This allows using opencl as an alternative to opengl compute shaders on macos, which does not support compute shaders. Now, macos can finally use the extended draw distance feature of the gpu plugin. This also includes code for using opencl with Windows and Linux if we want to enable that in the future. A copy of the existing compute shaders have been checked in and ported to opencl, keeping support for opengl compute shaders on Windows and Linux. Co-authored-by: Paul Norton <napkinorton@gmail.com>
2021-02-10 21:01:53 -05:00
parent 26f26308ab
commit 13efaa6a0c
9 changed files with 1445 additions and 175 deletions
--- a/runelite-client/pom.xml
+++ b/runelite-client/pom.xml
@@ -177,6 +177,25 @@
 			<version>2.4.0-rc-20210117</version>
 			<scope>runtime</scope>
 		</dependency>
+		<dependency>
+			<groupId>net.runelite.jocl</groupId>
+			<artifactId>jocl</artifactId>
+			<version>1.0</version>
+		</dependency>
+		<dependency>
+			<groupId>net.runelite.jocl</groupId>
+			<artifactId>jocl</artifactId>
+			<version>1.0</version>
+			<classifier>macos-x64</classifier>
+			<scope>runtime</scope>
+		</dependency>
+		<dependency>
+			<groupId>net.runelite.jocl</groupId>
+			<artifactId>jocl</artifactId>
+			<version>1.0</version>
+			<classifier>macos-arm64</classifier>
+			<scope>runtime</scope>
+		</dependency>
 		<dependency>
 			<groupId>net.runelite</groupId>
 			<artifactId>archive-patcher</artifactId>
--- a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GLBuffer.java
+++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GLBuffer.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+package net.runelite.client.plugins.gpu;
+
+import org.jocl.Pointer;
+import org.jocl.cl_mem;
+
+class GLBuffer
+{
+	int glBufferId = -1;
+	int size = -1;
+	cl_mem cl_mem;
+
+	Pointer ptr()
+	{
+		return cl_mem != null ? Pointer.to(cl_mem) : null;
+	}
+}
--- a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java
+++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java
@@ -29,6 +29,11 @@ import com.google.inject.Provides;
 import com.jogamp.nativewindow.awt.AWTGraphicsConfiguration;
 import com.jogamp.nativewindow.awt.JAWTWindow;
 import com.jogamp.opengl.GL;
+import static com.jogamp.opengl.GL.GL_ARRAY_BUFFER;
+import static com.jogamp.opengl.GL.GL_DYNAMIC_DRAW;
+import static com.jogamp.opengl.GL2ES2.GL_STREAM_DRAW;
+import static com.jogamp.opengl.GL2ES3.GL_STATIC_COPY;
+import static com.jogamp.opengl.GL2ES3.GL_UNIFORM_BUFFER;
 import com.jogamp.opengl.GL4;
 import com.jogamp.opengl.GLCapabilities;
 import com.jogamp.opengl.GLContext;
@@ -45,6 +50,7 @@ import java.awt.Image;
 import java.awt.geom.AffineTransform;
 import java.awt.image.BufferedImage;
 import java.awt.image.DataBufferInt;
+import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.FloatBuffer;
@@ -93,6 +99,10 @@ import net.runelite.client.plugins.gpu.config.UIScalingMode;
 import net.runelite.client.plugins.gpu.template.Template;
 import net.runelite.client.ui.DrawManager;
 import net.runelite.client.util.OSType;
+import org.jocl.CL;
+import static org.jocl.CL.CL_MEM_READ_ONLY;
+import static org.jocl.CL.CL_MEM_WRITE_ONLY;
+import static org.jocl.CL.clCreateFromGLBuffer;

@PluginDescriptor(
 	name = "GPU",
@@ -105,8 +115,8 @@ import net.runelite.client.util.OSType;
 public class GpuPlugin extends Plugin implements DrawCallbacks
 {
 	// This is the maximum number of triangles the compute shaders support
-	private static final int MAX_TRIANGLE = 4096;
-	private static final int SMALL_TRIANGLE_COUNT = 512;
+	static final int MAX_TRIANGLE = 4096;
+	static final int SMALL_TRIANGLE_COUNT = 512;
 	private static final int FLAG_SCENE_BUFFER = Integer.MIN_VALUE;
 	private static final int DEFAULT_DISTANCE = 25;
 	static final int MAX_DISTANCE = 90;
@@ -115,6 +125,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 	@Inject
 	private Client client;

+	@Inject
+	private OpenCLManager openCLManager;
+
 	@Inject
 	private ClientThread clientThread;

@@ -133,7 +146,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 	@Inject
 	private PluginManager pluginManager;

-	private boolean useComputeShaders;
+	enum ComputeMode
+	{
+		NONE,
+		OPENGL,
+		OPENCL
+	}
+
+	private ComputeMode computeMode = ComputeMode.NONE;

 	private Canvas canvas;
 	private JAWTWindow jawtWindow;
@@ -182,23 +202,22 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 	private int texSceneHandle;
 	private int rboSceneHandle;

-	// scene vertex buffer id
-	private int bufferId;
-	// scene uv buffer id
-	private int uvBufferId;
+	// scene vertex buffer
+	private final GLBuffer sceneVertexBuffer = new GLBuffer();
+	// scene uv buffer
+	private final GLBuffer sceneUvBuffer = new GLBuffer();

-	private int tmpBufferId; // temporary scene vertex buffer
-	private int tmpUvBufferId; // temporary scene uv buffer
-	private int tmpModelBufferId; // scene model buffer, large
-	private int tmpModelBufferSmallId; // scene model buffer, small
-	private int tmpModelBufferUnorderedId;
-	private int tmpOutBufferId; // target vertex buffer for compute shaders
-	private int tmpOutUvBufferId; // target uv buffer for compute shaders
+	private final GLBuffer tmpVertexBuffer = new GLBuffer(); // temporary scene vertex buffer
+	private final GLBuffer tmpUvBuffer = new GLBuffer(); // temporary scene uv buffer
+	private final GLBuffer tmpModelBufferLarge = new GLBuffer(); // scene model buffer, large
+	private final GLBuffer tmpModelBufferSmall = new GLBuffer(); // scene model buffer, small
+	private final GLBuffer tmpModelBufferUnordered = new GLBuffer(); // scene model buffer, unordered
+	private final GLBuffer tmpOutBuffer = new GLBuffer(); // target vertex buffer for compute shaders
+	private final GLBuffer tmpOutUvBuffer = new GLBuffer(); // target uv buffer for compute shaders

 	private int textureArrayId;

-	private int uniformBufferId;
-	private final IntBuffer uniformBuffer = GpuIntBuffer.allocateDirect(5 + 3 + 2048 * 4);
+	private final GLBuffer uniformBuffer = new GLBuffer();
 	private final float[] textureOffsets = new float[128];

 	private GpuIntBuffer vertexBuffer;
@@ -278,7 +297,6 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		{
 			try
 			{
-				bufferId = uvBufferId = uniformBufferId = tmpBufferId = tmpUvBufferId = tmpModelBufferId = tmpModelBufferSmallId = tmpModelBufferUnorderedId = tmpOutBufferId = tmpOutUvBufferId = -1;
 				texSceneHandle = fboSceneHandle = rboSceneHandle = -1; // AA FBO
 				unorderedModels = smallModels = largeModels = 0;
 				drawingModel = false;
@@ -290,8 +308,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 					return false;
 				}

-				// OSX supports up to OpenGL 4.1, however 4.3 is required for compute shaders
-				useComputeShaders = config.useComputeShaders() && OSType.getOSType() != OSType.MacOS;
+				computeMode = config.useComputeShaders()
+					? (OSType.getOSType() == OSType.MacOS ? ComputeMode.OPENCL : ComputeMode.OPENGL)
+					: ComputeMode.NONE;

 				canvas.setIgnoreRepaint(true);

@@ -397,7 +416,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 				if (client.getGameState() == GameState.LOGGED_IN)
 				{
-					uploadScene();
+					invokeOnMainThread(this::uploadScene);
 				}
 			}
 			catch (Throwable e)
@@ -433,6 +452,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 			invokeOnMainThread(() ->
 			{
+				openCLManager.cleanup();
+
 				if (gl != null)
 				{
 					if (textureArrayId != -1)
@@ -441,11 +462,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 						textureArrayId = -1;
 					}

-					if (uniformBufferId != -1)
-					{
-						glDeleteBuffer(gl, uniformBufferId);
-						uniformBufferId = -1;
-					}
+					destroyGlBuffer(uniformBuffer);

 					shutdownBuffers();
 					shutdownInterfaceTexture();
@@ -519,12 +536,16 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		glProgram = PROGRAM.compile(gl, template);
 		glUiProgram = UI_PROGRAM.compile(gl, template);

-		if (useComputeShaders)
+		if (computeMode == ComputeMode.OPENGL)
 		{
 			glComputeProgram = COMPUTE_PROGRAM.compile(gl, template);
 			glSmallComputeProgram = SMALL_COMPUTE_PROGRAM.compile(gl, template);
 			glUnorderedComputeProgram = UNORDERED_COMPUTE_PROGRAM.compile(gl, template);
 		}
+		else if (computeMode == ComputeMode.OPENCL)
+		{
+			openCLManager.init(gl);
+		}

 		initUniforms();
 	}
@@ -593,8 +614,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 			-1f, 1f, 0.0f, 0.0f, 0f  // top left
 		});
 		vboUiBuf.rewind();
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vboUiHandle);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW);
+		gl.glBindBuffer(GL_ARRAY_BUFFER, vboUiHandle);
+		gl.glBufferData(GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW);

 		// position attribute
 		gl.glVertexAttribPointer(0, 3, gl.GL_FLOAT, false, 5 * Float.BYTES, 0);
@@ -605,7 +626,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		gl.glEnableVertexAttribArray(1);

 		// unbind VBO
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0);
+		gl.glBindBuffer(GL_ARRAY_BUFFER, 0);
 	}

 	private void shutdownVao()
@@ -622,71 +643,49 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 	private void initBuffers()
 	{
-		bufferId = glGenBuffers(gl);
-		uvBufferId = glGenBuffers(gl);
-		tmpBufferId = glGenBuffers(gl);
-		tmpUvBufferId = glGenBuffers(gl);
-		tmpModelBufferId = glGenBuffers(gl);
-		tmpModelBufferSmallId = glGenBuffers(gl);
-		tmpModelBufferUnorderedId = glGenBuffers(gl);
-		tmpOutBufferId = glGenBuffers(gl);
-		tmpOutUvBufferId = glGenBuffers(gl);
+		initGlBuffer(sceneVertexBuffer);
+		initGlBuffer(sceneUvBuffer);
+		initGlBuffer(tmpVertexBuffer);
+		initGlBuffer(tmpUvBuffer);
+		initGlBuffer(tmpModelBufferLarge);
+		initGlBuffer(tmpModelBufferSmall);
+		initGlBuffer(tmpModelBufferUnordered);
+		initGlBuffer(tmpOutBuffer);
+		initGlBuffer(tmpOutUvBuffer);
+	}
+
+	private void initGlBuffer(GLBuffer glBuffer)
+	{
+		glBuffer.glBufferId = glGenBuffers(gl);
 	}

 	private void shutdownBuffers()
 	{
-		if (bufferId != -1)
-		{
-			glDeleteBuffer(gl, bufferId);
-			bufferId = -1;
-		}
+		destroyGlBuffer(sceneVertexBuffer);
+		destroyGlBuffer(sceneUvBuffer);

-		if (uvBufferId != -1)
-		{
-			glDeleteBuffer(gl, uvBufferId);
-			uvBufferId = -1;
-		}
+		destroyGlBuffer(tmpVertexBuffer);
+		destroyGlBuffer(tmpUvBuffer);
+		destroyGlBuffer(tmpModelBufferLarge);
+		destroyGlBuffer(tmpModelBufferSmall);
+		destroyGlBuffer(tmpModelBufferUnordered);
+		destroyGlBuffer(tmpOutBuffer);
+		destroyGlBuffer(tmpOutUvBuffer);
+	}

-		if (tmpBufferId != -1)
+	private void destroyGlBuffer(GLBuffer glBuffer)
+	{
+		if (glBuffer.glBufferId != -1)
 		{
-			glDeleteBuffer(gl, tmpBufferId);
-			tmpBufferId = -1;
+			glDeleteBuffer(gl, glBuffer.glBufferId);
+			glBuffer.glBufferId = -1;
 		}
+		glBuffer.size = -1;

-		if (tmpUvBufferId != -1)
+		if (glBuffer.cl_mem != null)
 		{
-			glDeleteBuffer(gl, tmpUvBufferId);
-			tmpUvBufferId = -1;
-		}
-
-		if (tmpModelBufferId != -1)
-		{
-			glDeleteBuffer(gl, tmpModelBufferId);
-			tmpModelBufferId = -1;
-		}
-
-		if (tmpModelBufferSmallId != -1)
-		{
-			glDeleteBuffer(gl, tmpModelBufferSmallId);
-			tmpModelBufferSmallId = -1;
-		}
-
-		if (tmpModelBufferUnorderedId != -1)
-		{
-			glDeleteBuffer(gl, tmpModelBufferUnorderedId);
-			tmpModelBufferUnorderedId = -1;
-		}
-
-		if (tmpOutBufferId != -1)
-		{
-			glDeleteBuffer(gl, tmpOutBufferId);
-			tmpOutBufferId = -1;
-		}
-
-		if (tmpOutUvBufferId != -1)
-		{
-			glDeleteBuffer(gl, tmpOutUvBufferId);
-			tmpOutUvBufferId = -1;
+			CL.clReleaseMemObject(glBuffer.cl_mem);
+			glBuffer.cl_mem = null;
 		}
 	}

@@ -709,21 +708,21 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 	private void initUniformBuffer()
 	{
-		uniformBufferId = glGenBuffers(gl);
-		gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId);
-		uniformBuffer.clear();
-		uniformBuffer.put(new int[8]);
+		initGlBuffer(uniformBuffer);
+
+		IntBuffer uniformBuf = GpuIntBuffer.allocateDirect(8 + 2048 * 4);
+		uniformBuf.put(new int[8]); // uniform block
 		final int[] pad = new int[2];
 		for (int i = 0; i < 2048; i++)
 		{
-			uniformBuffer.put(Perspective.SINE[i]);
-			uniformBuffer.put(Perspective.COSINE[i]);
-			uniformBuffer.put(pad);
+			uniformBuf.put(Perspective.SINE[i]);
+			uniformBuf.put(Perspective.COSINE[i]);
+			uniformBuf.put(pad); // ivec2 alignment in std140 is 16 bytes
 		}
-		uniformBuffer.flip();
+		uniformBuf.flip();

-		gl.glBufferData(gl.GL_UNIFORM_BUFFER, uniformBuffer.limit() * Integer.BYTES, uniformBuffer, gl.GL_DYNAMIC_DRAW);
-		gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0);
+		updateBuffer(uniformBuffer, GL_UNIFORM_BUFFER, uniformBuf.limit() * Integer.BYTES, uniformBuf, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
+		gl.glBindBuffer(GL_UNIFORM_BUFFER, 0);
 	}

 	private void initAAFbo(int width, int height, int aaSamples)
@@ -785,9 +784,11 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		invokeOnMainThread(() ->
 		{
 			// UBO. Only the first 32 bytes get modified here, the rest is the constant sin/cos table.
-			gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId);
-			uniformBuffer.clear();
-			uniformBuffer
+			// We can reuse the vertex buffer since it isn't used yet.
+			vertexBuffer.clear();
+			vertexBuffer.ensureCapacity(32);
+			IntBuffer uniformBuf = vertexBuffer.getBuffer();
+			uniformBuf
 				.put(yaw)
 				.put(pitch)
 				.put(client.getCenterX())
@@ -796,12 +797,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 				.put(cameraX)
 				.put(cameraY)
 				.put(cameraZ);
-			uniformBuffer.flip();
+			uniformBuf.flip();

-			gl.glBufferSubData(gl.GL_UNIFORM_BUFFER, 0, uniformBuffer.limit() * Integer.BYTES, uniformBuffer);
-			gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0);
+			gl.glBindBuffer(GL_UNIFORM_BUFFER, uniformBuffer.glBufferId);
+			gl.glBufferSubData(GL_UNIFORM_BUFFER, 0, uniformBuf.limit() * Integer.BYTES, uniformBuf);
+			gl.glBindBuffer(GL_UNIFORM_BUFFER, 0);

-			gl.glBindBufferBase(gl.GL_UNIFORM_BUFFER, 0, uniformBufferId);
+			gl.glBindBufferBase(GL_UNIFORM_BUFFER, 0, uniformBuffer.glBufferId);
+			uniformBuf.clear();
 		});
 	}

@@ -813,7 +816,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 	private void postDraw()
 	{
-		if (!useComputeShaders)
+		if (computeMode == ComputeMode.NONE)
 		{
 			// Upload buffers
 			vertexBuffer.flip();
@@ -822,12 +825,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 			IntBuffer vertexBuffer = this.vertexBuffer.getBuffer();
 			FloatBuffer uvBuffer = this.uvBuffer.getBuffer();

-			gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId);
-			gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW);
-
-			gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId);
-			gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW);
-
+			updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, 0L);
+			updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, 0L);
 			return;
 		}

@@ -844,79 +843,91 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		IntBuffer modelBufferSmall = this.modelBufferSmall.getBuffer();
 		IntBuffer modelBufferUnordered = this.modelBufferUnordered.getBuffer();

-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW);
+		// temp buffers
+		updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
+		updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);

-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW);
-
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, gl.GL_DYNAMIC_DRAW);
-
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferSmallId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, gl.GL_DYNAMIC_DRAW);
-
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferUnorderedId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, gl.GL_DYNAMIC_DRAW);
+		// model buffers
+		updateBuffer(tmpModelBufferLarge, GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
+		updateBuffer(tmpModelBufferSmall, GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
+		updateBuffer(tmpModelBufferUnordered, GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);

 		// Output buffers
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER,
+		updateBuffer(tmpOutBuffer,
+			GL_ARRAY_BUFFER,
 			targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes
 			null,
-			gl.GL_STREAM_DRAW);
-
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutUvBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER,
-			targetBufferOffset * 16,
+			GL_STREAM_DRAW,
+			CL_MEM_WRITE_ONLY);
+		updateBuffer(tmpOutUvBuffer,
+			GL_ARRAY_BUFFER,
+			targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes
 			null,
-			gl.GL_STREAM_DRAW);
+			GL_STREAM_DRAW,
+			CL_MEM_WRITE_ONLY);

-		// Bind UBO to compute programs
-		gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0);
-		gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0);
+		if (computeMode == ComputeMode.OPENCL)
+		{
+			// The docs for clEnqueueAcquireGLObjects say all pending GL operations must be completed before calling
+			// clEnqueueAcquireGLObjects, and recommends calling glFinish() as the only portable way to do that.
+			// However no issues have been observed from not calling it, and so will leave disabled for now.
+			// gl.glFinish();
+
+			openCLManager.compute(
+				unorderedModels, smallModels, largeModels,
+				sceneVertexBuffer, sceneUvBuffer,
+				tmpVertexBuffer, tmpUvBuffer,
+				tmpModelBufferUnordered, tmpModelBufferSmall, tmpModelBufferLarge,
+				tmpOutBuffer, tmpOutUvBuffer,
+				uniformBuffer);
+			return;
+		}

 		/*
 		 * Compute is split into three separate programs: 'unordered', 'small', and 'large'
 		 * to save on GPU resources. Small will sort <= 512 faces, large will do <= 4096.
 		 */

+		// Bind UBO to compute programs
+		gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0);
+		gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0);
+
 		// unordered
 		gl.glUseProgram(glUnorderedComputeProgram);

-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnorderedId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnordered.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);

 		gl.glDispatchCompute(unorderedModels, 1, 1);

 		// small
 		gl.glUseProgram(glSmallComputeProgram);

-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmallId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmall.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);

 		gl.glDispatchCompute(smallModels, 1, 1);

 		// large
 		gl.glUseProgram(glComputeProgram);

-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
-		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferLarge.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
+		gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);

 		gl.glDispatchCompute(largeModels, 1, 1);
 	}
@@ -926,7 +937,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 							SceneTilePaint paint, int tileZ, int tileX, int tileY,
 							int zoom, int centerX, int centerY)
 	{
-		if (!useComputeShaders)
+		if (computeMode == ComputeMode.NONE)
 		{
 			targetBufferOffset += sceneUploader.upload(paint,
 				tileZ, tileX, tileY,
@@ -963,7 +974,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 							SceneTileModel model, int tileZ, int tileX, int tileY,
 							int zoom, int centerX, int centerY)
 	{
-		if (!useComputeShaders)
+		if (computeMode == ComputeMode.NONE)
 		{
 			targetBufferOffset += sceneUploader.upload(model,
 				tileX, tileY,
@@ -1131,7 +1142,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 				// Ceil the sizes because even if the size is 599.1 we want to treat it as size 600 (i.e. render to the x=599 pixel).
 				renderViewportHeight = (int) Math.ceil(scaleFactorY * (renderViewportHeight)) + padding * 2;
-				renderViewportWidth  = (int) Math.ceil(scaleFactorX * (renderViewportWidth )) + padding * 2;
+				renderViewportWidth  = (int) Math.ceil(scaleFactorX * (renderViewportWidth ))  + padding * 2;

 				// Floor the offsets because even if the offset is 4.9, we want to render to the x=4 pixel anyway.
 				renderHeightOff      = (int) Math.floor(scaleFactorY * (renderHeightOff)) - padding;
@@ -1195,27 +1206,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 			gl.glBindVertexArray(vaoHandle);

 			int vertexBuffer, uvBuffer;
-			if (useComputeShaders)
+			if (computeMode != ComputeMode.NONE)
 			{
-				// Before reading the SSBOs written to from postDrawScene() we must insert a barrier
-				gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
+				if (computeMode == ComputeMode.OPENGL)
+				{
+					// Before reading the SSBOs written to from postDrawScene() we must insert a barrier
+					gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
+				}
+				else
+				{
+					// Wait for the command queue to finish, so that we know the compute is done
+					openCLManager.finish();
+				}
+
 				// Draw using the output buffer of the compute
-				vertexBuffer = tmpOutBufferId;
-				uvBuffer = tmpOutUvBufferId;
+				vertexBuffer = tmpOutBuffer.glBufferId;
+				uvBuffer = tmpOutUvBuffer.glBufferId;
 			}
 			else
 			{
 				// Only use the temporary buffers, which will contain the full scene
-				vertexBuffer = tmpBufferId;
-				uvBuffer = tmpUvBufferId;
+				vertexBuffer = tmpVertexBuffer.glBufferId;
+				uvBuffer = tmpUvBuffer.glBufferId;
 			}

 			gl.glEnableVertexAttribArray(0);
-			gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vertexBuffer);
+			gl.glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
 			gl.glVertexAttribIPointer(0, 4, gl.GL_INT, 0, 0);

 			gl.glEnableVertexAttribArray(1);
-			gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBuffer);
+			gl.glBindBuffer(GL_ARRAY_BUFFER, uvBuffer);
 			gl.glVertexAttribPointer(1, 4, gl.GL_FLOAT, false, 0, 0);

 			gl.glDrawArrays(gl.GL_TRIANGLES, 0, targetBufferOffset);
@@ -1400,12 +1420,12 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 	@Subscribe
 	public void onGameStateChanged(GameStateChanged gameStateChanged)
 	{
-		if (!useComputeShaders || gameStateChanged.getGameState() != GameState.LOGGED_IN)
+		if (computeMode == ComputeMode.NONE || gameStateChanged.getGameState() != GameState.LOGGED_IN)
 		{
 			return;
 		}

-		uploadScene();
+		invokeOnMainThread(this::uploadScene);
 	}

 	private void uploadScene()
@@ -1421,13 +1441,10 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 		IntBuffer vertexBuffer = this.vertexBuffer.getBuffer();
 		FloatBuffer uvBuffer = this.uvBuffer.getBuffer();

-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, bufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_STATIC_COPY);
+		updateBuffer(sceneVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY);
+		updateBuffer(sceneUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY);

-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBufferId);
-		gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_STATIC_COPY);
-
-		gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0);
+		gl.glBindBuffer(GL_ARRAY_BUFFER, 0);

 		vertexBuffer.clear();
 		uvBuffer.clear();
@@ -1492,7 +1509,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 	@Override
 	public void draw(Renderable renderable, int orientation, int pitchSin, int pitchCos, int yawSin, int yawCos, int x, int y, int z, long hash)
 	{
-		if (!useComputeShaders)
+		if (computeMode == ComputeMode.NONE)
 		{
 			Model model = renderable instanceof Model ? (Model) renderable : renderable.getModel();
 			if (model != null)
@@ -1673,7 +1690,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks

 	private int getDrawDistance()
 	{
-		final int limit = useComputeShaders ? MAX_DISTANCE : DEFAULT_DISTANCE;
+		final int limit = computeMode != ComputeMode.NONE ? MAX_DISTANCE : DEFAULT_DISTANCE;
 		return Ints.constrainToRange(config.drawDistance(), 0, limit);
 	}

@@ -1688,4 +1705,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
 			runnable.run();
 		}
 	}
+
+	private void updateBuffer(GLBuffer glBuffer, int target, int size, Buffer data, int usage, long clFlags)
+	{
+		gl.glBindBuffer(target, glBuffer.glBufferId);
+		if (size > glBuffer.size)
+		{
+			log.trace("Buffer resize: {} {} -> {}", glBuffer, glBuffer.size, size);
+
+			glBuffer.size = size;
+			gl.glBufferData(target, size, data, usage);
+
+			if (computeMode == ComputeMode.OPENCL)
+			{
+				if (glBuffer.cl_mem != null)
+				{
+					CL.clReleaseMemObject(glBuffer.cl_mem);
+				}
+				if (size == 0)
+				{
+					glBuffer.cl_mem = null;
+				}
+				else
+				{
+					glBuffer.cl_mem = clCreateFromGLBuffer(openCLManager.context, clFlags, glBuffer.glBufferId, null);
+				}
+			}
+		}
+		else if (data != null)
+		{
+			gl.glBufferSubData(target, 0, size, data);
+		}
+	}
 }
--- a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/OpenCLManager.java
+++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/OpenCLManager.java
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+package net.runelite.client.plugins.gpu;
+
+import com.google.common.base.Charsets;
+import com.jogamp.nativewindow.NativeSurface;
+import com.jogamp.opengl.GL4;
+import com.jogamp.opengl.GLContext;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Objects;
+import javax.inject.Singleton;
+import jogamp.opengl.GLContextImpl;
+import jogamp.opengl.GLDrawableImpl;
+import jogamp.opengl.egl.EGLContext;
+import jogamp.opengl.macosx.cgl.CGL;
+import jogamp.opengl.windows.wgl.WindowsWGLContext;
+import jogamp.opengl.x11.glx.X11GLXContext;
+import lombok.extern.slf4j.Slf4j;
+import net.runelite.client.plugins.gpu.template.Template;
+import net.runelite.client.util.OSType;
+import org.jocl.CL;
+import static org.jocl.CL.*;
+import org.jocl.CLException;
+import org.jocl.Pointer;
+import org.jocl.Sizeof;
+import org.jocl.cl_command_queue;
+import org.jocl.cl_context;
+import org.jocl.cl_context_properties;
+import org.jocl.cl_device_id;
+import org.jocl.cl_event;
+import org.jocl.cl_kernel;
+import org.jocl.cl_mem;
+import org.jocl.cl_platform_id;
+import org.jocl.cl_program;
+
+@Singleton
+@Slf4j
+class OpenCLManager
+{
+	private static final String GL_SHARING_PLATFORM_EXT = "cl_khr_gl_sharing";
+
+	private static final String KERNEL_NAME_UNORDERED = "computeUnordered";
+	private static final String KERNEL_NAME_LARGE = "computeLarge";
+
+	private static final int MIN_WORK_GROUP_SIZE = 256;
+	private static final int SMALL_SIZE = GpuPlugin.SMALL_TRIANGLE_COUNT;
+	private static final int LARGE_SIZE = GpuPlugin.MAX_TRIANGLE;
+	//  struct shared_data {
+	//      int totalNum[12];
+	//      int totalDistance[12];
+	//      int totalMappedNum[18];
+	//      int min10;
+	//      int dfs[0];
+	//  };
+	private static final int SHARED_SIZE = 12 + 12 + 18 + 1; // in ints
+
+	// The number of faces each worker processes in the two kernels
+	private int largeFaceCount;
+	private int smallFaceCount;
+
+	private cl_platform_id platform;
+	private cl_device_id device;
+	cl_context context;
+	private cl_command_queue commandQueue;
+
+	private cl_program programUnordered;
+	private cl_program programSmall;
+	private cl_program programLarge;
+
+	private cl_kernel kernelUnordered;
+	private cl_kernel kernelSmall;
+	private cl_kernel kernelLarge;
+
+	void init(GL4 gl)
+	{
+		CL.setExceptionsEnabled(true);
+
+		switch (OSType.getOSType())
+		{
+			case Windows:
+			case Linux:
+				initPlatform();
+				initDevice();
+				initContext(gl);
+				break;
+			case MacOS:
+				initMacOS(gl);
+				break;
+			default:
+				throw new RuntimeException("Unsupported OS Type " + OSType.getOSType().name());
+		}
+		ensureMinWorkGroupSize();
+		initQueue();
+		compilePrograms();
+	}
+
+	void cleanup()
+	{
+		if (programUnordered != null)
+		{
+			CL.clReleaseProgram(programUnordered);
+			programUnordered = null;
+		}
+
+		if (programSmall != null)
+		{
+			CL.clReleaseProgram(programSmall);
+			programSmall = null;
+		}
+
+		if (programLarge != null)
+		{
+			CL.clReleaseProgram(programLarge);
+			programLarge = null;
+		}
+
+		if (kernelUnordered != null)
+		{
+			CL.clReleaseKernel(kernelUnordered);
+			kernelUnordered = null;
+		}
+
+		if (kernelSmall != null)
+		{
+			CL.clReleaseKernel(kernelSmall);
+			kernelSmall = null;
+		}
+
+		if (kernelLarge != null)
+		{
+			CL.clReleaseKernel(kernelLarge);
+			kernelLarge = null;
+		}
+
+		if (commandQueue != null)
+		{
+			CL.clReleaseCommandQueue(commandQueue);
+			commandQueue = null;
+		}
+
+		if (context != null)
+		{
+			CL.clReleaseContext(context);
+			context = null;
+		}
+
+		if (device != null)
+		{
+			CL.clReleaseDevice(device);
+			device = null;
+		}
+	}
+
+	private String logPlatformInfo(cl_platform_id platform, int param)
+	{
+		long[] size = new long[1];
+		clGetPlatformInfo(platform, param, 0, null, size);
+
+		byte[] buffer = new byte[(int) size[0]];
+		clGetPlatformInfo(platform, param, buffer.length, Pointer.to(buffer), null);
+		String platformInfo = new String(buffer, Charsets.UTF_8);
+		log.debug("Platform: {}, {}", stringFor_cl_platform_info(param), platformInfo);
+		return platformInfo;
+	}
+
+	private void logBuildInfo(cl_program program, int param)
+	{
+		long[] size = new long[1];
+		clGetProgramBuildInfo(program, device, param, 0, null, size);
+
+		ByteBuffer buffer = ByteBuffer.allocateDirect((int) size[0]);
+		clGetProgramBuildInfo(program, device, param, buffer.limit(), Pointer.toBuffer(buffer), null);
+
+		switch (param)
+		{
+			case CL_PROGRAM_BUILD_STATUS:
+				log.debug("Build status: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_build_status(buffer.getInt()));
+				break;
+			case CL_PROGRAM_BINARY_TYPE:
+				log.debug("Binary type: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_program_binary_type(buffer.getInt()));
+				break;
+			case CL_PROGRAM_BUILD_LOG:
+				String buildLog = StandardCharsets.US_ASCII.decode(buffer).toString();
+				log.trace("Build log: {}, {}", stringFor_cl_program_build_info(param), buildLog);
+				break;
+			case CL_PROGRAM_BUILD_OPTIONS:
+				String message = StandardCharsets.US_ASCII.decode(buffer).toString();
+				log.debug("Build options: {}, {}", stringFor_cl_program_build_info(param), message);
+				break;
+			default:
+				throw new IllegalArgumentException();
+		}
+	}
+
+	private void initPlatform()
+	{
+		int[] platformCount = new int[1];
+		clGetPlatformIDs(0, null, platformCount);
+		if (platformCount[0] == 0)
+		{
+			throw new RuntimeException("No compute platforms found");
+		}
+
+		cl_platform_id[] platforms = new cl_platform_id[platformCount[0]];
+		clGetPlatformIDs(platforms.length, platforms, null);
+
+		for (cl_platform_id platform : platforms)
+		{
+			log.debug("Found cl_platform_id {}", platform);
+			logPlatformInfo(platform, CL_PLATFORM_PROFILE);
+			logPlatformInfo(platform, CL_PLATFORM_VERSION);
+			logPlatformInfo(platform, CL_PLATFORM_NAME);
+			logPlatformInfo(platform, CL_PLATFORM_VENDOR);
+			String[] extensions = logPlatformInfo(platform, CL_PLATFORM_EXTENSIONS).split(" ");
+			if (Arrays.stream(extensions).noneMatch(s -> s.equals(GL_SHARING_PLATFORM_EXT)))
+			{
+				throw new RuntimeException("Platform does not support OpenGL buffer sharing");
+			}
+		}
+
+		platform = platforms[0];
+		log.debug("Selected cl_platform_id {}", platform);
+	}
+
+	private void initDevice()
+	{
+		int[] deviceCount = new int[1];
+		clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, null, deviceCount);
+		if (deviceCount[0] == 0)
+		{
+			throw new RuntimeException("No compute devices found");
+		}
+
+		cl_device_id[] devices = new cl_device_id[(int) deviceCount[0]];
+		clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, devices.length, devices, null);
+
+		for (cl_device_id device : devices)
+		{
+			long[] size = new long[1];
+			clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, null, size);
+
+			byte[] devInfoBuf = new byte[(int) size[0]];
+			clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, devInfoBuf.length, Pointer.to(devInfoBuf), null);
+
+			log.debug("Found cl_device_id: {}", device);
+			log.debug("Device extensions: {}", new String(devInfoBuf, Charsets.UTF_8));
+		}
+
+		device = devices[0];
+		log.debug("Selected cl_device_id {}", device);
+	}
+
+	private void initContext(GL4 gl)
+	{
+		// set computation platform
+		cl_context_properties contextProps = new cl_context_properties();
+		contextProps.addProperty(CL_CONTEXT_PLATFORM, platform);
+
+		// pull gl context
+		GLContext glContext = gl.getContext();
+		log.debug("Got GLContext of type {}", glContext.getClass().getSimpleName());
+		if (!glContext.isCurrent())
+		{
+			throw new RuntimeException("Can't create OpenCL context from inactive GL Context");
+		}
+
+		// get correct props based on os
+		long glContextHandle = glContext.getHandle();
+		GLContextImpl glContextImpl = (GLContextImpl) glContext;
+		GLDrawableImpl glDrawableImpl = glContextImpl.getDrawableImpl();
+		NativeSurface nativeSurface = glDrawableImpl.getNativeSurface();
+
+		if (glContext instanceof X11GLXContext)
+		{
+			long displayHandle = nativeSurface.getDisplayHandle();
+			contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
+			contextProps.addProperty(CL_GLX_DISPLAY_KHR, displayHandle);
+		}
+		else if (glContext instanceof WindowsWGLContext)
+		{
+			long surfaceHandle = nativeSurface.getSurfaceHandle();
+			contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
+			contextProps.addProperty(CL_WGL_HDC_KHR, surfaceHandle);
+		}
+		else if (glContext instanceof EGLContext)
+		{
+			long displayHandle = nativeSurface.getDisplayHandle();
+			contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
+			contextProps.addProperty(CL_EGL_DISPLAY_KHR, displayHandle);
+		}
+
+		log.debug("Creating context with props: {}", contextProps);
+		context = clCreateContext(contextProps, 1, new cl_device_id[]{device}, null, null, null);
+		log.debug("Created compute context {}", context);
+	}
+
+	private void initMacOS(GL4 gl)
+	{
+		// get sharegroup from gl context
+		GLContext glContext = gl.getContext();
+		if (!glContext.isCurrent())
+		{
+			throw new RuntimeException("Can't create context from inactive GL");
+		}
+		long cglContext = CGL.CGLGetCurrentContext();
+		long cglShareGroup = CGL.CGLGetShareGroup(cglContext);
+
+		// build context props
+		cl_context_properties contextProps = new cl_context_properties();
+		contextProps.addProperty(CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, cglShareGroup);
+
+		// ask macos to make the context for us
+		log.debug("Creating context with props: {}", contextProps);
+		context = clCreateContext(contextProps, 0, null, null, null, null);
+
+		// pull the compute device out of the provided context
+		device = new cl_device_id();
+		clGetGLContextInfoAPPLE(context, cglContext, CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, Sizeof.cl_device_id, Pointer.to(device), null);
+
+		log.debug("Got macOS CLGL compute device {}", device);
+	}
+
+	private void ensureMinWorkGroupSize()
+	{
+		long[] maxWorkGroupSize = new long[1];
+		clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(maxWorkGroupSize), null);
+		log.debug("Device CL_DEVICE_MAX_WORK_GROUP_SIZE: {}", maxWorkGroupSize[0]);
+
+		if (maxWorkGroupSize[0] < MIN_WORK_GROUP_SIZE)
+		{
+			throw new RuntimeException("Compute device does not support min work group size " + MIN_WORK_GROUP_SIZE);
+		}
+
+		// Largest power of 2 less than or equal to maxWorkGroupSize
+		int groupSize = 0x80000000 >>> Integer.numberOfLeadingZeros((int) maxWorkGroupSize[0]);
+		largeFaceCount = LARGE_SIZE / (Math.min(groupSize, LARGE_SIZE));
+		smallFaceCount = SMALL_SIZE / (Math.min(groupSize, SMALL_SIZE));
+
+		log.debug("Face counts: small: {}, large: {}", smallFaceCount, largeFaceCount);
+	}
+
+	private void initQueue()
+	{
+		long[] l = new long[1];
+		clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, Sizeof.cl_long, Pointer.to(l), null);
+
+		commandQueue = clCreateCommandQueue(context, device, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, null);
+		log.debug("Created command_queue {}, properties {}", commandQueue, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+	}
+
+	private cl_program compileProgram(String programSource)
+	{
+		log.trace("Compiling program:\n {}", programSource);
+		cl_program program = clCreateProgramWithSource(context, 1, new String[]{programSource}, null, null);
+
+		try
+		{
+			clBuildProgram(program, 0, null, null, null, null);
+		}
+		catch (CLException e)
+		{
+			logBuildInfo(program, CL_PROGRAM_BUILD_LOG);
+			throw e;
+		}
+
+		logBuildInfo(program, CL_PROGRAM_BUILD_STATUS);
+		logBuildInfo(program, CL_PROGRAM_BINARY_TYPE);
+		logBuildInfo(program, CL_PROGRAM_BUILD_OPTIONS);
+		logBuildInfo(program, CL_PROGRAM_BUILD_LOG);
+		return program;
+	}
+
+	private cl_kernel getKernel(cl_program program, String kernelName)
+	{
+		cl_kernel kernel = clCreateKernel(program, kernelName, null);
+		log.debug("Loaded kernel {} for program {}", kernelName, program);
+		return kernel;
+	}
+
+	private void compilePrograms()
+	{
+		Template templateSmall = new Template()
+			.addInclude(OpenCLManager.class)
+			.add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + smallFaceCount) : null);
+		Template templateLarge = new Template()
+			.addInclude(OpenCLManager.class)
+			.add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + largeFaceCount) : null);
+
+		String unordered = new Template()
+			.addInclude(OpenCLManager.class)
+			.load("comp_unordered.cl");
+		String small = templateSmall.load("comp.cl");
+		String large = templateLarge.load("comp.cl");
+
+		programUnordered = compileProgram(unordered);
+		programSmall = compileProgram(small);
+		programLarge = compileProgram(large);
+
+		kernelUnordered = getKernel(programUnordered, KERNEL_NAME_UNORDERED);
+		kernelSmall = getKernel(programSmall, KERNEL_NAME_LARGE);
+		kernelLarge = getKernel(programLarge, KERNEL_NAME_LARGE);
+	}
+
+	void compute(int unorderedModels, int smallModels, int largeModels,
+		GLBuffer sceneVertexBuffer,
+		GLBuffer sceneUvBuffer,
+		GLBuffer vertexBuffer,
+		GLBuffer uvBuffer,
+		GLBuffer unorderedBuffer,
+		GLBuffer smallBuffer,
+		GLBuffer largeBuffer,
+		GLBuffer outVertexBuffer,
+		GLBuffer outUvBuffer,
+		GLBuffer uniformBuffer
+	)
+	{
+		cl_mem[] glBuffersAll = {
+			sceneVertexBuffer.cl_mem,
+			sceneUvBuffer.cl_mem,
+			unorderedBuffer.cl_mem,
+			smallBuffer.cl_mem,
+			largeBuffer.cl_mem,
+			vertexBuffer.cl_mem,
+			uvBuffer.cl_mem,
+			outVertexBuffer.cl_mem,
+			outUvBuffer.cl_mem,
+			uniformBuffer.cl_mem,
+		};
+		cl_mem[] glBuffers = Arrays.stream(glBuffersAll)
+			.filter(Objects::nonNull)
+			.toArray(cl_mem[]::new);
+
+		cl_event acquireGLBuffers = new cl_event();
+		clEnqueueAcquireGLObjects(commandQueue, glBuffers.length, glBuffers, 0, null, acquireGLBuffers);
+
+		cl_event[] computeEvents = {
+			new cl_event(),
+			new cl_event(),
+			new cl_event()
+		};
+		int numComputeEvents = 0;
+
+		if (unorderedModels > 0)
+		{
+			clSetKernelArg(kernelUnordered, 0, Sizeof.cl_mem, unorderedBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 1, Sizeof.cl_mem, sceneVertexBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 2, Sizeof.cl_mem, vertexBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 3, Sizeof.cl_mem, sceneUvBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 4, Sizeof.cl_mem, uvBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 5, Sizeof.cl_mem, outVertexBuffer.ptr());
+			clSetKernelArg(kernelUnordered, 6, Sizeof.cl_mem, outUvBuffer.ptr());
+
+			// queue compute call after acquireGLBuffers
+			clEnqueueNDRangeKernel(commandQueue, kernelUnordered, 1, null,
+				new long[]{unorderedModels * 6L}, new long[]{6}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
+		}
+
+		if (smallModels > 0)
+		{
+			clSetKernelArg(kernelSmall, 0, (SHARED_SIZE + SMALL_SIZE) * Integer.BYTES, null);
+			clSetKernelArg(kernelSmall, 1, Sizeof.cl_mem, smallBuffer.ptr());
+			clSetKernelArg(kernelSmall, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr());
+			clSetKernelArg(kernelSmall, 3, Sizeof.cl_mem, vertexBuffer.ptr());
+			clSetKernelArg(kernelSmall, 4, Sizeof.cl_mem, sceneUvBuffer.ptr());
+			clSetKernelArg(kernelSmall, 5, Sizeof.cl_mem, uvBuffer.ptr());
+			clSetKernelArg(kernelSmall, 6, Sizeof.cl_mem, outVertexBuffer.ptr());
+			clSetKernelArg(kernelSmall, 7, Sizeof.cl_mem, outUvBuffer.ptr());
+			clSetKernelArg(kernelSmall, 8, Sizeof.cl_mem, uniformBuffer.ptr());
+
+			clEnqueueNDRangeKernel(commandQueue, kernelSmall, 1, null,
+				new long[]{smallModels * (SMALL_SIZE / smallFaceCount)}, new long[]{SMALL_SIZE / smallFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
+		}
+
+		if (largeModels > 0)
+		{
+			clSetKernelArg(kernelLarge, 0, (SHARED_SIZE + LARGE_SIZE) * Integer.BYTES, null);
+			clSetKernelArg(kernelLarge, 1, Sizeof.cl_mem, largeBuffer.ptr());
+			clSetKernelArg(kernelLarge, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr());
+			clSetKernelArg(kernelLarge, 3, Sizeof.cl_mem, vertexBuffer.ptr());
+			clSetKernelArg(kernelLarge, 4, Sizeof.cl_mem, sceneUvBuffer.ptr());
+			clSetKernelArg(kernelLarge, 5, Sizeof.cl_mem, uvBuffer.ptr());
+			clSetKernelArg(kernelLarge, 6, Sizeof.cl_mem, outVertexBuffer.ptr());
+			clSetKernelArg(kernelLarge, 7, Sizeof.cl_mem, outUvBuffer.ptr());
+			clSetKernelArg(kernelLarge, 8, Sizeof.cl_mem, uniformBuffer.ptr());
+
+			clEnqueueNDRangeKernel(commandQueue, kernelLarge, 1, null,
+				new long[]{(long) largeModels * (LARGE_SIZE / largeFaceCount)}, new long[]{LARGE_SIZE / largeFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
+		}
+
+		clEnqueueReleaseGLObjects(commandQueue, glBuffers.length, glBuffers, numComputeEvents, computeEvents, null);
+	}
+
+	void finish()
+	{
+		clFinish(commandQueue);
+	}
+}
--- a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/cl_types.cl
+++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/cl_types.cl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+struct uniform {
+  int cameraYaw;
+  int cameraPitch;
+  int centerX;
+  int centerY;
+  int zoom;
+  int cameraX;
+  int cameraY;
+  int cameraZ;
+  int4 sinCosTable[2048];
+};
+
+struct shared_data {
+  int totalNum[12]; // number of faces with a given priority
+  int totalDistance[12]; // sum of distances to faces of a given priority
+  int totalMappedNum[18]; // number of faces with a given adjusted priority
+  int min10; // minimum distance to a face of priority 10
+  int dfs[0]; // packed face id and distance, size 512 for small, 4096 for large
+};
+
+struct modelinfo {
+  int offset;   // offset into buffer
+  int uvOffset; // offset into uv buffer
+  int size;     // length in faces
+  int idx;      // write idx in target buffer
+  int flags;    // radius, orientation
+  int x;        // scene position x
+  int y;        // scene position y
+  int z;        // scene position z
+};
--- a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/common.cl
+++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/common.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define PI 3.1415926535897932384626433832795f
+#define UNIT PI / 1024.0f
+
+float3 toScreen(int4 vertex, int cameraYaw, int cameraPitch, int centerX, int centerY, int zoom) {
+  float yawSin = sin(cameraYaw * UNIT);
+  float yawCos = cos(cameraYaw * UNIT);
+
+  float pitchSin = sin(cameraPitch * UNIT);
+  float pitchCos = cos(cameraPitch * UNIT);
+
+  float rotatedX = (vertex.z * yawSin) + (vertex.x * yawCos);
+  float rotatedZ = (vertex.z * yawCos) - (vertex.x * yawSin);
+
+  float var13 = (vertex.y * pitchCos) - (rotatedZ * pitchSin);
+  float var12 = (vertex.y * pitchSin) + (rotatedZ * pitchCos);
+
+  float x = rotatedX * zoom / var12 + centerX;
+  float y = var13 * zoom / var12 + centerY;
+  float z = -var12; // in OpenGL depth is negative
+
+  return (float3) (x, y, z);
+}
+
+/*
+ * Rotate a vertex by a given orientation in JAU
+ */
+int4 rotate_vertex(__constant struct uniform *uni, int4 vertex, int orientation) {
+  int4 sinCos = uni->sinCosTable[orientation];
+  int s = sinCos.x;
+  int c = sinCos.y;
+  int x = vertex.z * s + vertex.x * c >> 16;
+  int z = vertex.z * c - vertex.x * s >> 16;
+  return (int4)(x, vertex.y, z, vertex.w);
+}
+
+/*
+ * Calculate the distance to a vertex given the camera angle
+ */
+int vertex_distance(int4 vertex, int cameraYaw, int cameraPitch) {
+  int yawSin = (int)(65536.0f * sin(cameraYaw * UNIT));
+  int yawCos = (int)(65536.0f * cos(cameraYaw * UNIT));
+
+  int pitchSin = (int)(65536.0f * sin(cameraPitch * UNIT));
+  int pitchCos = (int)(65536.0f * cos(cameraPitch * UNIT));
+
+  int j = vertex.z * yawCos - vertex.x * yawSin >> 16;
+  int l = vertex.y * pitchSin + j * pitchCos >> 16;
+
+  return l;
+}
+
+/*
+ * Calculate the distance to a face
+ */
+int face_distance(int4 vA, int4 vB, int4 vC, int cameraYaw, int cameraPitch) {
+  int dvA = vertex_distance(vA, cameraYaw, cameraPitch);
+  int dvB = vertex_distance(vB, cameraYaw, cameraPitch);
+  int dvC = vertex_distance(vC, cameraYaw, cameraPitch);
+  int faceDistance = (dvA + dvB + dvC) / 3;
+  return faceDistance;
+}
+
+/*
+ * Test if a face is visible (not backward facing)
+ */
+bool face_visible(__constant struct uniform *uni, int4 vA, int4 vB, int4 vC, int4 position) {
+  // Move model to scene location, and account for camera offset
+  int4 cameraPos = (int4)(uni->cameraX, uni->cameraY, uni->cameraZ, 0);
+  vA += position - cameraPos;
+  vB += position - cameraPos;
+  vC += position - cameraPos;
+
+  float3 sA = toScreen(vA, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
+  float3 sB = toScreen(vB, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
+  float3 sC = toScreen(vC, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
+
+  return (sA.x - sB.x) * (sC.y - sB.y) - (sC.x - sB.x) * (sA.y - sB.y) > 0;
+}
+
--- a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp.cl
+++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include FACE_COUNT
+
+#include cl_types.cl
+#include to_screen.cl
+#include common.cl
+#include priority_render.cl
+
+__kernel
+__attribute__((work_group_size_hint(256, 1, 1)))
+void computeLarge(
+  __local struct shared_data *shared,
+  __global const struct modelinfo *ol,
+  __global const int4 *vb,
+  __global const int4 *tempvb,
+  __global const float4 *uv,
+  __global const float4 *tempuv,
+  __global int4 *vout,
+  __global float4 *uvout,
+  __constant struct uniform *uni) {
+
+  size_t groupId = get_group_id(0);
+  size_t localId = get_local_id(0) * FACE_COUNT;
+  struct modelinfo minfo = ol[groupId];
+  int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
+
+  if (localId == 0) {
+    shared->min10 = 1600;
+    for (int i = 0; i < 12; ++i) {
+      shared->totalNum[i] = 0;
+      shared->totalDistance[i] = 0;
+    }
+    for (int i = 0; i < 18; ++i) {
+      shared->totalMappedNum[i] = 0;
+    }
+  }
+
+  int prio[FACE_COUNT];
+  int dis[FACE_COUNT];
+  int4 v1[FACE_COUNT];
+  int4 v2[FACE_COUNT];
+  int4 v3[FACE_COUNT];
+
+  for (int i = 0; i < FACE_COUNT; i++) {
+    get_face(shared, uni, vb, tempvb, localId + i, minfo, uni->cameraYaw, uni->cameraPitch, &prio[i], &dis[i], &v1[i], &v2[i], &v3[i]);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int i = 0; i < FACE_COUNT; i++) {
+    add_face_prio_distance(shared, uni, localId + i, minfo, v1[i], v2[i], v3[i], prio[i], dis[i], pos);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int prioAdj[FACE_COUNT];
+  int idx[FACE_COUNT];
+  for (int i = 0; i < FACE_COUNT; i++) {
+    idx[i] = map_face_priority(shared, localId + i, minfo, prio[i], dis[i], &prioAdj[i]);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int i = 0; i < FACE_COUNT; i++) {
+    insert_dfs(shared, localId + i, minfo, prioAdj[i], dis[i], idx[i]);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int i = 0; i < FACE_COUNT; i++) {
+    sort_and_insert(shared, uv, tempuv, vout, uvout, localId + i, minfo, prioAdj[i], dis[i], v1[i], v2[i], v3[i]);
+  }
+}
--- a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp_unordered.cl
+++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp_unordered.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include cl_types.cl
+
+__kernel
+__attribute__((reqd_work_group_size(6, 1, 1)))
+void computeUnordered(__global const struct modelinfo *ol,
+                      __global const int4 *vb,
+                      __global const int4 *tempvb,
+                      __global const float4 *uv,
+                      __global const float4 *tempuv,
+                      __global int4 *vout,
+                      __global float4 *uvout) {
+  size_t groupId = get_group_id(0);
+  size_t localId = get_local_id(0);
+  struct modelinfo minfo = ol[groupId];
+
+  int offset = minfo.offset;
+  int size = minfo.size;
+  int outOffset = minfo.idx;
+  int uvOffset = minfo.uvOffset;
+  int flags = minfo.flags;
+  int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
+
+  if (localId >= size) {
+    return;
+  }
+
+  uint ssboOffset = localId;
+  int4 thisA, thisB, thisC;
+
+  // Grab triangle vertices from the correct buffer
+  if (flags < 0) {
+    thisA = vb[offset + ssboOffset * 3];
+    thisB = vb[offset + ssboOffset * 3 + 1];
+    thisC = vb[offset + ssboOffset * 3 + 2];
+  } else {
+    thisA = tempvb[offset + ssboOffset * 3];
+    thisB = tempvb[offset + ssboOffset * 3 + 1];
+    thisC = tempvb[offset + ssboOffset * 3 + 2];
+  }
+
+  uint myOffset = localId;
+
+  // position vertices in scene and write to out buffer
+  vout[outOffset + myOffset * 3]     = pos + thisA;
+  vout[outOffset + myOffset * 3 + 1] = pos + thisB;
+  vout[outOffset + myOffset * 3 + 2] = pos + thisC;
+
+  if (uvOffset < 0) {
+    uvout[outOffset + myOffset * 3]     = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+    uvout[outOffset + myOffset * 3 + 1] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+    uvout[outOffset + myOffset * 3 + 2] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+  } else if (flags >= 0) {
+    uvout[outOffset + myOffset * 3]     = tempuv[uvOffset + localId * 3];
+    uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1];
+    uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2];
+  } else {
+    uvout[outOffset + myOffset * 3]     = uv[uvOffset + localId * 3];
+    uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1];
+    uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2];
+  }
+}
--- a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/priority_render.cl
+++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/priority_render.cl
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2021, Adam <Adam@sigterm.info>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Calculate adjusted priority for a face with a given priority, distance, and
+// model global min10 and face distance averages. This allows positioning faces
+// with priorities 10/11 into the correct 'slots' resulting in 18 possible
+// adjusted priorities
+int priority_map(int p, int distance, int _min10, int avg1, int avg2, int avg3) {
+  // (10, 11)  0  1  2  (10, 11)  3  4  (10, 11)  5  6  7  8  9  (10, 11)
+  //   0   1   2  3  4    5   6   7  8    9  10  11 12 13 14 15   16  17
+  switch (p) {
+    case 0: return 2;
+    case 1: return 3;
+    case 2: return 4;
+    case 3: return 7;
+    case 4: return 8;
+    case 5: return 11;
+    case 6: return 12;
+    case 7: return 13;
+    case 8: return 14;
+    case 9: return 15;
+    case 10:
+      if (distance > avg1) {
+        return 0;
+      } else if (distance > avg2) {
+        return 5;
+      } else if (distance > avg3) {
+        return 9;
+      } else {
+        return 16;
+      }
+    case 11:
+      if (distance > avg1 && _min10 > avg1) {
+        return 1;
+      } else if (distance > avg2 && (_min10 > avg1 || _min10 > avg2)) {
+        return 6;
+      } else if (distance > avg3 && (_min10 > avg1 || _min10 > avg2 || _min10 > avg3)) {
+        return 10;
+      } else {
+        return 17;
+      }
+    default:
+      return -1;
+  }
+}
+
+// calculate the number of faces with a lower adjusted priority than
+// the given adjusted priority
+int count_prio_offset(__local struct shared_data *shared, int priority) {
+  int total = 0;
+  switch (priority) {
+    case 17:
+      total += shared->totalMappedNum[16];
+    case 16:
+      total += shared->totalMappedNum[15];
+    case 15:
+      total += shared->totalMappedNum[14];
+    case 14:
+      total += shared->totalMappedNum[13];
+    case 13:
+      total += shared->totalMappedNum[12];
+    case 12:
+      total += shared->totalMappedNum[11];
+    case 11:
+      total += shared->totalMappedNum[10];
+    case 10:
+      total += shared->totalMappedNum[9];
+    case 9:
+      total += shared->totalMappedNum[8];
+    case 8:
+      total += shared->totalMappedNum[7];
+    case 7:
+      total += shared->totalMappedNum[6];
+    case 6:
+      total += shared->totalMappedNum[5];
+    case 5:
+      total += shared->totalMappedNum[4];
+    case 4:
+      total += shared->totalMappedNum[3];
+    case 3:
+      total += shared->totalMappedNum[2];
+    case 2:
+      total += shared->totalMappedNum[1];
+    case 1:
+      total += shared->totalMappedNum[0];
+    case 0:
+      return total;
+  }
+}
+
+void get_face(
+  __local struct shared_data *shared,
+  __constant struct uniform *uni,
+  __global const int4 *vb,
+  __global const int4 *tempvb,
+  uint localId, struct modelinfo minfo, int cameraYaw, int cameraPitch,
+  /* out */ int *prio, int *dis, int4 *o1, int4 *o2, int4 *o3) {
+  int size = minfo.size;
+  int offset = minfo.offset;
+  int flags = minfo.flags;
+  uint ssboOffset;
+
+  if (localId < size) {
+    ssboOffset = localId;
+  } else {
+    ssboOffset = 0;
+  }
+
+  int4 thisA;
+  int4 thisB;
+  int4 thisC;
+
+  // Grab triangle vertices from the correct buffer
+  if (flags < 0) {
+    thisA = vb[offset + ssboOffset * 3];
+    thisB = vb[offset + ssboOffset * 3 + 1];
+    thisC = vb[offset + ssboOffset * 3 + 2];
+  } else {
+    thisA = tempvb[offset + ssboOffset * 3];
+    thisB = tempvb[offset + ssboOffset * 3 + 1];
+    thisC = tempvb[offset + ssboOffset * 3 + 2];
+  }
+
+  if (localId < size) {
+    int radius = (flags & 0x7fffffff) >> 12;
+    int orientation = flags & 0x7ff;
+
+    // rotate for model orientation
+    int4 thisrvA = rotate_vertex(uni, thisA, orientation);
+    int4 thisrvB = rotate_vertex(uni, thisB, orientation);
+    int4 thisrvC = rotate_vertex(uni, thisC, orientation);
+
+    // calculate distance to face
+    int thisPriority = (thisA.w >> 16) & 0xff;// all vertices on the face have the same priority
+    int thisDistance;
+    if (radius == 0) {
+      thisDistance = 0;
+    } else {
+      thisDistance = face_distance(thisrvA, thisrvB, thisrvC, cameraYaw, cameraPitch) + radius;
+    }
+
+    *o1 = thisrvA;
+    *o2 = thisrvB;
+    *o3 = thisrvC;
+
+    *prio = thisPriority;
+    *dis = thisDistance;
+  } else {
+    *o1 = (int4)(0, 0, 0, 0);
+    *o2 = (int4)(0, 0, 0, 0);
+    *o3 = (int4)(0, 0, 0, 0);
+    *prio = 0;
+    *dis = 0;
+  }
+}
+
+void add_face_prio_distance(
+  __local struct shared_data *shared,
+  __constant struct uniform *uni,
+  uint localId, struct modelinfo minfo, int4 thisrvA, int4 thisrvB, int4 thisrvC, int thisPriority, int thisDistance, int4 pos) {
+  if (localId < minfo.size) {
+    // if the face is not culled, it is calculated into priority distance averages
+    if (face_visible(uni, thisrvA, thisrvB, thisrvC, pos)) {
+      atomic_add(&shared->totalNum[thisPriority], 1);
+      atomic_add(&shared->totalDistance[thisPriority], thisDistance);
+
+      // calculate minimum distance to any face of priority 10 for positioning the 11 faces later
+      if (thisPriority == 10) {
+        atomic_min(&shared->min10, thisDistance);
+      }
+    }
+  }
+}
+
+int map_face_priority(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int *prio) {
+  int size = minfo.size;
+
+  // Compute average distances for 0/2, 3/4, and 6/8
+
+  if (localId < size) {
+    int avg1 = 0;
+    int avg2 = 0;
+    int avg3 = 0;
+
+    if (shared->totalNum[1] > 0 || shared->totalNum[2] > 0) {
+      avg1 = (shared->totalDistance[1] + shared->totalDistance[2]) / (shared->totalNum[1] + shared->totalNum[2]);
+    }
+
+    if (shared->totalNum[3] > 0 || shared->totalNum[4] > 0) {
+      avg2 = (shared->totalDistance[3] + shared->totalDistance[4]) / (shared->totalNum[3] + shared->totalNum[4]);
+    }
+
+    if (shared->totalNum[6] > 0 || shared->totalNum[8] > 0) {
+      avg3 = (shared->totalDistance[6] + shared->totalDistance[8]) / (shared->totalNum[6] + shared->totalNum[8]);
+    }
+
+    int adjPrio = priority_map(thisPriority, thisDistance, shared->min10, avg1, avg2, avg3);
+    int prioIdx = atomic_add(&shared->totalMappedNum[adjPrio], 1);
+
+    *prio = adjPrio;
+    return prioIdx;
+  }
+
+  *prio = 0;
+  return 0;
+}
+
+void insert_dfs(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int adjPrio, int distance, int prioIdx) {
+  int size = minfo.size;
+
+  if (localId < size) {
+    // calculate base offset into dfs based on number of faces with a lower priority
+    int baseOff = count_prio_offset(shared, adjPrio);
+    // store into face array offset array by unique index
+    shared->dfs[baseOff + prioIdx] = ((int) localId << 16) | distance;
+  }
+}
+
+void sort_and_insert(
+  __local struct shared_data *shared,
+  __global const float4 *uv,
+  __global const float4 *tempuv,
+  __global int4 *vout,
+  __global float4 *uvout,
+  uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int4 thisrvA, int4 thisrvB, int4 thisrvC) {
+  /* compute face distance */
+  int size = minfo.size;
+
+  if (localId < size) {
+    int outOffset = minfo.idx;
+    int uvOffset = minfo.uvOffset;
+    int flags = minfo.flags;
+    int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
+
+    const int priorityOffset = count_prio_offset(shared, thisPriority);
+    const int numOfPriority = shared->totalMappedNum[thisPriority];
+    int start = priorityOffset; // index of first face with this priority
+    int end = priorityOffset + numOfPriority; // index of last face with this priority
+    int myOffset = priorityOffset;
+
+    // we only have to order faces against others of the same priority
+    // calculate position this face will be in
+    for (int i = start; i < end; ++i) {
+      int d1 = shared->dfs[i];
+      int theirId = d1 >> 16;
+      int theirDistance = d1 & 0xffff;
+
+      // the closest faces draw last, so have the highest index
+      // if two faces have the same distance, the one with the
+      // higher id draws last
+      if ((theirDistance > thisDistance)
+        || (theirDistance == thisDistance && theirId < localId)) {
+        ++myOffset;
+      }
+    }
+
+    // position vertices in scene and write to out buffer
+    vout[outOffset + myOffset * 3]     = pos + thisrvA;
+    vout[outOffset + myOffset * 3 + 1] = pos + thisrvB;
+    vout[outOffset + myOffset * 3 + 2] = pos + thisrvC;
+
+    if (uvOffset < 0) {
+      uvout[outOffset + myOffset * 3]     = (float4)(0, 0, 0, 0);
+      uvout[outOffset + myOffset * 3 + 1] = (float4)(0, 0, 0, 0);
+      uvout[outOffset + myOffset * 3 + 2] = (float4)(0, 0, 0, 0);
+    } else if (flags >= 0) {
+      uvout[outOffset + myOffset * 3]     = tempuv[uvOffset + localId * 3];
+      uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1];
+      uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2];
+    } else {
+      uvout[outOffset + myOffset * 3]     = uv[uvOffset + localId * 3];
+      uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1];
+      uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2];
+    }
+  }
+}