diff --git a/runelite-client/pom.xml b/runelite-client/pom.xml index 10000e6125..48eb8825ce 100644 --- a/runelite-client/pom.xml +++ b/runelite-client/pom.xml @@ -177,6 +177,25 @@ 2.4.0-rc-20210117 runtime + + net.runelite.jocl + jocl + 1.0 + + + net.runelite.jocl + jocl + 1.0 + macos-x64 + runtime + + + net.runelite.jocl + jocl + 1.0 + macos-arm64 + runtime + net.runelite archive-patcher diff --git a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GLBuffer.java b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GLBuffer.java new file mode 100644 index 0000000000..cd3b7288ed --- /dev/null +++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GLBuffer.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package net.runelite.client.plugins.gpu; + +import org.jocl.Pointer; +import org.jocl.cl_mem; + +class GLBuffer +{ + int glBufferId = -1; + int size = -1; + cl_mem cl_mem; + + Pointer ptr() + { + return cl_mem != null ? Pointer.to(cl_mem) : null; + } +} diff --git a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java index a6be4e5872..8ba9db5a67 100644 --- a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java +++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/GpuPlugin.java @@ -29,6 +29,11 @@ import com.google.inject.Provides; import com.jogamp.nativewindow.awt.AWTGraphicsConfiguration; import com.jogamp.nativewindow.awt.JAWTWindow; import com.jogamp.opengl.GL; +import static com.jogamp.opengl.GL.GL_ARRAY_BUFFER; +import static com.jogamp.opengl.GL.GL_DYNAMIC_DRAW; +import static com.jogamp.opengl.GL2ES2.GL_STREAM_DRAW; +import static com.jogamp.opengl.GL2ES3.GL_STATIC_COPY; +import static com.jogamp.opengl.GL2ES3.GL_UNIFORM_BUFFER; import com.jogamp.opengl.GL4; import com.jogamp.opengl.GLCapabilities; import com.jogamp.opengl.GLContext; @@ -45,6 +50,7 @@ import java.awt.Image; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; import java.awt.image.DataBufferInt; +import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.FloatBuffer; @@ -93,6 +99,10 @@ import net.runelite.client.plugins.gpu.config.UIScalingMode; import net.runelite.client.plugins.gpu.template.Template; import net.runelite.client.ui.DrawManager; import net.runelite.client.util.OSType; +import org.jocl.CL; +import static org.jocl.CL.CL_MEM_READ_ONLY; +import static org.jocl.CL.CL_MEM_WRITE_ONLY; +import static org.jocl.CL.clCreateFromGLBuffer; @PluginDescriptor( name = "GPU", @@ -105,8 +115,8 @@ import net.runelite.client.util.OSType; public class GpuPlugin extends Plugin implements DrawCallbacks { // This is the maximum number of triangles the compute shaders support - private static final int MAX_TRIANGLE = 4096; - private static final int SMALL_TRIANGLE_COUNT = 512; + static final int MAX_TRIANGLE = 4096; + static final int SMALL_TRIANGLE_COUNT = 512; private static final int FLAG_SCENE_BUFFER = Integer.MIN_VALUE; private static final int DEFAULT_DISTANCE = 25; static final int MAX_DISTANCE = 90; @@ -115,6 +125,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks @Inject private Client client; + @Inject + private OpenCLManager openCLManager; + @Inject private ClientThread clientThread; @@ -133,7 +146,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks @Inject private PluginManager pluginManager; - private boolean useComputeShaders; + enum ComputeMode + { + NONE, + OPENGL, + OPENCL + } + + private ComputeMode computeMode = ComputeMode.NONE; private Canvas canvas; private JAWTWindow jawtWindow; @@ -182,23 +202,22 @@ public class GpuPlugin extends Plugin implements DrawCallbacks private int texSceneHandle; private int rboSceneHandle; - // scene vertex buffer id - private int bufferId; - // scene uv buffer id - private int uvBufferId; + // scene vertex buffer + private final GLBuffer sceneVertexBuffer = new GLBuffer(); + // scene uv buffer + private final GLBuffer sceneUvBuffer = new GLBuffer(); - private int tmpBufferId; // temporary scene vertex buffer - private int tmpUvBufferId; // temporary scene uv buffer - private int tmpModelBufferId; // scene model buffer, large - private int tmpModelBufferSmallId; // scene model buffer, small - private int tmpModelBufferUnorderedId; - private int tmpOutBufferId; // target vertex buffer for compute shaders - private int tmpOutUvBufferId; // target uv buffer for compute shaders + private final GLBuffer tmpVertexBuffer = new GLBuffer(); // temporary scene vertex buffer + private final GLBuffer tmpUvBuffer = new GLBuffer(); // temporary scene uv buffer + private final GLBuffer tmpModelBufferLarge = new GLBuffer(); // scene model buffer, large + private final GLBuffer tmpModelBufferSmall = new GLBuffer(); // scene model buffer, small + private final GLBuffer tmpModelBufferUnordered = new GLBuffer(); // scene model buffer, unordered + private final GLBuffer tmpOutBuffer = new GLBuffer(); // target vertex buffer for compute shaders + private final GLBuffer tmpOutUvBuffer = new GLBuffer(); // target uv buffer for compute shaders private int textureArrayId; - private int uniformBufferId; - private final IntBuffer uniformBuffer = GpuIntBuffer.allocateDirect(5 + 3 + 2048 * 4); + private final GLBuffer uniformBuffer = new GLBuffer(); private final float[] textureOffsets = new float[128]; private GpuIntBuffer vertexBuffer; @@ -278,7 +297,6 @@ public class GpuPlugin extends Plugin implements DrawCallbacks { try { - bufferId = uvBufferId = uniformBufferId = tmpBufferId = tmpUvBufferId = tmpModelBufferId = tmpModelBufferSmallId = tmpModelBufferUnorderedId = tmpOutBufferId = tmpOutUvBufferId = -1; texSceneHandle = fboSceneHandle = rboSceneHandle = -1; // AA FBO unorderedModels = smallModels = largeModels = 0; drawingModel = false; @@ -290,8 +308,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks return false; } - // OSX supports up to OpenGL 4.1, however 4.3 is required for compute shaders - useComputeShaders = config.useComputeShaders() && OSType.getOSType() != OSType.MacOS; + computeMode = config.useComputeShaders() + ? (OSType.getOSType() == OSType.MacOS ? ComputeMode.OPENCL : ComputeMode.OPENGL) + : ComputeMode.NONE; canvas.setIgnoreRepaint(true); @@ -397,7 +416,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks if (client.getGameState() == GameState.LOGGED_IN) { - uploadScene(); + invokeOnMainThread(this::uploadScene); } } catch (Throwable e) @@ -433,6 +452,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks invokeOnMainThread(() -> { + openCLManager.cleanup(); + if (gl != null) { if (textureArrayId != -1) @@ -441,11 +462,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks textureArrayId = -1; } - if (uniformBufferId != -1) - { - glDeleteBuffer(gl, uniformBufferId); - uniformBufferId = -1; - } + destroyGlBuffer(uniformBuffer); shutdownBuffers(); shutdownInterfaceTexture(); @@ -519,12 +536,16 @@ public class GpuPlugin extends Plugin implements DrawCallbacks glProgram = PROGRAM.compile(gl, template); glUiProgram = UI_PROGRAM.compile(gl, template); - if (useComputeShaders) + if (computeMode == ComputeMode.OPENGL) { glComputeProgram = COMPUTE_PROGRAM.compile(gl, template); glSmallComputeProgram = SMALL_COMPUTE_PROGRAM.compile(gl, template); glUnorderedComputeProgram = UNORDERED_COMPUTE_PROGRAM.compile(gl, template); } + else if (computeMode == ComputeMode.OPENCL) + { + openCLManager.init(gl); + } initUniforms(); } @@ -593,8 +614,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks -1f, 1f, 0.0f, 0.0f, 0f // top left }); vboUiBuf.rewind(); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vboUiHandle); - gl.glBufferData(gl.GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW); + gl.glBindBuffer(GL_ARRAY_BUFFER, vboUiHandle); + gl.glBufferData(GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW); // position attribute gl.glVertexAttribPointer(0, 3, gl.GL_FLOAT, false, 5 * Float.BYTES, 0); @@ -605,7 +626,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks gl.glEnableVertexAttribArray(1); // unbind VBO - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0); + gl.glBindBuffer(GL_ARRAY_BUFFER, 0); } private void shutdownVao() @@ -622,71 +643,49 @@ public class GpuPlugin extends Plugin implements DrawCallbacks private void initBuffers() { - bufferId = glGenBuffers(gl); - uvBufferId = glGenBuffers(gl); - tmpBufferId = glGenBuffers(gl); - tmpUvBufferId = glGenBuffers(gl); - tmpModelBufferId = glGenBuffers(gl); - tmpModelBufferSmallId = glGenBuffers(gl); - tmpModelBufferUnorderedId = glGenBuffers(gl); - tmpOutBufferId = glGenBuffers(gl); - tmpOutUvBufferId = glGenBuffers(gl); + initGlBuffer(sceneVertexBuffer); + initGlBuffer(sceneUvBuffer); + initGlBuffer(tmpVertexBuffer); + initGlBuffer(tmpUvBuffer); + initGlBuffer(tmpModelBufferLarge); + initGlBuffer(tmpModelBufferSmall); + initGlBuffer(tmpModelBufferUnordered); + initGlBuffer(tmpOutBuffer); + initGlBuffer(tmpOutUvBuffer); + } + + private void initGlBuffer(GLBuffer glBuffer) + { + glBuffer.glBufferId = glGenBuffers(gl); } private void shutdownBuffers() { - if (bufferId != -1) - { - glDeleteBuffer(gl, bufferId); - bufferId = -1; - } + destroyGlBuffer(sceneVertexBuffer); + destroyGlBuffer(sceneUvBuffer); - if (uvBufferId != -1) - { - glDeleteBuffer(gl, uvBufferId); - uvBufferId = -1; - } + destroyGlBuffer(tmpVertexBuffer); + destroyGlBuffer(tmpUvBuffer); + destroyGlBuffer(tmpModelBufferLarge); + destroyGlBuffer(tmpModelBufferSmall); + destroyGlBuffer(tmpModelBufferUnordered); + destroyGlBuffer(tmpOutBuffer); + destroyGlBuffer(tmpOutUvBuffer); + } - if (tmpBufferId != -1) + private void destroyGlBuffer(GLBuffer glBuffer) + { + if (glBuffer.glBufferId != -1) { - glDeleteBuffer(gl, tmpBufferId); - tmpBufferId = -1; + glDeleteBuffer(gl, glBuffer.glBufferId); + glBuffer.glBufferId = -1; } + glBuffer.size = -1; - if (tmpUvBufferId != -1) + if (glBuffer.cl_mem != null) { - glDeleteBuffer(gl, tmpUvBufferId); - tmpUvBufferId = -1; - } - - if (tmpModelBufferId != -1) - { - glDeleteBuffer(gl, tmpModelBufferId); - tmpModelBufferId = -1; - } - - if (tmpModelBufferSmallId != -1) - { - glDeleteBuffer(gl, tmpModelBufferSmallId); - tmpModelBufferSmallId = -1; - } - - if (tmpModelBufferUnorderedId != -1) - { - glDeleteBuffer(gl, tmpModelBufferUnorderedId); - tmpModelBufferUnorderedId = -1; - } - - if (tmpOutBufferId != -1) - { - glDeleteBuffer(gl, tmpOutBufferId); - tmpOutBufferId = -1; - } - - if (tmpOutUvBufferId != -1) - { - glDeleteBuffer(gl, tmpOutUvBufferId); - tmpOutUvBufferId = -1; + CL.clReleaseMemObject(glBuffer.cl_mem); + glBuffer.cl_mem = null; } } @@ -709,21 +708,21 @@ public class GpuPlugin extends Plugin implements DrawCallbacks private void initUniformBuffer() { - uniformBufferId = glGenBuffers(gl); - gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId); - uniformBuffer.clear(); - uniformBuffer.put(new int[8]); + initGlBuffer(uniformBuffer); + + IntBuffer uniformBuf = GpuIntBuffer.allocateDirect(8 + 2048 * 4); + uniformBuf.put(new int[8]); // uniform block final int[] pad = new int[2]; for (int i = 0; i < 2048; i++) { - uniformBuffer.put(Perspective.SINE[i]); - uniformBuffer.put(Perspective.COSINE[i]); - uniformBuffer.put(pad); + uniformBuf.put(Perspective.SINE[i]); + uniformBuf.put(Perspective.COSINE[i]); + uniformBuf.put(pad); // ivec2 alignment in std140 is 16 bytes } - uniformBuffer.flip(); + uniformBuf.flip(); - gl.glBufferData(gl.GL_UNIFORM_BUFFER, uniformBuffer.limit() * Integer.BYTES, uniformBuffer, gl.GL_DYNAMIC_DRAW); - gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0); + updateBuffer(uniformBuffer, GL_UNIFORM_BUFFER, uniformBuf.limit() * Integer.BYTES, uniformBuf, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); + gl.glBindBuffer(GL_UNIFORM_BUFFER, 0); } private void initAAFbo(int width, int height, int aaSamples) @@ -785,9 +784,11 @@ public class GpuPlugin extends Plugin implements DrawCallbacks invokeOnMainThread(() -> { // UBO. Only the first 32 bytes get modified here, the rest is the constant sin/cos table. - gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId); - uniformBuffer.clear(); - uniformBuffer + // We can reuse the vertex buffer since it isn't used yet. + vertexBuffer.clear(); + vertexBuffer.ensureCapacity(32); + IntBuffer uniformBuf = vertexBuffer.getBuffer(); + uniformBuf .put(yaw) .put(pitch) .put(client.getCenterX()) @@ -796,12 +797,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks .put(cameraX) .put(cameraY) .put(cameraZ); - uniformBuffer.flip(); + uniformBuf.flip(); - gl.glBufferSubData(gl.GL_UNIFORM_BUFFER, 0, uniformBuffer.limit() * Integer.BYTES, uniformBuffer); - gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0); + gl.glBindBuffer(GL_UNIFORM_BUFFER, uniformBuffer.glBufferId); + gl.glBufferSubData(GL_UNIFORM_BUFFER, 0, uniformBuf.limit() * Integer.BYTES, uniformBuf); + gl.glBindBuffer(GL_UNIFORM_BUFFER, 0); - gl.glBindBufferBase(gl.GL_UNIFORM_BUFFER, 0, uniformBufferId); + gl.glBindBufferBase(GL_UNIFORM_BUFFER, 0, uniformBuffer.glBufferId); + uniformBuf.clear(); }); } @@ -813,7 +816,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks private void postDraw() { - if (!useComputeShaders) + if (computeMode == ComputeMode.NONE) { // Upload buffers vertexBuffer.flip(); @@ -822,12 +825,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks IntBuffer vertexBuffer = this.vertexBuffer.getBuffer(); FloatBuffer uvBuffer = this.uvBuffer.getBuffer(); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW); - + updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, 0L); + updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, 0L); return; } @@ -844,79 +843,91 @@ public class GpuPlugin extends Plugin implements DrawCallbacks IntBuffer modelBufferSmall = this.modelBufferSmall.getBuffer(); IntBuffer modelBufferUnordered = this.modelBufferUnordered.getBuffer(); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW); + // temp buffers + updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); + updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, gl.GL_DYNAMIC_DRAW); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferSmallId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, gl.GL_DYNAMIC_DRAW); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferUnorderedId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, gl.GL_DYNAMIC_DRAW); + // model buffers + updateBuffer(tmpModelBufferLarge, GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); + updateBuffer(tmpModelBufferSmall, GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); + updateBuffer(tmpModelBufferUnordered, GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY); // Output buffers - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, + updateBuffer(tmpOutBuffer, + GL_ARRAY_BUFFER, targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes null, - gl.GL_STREAM_DRAW); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutUvBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, - targetBufferOffset * 16, + GL_STREAM_DRAW, + CL_MEM_WRITE_ONLY); + updateBuffer(tmpOutUvBuffer, + GL_ARRAY_BUFFER, + targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes null, - gl.GL_STREAM_DRAW); + GL_STREAM_DRAW, + CL_MEM_WRITE_ONLY); - // Bind UBO to compute programs - gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0); - gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0); + if (computeMode == ComputeMode.OPENCL) + { + // The docs for clEnqueueAcquireGLObjects say all pending GL operations must be completed before calling + // clEnqueueAcquireGLObjects, and recommends calling glFinish() as the only portable way to do that. + // However no issues have been observed from not calling it, and so will leave disabled for now. + // gl.glFinish(); + + openCLManager.compute( + unorderedModels, smallModels, largeModels, + sceneVertexBuffer, sceneUvBuffer, + tmpVertexBuffer, tmpUvBuffer, + tmpModelBufferUnordered, tmpModelBufferSmall, tmpModelBufferLarge, + tmpOutBuffer, tmpOutUvBuffer, + uniformBuffer); + return; + } /* * Compute is split into three separate programs: 'unordered', 'small', and 'large' * to save on GPU resources. Small will sort <= 512 faces, large will do <= 4096. */ + // Bind UBO to compute programs + gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0); + gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0); + // unordered gl.glUseProgram(glUnorderedComputeProgram); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnorderedId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnordered.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId); gl.glDispatchCompute(unorderedModels, 1, 1); // small gl.glUseProgram(glSmallComputeProgram); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmallId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmall.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId); gl.glDispatchCompute(smallModels, 1, 1); // large gl.glUseProgram(glComputeProgram); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId); - gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferLarge.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId); + gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId); gl.glDispatchCompute(largeModels, 1, 1); } @@ -926,7 +937,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks SceneTilePaint paint, int tileZ, int tileX, int tileY, int zoom, int centerX, int centerY) { - if (!useComputeShaders) + if (computeMode == ComputeMode.NONE) { targetBufferOffset += sceneUploader.upload(paint, tileZ, tileX, tileY, @@ -963,7 +974,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks SceneTileModel model, int tileZ, int tileX, int tileY, int zoom, int centerX, int centerY) { - if (!useComputeShaders) + if (computeMode == ComputeMode.NONE) { targetBufferOffset += sceneUploader.upload(model, tileX, tileY, @@ -1131,7 +1142,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks // Ceil the sizes because even if the size is 599.1 we want to treat it as size 600 (i.e. render to the x=599 pixel). renderViewportHeight = (int) Math.ceil(scaleFactorY * (renderViewportHeight)) + padding * 2; - renderViewportWidth = (int) Math.ceil(scaleFactorX * (renderViewportWidth )) + padding * 2; + renderViewportWidth = (int) Math.ceil(scaleFactorX * (renderViewportWidth )) + padding * 2; // Floor the offsets because even if the offset is 4.9, we want to render to the x=4 pixel anyway. renderHeightOff = (int) Math.floor(scaleFactorY * (renderHeightOff)) - padding; @@ -1195,27 +1206,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks gl.glBindVertexArray(vaoHandle); int vertexBuffer, uvBuffer; - if (useComputeShaders) + if (computeMode != ComputeMode.NONE) { - // Before reading the SSBOs written to from postDrawScene() we must insert a barrier - gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT); + if (computeMode == ComputeMode.OPENGL) + { + // Before reading the SSBOs written to from postDrawScene() we must insert a barrier + gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT); + } + else + { + // Wait for the command queue to finish, so that we know the compute is done + openCLManager.finish(); + } + // Draw using the output buffer of the compute - vertexBuffer = tmpOutBufferId; - uvBuffer = tmpOutUvBufferId; + vertexBuffer = tmpOutBuffer.glBufferId; + uvBuffer = tmpOutUvBuffer.glBufferId; } else { // Only use the temporary buffers, which will contain the full scene - vertexBuffer = tmpBufferId; - uvBuffer = tmpUvBufferId; + vertexBuffer = tmpVertexBuffer.glBufferId; + uvBuffer = tmpUvBuffer.glBufferId; } gl.glEnableVertexAttribArray(0); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vertexBuffer); + gl.glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer); gl.glVertexAttribIPointer(0, 4, gl.GL_INT, 0, 0); gl.glEnableVertexAttribArray(1); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBuffer); + gl.glBindBuffer(GL_ARRAY_BUFFER, uvBuffer); gl.glVertexAttribPointer(1, 4, gl.GL_FLOAT, false, 0, 0); gl.glDrawArrays(gl.GL_TRIANGLES, 0, targetBufferOffset); @@ -1400,12 +1420,12 @@ public class GpuPlugin extends Plugin implements DrawCallbacks @Subscribe public void onGameStateChanged(GameStateChanged gameStateChanged) { - if (!useComputeShaders || gameStateChanged.getGameState() != GameState.LOGGED_IN) + if (computeMode == ComputeMode.NONE || gameStateChanged.getGameState() != GameState.LOGGED_IN) { return; } - uploadScene(); + invokeOnMainThread(this::uploadScene); } private void uploadScene() @@ -1421,13 +1441,10 @@ public class GpuPlugin extends Plugin implements DrawCallbacks IntBuffer vertexBuffer = this.vertexBuffer.getBuffer(); FloatBuffer uvBuffer = this.uvBuffer.getBuffer(); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, bufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_STATIC_COPY); + updateBuffer(sceneVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY); + updateBuffer(sceneUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY); - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBufferId); - gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_STATIC_COPY); - - gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0); + gl.glBindBuffer(GL_ARRAY_BUFFER, 0); vertexBuffer.clear(); uvBuffer.clear(); @@ -1492,7 +1509,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks @Override public void draw(Renderable renderable, int orientation, int pitchSin, int pitchCos, int yawSin, int yawCos, int x, int y, int z, long hash) { - if (!useComputeShaders) + if (computeMode == ComputeMode.NONE) { Model model = renderable instanceof Model ? (Model) renderable : renderable.getModel(); if (model != null) @@ -1673,7 +1690,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks private int getDrawDistance() { - final int limit = useComputeShaders ? MAX_DISTANCE : DEFAULT_DISTANCE; + final int limit = computeMode != ComputeMode.NONE ? MAX_DISTANCE : DEFAULT_DISTANCE; return Ints.constrainToRange(config.drawDistance(), 0, limit); } @@ -1688,4 +1705,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks runnable.run(); } } + + private void updateBuffer(GLBuffer glBuffer, int target, int size, Buffer data, int usage, long clFlags) + { + gl.glBindBuffer(target, glBuffer.glBufferId); + if (size > glBuffer.size) + { + log.trace("Buffer resize: {} {} -> {}", glBuffer, glBuffer.size, size); + + glBuffer.size = size; + gl.glBufferData(target, size, data, usage); + + if (computeMode == ComputeMode.OPENCL) + { + if (glBuffer.cl_mem != null) + { + CL.clReleaseMemObject(glBuffer.cl_mem); + } + if (size == 0) + { + glBuffer.cl_mem = null; + } + else + { + glBuffer.cl_mem = clCreateFromGLBuffer(openCLManager.context, clFlags, glBuffer.glBufferId, null); + } + } + } + else if (data != null) + { + gl.glBufferSubData(target, 0, size, data); + } + } } diff --git a/runelite-client/src/main/java/net/runelite/client/plugins/gpu/OpenCLManager.java b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/OpenCLManager.java new file mode 100644 index 0000000000..77c2f9b575 --- /dev/null +++ b/runelite-client/src/main/java/net/runelite/client/plugins/gpu/OpenCLManager.java @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package net.runelite.client.plugins.gpu; + +import com.google.common.base.Charsets; +import com.jogamp.nativewindow.NativeSurface; +import com.jogamp.opengl.GL4; +import com.jogamp.opengl.GLContext; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; +import javax.inject.Singleton; +import jogamp.opengl.GLContextImpl; +import jogamp.opengl.GLDrawableImpl; +import jogamp.opengl.egl.EGLContext; +import jogamp.opengl.macosx.cgl.CGL; +import jogamp.opengl.windows.wgl.WindowsWGLContext; +import jogamp.opengl.x11.glx.X11GLXContext; +import lombok.extern.slf4j.Slf4j; +import net.runelite.client.plugins.gpu.template.Template; +import net.runelite.client.util.OSType; +import org.jocl.CL; +import static org.jocl.CL.*; +import org.jocl.CLException; +import org.jocl.Pointer; +import org.jocl.Sizeof; +import org.jocl.cl_command_queue; +import org.jocl.cl_context; +import org.jocl.cl_context_properties; +import org.jocl.cl_device_id; +import org.jocl.cl_event; +import org.jocl.cl_kernel; +import org.jocl.cl_mem; +import org.jocl.cl_platform_id; +import org.jocl.cl_program; + +@Singleton +@Slf4j +class OpenCLManager +{ + private static final String GL_SHARING_PLATFORM_EXT = "cl_khr_gl_sharing"; + + private static final String KERNEL_NAME_UNORDERED = "computeUnordered"; + private static final String KERNEL_NAME_LARGE = "computeLarge"; + + private static final int MIN_WORK_GROUP_SIZE = 256; + private static final int SMALL_SIZE = GpuPlugin.SMALL_TRIANGLE_COUNT; + private static final int LARGE_SIZE = GpuPlugin.MAX_TRIANGLE; + // struct shared_data { + // int totalNum[12]; + // int totalDistance[12]; + // int totalMappedNum[18]; + // int min10; + // int dfs[0]; + // }; + private static final int SHARED_SIZE = 12 + 12 + 18 + 1; // in ints + + // The number of faces each worker processes in the two kernels + private int largeFaceCount; + private int smallFaceCount; + + private cl_platform_id platform; + private cl_device_id device; + cl_context context; + private cl_command_queue commandQueue; + + private cl_program programUnordered; + private cl_program programSmall; + private cl_program programLarge; + + private cl_kernel kernelUnordered; + private cl_kernel kernelSmall; + private cl_kernel kernelLarge; + + void init(GL4 gl) + { + CL.setExceptionsEnabled(true); + + switch (OSType.getOSType()) + { + case Windows: + case Linux: + initPlatform(); + initDevice(); + initContext(gl); + break; + case MacOS: + initMacOS(gl); + break; + default: + throw new RuntimeException("Unsupported OS Type " + OSType.getOSType().name()); + } + ensureMinWorkGroupSize(); + initQueue(); + compilePrograms(); + } + + void cleanup() + { + if (programUnordered != null) + { + CL.clReleaseProgram(programUnordered); + programUnordered = null; + } + + if (programSmall != null) + { + CL.clReleaseProgram(programSmall); + programSmall = null; + } + + if (programLarge != null) + { + CL.clReleaseProgram(programLarge); + programLarge = null; + } + + if (kernelUnordered != null) + { + CL.clReleaseKernel(kernelUnordered); + kernelUnordered = null; + } + + if (kernelSmall != null) + { + CL.clReleaseKernel(kernelSmall); + kernelSmall = null; + } + + if (kernelLarge != null) + { + CL.clReleaseKernel(kernelLarge); + kernelLarge = null; + } + + if (commandQueue != null) + { + CL.clReleaseCommandQueue(commandQueue); + commandQueue = null; + } + + if (context != null) + { + CL.clReleaseContext(context); + context = null; + } + + if (device != null) + { + CL.clReleaseDevice(device); + device = null; + } + } + + private String logPlatformInfo(cl_platform_id platform, int param) + { + long[] size = new long[1]; + clGetPlatformInfo(platform, param, 0, null, size); + + byte[] buffer = new byte[(int) size[0]]; + clGetPlatformInfo(platform, param, buffer.length, Pointer.to(buffer), null); + String platformInfo = new String(buffer, Charsets.UTF_8); + log.debug("Platform: {}, {}", stringFor_cl_platform_info(param), platformInfo); + return platformInfo; + } + + private void logBuildInfo(cl_program program, int param) + { + long[] size = new long[1]; + clGetProgramBuildInfo(program, device, param, 0, null, size); + + ByteBuffer buffer = ByteBuffer.allocateDirect((int) size[0]); + clGetProgramBuildInfo(program, device, param, buffer.limit(), Pointer.toBuffer(buffer), null); + + switch (param) + { + case CL_PROGRAM_BUILD_STATUS: + log.debug("Build status: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_build_status(buffer.getInt())); + break; + case CL_PROGRAM_BINARY_TYPE: + log.debug("Binary type: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_program_binary_type(buffer.getInt())); + break; + case CL_PROGRAM_BUILD_LOG: + String buildLog = StandardCharsets.US_ASCII.decode(buffer).toString(); + log.trace("Build log: {}, {}", stringFor_cl_program_build_info(param), buildLog); + break; + case CL_PROGRAM_BUILD_OPTIONS: + String message = StandardCharsets.US_ASCII.decode(buffer).toString(); + log.debug("Build options: {}, {}", stringFor_cl_program_build_info(param), message); + break; + default: + throw new IllegalArgumentException(); + } + } + + private void initPlatform() + { + int[] platformCount = new int[1]; + clGetPlatformIDs(0, null, platformCount); + if (platformCount[0] == 0) + { + throw new RuntimeException("No compute platforms found"); + } + + cl_platform_id[] platforms = new cl_platform_id[platformCount[0]]; + clGetPlatformIDs(platforms.length, platforms, null); + + for (cl_platform_id platform : platforms) + { + log.debug("Found cl_platform_id {}", platform); + logPlatformInfo(platform, CL_PLATFORM_PROFILE); + logPlatformInfo(platform, CL_PLATFORM_VERSION); + logPlatformInfo(platform, CL_PLATFORM_NAME); + logPlatformInfo(platform, CL_PLATFORM_VENDOR); + String[] extensions = logPlatformInfo(platform, CL_PLATFORM_EXTENSIONS).split(" "); + if (Arrays.stream(extensions).noneMatch(s -> s.equals(GL_SHARING_PLATFORM_EXT))) + { + throw new RuntimeException("Platform does not support OpenGL buffer sharing"); + } + } + + platform = platforms[0]; + log.debug("Selected cl_platform_id {}", platform); + } + + private void initDevice() + { + int[] deviceCount = new int[1]; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, null, deviceCount); + if (deviceCount[0] == 0) + { + throw new RuntimeException("No compute devices found"); + } + + cl_device_id[] devices = new cl_device_id[(int) deviceCount[0]]; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, devices.length, devices, null); + + for (cl_device_id device : devices) + { + long[] size = new long[1]; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, null, size); + + byte[] devInfoBuf = new byte[(int) size[0]]; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, devInfoBuf.length, Pointer.to(devInfoBuf), null); + + log.debug("Found cl_device_id: {}", device); + log.debug("Device extensions: {}", new String(devInfoBuf, Charsets.UTF_8)); + } + + device = devices[0]; + log.debug("Selected cl_device_id {}", device); + } + + private void initContext(GL4 gl) + { + // set computation platform + cl_context_properties contextProps = new cl_context_properties(); + contextProps.addProperty(CL_CONTEXT_PLATFORM, platform); + + // pull gl context + GLContext glContext = gl.getContext(); + log.debug("Got GLContext of type {}", glContext.getClass().getSimpleName()); + if (!glContext.isCurrent()) + { + throw new RuntimeException("Can't create OpenCL context from inactive GL Context"); + } + + // get correct props based on os + long glContextHandle = glContext.getHandle(); + GLContextImpl glContextImpl = (GLContextImpl) glContext; + GLDrawableImpl glDrawableImpl = glContextImpl.getDrawableImpl(); + NativeSurface nativeSurface = glDrawableImpl.getNativeSurface(); + + if (glContext instanceof X11GLXContext) + { + long displayHandle = nativeSurface.getDisplayHandle(); + contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle); + contextProps.addProperty(CL_GLX_DISPLAY_KHR, displayHandle); + } + else if (glContext instanceof WindowsWGLContext) + { + long surfaceHandle = nativeSurface.getSurfaceHandle(); + contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle); + contextProps.addProperty(CL_WGL_HDC_KHR, surfaceHandle); + } + else if (glContext instanceof EGLContext) + { + long displayHandle = nativeSurface.getDisplayHandle(); + contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle); + contextProps.addProperty(CL_EGL_DISPLAY_KHR, displayHandle); + } + + log.debug("Creating context with props: {}", contextProps); + context = clCreateContext(contextProps, 1, new cl_device_id[]{device}, null, null, null); + log.debug("Created compute context {}", context); + } + + private void initMacOS(GL4 gl) + { + // get sharegroup from gl context + GLContext glContext = gl.getContext(); + if (!glContext.isCurrent()) + { + throw new RuntimeException("Can't create context from inactive GL"); + } + long cglContext = CGL.CGLGetCurrentContext(); + long cglShareGroup = CGL.CGLGetShareGroup(cglContext); + + // build context props + cl_context_properties contextProps = new cl_context_properties(); + contextProps.addProperty(CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, cglShareGroup); + + // ask macos to make the context for us + log.debug("Creating context with props: {}", contextProps); + context = clCreateContext(contextProps, 0, null, null, null, null); + + // pull the compute device out of the provided context + device = new cl_device_id(); + clGetGLContextInfoAPPLE(context, cglContext, CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, Sizeof.cl_device_id, Pointer.to(device), null); + + log.debug("Got macOS CLGL compute device {}", device); + } + + private void ensureMinWorkGroupSize() + { + long[] maxWorkGroupSize = new long[1]; + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(maxWorkGroupSize), null); + log.debug("Device CL_DEVICE_MAX_WORK_GROUP_SIZE: {}", maxWorkGroupSize[0]); + + if (maxWorkGroupSize[0] < MIN_WORK_GROUP_SIZE) + { + throw new RuntimeException("Compute device does not support min work group size " + MIN_WORK_GROUP_SIZE); + } + + // Largest power of 2 less than or equal to maxWorkGroupSize + int groupSize = 0x80000000 >>> Integer.numberOfLeadingZeros((int) maxWorkGroupSize[0]); + largeFaceCount = LARGE_SIZE / (Math.min(groupSize, LARGE_SIZE)); + smallFaceCount = SMALL_SIZE / (Math.min(groupSize, SMALL_SIZE)); + + log.debug("Face counts: small: {}, large: {}", smallFaceCount, largeFaceCount); + } + + private void initQueue() + { + long[] l = new long[1]; + clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, Sizeof.cl_long, Pointer.to(l), null); + + commandQueue = clCreateCommandQueue(context, device, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, null); + log.debug("Created command_queue {}, properties {}", commandQueue, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); + } + + private cl_program compileProgram(String programSource) + { + log.trace("Compiling program:\n {}", programSource); + cl_program program = clCreateProgramWithSource(context, 1, new String[]{programSource}, null, null); + + try + { + clBuildProgram(program, 0, null, null, null, null); + } + catch (CLException e) + { + logBuildInfo(program, CL_PROGRAM_BUILD_LOG); + throw e; + } + + logBuildInfo(program, CL_PROGRAM_BUILD_STATUS); + logBuildInfo(program, CL_PROGRAM_BINARY_TYPE); + logBuildInfo(program, CL_PROGRAM_BUILD_OPTIONS); + logBuildInfo(program, CL_PROGRAM_BUILD_LOG); + return program; + } + + private cl_kernel getKernel(cl_program program, String kernelName) + { + cl_kernel kernel = clCreateKernel(program, kernelName, null); + log.debug("Loaded kernel {} for program {}", kernelName, program); + return kernel; + } + + private void compilePrograms() + { + Template templateSmall = new Template() + .addInclude(OpenCLManager.class) + .add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + smallFaceCount) : null); + Template templateLarge = new Template() + .addInclude(OpenCLManager.class) + .add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + largeFaceCount) : null); + + String unordered = new Template() + .addInclude(OpenCLManager.class) + .load("comp_unordered.cl"); + String small = templateSmall.load("comp.cl"); + String large = templateLarge.load("comp.cl"); + + programUnordered = compileProgram(unordered); + programSmall = compileProgram(small); + programLarge = compileProgram(large); + + kernelUnordered = getKernel(programUnordered, KERNEL_NAME_UNORDERED); + kernelSmall = getKernel(programSmall, KERNEL_NAME_LARGE); + kernelLarge = getKernel(programLarge, KERNEL_NAME_LARGE); + } + + void compute(int unorderedModels, int smallModels, int largeModels, + GLBuffer sceneVertexBuffer, + GLBuffer sceneUvBuffer, + GLBuffer vertexBuffer, + GLBuffer uvBuffer, + GLBuffer unorderedBuffer, + GLBuffer smallBuffer, + GLBuffer largeBuffer, + GLBuffer outVertexBuffer, + GLBuffer outUvBuffer, + GLBuffer uniformBuffer + ) + { + cl_mem[] glBuffersAll = { + sceneVertexBuffer.cl_mem, + sceneUvBuffer.cl_mem, + unorderedBuffer.cl_mem, + smallBuffer.cl_mem, + largeBuffer.cl_mem, + vertexBuffer.cl_mem, + uvBuffer.cl_mem, + outVertexBuffer.cl_mem, + outUvBuffer.cl_mem, + uniformBuffer.cl_mem, + }; + cl_mem[] glBuffers = Arrays.stream(glBuffersAll) + .filter(Objects::nonNull) + .toArray(cl_mem[]::new); + + cl_event acquireGLBuffers = new cl_event(); + clEnqueueAcquireGLObjects(commandQueue, glBuffers.length, glBuffers, 0, null, acquireGLBuffers); + + cl_event[] computeEvents = { + new cl_event(), + new cl_event(), + new cl_event() + }; + int numComputeEvents = 0; + + if (unorderedModels > 0) + { + clSetKernelArg(kernelUnordered, 0, Sizeof.cl_mem, unorderedBuffer.ptr()); + clSetKernelArg(kernelUnordered, 1, Sizeof.cl_mem, sceneVertexBuffer.ptr()); + clSetKernelArg(kernelUnordered, 2, Sizeof.cl_mem, vertexBuffer.ptr()); + clSetKernelArg(kernelUnordered, 3, Sizeof.cl_mem, sceneUvBuffer.ptr()); + clSetKernelArg(kernelUnordered, 4, Sizeof.cl_mem, uvBuffer.ptr()); + clSetKernelArg(kernelUnordered, 5, Sizeof.cl_mem, outVertexBuffer.ptr()); + clSetKernelArg(kernelUnordered, 6, Sizeof.cl_mem, outUvBuffer.ptr()); + + // queue compute call after acquireGLBuffers + clEnqueueNDRangeKernel(commandQueue, kernelUnordered, 1, null, + new long[]{unorderedModels * 6L}, new long[]{6}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]); + } + + if (smallModels > 0) + { + clSetKernelArg(kernelSmall, 0, (SHARED_SIZE + SMALL_SIZE) * Integer.BYTES, null); + clSetKernelArg(kernelSmall, 1, Sizeof.cl_mem, smallBuffer.ptr()); + clSetKernelArg(kernelSmall, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr()); + clSetKernelArg(kernelSmall, 3, Sizeof.cl_mem, vertexBuffer.ptr()); + clSetKernelArg(kernelSmall, 4, Sizeof.cl_mem, sceneUvBuffer.ptr()); + clSetKernelArg(kernelSmall, 5, Sizeof.cl_mem, uvBuffer.ptr()); + clSetKernelArg(kernelSmall, 6, Sizeof.cl_mem, outVertexBuffer.ptr()); + clSetKernelArg(kernelSmall, 7, Sizeof.cl_mem, outUvBuffer.ptr()); + clSetKernelArg(kernelSmall, 8, Sizeof.cl_mem, uniformBuffer.ptr()); + + clEnqueueNDRangeKernel(commandQueue, kernelSmall, 1, null, + new long[]{smallModels * (SMALL_SIZE / smallFaceCount)}, new long[]{SMALL_SIZE / smallFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]); + } + + if (largeModels > 0) + { + clSetKernelArg(kernelLarge, 0, (SHARED_SIZE + LARGE_SIZE) * Integer.BYTES, null); + clSetKernelArg(kernelLarge, 1, Sizeof.cl_mem, largeBuffer.ptr()); + clSetKernelArg(kernelLarge, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr()); + clSetKernelArg(kernelLarge, 3, Sizeof.cl_mem, vertexBuffer.ptr()); + clSetKernelArg(kernelLarge, 4, Sizeof.cl_mem, sceneUvBuffer.ptr()); + clSetKernelArg(kernelLarge, 5, Sizeof.cl_mem, uvBuffer.ptr()); + clSetKernelArg(kernelLarge, 6, Sizeof.cl_mem, outVertexBuffer.ptr()); + clSetKernelArg(kernelLarge, 7, Sizeof.cl_mem, outUvBuffer.ptr()); + clSetKernelArg(kernelLarge, 8, Sizeof.cl_mem, uniformBuffer.ptr()); + + clEnqueueNDRangeKernel(commandQueue, kernelLarge, 1, null, + new long[]{(long) largeModels * (LARGE_SIZE / largeFaceCount)}, new long[]{LARGE_SIZE / largeFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]); + } + + clEnqueueReleaseGLObjects(commandQueue, glBuffers.length, glBuffers, numComputeEvents, computeEvents, null); + } + + void finish() + { + clFinish(commandQueue); + } +} \ No newline at end of file diff --git a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/cl_types.cl b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/cl_types.cl new file mode 100644 index 0000000000..7f41acab49 --- /dev/null +++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/cl_types.cl @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +struct uniform { + int cameraYaw; + int cameraPitch; + int centerX; + int centerY; + int zoom; + int cameraX; + int cameraY; + int cameraZ; + int4 sinCosTable[2048]; +}; + +struct shared_data { + int totalNum[12]; // number of faces with a given priority + int totalDistance[12]; // sum of distances to faces of a given priority + int totalMappedNum[18]; // number of faces with a given adjusted priority + int min10; // minimum distance to a face of priority 10 + int dfs[0]; // packed face id and distance, size 512 for small, 4096 for large +}; + +struct modelinfo { + int offset; // offset into buffer + int uvOffset; // offset into uv buffer + int size; // length in faces + int idx; // write idx in target buffer + int flags; // radius, orientation + int x; // scene position x + int y; // scene position y + int z; // scene position z +}; diff --git a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/common.cl b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/common.cl new file mode 100644 index 0000000000..f499599cd2 --- /dev/null +++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/common.cl @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define PI 3.1415926535897932384626433832795f +#define UNIT PI / 1024.0f + +float3 toScreen(int4 vertex, int cameraYaw, int cameraPitch, int centerX, int centerY, int zoom) { + float yawSin = sin(cameraYaw * UNIT); + float yawCos = cos(cameraYaw * UNIT); + + float pitchSin = sin(cameraPitch * UNIT); + float pitchCos = cos(cameraPitch * UNIT); + + float rotatedX = (vertex.z * yawSin) + (vertex.x * yawCos); + float rotatedZ = (vertex.z * yawCos) - (vertex.x * yawSin); + + float var13 = (vertex.y * pitchCos) - (rotatedZ * pitchSin); + float var12 = (vertex.y * pitchSin) + (rotatedZ * pitchCos); + + float x = rotatedX * zoom / var12 + centerX; + float y = var13 * zoom / var12 + centerY; + float z = -var12; // in OpenGL depth is negative + + return (float3) (x, y, z); +} + +/* + * Rotate a vertex by a given orientation in JAU + */ +int4 rotate_vertex(__constant struct uniform *uni, int4 vertex, int orientation) { + int4 sinCos = uni->sinCosTable[orientation]; + int s = sinCos.x; + int c = sinCos.y; + int x = vertex.z * s + vertex.x * c >> 16; + int z = vertex.z * c - vertex.x * s >> 16; + return (int4)(x, vertex.y, z, vertex.w); +} + +/* + * Calculate the distance to a vertex given the camera angle + */ +int vertex_distance(int4 vertex, int cameraYaw, int cameraPitch) { + int yawSin = (int)(65536.0f * sin(cameraYaw * UNIT)); + int yawCos = (int)(65536.0f * cos(cameraYaw * UNIT)); + + int pitchSin = (int)(65536.0f * sin(cameraPitch * UNIT)); + int pitchCos = (int)(65536.0f * cos(cameraPitch * UNIT)); + + int j = vertex.z * yawCos - vertex.x * yawSin >> 16; + int l = vertex.y * pitchSin + j * pitchCos >> 16; + + return l; +} + +/* + * Calculate the distance to a face + */ +int face_distance(int4 vA, int4 vB, int4 vC, int cameraYaw, int cameraPitch) { + int dvA = vertex_distance(vA, cameraYaw, cameraPitch); + int dvB = vertex_distance(vB, cameraYaw, cameraPitch); + int dvC = vertex_distance(vC, cameraYaw, cameraPitch); + int faceDistance = (dvA + dvB + dvC) / 3; + return faceDistance; +} + +/* + * Test if a face is visible (not backward facing) + */ +bool face_visible(__constant struct uniform *uni, int4 vA, int4 vB, int4 vC, int4 position) { + // Move model to scene location, and account for camera offset + int4 cameraPos = (int4)(uni->cameraX, uni->cameraY, uni->cameraZ, 0); + vA += position - cameraPos; + vB += position - cameraPos; + vC += position - cameraPos; + + float3 sA = toScreen(vA, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom); + float3 sB = toScreen(vB, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom); + float3 sC = toScreen(vC, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom); + + return (sA.x - sB.x) * (sC.y - sB.y) - (sC.x - sB.x) * (sA.y - sB.y) > 0; +} + diff --git a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp.cl b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp.cl new file mode 100644 index 0000000000..1212123f25 --- /dev/null +++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp.cl @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include FACE_COUNT + +#include cl_types.cl +#include to_screen.cl +#include common.cl +#include priority_render.cl + +__kernel +__attribute__((work_group_size_hint(256, 1, 1))) +void computeLarge( + __local struct shared_data *shared, + __global const struct modelinfo *ol, + __global const int4 *vb, + __global const int4 *tempvb, + __global const float4 *uv, + __global const float4 *tempuv, + __global int4 *vout, + __global float4 *uvout, + __constant struct uniform *uni) { + + size_t groupId = get_group_id(0); + size_t localId = get_local_id(0) * FACE_COUNT; + struct modelinfo minfo = ol[groupId]; + int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0); + + if (localId == 0) { + shared->min10 = 1600; + for (int i = 0; i < 12; ++i) { + shared->totalNum[i] = 0; + shared->totalDistance[i] = 0; + } + for (int i = 0; i < 18; ++i) { + shared->totalMappedNum[i] = 0; + } + } + + int prio[FACE_COUNT]; + int dis[FACE_COUNT]; + int4 v1[FACE_COUNT]; + int4 v2[FACE_COUNT]; + int4 v3[FACE_COUNT]; + + for (int i = 0; i < FACE_COUNT; i++) { + get_face(shared, uni, vb, tempvb, localId + i, minfo, uni->cameraYaw, uni->cameraPitch, &prio[i], &dis[i], &v1[i], &v2[i], &v3[i]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < FACE_COUNT; i++) { + add_face_prio_distance(shared, uni, localId + i, minfo, v1[i], v2[i], v3[i], prio[i], dis[i], pos); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int prioAdj[FACE_COUNT]; + int idx[FACE_COUNT]; + for (int i = 0; i < FACE_COUNT; i++) { + idx[i] = map_face_priority(shared, localId + i, minfo, prio[i], dis[i], &prioAdj[i]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < FACE_COUNT; i++) { + insert_dfs(shared, localId + i, minfo, prioAdj[i], dis[i], idx[i]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < FACE_COUNT; i++) { + sort_and_insert(shared, uv, tempuv, vout, uvout, localId + i, minfo, prioAdj[i], dis[i], v1[i], v2[i], v3[i]); + } +} diff --git a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp_unordered.cl b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp_unordered.cl new file mode 100644 index 0000000000..436f9a7d72 --- /dev/null +++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/comp_unordered.cl @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include cl_types.cl + +__kernel +__attribute__((reqd_work_group_size(6, 1, 1))) +void computeUnordered(__global const struct modelinfo *ol, + __global const int4 *vb, + __global const int4 *tempvb, + __global const float4 *uv, + __global const float4 *tempuv, + __global int4 *vout, + __global float4 *uvout) { + size_t groupId = get_group_id(0); + size_t localId = get_local_id(0); + struct modelinfo minfo = ol[groupId]; + + int offset = minfo.offset; + int size = minfo.size; + int outOffset = minfo.idx; + int uvOffset = minfo.uvOffset; + int flags = minfo.flags; + int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0); + + if (localId >= size) { + return; + } + + uint ssboOffset = localId; + int4 thisA, thisB, thisC; + + // Grab triangle vertices from the correct buffer + if (flags < 0) { + thisA = vb[offset + ssboOffset * 3]; + thisB = vb[offset + ssboOffset * 3 + 1]; + thisC = vb[offset + ssboOffset * 3 + 2]; + } else { + thisA = tempvb[offset + ssboOffset * 3]; + thisB = tempvb[offset + ssboOffset * 3 + 1]; + thisC = tempvb[offset + ssboOffset * 3 + 2]; + } + + uint myOffset = localId; + + // position vertices in scene and write to out buffer + vout[outOffset + myOffset * 3] = pos + thisA; + vout[outOffset + myOffset * 3 + 1] = pos + thisB; + vout[outOffset + myOffset * 3 + 2] = pos + thisC; + + if (uvOffset < 0) { + uvout[outOffset + myOffset * 3] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + uvout[outOffset + myOffset * 3 + 1] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + uvout[outOffset + myOffset * 3 + 2] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + } else if (flags >= 0) { + uvout[outOffset + myOffset * 3] = tempuv[uvOffset + localId * 3]; + uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1]; + uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2]; + } else { + uvout[outOffset + myOffset * 3] = uv[uvOffset + localId * 3]; + uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1]; + uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2]; + } +} diff --git a/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/priority_render.cl b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/priority_render.cl new file mode 100644 index 0000000000..6f1a04470c --- /dev/null +++ b/runelite-client/src/main/resources/net/runelite/client/plugins/gpu/priority_render.cl @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2021, Adam + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Calculate adjusted priority for a face with a given priority, distance, and +// model global min10 and face distance averages. This allows positioning faces +// with priorities 10/11 into the correct 'slots' resulting in 18 possible +// adjusted priorities +int priority_map(int p, int distance, int _min10, int avg1, int avg2, int avg3) { + // (10, 11) 0 1 2 (10, 11) 3 4 (10, 11) 5 6 7 8 9 (10, 11) + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 + switch (p) { + case 0: return 2; + case 1: return 3; + case 2: return 4; + case 3: return 7; + case 4: return 8; + case 5: return 11; + case 6: return 12; + case 7: return 13; + case 8: return 14; + case 9: return 15; + case 10: + if (distance > avg1) { + return 0; + } else if (distance > avg2) { + return 5; + } else if (distance > avg3) { + return 9; + } else { + return 16; + } + case 11: + if (distance > avg1 && _min10 > avg1) { + return 1; + } else if (distance > avg2 && (_min10 > avg1 || _min10 > avg2)) { + return 6; + } else if (distance > avg3 && (_min10 > avg1 || _min10 > avg2 || _min10 > avg3)) { + return 10; + } else { + return 17; + } + default: + return -1; + } +} + +// calculate the number of faces with a lower adjusted priority than +// the given adjusted priority +int count_prio_offset(__local struct shared_data *shared, int priority) { + int total = 0; + switch (priority) { + case 17: + total += shared->totalMappedNum[16]; + case 16: + total += shared->totalMappedNum[15]; + case 15: + total += shared->totalMappedNum[14]; + case 14: + total += shared->totalMappedNum[13]; + case 13: + total += shared->totalMappedNum[12]; + case 12: + total += shared->totalMappedNum[11]; + case 11: + total += shared->totalMappedNum[10]; + case 10: + total += shared->totalMappedNum[9]; + case 9: + total += shared->totalMappedNum[8]; + case 8: + total += shared->totalMappedNum[7]; + case 7: + total += shared->totalMappedNum[6]; + case 6: + total += shared->totalMappedNum[5]; + case 5: + total += shared->totalMappedNum[4]; + case 4: + total += shared->totalMappedNum[3]; + case 3: + total += shared->totalMappedNum[2]; + case 2: + total += shared->totalMappedNum[1]; + case 1: + total += shared->totalMappedNum[0]; + case 0: + return total; + } +} + +void get_face( + __local struct shared_data *shared, + __constant struct uniform *uni, + __global const int4 *vb, + __global const int4 *tempvb, + uint localId, struct modelinfo minfo, int cameraYaw, int cameraPitch, + /* out */ int *prio, int *dis, int4 *o1, int4 *o2, int4 *o3) { + int size = minfo.size; + int offset = minfo.offset; + int flags = minfo.flags; + uint ssboOffset; + + if (localId < size) { + ssboOffset = localId; + } else { + ssboOffset = 0; + } + + int4 thisA; + int4 thisB; + int4 thisC; + + // Grab triangle vertices from the correct buffer + if (flags < 0) { + thisA = vb[offset + ssboOffset * 3]; + thisB = vb[offset + ssboOffset * 3 + 1]; + thisC = vb[offset + ssboOffset * 3 + 2]; + } else { + thisA = tempvb[offset + ssboOffset * 3]; + thisB = tempvb[offset + ssboOffset * 3 + 1]; + thisC = tempvb[offset + ssboOffset * 3 + 2]; + } + + if (localId < size) { + int radius = (flags & 0x7fffffff) >> 12; + int orientation = flags & 0x7ff; + + // rotate for model orientation + int4 thisrvA = rotate_vertex(uni, thisA, orientation); + int4 thisrvB = rotate_vertex(uni, thisB, orientation); + int4 thisrvC = rotate_vertex(uni, thisC, orientation); + + // calculate distance to face + int thisPriority = (thisA.w >> 16) & 0xff;// all vertices on the face have the same priority + int thisDistance; + if (radius == 0) { + thisDistance = 0; + } else { + thisDistance = face_distance(thisrvA, thisrvB, thisrvC, cameraYaw, cameraPitch) + radius; + } + + *o1 = thisrvA; + *o2 = thisrvB; + *o3 = thisrvC; + + *prio = thisPriority; + *dis = thisDistance; + } else { + *o1 = (int4)(0, 0, 0, 0); + *o2 = (int4)(0, 0, 0, 0); + *o3 = (int4)(0, 0, 0, 0); + *prio = 0; + *dis = 0; + } +} + +void add_face_prio_distance( + __local struct shared_data *shared, + __constant struct uniform *uni, + uint localId, struct modelinfo minfo, int4 thisrvA, int4 thisrvB, int4 thisrvC, int thisPriority, int thisDistance, int4 pos) { + if (localId < minfo.size) { + // if the face is not culled, it is calculated into priority distance averages + if (face_visible(uni, thisrvA, thisrvB, thisrvC, pos)) { + atomic_add(&shared->totalNum[thisPriority], 1); + atomic_add(&shared->totalDistance[thisPriority], thisDistance); + + // calculate minimum distance to any face of priority 10 for positioning the 11 faces later + if (thisPriority == 10) { + atomic_min(&shared->min10, thisDistance); + } + } + } +} + +int map_face_priority(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int *prio) { + int size = minfo.size; + + // Compute average distances for 0/2, 3/4, and 6/8 + + if (localId < size) { + int avg1 = 0; + int avg2 = 0; + int avg3 = 0; + + if (shared->totalNum[1] > 0 || shared->totalNum[2] > 0) { + avg1 = (shared->totalDistance[1] + shared->totalDistance[2]) / (shared->totalNum[1] + shared->totalNum[2]); + } + + if (shared->totalNum[3] > 0 || shared->totalNum[4] > 0) { + avg2 = (shared->totalDistance[3] + shared->totalDistance[4]) / (shared->totalNum[3] + shared->totalNum[4]); + } + + if (shared->totalNum[6] > 0 || shared->totalNum[8] > 0) { + avg3 = (shared->totalDistance[6] + shared->totalDistance[8]) / (shared->totalNum[6] + shared->totalNum[8]); + } + + int adjPrio = priority_map(thisPriority, thisDistance, shared->min10, avg1, avg2, avg3); + int prioIdx = atomic_add(&shared->totalMappedNum[adjPrio], 1); + + *prio = adjPrio; + return prioIdx; + } + + *prio = 0; + return 0; +} + +void insert_dfs(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int adjPrio, int distance, int prioIdx) { + int size = minfo.size; + + if (localId < size) { + // calculate base offset into dfs based on number of faces with a lower priority + int baseOff = count_prio_offset(shared, adjPrio); + // store into face array offset array by unique index + shared->dfs[baseOff + prioIdx] = ((int) localId << 16) | distance; + } +} + +void sort_and_insert( + __local struct shared_data *shared, + __global const float4 *uv, + __global const float4 *tempuv, + __global int4 *vout, + __global float4 *uvout, + uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int4 thisrvA, int4 thisrvB, int4 thisrvC) { + /* compute face distance */ + int size = minfo.size; + + if (localId < size) { + int outOffset = minfo.idx; + int uvOffset = minfo.uvOffset; + int flags = minfo.flags; + int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0); + + const int priorityOffset = count_prio_offset(shared, thisPriority); + const int numOfPriority = shared->totalMappedNum[thisPriority]; + int start = priorityOffset; // index of first face with this priority + int end = priorityOffset + numOfPriority; // index of last face with this priority + int myOffset = priorityOffset; + + // we only have to order faces against others of the same priority + // calculate position this face will be in + for (int i = start; i < end; ++i) { + int d1 = shared->dfs[i]; + int theirId = d1 >> 16; + int theirDistance = d1 & 0xffff; + + // the closest faces draw last, so have the highest index + // if two faces have the same distance, the one with the + // higher id draws last + if ((theirDistance > thisDistance) + || (theirDistance == thisDistance && theirId < localId)) { + ++myOffset; + } + } + + // position vertices in scene and write to out buffer + vout[outOffset + myOffset * 3] = pos + thisrvA; + vout[outOffset + myOffset * 3 + 1] = pos + thisrvB; + vout[outOffset + myOffset * 3 + 2] = pos + thisrvC; + + if (uvOffset < 0) { + uvout[outOffset + myOffset * 3] = (float4)(0, 0, 0, 0); + uvout[outOffset + myOffset * 3 + 1] = (float4)(0, 0, 0, 0); + uvout[outOffset + myOffset * 3 + 2] = (float4)(0, 0, 0, 0); + } else if (flags >= 0) { + uvout[outOffset + myOffset * 3] = tempuv[uvOffset + localId * 3]; + uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1]; + uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2]; + } else { + uvout[outOffset + myOffset * 3] = uv[uvOffset + localId * 3]; + uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1]; + uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2]; + } + } +}