gpu: add opencl support for macos

This allows using opencl as an alternative to opengl compute shaders on
macos, which does not support compute shaders. Now, macos can finally
use the extended draw distance feature of the gpu plugin.

This also includes code for using opencl with Windows and Linux if we
want to enable that in the future. A copy of the existing compute
shaders have been checked in and ported to opencl, keeping support for
opengl compute shaders on Windows and Linux.

Co-authored-by: Paul Norton <napkinorton@gmail.com>
This commit is contained in:
Adam
2021-02-10 21:01:53 -05:00
parent 26f26308ab
commit 13efaa6a0c
9 changed files with 1445 additions and 175 deletions

View File

@@ -177,6 +177,25 @@
<version>2.4.0-rc-20210117</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>net.runelite.jocl</groupId>
<artifactId>jocl</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>net.runelite.jocl</groupId>
<artifactId>jocl</artifactId>
<version>1.0</version>
<classifier>macos-x64</classifier>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>net.runelite.jocl</groupId>
<artifactId>jocl</artifactId>
<version>1.0</version>
<classifier>macos-arm64</classifier>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>net.runelite</groupId>
<artifactId>archive-patcher</artifactId>

View File

@@ -0,0 +1,40 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package net.runelite.client.plugins.gpu;
import org.jocl.Pointer;
import org.jocl.cl_mem;
class GLBuffer
{
int glBufferId = -1;
int size = -1;
cl_mem cl_mem;
Pointer ptr()
{
return cl_mem != null ? Pointer.to(cl_mem) : null;
}
}

View File

@@ -29,6 +29,11 @@ import com.google.inject.Provides;
import com.jogamp.nativewindow.awt.AWTGraphicsConfiguration;
import com.jogamp.nativewindow.awt.JAWTWindow;
import com.jogamp.opengl.GL;
import static com.jogamp.opengl.GL.GL_ARRAY_BUFFER;
import static com.jogamp.opengl.GL.GL_DYNAMIC_DRAW;
import static com.jogamp.opengl.GL2ES2.GL_STREAM_DRAW;
import static com.jogamp.opengl.GL2ES3.GL_STATIC_COPY;
import static com.jogamp.opengl.GL2ES3.GL_UNIFORM_BUFFER;
import com.jogamp.opengl.GL4;
import com.jogamp.opengl.GLCapabilities;
import com.jogamp.opengl.GLContext;
@@ -45,6 +50,7 @@ import java.awt.Image;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.awt.image.DataBufferInt;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
@@ -93,6 +99,10 @@ import net.runelite.client.plugins.gpu.config.UIScalingMode;
import net.runelite.client.plugins.gpu.template.Template;
import net.runelite.client.ui.DrawManager;
import net.runelite.client.util.OSType;
import org.jocl.CL;
import static org.jocl.CL.CL_MEM_READ_ONLY;
import static org.jocl.CL.CL_MEM_WRITE_ONLY;
import static org.jocl.CL.clCreateFromGLBuffer;
@PluginDescriptor(
name = "GPU",
@@ -105,8 +115,8 @@ import net.runelite.client.util.OSType;
public class GpuPlugin extends Plugin implements DrawCallbacks
{
// This is the maximum number of triangles the compute shaders support
private static final int MAX_TRIANGLE = 4096;
private static final int SMALL_TRIANGLE_COUNT = 512;
static final int MAX_TRIANGLE = 4096;
static final int SMALL_TRIANGLE_COUNT = 512;
private static final int FLAG_SCENE_BUFFER = Integer.MIN_VALUE;
private static final int DEFAULT_DISTANCE = 25;
static final int MAX_DISTANCE = 90;
@@ -115,6 +125,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
@Inject
private Client client;
@Inject
private OpenCLManager openCLManager;
@Inject
private ClientThread clientThread;
@@ -133,7 +146,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
@Inject
private PluginManager pluginManager;
private boolean useComputeShaders;
enum ComputeMode
{
NONE,
OPENGL,
OPENCL
}
private ComputeMode computeMode = ComputeMode.NONE;
private Canvas canvas;
private JAWTWindow jawtWindow;
@@ -182,23 +202,22 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
private int texSceneHandle;
private int rboSceneHandle;
// scene vertex buffer id
private int bufferId;
// scene uv buffer id
private int uvBufferId;
// scene vertex buffer
private final GLBuffer sceneVertexBuffer = new GLBuffer();
// scene uv buffer
private final GLBuffer sceneUvBuffer = new GLBuffer();
private int tmpBufferId; // temporary scene vertex buffer
private int tmpUvBufferId; // temporary scene uv buffer
private int tmpModelBufferId; // scene model buffer, large
private int tmpModelBufferSmallId; // scene model buffer, small
private int tmpModelBufferUnorderedId;
private int tmpOutBufferId; // target vertex buffer for compute shaders
private int tmpOutUvBufferId; // target uv buffer for compute shaders
private final GLBuffer tmpVertexBuffer = new GLBuffer(); // temporary scene vertex buffer
private final GLBuffer tmpUvBuffer = new GLBuffer(); // temporary scene uv buffer
private final GLBuffer tmpModelBufferLarge = new GLBuffer(); // scene model buffer, large
private final GLBuffer tmpModelBufferSmall = new GLBuffer(); // scene model buffer, small
private final GLBuffer tmpModelBufferUnordered = new GLBuffer(); // scene model buffer, unordered
private final GLBuffer tmpOutBuffer = new GLBuffer(); // target vertex buffer for compute shaders
private final GLBuffer tmpOutUvBuffer = new GLBuffer(); // target uv buffer for compute shaders
private int textureArrayId;
private int uniformBufferId;
private final IntBuffer uniformBuffer = GpuIntBuffer.allocateDirect(5 + 3 + 2048 * 4);
private final GLBuffer uniformBuffer = new GLBuffer();
private final float[] textureOffsets = new float[128];
private GpuIntBuffer vertexBuffer;
@@ -278,7 +297,6 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
{
try
{
bufferId = uvBufferId = uniformBufferId = tmpBufferId = tmpUvBufferId = tmpModelBufferId = tmpModelBufferSmallId = tmpModelBufferUnorderedId = tmpOutBufferId = tmpOutUvBufferId = -1;
texSceneHandle = fboSceneHandle = rboSceneHandle = -1; // AA FBO
unorderedModels = smallModels = largeModels = 0;
drawingModel = false;
@@ -290,8 +308,9 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
return false;
}
// OSX supports up to OpenGL 4.1, however 4.3 is required for compute shaders
useComputeShaders = config.useComputeShaders() && OSType.getOSType() != OSType.MacOS;
computeMode = config.useComputeShaders()
? (OSType.getOSType() == OSType.MacOS ? ComputeMode.OPENCL : ComputeMode.OPENGL)
: ComputeMode.NONE;
canvas.setIgnoreRepaint(true);
@@ -397,7 +416,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
if (client.getGameState() == GameState.LOGGED_IN)
{
uploadScene();
invokeOnMainThread(this::uploadScene);
}
}
catch (Throwable e)
@@ -433,6 +452,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
invokeOnMainThread(() ->
{
openCLManager.cleanup();
if (gl != null)
{
if (textureArrayId != -1)
@@ -441,11 +462,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
textureArrayId = -1;
}
if (uniformBufferId != -1)
{
glDeleteBuffer(gl, uniformBufferId);
uniformBufferId = -1;
}
destroyGlBuffer(uniformBuffer);
shutdownBuffers();
shutdownInterfaceTexture();
@@ -519,12 +536,16 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
glProgram = PROGRAM.compile(gl, template);
glUiProgram = UI_PROGRAM.compile(gl, template);
if (useComputeShaders)
if (computeMode == ComputeMode.OPENGL)
{
glComputeProgram = COMPUTE_PROGRAM.compile(gl, template);
glSmallComputeProgram = SMALL_COMPUTE_PROGRAM.compile(gl, template);
glUnorderedComputeProgram = UNORDERED_COMPUTE_PROGRAM.compile(gl, template);
}
else if (computeMode == ComputeMode.OPENCL)
{
openCLManager.init(gl);
}
initUniforms();
}
@@ -593,8 +614,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
-1f, 1f, 0.0f, 0.0f, 0f // top left
});
vboUiBuf.rewind();
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vboUiHandle);
gl.glBufferData(gl.GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW);
gl.glBindBuffer(GL_ARRAY_BUFFER, vboUiHandle);
gl.glBufferData(GL_ARRAY_BUFFER, vboUiBuf.capacity() * Float.BYTES, vboUiBuf, gl.GL_STATIC_DRAW);
// position attribute
gl.glVertexAttribPointer(0, 3, gl.GL_FLOAT, false, 5 * Float.BYTES, 0);
@@ -605,7 +626,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
gl.glEnableVertexAttribArray(1);
// unbind VBO
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0);
gl.glBindBuffer(GL_ARRAY_BUFFER, 0);
}
private void shutdownVao()
@@ -622,71 +643,49 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
private void initBuffers()
{
bufferId = glGenBuffers(gl);
uvBufferId = glGenBuffers(gl);
tmpBufferId = glGenBuffers(gl);
tmpUvBufferId = glGenBuffers(gl);
tmpModelBufferId = glGenBuffers(gl);
tmpModelBufferSmallId = glGenBuffers(gl);
tmpModelBufferUnorderedId = glGenBuffers(gl);
tmpOutBufferId = glGenBuffers(gl);
tmpOutUvBufferId = glGenBuffers(gl);
initGlBuffer(sceneVertexBuffer);
initGlBuffer(sceneUvBuffer);
initGlBuffer(tmpVertexBuffer);
initGlBuffer(tmpUvBuffer);
initGlBuffer(tmpModelBufferLarge);
initGlBuffer(tmpModelBufferSmall);
initGlBuffer(tmpModelBufferUnordered);
initGlBuffer(tmpOutBuffer);
initGlBuffer(tmpOutUvBuffer);
}
private void initGlBuffer(GLBuffer glBuffer)
{
glBuffer.glBufferId = glGenBuffers(gl);
}
private void shutdownBuffers()
{
if (bufferId != -1)
{
glDeleteBuffer(gl, bufferId);
bufferId = -1;
}
destroyGlBuffer(sceneVertexBuffer);
destroyGlBuffer(sceneUvBuffer);
if (uvBufferId != -1)
{
glDeleteBuffer(gl, uvBufferId);
uvBufferId = -1;
}
destroyGlBuffer(tmpVertexBuffer);
destroyGlBuffer(tmpUvBuffer);
destroyGlBuffer(tmpModelBufferLarge);
destroyGlBuffer(tmpModelBufferSmall);
destroyGlBuffer(tmpModelBufferUnordered);
destroyGlBuffer(tmpOutBuffer);
destroyGlBuffer(tmpOutUvBuffer);
}
if (tmpBufferId != -1)
private void destroyGlBuffer(GLBuffer glBuffer)
{
if (glBuffer.glBufferId != -1)
{
glDeleteBuffer(gl, tmpBufferId);
tmpBufferId = -1;
glDeleteBuffer(gl, glBuffer.glBufferId);
glBuffer.glBufferId = -1;
}
glBuffer.size = -1;
if (tmpUvBufferId != -1)
if (glBuffer.cl_mem != null)
{
glDeleteBuffer(gl, tmpUvBufferId);
tmpUvBufferId = -1;
}
if (tmpModelBufferId != -1)
{
glDeleteBuffer(gl, tmpModelBufferId);
tmpModelBufferId = -1;
}
if (tmpModelBufferSmallId != -1)
{
glDeleteBuffer(gl, tmpModelBufferSmallId);
tmpModelBufferSmallId = -1;
}
if (tmpModelBufferUnorderedId != -1)
{
glDeleteBuffer(gl, tmpModelBufferUnorderedId);
tmpModelBufferUnorderedId = -1;
}
if (tmpOutBufferId != -1)
{
glDeleteBuffer(gl, tmpOutBufferId);
tmpOutBufferId = -1;
}
if (tmpOutUvBufferId != -1)
{
glDeleteBuffer(gl, tmpOutUvBufferId);
tmpOutUvBufferId = -1;
CL.clReleaseMemObject(glBuffer.cl_mem);
glBuffer.cl_mem = null;
}
}
@@ -709,21 +708,21 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
private void initUniformBuffer()
{
uniformBufferId = glGenBuffers(gl);
gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId);
uniformBuffer.clear();
uniformBuffer.put(new int[8]);
initGlBuffer(uniformBuffer);
IntBuffer uniformBuf = GpuIntBuffer.allocateDirect(8 + 2048 * 4);
uniformBuf.put(new int[8]); // uniform block
final int[] pad = new int[2];
for (int i = 0; i < 2048; i++)
{
uniformBuffer.put(Perspective.SINE[i]);
uniformBuffer.put(Perspective.COSINE[i]);
uniformBuffer.put(pad);
uniformBuf.put(Perspective.SINE[i]);
uniformBuf.put(Perspective.COSINE[i]);
uniformBuf.put(pad); // ivec2 alignment in std140 is 16 bytes
}
uniformBuffer.flip();
uniformBuf.flip();
gl.glBufferData(gl.GL_UNIFORM_BUFFER, uniformBuffer.limit() * Integer.BYTES, uniformBuffer, gl.GL_DYNAMIC_DRAW);
gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0);
updateBuffer(uniformBuffer, GL_UNIFORM_BUFFER, uniformBuf.limit() * Integer.BYTES, uniformBuf, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
gl.glBindBuffer(GL_UNIFORM_BUFFER, 0);
}
private void initAAFbo(int width, int height, int aaSamples)
@@ -785,9 +784,11 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
invokeOnMainThread(() ->
{
// UBO. Only the first 32 bytes get modified here, the rest is the constant sin/cos table.
gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, uniformBufferId);
uniformBuffer.clear();
uniformBuffer
// We can reuse the vertex buffer since it isn't used yet.
vertexBuffer.clear();
vertexBuffer.ensureCapacity(32);
IntBuffer uniformBuf = vertexBuffer.getBuffer();
uniformBuf
.put(yaw)
.put(pitch)
.put(client.getCenterX())
@@ -796,12 +797,14 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
.put(cameraX)
.put(cameraY)
.put(cameraZ);
uniformBuffer.flip();
uniformBuf.flip();
gl.glBufferSubData(gl.GL_UNIFORM_BUFFER, 0, uniformBuffer.limit() * Integer.BYTES, uniformBuffer);
gl.glBindBuffer(gl.GL_UNIFORM_BUFFER, 0);
gl.glBindBuffer(GL_UNIFORM_BUFFER, uniformBuffer.glBufferId);
gl.glBufferSubData(GL_UNIFORM_BUFFER, 0, uniformBuf.limit() * Integer.BYTES, uniformBuf);
gl.glBindBuffer(GL_UNIFORM_BUFFER, 0);
gl.glBindBufferBase(gl.GL_UNIFORM_BUFFER, 0, uniformBufferId);
gl.glBindBufferBase(GL_UNIFORM_BUFFER, 0, uniformBuffer.glBufferId);
uniformBuf.clear();
});
}
@@ -813,7 +816,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
private void postDraw()
{
if (!useComputeShaders)
if (computeMode == ComputeMode.NONE)
{
// Upload buffers
vertexBuffer.flip();
@@ -822,12 +825,8 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
IntBuffer vertexBuffer = this.vertexBuffer.getBuffer();
FloatBuffer uvBuffer = this.uvBuffer.getBuffer();
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW);
updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, 0L);
updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, 0L);
return;
}
@@ -844,79 +843,91 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
IntBuffer modelBufferSmall = this.modelBufferSmall.getBuffer();
IntBuffer modelBufferUnordered = this.modelBufferUnordered.getBuffer();
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_DYNAMIC_DRAW);
// temp buffers
updateBuffer(tmpVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
updateBuffer(tmpUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpUvBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_DYNAMIC_DRAW);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, gl.GL_DYNAMIC_DRAW);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferSmallId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, gl.GL_DYNAMIC_DRAW);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpModelBufferUnorderedId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, gl.GL_DYNAMIC_DRAW);
// model buffers
updateBuffer(tmpModelBufferLarge, GL_ARRAY_BUFFER, modelBuffer.limit() * Integer.BYTES, modelBuffer, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
updateBuffer(tmpModelBufferSmall, GL_ARRAY_BUFFER, modelBufferSmall.limit() * Integer.BYTES, modelBufferSmall, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
updateBuffer(tmpModelBufferUnordered, GL_ARRAY_BUFFER, modelBufferUnordered.limit() * Integer.BYTES, modelBufferUnordered, GL_DYNAMIC_DRAW, CL_MEM_READ_ONLY);
// Output buffers
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER,
updateBuffer(tmpOutBuffer,
GL_ARRAY_BUFFER,
targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes
null,
gl.GL_STREAM_DRAW);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, tmpOutUvBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER,
targetBufferOffset * 16,
GL_STREAM_DRAW,
CL_MEM_WRITE_ONLY);
updateBuffer(tmpOutUvBuffer,
GL_ARRAY_BUFFER,
targetBufferOffset * 16, // each vertex is an ivec4, which is 16 bytes
null,
gl.GL_STREAM_DRAW);
GL_STREAM_DRAW,
CL_MEM_WRITE_ONLY);
// Bind UBO to compute programs
gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0);
gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0);
if (computeMode == ComputeMode.OPENCL)
{
// The docs for clEnqueueAcquireGLObjects say all pending GL operations must be completed before calling
// clEnqueueAcquireGLObjects, and recommends calling glFinish() as the only portable way to do that.
// However no issues have been observed from not calling it, and so will leave disabled for now.
// gl.glFinish();
openCLManager.compute(
unorderedModels, smallModels, largeModels,
sceneVertexBuffer, sceneUvBuffer,
tmpVertexBuffer, tmpUvBuffer,
tmpModelBufferUnordered, tmpModelBufferSmall, tmpModelBufferLarge,
tmpOutBuffer, tmpOutUvBuffer,
uniformBuffer);
return;
}
/*
* Compute is split into three separate programs: 'unordered', 'small', and 'large'
* to save on GPU resources. Small will sort <= 512 faces, large will do <= 4096.
*/
// Bind UBO to compute programs
gl.glUniformBlockBinding(glSmallComputeProgram, uniBlockSmall, 0);
gl.glUniformBlockBinding(glComputeProgram, uniBlockLarge, 0);
// unordered
gl.glUseProgram(glUnorderedComputeProgram);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnorderedId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferUnordered.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);
gl.glDispatchCompute(unorderedModels, 1, 1);
// small
gl.glUseProgram(glSmallComputeProgram);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmallId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferSmall.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);
gl.glDispatchCompute(smallModels, 1, 1);
// large
gl.glUseProgram(glComputeProgram);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, this.bufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, this.uvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 0, tmpModelBufferLarge.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 1, sceneVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 2, tmpVertexBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 3, tmpOutBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 4, tmpOutUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 5, sceneUvBuffer.glBufferId);
gl.glBindBufferBase(gl.GL_SHADER_STORAGE_BUFFER, 6, tmpUvBuffer.glBufferId);
gl.glDispatchCompute(largeModels, 1, 1);
}
@@ -926,7 +937,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
SceneTilePaint paint, int tileZ, int tileX, int tileY,
int zoom, int centerX, int centerY)
{
if (!useComputeShaders)
if (computeMode == ComputeMode.NONE)
{
targetBufferOffset += sceneUploader.upload(paint,
tileZ, tileX, tileY,
@@ -963,7 +974,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
SceneTileModel model, int tileZ, int tileX, int tileY,
int zoom, int centerX, int centerY)
{
if (!useComputeShaders)
if (computeMode == ComputeMode.NONE)
{
targetBufferOffset += sceneUploader.upload(model,
tileX, tileY,
@@ -1131,7 +1142,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
// Ceil the sizes because even if the size is 599.1 we want to treat it as size 600 (i.e. render to the x=599 pixel).
renderViewportHeight = (int) Math.ceil(scaleFactorY * (renderViewportHeight)) + padding * 2;
renderViewportWidth = (int) Math.ceil(scaleFactorX * (renderViewportWidth )) + padding * 2;
renderViewportWidth = (int) Math.ceil(scaleFactorX * (renderViewportWidth )) + padding * 2;
// Floor the offsets because even if the offset is 4.9, we want to render to the x=4 pixel anyway.
renderHeightOff = (int) Math.floor(scaleFactorY * (renderHeightOff)) - padding;
@@ -1195,27 +1206,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
gl.glBindVertexArray(vaoHandle);
int vertexBuffer, uvBuffer;
if (useComputeShaders)
if (computeMode != ComputeMode.NONE)
{
// Before reading the SSBOs written to from postDrawScene() we must insert a barrier
gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
if (computeMode == ComputeMode.OPENGL)
{
// Before reading the SSBOs written to from postDrawScene() we must insert a barrier
gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
}
else
{
// Wait for the command queue to finish, so that we know the compute is done
openCLManager.finish();
}
// Draw using the output buffer of the compute
vertexBuffer = tmpOutBufferId;
uvBuffer = tmpOutUvBufferId;
vertexBuffer = tmpOutBuffer.glBufferId;
uvBuffer = tmpOutUvBuffer.glBufferId;
}
else
{
// Only use the temporary buffers, which will contain the full scene
vertexBuffer = tmpBufferId;
uvBuffer = tmpUvBufferId;
vertexBuffer = tmpVertexBuffer.glBufferId;
uvBuffer = tmpUvBuffer.glBufferId;
}
gl.glEnableVertexAttribArray(0);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vertexBuffer);
gl.glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
gl.glVertexAttribIPointer(0, 4, gl.GL_INT, 0, 0);
gl.glEnableVertexAttribArray(1);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBuffer);
gl.glBindBuffer(GL_ARRAY_BUFFER, uvBuffer);
gl.glVertexAttribPointer(1, 4, gl.GL_FLOAT, false, 0, 0);
gl.glDrawArrays(gl.GL_TRIANGLES, 0, targetBufferOffset);
@@ -1400,12 +1420,12 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
@Subscribe
public void onGameStateChanged(GameStateChanged gameStateChanged)
{
if (!useComputeShaders || gameStateChanged.getGameState() != GameState.LOGGED_IN)
if (computeMode == ComputeMode.NONE || gameStateChanged.getGameState() != GameState.LOGGED_IN)
{
return;
}
uploadScene();
invokeOnMainThread(this::uploadScene);
}
private void uploadScene()
@@ -1421,13 +1441,10 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
IntBuffer vertexBuffer = this.vertexBuffer.getBuffer();
FloatBuffer uvBuffer = this.uvBuffer.getBuffer();
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, bufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, gl.GL_STATIC_COPY);
updateBuffer(sceneVertexBuffer, GL_ARRAY_BUFFER, vertexBuffer.limit() * Integer.BYTES, vertexBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY);
updateBuffer(sceneUvBuffer, GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, GL_STATIC_COPY, CL_MEM_READ_ONLY);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, uvBufferId);
gl.glBufferData(gl.GL_ARRAY_BUFFER, uvBuffer.limit() * Float.BYTES, uvBuffer, gl.GL_STATIC_COPY);
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0);
gl.glBindBuffer(GL_ARRAY_BUFFER, 0);
vertexBuffer.clear();
uvBuffer.clear();
@@ -1492,7 +1509,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
@Override
public void draw(Renderable renderable, int orientation, int pitchSin, int pitchCos, int yawSin, int yawCos, int x, int y, int z, long hash)
{
if (!useComputeShaders)
if (computeMode == ComputeMode.NONE)
{
Model model = renderable instanceof Model ? (Model) renderable : renderable.getModel();
if (model != null)
@@ -1673,7 +1690,7 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
private int getDrawDistance()
{
final int limit = useComputeShaders ? MAX_DISTANCE : DEFAULT_DISTANCE;
final int limit = computeMode != ComputeMode.NONE ? MAX_DISTANCE : DEFAULT_DISTANCE;
return Ints.constrainToRange(config.drawDistance(), 0, limit);
}
@@ -1688,4 +1705,36 @@ public class GpuPlugin extends Plugin implements DrawCallbacks
runnable.run();
}
}
private void updateBuffer(GLBuffer glBuffer, int target, int size, Buffer data, int usage, long clFlags)
{
gl.glBindBuffer(target, glBuffer.glBufferId);
if (size > glBuffer.size)
{
log.trace("Buffer resize: {} {} -> {}", glBuffer, glBuffer.size, size);
glBuffer.size = size;
gl.glBufferData(target, size, data, usage);
if (computeMode == ComputeMode.OPENCL)
{
if (glBuffer.cl_mem != null)
{
CL.clReleaseMemObject(glBuffer.cl_mem);
}
if (size == 0)
{
glBuffer.cl_mem = null;
}
else
{
glBuffer.cl_mem = clCreateFromGLBuffer(openCLManager.context, clFlags, glBuffer.glBufferId, null);
}
}
}
else if (data != null)
{
gl.glBufferSubData(target, 0, size, data);
}
}
}

View File

@@ -0,0 +1,521 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package net.runelite.client.plugins.gpu;
import com.google.common.base.Charsets;
import com.jogamp.nativewindow.NativeSurface;
import com.jogamp.opengl.GL4;
import com.jogamp.opengl.GLContext;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Objects;
import javax.inject.Singleton;
import jogamp.opengl.GLContextImpl;
import jogamp.opengl.GLDrawableImpl;
import jogamp.opengl.egl.EGLContext;
import jogamp.opengl.macosx.cgl.CGL;
import jogamp.opengl.windows.wgl.WindowsWGLContext;
import jogamp.opengl.x11.glx.X11GLXContext;
import lombok.extern.slf4j.Slf4j;
import net.runelite.client.plugins.gpu.template.Template;
import net.runelite.client.util.OSType;
import org.jocl.CL;
import static org.jocl.CL.*;
import org.jocl.CLException;
import org.jocl.Pointer;
import org.jocl.Sizeof;
import org.jocl.cl_command_queue;
import org.jocl.cl_context;
import org.jocl.cl_context_properties;
import org.jocl.cl_device_id;
import org.jocl.cl_event;
import org.jocl.cl_kernel;
import org.jocl.cl_mem;
import org.jocl.cl_platform_id;
import org.jocl.cl_program;
@Singleton
@Slf4j
class OpenCLManager
{
private static final String GL_SHARING_PLATFORM_EXT = "cl_khr_gl_sharing";
private static final String KERNEL_NAME_UNORDERED = "computeUnordered";
private static final String KERNEL_NAME_LARGE = "computeLarge";
private static final int MIN_WORK_GROUP_SIZE = 256;
private static final int SMALL_SIZE = GpuPlugin.SMALL_TRIANGLE_COUNT;
private static final int LARGE_SIZE = GpuPlugin.MAX_TRIANGLE;
// struct shared_data {
// int totalNum[12];
// int totalDistance[12];
// int totalMappedNum[18];
// int min10;
// int dfs[0];
// };
private static final int SHARED_SIZE = 12 + 12 + 18 + 1; // in ints
// The number of faces each worker processes in the two kernels
private int largeFaceCount;
private int smallFaceCount;
private cl_platform_id platform;
private cl_device_id device;
cl_context context;
private cl_command_queue commandQueue;
private cl_program programUnordered;
private cl_program programSmall;
private cl_program programLarge;
private cl_kernel kernelUnordered;
private cl_kernel kernelSmall;
private cl_kernel kernelLarge;
void init(GL4 gl)
{
CL.setExceptionsEnabled(true);
switch (OSType.getOSType())
{
case Windows:
case Linux:
initPlatform();
initDevice();
initContext(gl);
break;
case MacOS:
initMacOS(gl);
break;
default:
throw new RuntimeException("Unsupported OS Type " + OSType.getOSType().name());
}
ensureMinWorkGroupSize();
initQueue();
compilePrograms();
}
void cleanup()
{
if (programUnordered != null)
{
CL.clReleaseProgram(programUnordered);
programUnordered = null;
}
if (programSmall != null)
{
CL.clReleaseProgram(programSmall);
programSmall = null;
}
if (programLarge != null)
{
CL.clReleaseProgram(programLarge);
programLarge = null;
}
if (kernelUnordered != null)
{
CL.clReleaseKernel(kernelUnordered);
kernelUnordered = null;
}
if (kernelSmall != null)
{
CL.clReleaseKernel(kernelSmall);
kernelSmall = null;
}
if (kernelLarge != null)
{
CL.clReleaseKernel(kernelLarge);
kernelLarge = null;
}
if (commandQueue != null)
{
CL.clReleaseCommandQueue(commandQueue);
commandQueue = null;
}
if (context != null)
{
CL.clReleaseContext(context);
context = null;
}
if (device != null)
{
CL.clReleaseDevice(device);
device = null;
}
}
private String logPlatformInfo(cl_platform_id platform, int param)
{
long[] size = new long[1];
clGetPlatformInfo(platform, param, 0, null, size);
byte[] buffer = new byte[(int) size[0]];
clGetPlatformInfo(platform, param, buffer.length, Pointer.to(buffer), null);
String platformInfo = new String(buffer, Charsets.UTF_8);
log.debug("Platform: {}, {}", stringFor_cl_platform_info(param), platformInfo);
return platformInfo;
}
private void logBuildInfo(cl_program program, int param)
{
long[] size = new long[1];
clGetProgramBuildInfo(program, device, param, 0, null, size);
ByteBuffer buffer = ByteBuffer.allocateDirect((int) size[0]);
clGetProgramBuildInfo(program, device, param, buffer.limit(), Pointer.toBuffer(buffer), null);
switch (param)
{
case CL_PROGRAM_BUILD_STATUS:
log.debug("Build status: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_build_status(buffer.getInt()));
break;
case CL_PROGRAM_BINARY_TYPE:
log.debug("Binary type: {}, {}", stringFor_cl_program_build_info(param), stringFor_cl_program_binary_type(buffer.getInt()));
break;
case CL_PROGRAM_BUILD_LOG:
String buildLog = StandardCharsets.US_ASCII.decode(buffer).toString();
log.trace("Build log: {}, {}", stringFor_cl_program_build_info(param), buildLog);
break;
case CL_PROGRAM_BUILD_OPTIONS:
String message = StandardCharsets.US_ASCII.decode(buffer).toString();
log.debug("Build options: {}, {}", stringFor_cl_program_build_info(param), message);
break;
default:
throw new IllegalArgumentException();
}
}
private void initPlatform()
{
int[] platformCount = new int[1];
clGetPlatformIDs(0, null, platformCount);
if (platformCount[0] == 0)
{
throw new RuntimeException("No compute platforms found");
}
cl_platform_id[] platforms = new cl_platform_id[platformCount[0]];
clGetPlatformIDs(platforms.length, platforms, null);
for (cl_platform_id platform : platforms)
{
log.debug("Found cl_platform_id {}", platform);
logPlatformInfo(platform, CL_PLATFORM_PROFILE);
logPlatformInfo(platform, CL_PLATFORM_VERSION);
logPlatformInfo(platform, CL_PLATFORM_NAME);
logPlatformInfo(platform, CL_PLATFORM_VENDOR);
String[] extensions = logPlatformInfo(platform, CL_PLATFORM_EXTENSIONS).split(" ");
if (Arrays.stream(extensions).noneMatch(s -> s.equals(GL_SHARING_PLATFORM_EXT)))
{
throw new RuntimeException("Platform does not support OpenGL buffer sharing");
}
}
platform = platforms[0];
log.debug("Selected cl_platform_id {}", platform);
}
private void initDevice()
{
int[] deviceCount = new int[1];
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, null, deviceCount);
if (deviceCount[0] == 0)
{
throw new RuntimeException("No compute devices found");
}
cl_device_id[] devices = new cl_device_id[(int) deviceCount[0]];
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, devices.length, devices, null);
for (cl_device_id device : devices)
{
long[] size = new long[1];
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, null, size);
byte[] devInfoBuf = new byte[(int) size[0]];
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, devInfoBuf.length, Pointer.to(devInfoBuf), null);
log.debug("Found cl_device_id: {}", device);
log.debug("Device extensions: {}", new String(devInfoBuf, Charsets.UTF_8));
}
device = devices[0];
log.debug("Selected cl_device_id {}", device);
}
private void initContext(GL4 gl)
{
// set computation platform
cl_context_properties contextProps = new cl_context_properties();
contextProps.addProperty(CL_CONTEXT_PLATFORM, platform);
// pull gl context
GLContext glContext = gl.getContext();
log.debug("Got GLContext of type {}", glContext.getClass().getSimpleName());
if (!glContext.isCurrent())
{
throw new RuntimeException("Can't create OpenCL context from inactive GL Context");
}
// get correct props based on os
long glContextHandle = glContext.getHandle();
GLContextImpl glContextImpl = (GLContextImpl) glContext;
GLDrawableImpl glDrawableImpl = glContextImpl.getDrawableImpl();
NativeSurface nativeSurface = glDrawableImpl.getNativeSurface();
if (glContext instanceof X11GLXContext)
{
long displayHandle = nativeSurface.getDisplayHandle();
contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
contextProps.addProperty(CL_GLX_DISPLAY_KHR, displayHandle);
}
else if (glContext instanceof WindowsWGLContext)
{
long surfaceHandle = nativeSurface.getSurfaceHandle();
contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
contextProps.addProperty(CL_WGL_HDC_KHR, surfaceHandle);
}
else if (glContext instanceof EGLContext)
{
long displayHandle = nativeSurface.getDisplayHandle();
contextProps.addProperty(CL_GL_CONTEXT_KHR, glContextHandle);
contextProps.addProperty(CL_EGL_DISPLAY_KHR, displayHandle);
}
log.debug("Creating context with props: {}", contextProps);
context = clCreateContext(contextProps, 1, new cl_device_id[]{device}, null, null, null);
log.debug("Created compute context {}", context);
}
private void initMacOS(GL4 gl)
{
// get sharegroup from gl context
GLContext glContext = gl.getContext();
if (!glContext.isCurrent())
{
throw new RuntimeException("Can't create context from inactive GL");
}
long cglContext = CGL.CGLGetCurrentContext();
long cglShareGroup = CGL.CGLGetShareGroup(cglContext);
// build context props
cl_context_properties contextProps = new cl_context_properties();
contextProps.addProperty(CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, cglShareGroup);
// ask macos to make the context for us
log.debug("Creating context with props: {}", contextProps);
context = clCreateContext(contextProps, 0, null, null, null, null);
// pull the compute device out of the provided context
device = new cl_device_id();
clGetGLContextInfoAPPLE(context, cglContext, CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, Sizeof.cl_device_id, Pointer.to(device), null);
log.debug("Got macOS CLGL compute device {}", device);
}
private void ensureMinWorkGroupSize()
{
long[] maxWorkGroupSize = new long[1];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(maxWorkGroupSize), null);
log.debug("Device CL_DEVICE_MAX_WORK_GROUP_SIZE: {}", maxWorkGroupSize[0]);
if (maxWorkGroupSize[0] < MIN_WORK_GROUP_SIZE)
{
throw new RuntimeException("Compute device does not support min work group size " + MIN_WORK_GROUP_SIZE);
}
// Largest power of 2 less than or equal to maxWorkGroupSize
int groupSize = 0x80000000 >>> Integer.numberOfLeadingZeros((int) maxWorkGroupSize[0]);
largeFaceCount = LARGE_SIZE / (Math.min(groupSize, LARGE_SIZE));
smallFaceCount = SMALL_SIZE / (Math.min(groupSize, SMALL_SIZE));
log.debug("Face counts: small: {}, large: {}", smallFaceCount, largeFaceCount);
}
private void initQueue()
{
long[] l = new long[1];
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, Sizeof.cl_long, Pointer.to(l), null);
commandQueue = clCreateCommandQueue(context, device, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, null);
log.debug("Created command_queue {}, properties {}", commandQueue, l[0] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
}
private cl_program compileProgram(String programSource)
{
log.trace("Compiling program:\n {}", programSource);
cl_program program = clCreateProgramWithSource(context, 1, new String[]{programSource}, null, null);
try
{
clBuildProgram(program, 0, null, null, null, null);
}
catch (CLException e)
{
logBuildInfo(program, CL_PROGRAM_BUILD_LOG);
throw e;
}
logBuildInfo(program, CL_PROGRAM_BUILD_STATUS);
logBuildInfo(program, CL_PROGRAM_BINARY_TYPE);
logBuildInfo(program, CL_PROGRAM_BUILD_OPTIONS);
logBuildInfo(program, CL_PROGRAM_BUILD_LOG);
return program;
}
private cl_kernel getKernel(cl_program program, String kernelName)
{
cl_kernel kernel = clCreateKernel(program, kernelName, null);
log.debug("Loaded kernel {} for program {}", kernelName, program);
return kernel;
}
private void compilePrograms()
{
Template templateSmall = new Template()
.addInclude(OpenCLManager.class)
.add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + smallFaceCount) : null);
Template templateLarge = new Template()
.addInclude(OpenCLManager.class)
.add(key -> key.equals("FACE_COUNT") ? ("#define FACE_COUNT " + largeFaceCount) : null);
String unordered = new Template()
.addInclude(OpenCLManager.class)
.load("comp_unordered.cl");
String small = templateSmall.load("comp.cl");
String large = templateLarge.load("comp.cl");
programUnordered = compileProgram(unordered);
programSmall = compileProgram(small);
programLarge = compileProgram(large);
kernelUnordered = getKernel(programUnordered, KERNEL_NAME_UNORDERED);
kernelSmall = getKernel(programSmall, KERNEL_NAME_LARGE);
kernelLarge = getKernel(programLarge, KERNEL_NAME_LARGE);
}
void compute(int unorderedModels, int smallModels, int largeModels,
GLBuffer sceneVertexBuffer,
GLBuffer sceneUvBuffer,
GLBuffer vertexBuffer,
GLBuffer uvBuffer,
GLBuffer unorderedBuffer,
GLBuffer smallBuffer,
GLBuffer largeBuffer,
GLBuffer outVertexBuffer,
GLBuffer outUvBuffer,
GLBuffer uniformBuffer
)
{
cl_mem[] glBuffersAll = {
sceneVertexBuffer.cl_mem,
sceneUvBuffer.cl_mem,
unorderedBuffer.cl_mem,
smallBuffer.cl_mem,
largeBuffer.cl_mem,
vertexBuffer.cl_mem,
uvBuffer.cl_mem,
outVertexBuffer.cl_mem,
outUvBuffer.cl_mem,
uniformBuffer.cl_mem,
};
cl_mem[] glBuffers = Arrays.stream(glBuffersAll)
.filter(Objects::nonNull)
.toArray(cl_mem[]::new);
cl_event acquireGLBuffers = new cl_event();
clEnqueueAcquireGLObjects(commandQueue, glBuffers.length, glBuffers, 0, null, acquireGLBuffers);
cl_event[] computeEvents = {
new cl_event(),
new cl_event(),
new cl_event()
};
int numComputeEvents = 0;
if (unorderedModels > 0)
{
clSetKernelArg(kernelUnordered, 0, Sizeof.cl_mem, unorderedBuffer.ptr());
clSetKernelArg(kernelUnordered, 1, Sizeof.cl_mem, sceneVertexBuffer.ptr());
clSetKernelArg(kernelUnordered, 2, Sizeof.cl_mem, vertexBuffer.ptr());
clSetKernelArg(kernelUnordered, 3, Sizeof.cl_mem, sceneUvBuffer.ptr());
clSetKernelArg(kernelUnordered, 4, Sizeof.cl_mem, uvBuffer.ptr());
clSetKernelArg(kernelUnordered, 5, Sizeof.cl_mem, outVertexBuffer.ptr());
clSetKernelArg(kernelUnordered, 6, Sizeof.cl_mem, outUvBuffer.ptr());
// queue compute call after acquireGLBuffers
clEnqueueNDRangeKernel(commandQueue, kernelUnordered, 1, null,
new long[]{unorderedModels * 6L}, new long[]{6}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
}
if (smallModels > 0)
{
clSetKernelArg(kernelSmall, 0, (SHARED_SIZE + SMALL_SIZE) * Integer.BYTES, null);
clSetKernelArg(kernelSmall, 1, Sizeof.cl_mem, smallBuffer.ptr());
clSetKernelArg(kernelSmall, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr());
clSetKernelArg(kernelSmall, 3, Sizeof.cl_mem, vertexBuffer.ptr());
clSetKernelArg(kernelSmall, 4, Sizeof.cl_mem, sceneUvBuffer.ptr());
clSetKernelArg(kernelSmall, 5, Sizeof.cl_mem, uvBuffer.ptr());
clSetKernelArg(kernelSmall, 6, Sizeof.cl_mem, outVertexBuffer.ptr());
clSetKernelArg(kernelSmall, 7, Sizeof.cl_mem, outUvBuffer.ptr());
clSetKernelArg(kernelSmall, 8, Sizeof.cl_mem, uniformBuffer.ptr());
clEnqueueNDRangeKernel(commandQueue, kernelSmall, 1, null,
new long[]{smallModels * (SMALL_SIZE / smallFaceCount)}, new long[]{SMALL_SIZE / smallFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
}
if (largeModels > 0)
{
clSetKernelArg(kernelLarge, 0, (SHARED_SIZE + LARGE_SIZE) * Integer.BYTES, null);
clSetKernelArg(kernelLarge, 1, Sizeof.cl_mem, largeBuffer.ptr());
clSetKernelArg(kernelLarge, 2, Sizeof.cl_mem, sceneVertexBuffer.ptr());
clSetKernelArg(kernelLarge, 3, Sizeof.cl_mem, vertexBuffer.ptr());
clSetKernelArg(kernelLarge, 4, Sizeof.cl_mem, sceneUvBuffer.ptr());
clSetKernelArg(kernelLarge, 5, Sizeof.cl_mem, uvBuffer.ptr());
clSetKernelArg(kernelLarge, 6, Sizeof.cl_mem, outVertexBuffer.ptr());
clSetKernelArg(kernelLarge, 7, Sizeof.cl_mem, outUvBuffer.ptr());
clSetKernelArg(kernelLarge, 8, Sizeof.cl_mem, uniformBuffer.ptr());
clEnqueueNDRangeKernel(commandQueue, kernelLarge, 1, null,
new long[]{(long) largeModels * (LARGE_SIZE / largeFaceCount)}, new long[]{LARGE_SIZE / largeFaceCount}, 1, new cl_event[]{acquireGLBuffers}, computeEvents[numComputeEvents++]);
}
clEnqueueReleaseGLObjects(commandQueue, glBuffers.length, glBuffers, numComputeEvents, computeEvents, null);
}
void finish()
{
clFinish(commandQueue);
}
}

View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct uniform {
int cameraYaw;
int cameraPitch;
int centerX;
int centerY;
int zoom;
int cameraX;
int cameraY;
int cameraZ;
int4 sinCosTable[2048];
};
struct shared_data {
int totalNum[12]; // number of faces with a given priority
int totalDistance[12]; // sum of distances to faces of a given priority
int totalMappedNum[18]; // number of faces with a given adjusted priority
int min10; // minimum distance to a face of priority 10
int dfs[0]; // packed face id and distance, size 512 for small, 4096 for large
};
struct modelinfo {
int offset; // offset into buffer
int uvOffset; // offset into uv buffer
int size; // length in faces
int idx; // write idx in target buffer
int flags; // radius, orientation
int x; // scene position x
int y; // scene position y
int z; // scene position z
};

View File

@@ -0,0 +1,104 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define PI 3.1415926535897932384626433832795f
#define UNIT PI / 1024.0f
float3 toScreen(int4 vertex, int cameraYaw, int cameraPitch, int centerX, int centerY, int zoom) {
float yawSin = sin(cameraYaw * UNIT);
float yawCos = cos(cameraYaw * UNIT);
float pitchSin = sin(cameraPitch * UNIT);
float pitchCos = cos(cameraPitch * UNIT);
float rotatedX = (vertex.z * yawSin) + (vertex.x * yawCos);
float rotatedZ = (vertex.z * yawCos) - (vertex.x * yawSin);
float var13 = (vertex.y * pitchCos) - (rotatedZ * pitchSin);
float var12 = (vertex.y * pitchSin) + (rotatedZ * pitchCos);
float x = rotatedX * zoom / var12 + centerX;
float y = var13 * zoom / var12 + centerY;
float z = -var12; // in OpenGL depth is negative
return (float3) (x, y, z);
}
/*
* Rotate a vertex by a given orientation in JAU
*/
int4 rotate_vertex(__constant struct uniform *uni, int4 vertex, int orientation) {
int4 sinCos = uni->sinCosTable[orientation];
int s = sinCos.x;
int c = sinCos.y;
int x = vertex.z * s + vertex.x * c >> 16;
int z = vertex.z * c - vertex.x * s >> 16;
return (int4)(x, vertex.y, z, vertex.w);
}
/*
* Calculate the distance to a vertex given the camera angle
*/
int vertex_distance(int4 vertex, int cameraYaw, int cameraPitch) {
int yawSin = (int)(65536.0f * sin(cameraYaw * UNIT));
int yawCos = (int)(65536.0f * cos(cameraYaw * UNIT));
int pitchSin = (int)(65536.0f * sin(cameraPitch * UNIT));
int pitchCos = (int)(65536.0f * cos(cameraPitch * UNIT));
int j = vertex.z * yawCos - vertex.x * yawSin >> 16;
int l = vertex.y * pitchSin + j * pitchCos >> 16;
return l;
}
/*
* Calculate the distance to a face
*/
int face_distance(int4 vA, int4 vB, int4 vC, int cameraYaw, int cameraPitch) {
int dvA = vertex_distance(vA, cameraYaw, cameraPitch);
int dvB = vertex_distance(vB, cameraYaw, cameraPitch);
int dvC = vertex_distance(vC, cameraYaw, cameraPitch);
int faceDistance = (dvA + dvB + dvC) / 3;
return faceDistance;
}
/*
* Test if a face is visible (not backward facing)
*/
bool face_visible(__constant struct uniform *uni, int4 vA, int4 vB, int4 vC, int4 position) {
// Move model to scene location, and account for camera offset
int4 cameraPos = (int4)(uni->cameraX, uni->cameraY, uni->cameraZ, 0);
vA += position - cameraPos;
vB += position - cameraPos;
vC += position - cameraPos;
float3 sA = toScreen(vA, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
float3 sB = toScreen(vB, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
float3 sC = toScreen(vC, uni->cameraYaw, uni->cameraPitch, uni->centerX, uni->centerY, uni->zoom);
return (sA.x - sB.x) * (sC.y - sB.y) - (sC.x - sB.x) * (sA.y - sB.y) > 0;
}

View File

@@ -0,0 +1,97 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include FACE_COUNT
#include cl_types.cl
#include to_screen.cl
#include common.cl
#include priority_render.cl
__kernel
__attribute__((work_group_size_hint(256, 1, 1)))
void computeLarge(
__local struct shared_data *shared,
__global const struct modelinfo *ol,
__global const int4 *vb,
__global const int4 *tempvb,
__global const float4 *uv,
__global const float4 *tempuv,
__global int4 *vout,
__global float4 *uvout,
__constant struct uniform *uni) {
size_t groupId = get_group_id(0);
size_t localId = get_local_id(0) * FACE_COUNT;
struct modelinfo minfo = ol[groupId];
int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
if (localId == 0) {
shared->min10 = 1600;
for (int i = 0; i < 12; ++i) {
shared->totalNum[i] = 0;
shared->totalDistance[i] = 0;
}
for (int i = 0; i < 18; ++i) {
shared->totalMappedNum[i] = 0;
}
}
int prio[FACE_COUNT];
int dis[FACE_COUNT];
int4 v1[FACE_COUNT];
int4 v2[FACE_COUNT];
int4 v3[FACE_COUNT];
for (int i = 0; i < FACE_COUNT; i++) {
get_face(shared, uni, vb, tempvb, localId + i, minfo, uni->cameraYaw, uni->cameraPitch, &prio[i], &dis[i], &v1[i], &v2[i], &v3[i]);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < FACE_COUNT; i++) {
add_face_prio_distance(shared, uni, localId + i, minfo, v1[i], v2[i], v3[i], prio[i], dis[i], pos);
}
barrier(CLK_LOCAL_MEM_FENCE);
int prioAdj[FACE_COUNT];
int idx[FACE_COUNT];
for (int i = 0; i < FACE_COUNT; i++) {
idx[i] = map_face_priority(shared, localId + i, minfo, prio[i], dis[i], &prioAdj[i]);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < FACE_COUNT; i++) {
insert_dfs(shared, localId + i, minfo, prioAdj[i], dis[i], idx[i]);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < FACE_COUNT; i++) {
sort_and_insert(shared, uv, tempuv, vout, uvout, localId + i, minfo, prioAdj[i], dis[i], v1[i], v2[i], v3[i]);
}
}

View File

@@ -0,0 +1,87 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include cl_types.cl
__kernel
__attribute__((reqd_work_group_size(6, 1, 1)))
void computeUnordered(__global const struct modelinfo *ol,
__global const int4 *vb,
__global const int4 *tempvb,
__global const float4 *uv,
__global const float4 *tempuv,
__global int4 *vout,
__global float4 *uvout) {
size_t groupId = get_group_id(0);
size_t localId = get_local_id(0);
struct modelinfo minfo = ol[groupId];
int offset = minfo.offset;
int size = minfo.size;
int outOffset = minfo.idx;
int uvOffset = minfo.uvOffset;
int flags = minfo.flags;
int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
if (localId >= size) {
return;
}
uint ssboOffset = localId;
int4 thisA, thisB, thisC;
// Grab triangle vertices from the correct buffer
if (flags < 0) {
thisA = vb[offset + ssboOffset * 3];
thisB = vb[offset + ssboOffset * 3 + 1];
thisC = vb[offset + ssboOffset * 3 + 2];
} else {
thisA = tempvb[offset + ssboOffset * 3];
thisB = tempvb[offset + ssboOffset * 3 + 1];
thisC = tempvb[offset + ssboOffset * 3 + 2];
}
uint myOffset = localId;
// position vertices in scene and write to out buffer
vout[outOffset + myOffset * 3] = pos + thisA;
vout[outOffset + myOffset * 3 + 1] = pos + thisB;
vout[outOffset + myOffset * 3 + 2] = pos + thisC;
if (uvOffset < 0) {
uvout[outOffset + myOffset * 3] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
uvout[outOffset + myOffset * 3 + 1] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
uvout[outOffset + myOffset * 3 + 2] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
} else if (flags >= 0) {
uvout[outOffset + myOffset * 3] = tempuv[uvOffset + localId * 3];
uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1];
uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2];
} else {
uvout[outOffset + myOffset * 3] = uv[uvOffset + localId * 3];
uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1];
uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2];
}
}

View File

@@ -0,0 +1,298 @@
/*
* Copyright (c) 2021, Adam <Adam@sigterm.info>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Calculate adjusted priority for a face with a given priority, distance, and
// model global min10 and face distance averages. This allows positioning faces
// with priorities 10/11 into the correct 'slots' resulting in 18 possible
// adjusted priorities
int priority_map(int p, int distance, int _min10, int avg1, int avg2, int avg3) {
// (10, 11) 0 1 2 (10, 11) 3 4 (10, 11) 5 6 7 8 9 (10, 11)
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
switch (p) {
case 0: return 2;
case 1: return 3;
case 2: return 4;
case 3: return 7;
case 4: return 8;
case 5: return 11;
case 6: return 12;
case 7: return 13;
case 8: return 14;
case 9: return 15;
case 10:
if (distance > avg1) {
return 0;
} else if (distance > avg2) {
return 5;
} else if (distance > avg3) {
return 9;
} else {
return 16;
}
case 11:
if (distance > avg1 && _min10 > avg1) {
return 1;
} else if (distance > avg2 && (_min10 > avg1 || _min10 > avg2)) {
return 6;
} else if (distance > avg3 && (_min10 > avg1 || _min10 > avg2 || _min10 > avg3)) {
return 10;
} else {
return 17;
}
default:
return -1;
}
}
// calculate the number of faces with a lower adjusted priority than
// the given adjusted priority
int count_prio_offset(__local struct shared_data *shared, int priority) {
int total = 0;
switch (priority) {
case 17:
total += shared->totalMappedNum[16];
case 16:
total += shared->totalMappedNum[15];
case 15:
total += shared->totalMappedNum[14];
case 14:
total += shared->totalMappedNum[13];
case 13:
total += shared->totalMappedNum[12];
case 12:
total += shared->totalMappedNum[11];
case 11:
total += shared->totalMappedNum[10];
case 10:
total += shared->totalMappedNum[9];
case 9:
total += shared->totalMappedNum[8];
case 8:
total += shared->totalMappedNum[7];
case 7:
total += shared->totalMappedNum[6];
case 6:
total += shared->totalMappedNum[5];
case 5:
total += shared->totalMappedNum[4];
case 4:
total += shared->totalMappedNum[3];
case 3:
total += shared->totalMappedNum[2];
case 2:
total += shared->totalMappedNum[1];
case 1:
total += shared->totalMappedNum[0];
case 0:
return total;
}
}
void get_face(
__local struct shared_data *shared,
__constant struct uniform *uni,
__global const int4 *vb,
__global const int4 *tempvb,
uint localId, struct modelinfo minfo, int cameraYaw, int cameraPitch,
/* out */ int *prio, int *dis, int4 *o1, int4 *o2, int4 *o3) {
int size = minfo.size;
int offset = minfo.offset;
int flags = minfo.flags;
uint ssboOffset;
if (localId < size) {
ssboOffset = localId;
} else {
ssboOffset = 0;
}
int4 thisA;
int4 thisB;
int4 thisC;
// Grab triangle vertices from the correct buffer
if (flags < 0) {
thisA = vb[offset + ssboOffset * 3];
thisB = vb[offset + ssboOffset * 3 + 1];
thisC = vb[offset + ssboOffset * 3 + 2];
} else {
thisA = tempvb[offset + ssboOffset * 3];
thisB = tempvb[offset + ssboOffset * 3 + 1];
thisC = tempvb[offset + ssboOffset * 3 + 2];
}
if (localId < size) {
int radius = (flags & 0x7fffffff) >> 12;
int orientation = flags & 0x7ff;
// rotate for model orientation
int4 thisrvA = rotate_vertex(uni, thisA, orientation);
int4 thisrvB = rotate_vertex(uni, thisB, orientation);
int4 thisrvC = rotate_vertex(uni, thisC, orientation);
// calculate distance to face
int thisPriority = (thisA.w >> 16) & 0xff;// all vertices on the face have the same priority
int thisDistance;
if (radius == 0) {
thisDistance = 0;
} else {
thisDistance = face_distance(thisrvA, thisrvB, thisrvC, cameraYaw, cameraPitch) + radius;
}
*o1 = thisrvA;
*o2 = thisrvB;
*o3 = thisrvC;
*prio = thisPriority;
*dis = thisDistance;
} else {
*o1 = (int4)(0, 0, 0, 0);
*o2 = (int4)(0, 0, 0, 0);
*o3 = (int4)(0, 0, 0, 0);
*prio = 0;
*dis = 0;
}
}
void add_face_prio_distance(
__local struct shared_data *shared,
__constant struct uniform *uni,
uint localId, struct modelinfo minfo, int4 thisrvA, int4 thisrvB, int4 thisrvC, int thisPriority, int thisDistance, int4 pos) {
if (localId < minfo.size) {
// if the face is not culled, it is calculated into priority distance averages
if (face_visible(uni, thisrvA, thisrvB, thisrvC, pos)) {
atomic_add(&shared->totalNum[thisPriority], 1);
atomic_add(&shared->totalDistance[thisPriority], thisDistance);
// calculate minimum distance to any face of priority 10 for positioning the 11 faces later
if (thisPriority == 10) {
atomic_min(&shared->min10, thisDistance);
}
}
}
}
int map_face_priority(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int *prio) {
int size = minfo.size;
// Compute average distances for 0/2, 3/4, and 6/8
if (localId < size) {
int avg1 = 0;
int avg2 = 0;
int avg3 = 0;
if (shared->totalNum[1] > 0 || shared->totalNum[2] > 0) {
avg1 = (shared->totalDistance[1] + shared->totalDistance[2]) / (shared->totalNum[1] + shared->totalNum[2]);
}
if (shared->totalNum[3] > 0 || shared->totalNum[4] > 0) {
avg2 = (shared->totalDistance[3] + shared->totalDistance[4]) / (shared->totalNum[3] + shared->totalNum[4]);
}
if (shared->totalNum[6] > 0 || shared->totalNum[8] > 0) {
avg3 = (shared->totalDistance[6] + shared->totalDistance[8]) / (shared->totalNum[6] + shared->totalNum[8]);
}
int adjPrio = priority_map(thisPriority, thisDistance, shared->min10, avg1, avg2, avg3);
int prioIdx = atomic_add(&shared->totalMappedNum[adjPrio], 1);
*prio = adjPrio;
return prioIdx;
}
*prio = 0;
return 0;
}
void insert_dfs(__local struct shared_data *shared, uint localId, struct modelinfo minfo, int adjPrio, int distance, int prioIdx) {
int size = minfo.size;
if (localId < size) {
// calculate base offset into dfs based on number of faces with a lower priority
int baseOff = count_prio_offset(shared, adjPrio);
// store into face array offset array by unique index
shared->dfs[baseOff + prioIdx] = ((int) localId << 16) | distance;
}
}
void sort_and_insert(
__local struct shared_data *shared,
__global const float4 *uv,
__global const float4 *tempuv,
__global int4 *vout,
__global float4 *uvout,
uint localId, struct modelinfo minfo, int thisPriority, int thisDistance, int4 thisrvA, int4 thisrvB, int4 thisrvC) {
/* compute face distance */
int size = minfo.size;
if (localId < size) {
int outOffset = minfo.idx;
int uvOffset = minfo.uvOffset;
int flags = minfo.flags;
int4 pos = (int4)(minfo.x, minfo.y, minfo.z, 0);
const int priorityOffset = count_prio_offset(shared, thisPriority);
const int numOfPriority = shared->totalMappedNum[thisPriority];
int start = priorityOffset; // index of first face with this priority
int end = priorityOffset + numOfPriority; // index of last face with this priority
int myOffset = priorityOffset;
// we only have to order faces against others of the same priority
// calculate position this face will be in
for (int i = start; i < end; ++i) {
int d1 = shared->dfs[i];
int theirId = d1 >> 16;
int theirDistance = d1 & 0xffff;
// the closest faces draw last, so have the highest index
// if two faces have the same distance, the one with the
// higher id draws last
if ((theirDistance > thisDistance)
|| (theirDistance == thisDistance && theirId < localId)) {
++myOffset;
}
}
// position vertices in scene and write to out buffer
vout[outOffset + myOffset * 3] = pos + thisrvA;
vout[outOffset + myOffset * 3 + 1] = pos + thisrvB;
vout[outOffset + myOffset * 3 + 2] = pos + thisrvC;
if (uvOffset < 0) {
uvout[outOffset + myOffset * 3] = (float4)(0, 0, 0, 0);
uvout[outOffset + myOffset * 3 + 1] = (float4)(0, 0, 0, 0);
uvout[outOffset + myOffset * 3 + 2] = (float4)(0, 0, 0, 0);
} else if (flags >= 0) {
uvout[outOffset + myOffset * 3] = tempuv[uvOffset + localId * 3];
uvout[outOffset + myOffset * 3 + 1] = tempuv[uvOffset + localId * 3 + 1];
uvout[outOffset + myOffset * 3 + 2] = tempuv[uvOffset + localId * 3 + 2];
} else {
uvout[outOffset + myOffset * 3] = uv[uvOffset + localId * 3];
uvout[outOffset + myOffset * 3 + 1] = uv[uvOffset + localId * 3 + 1];
uvout[outOffset + myOffset * 3 + 2] = uv[uvOffset + localId * 3 + 2];
}
}
}