// // Copyright 2016 Pixar // // Licensed under the Apache License, Version 2.0 (the "Apache License") // with the following modification; you may not use this file except in // compliance with the Apache License and the following modification to it: // Section 6. Trademarks. is deleted and replaced with: // // 6. Trademarks. This License does not grant permission to use the trade // names, trademarks, service marks, or product names of the Licensor // and its affiliates, except as required to comply with Section 4(c) of // the License and to reproduce the content of the NOTICE file. // // You may obtain a copy of the Apache License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the Apache License with the above modification is // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the Apache License for the specific // language governing permissions and limitations under the Apache License. // #include "pxr/imaging/glf/glew.h" #include "pxr/imaging/glf/contextCaps.h" #include "pxr/imaging/hdSt/bufferArrayRangeGL.h" #include "pxr/imaging/hdSt/commandBuffer.h" #include "pxr/imaging/hdSt/cullingShaderKey.h" #include "pxr/imaging/hdSt/debugCodes.h" #include "pxr/imaging/hdSt/drawItemInstance.h" #include "pxr/imaging/hdSt/geometricShader.h" #include "pxr/imaging/hdSt/glslProgram.h" #include "pxr/imaging/hdSt/indirectDrawBatch.h" #include "pxr/imaging/hdSt/renderPassState.h" #include "pxr/imaging/hdSt/resourceRegistry.h" #include "pxr/imaging/hdSt/shaderCode.h" #include "pxr/imaging/hdSt/shaderKey.h" #include "pxr/imaging/hd/binding.h" #include "pxr/imaging/hd/debugCodes.h" #include "pxr/imaging/hd/perfLog.h" #include "pxr/imaging/hd/tokens.h" #include "pxr/imaging/glf/diagnostic.h" #include "pxr/imaging/hio/glslfx.h" #include "pxr/base/tf/diagnostic.h" #include "pxr/base/tf/envSetting.h" #include "pxr/base/tf/getenv.h" #include "pxr/base/tf/iterator.h" #include "pxr/base/tf/staticTokens.h" #include #include PXR_NAMESPACE_OPEN_SCOPE TF_DEFINE_PRIVATE_TOKENS( _tokens, (dispatchBuffer) (drawCommandIndex) (drawIndirect) (drawIndirectCull) (drawIndirectResult) (instanceCountInput) (ulocDrawCommandNumUints) (ulocResetPass) (ulocCullMatrix) (ulocDrawRangeNDC) ); static const GLuint64 HD_CULL_RESULT_TIMEOUT_NS = 5e9; // XXX how long to wait? TF_DEFINE_ENV_SETTING(HD_ENABLE_GPU_FRUSTUM_CULLING, true, "Enable GPU frustum culling"); TF_DEFINE_ENV_SETTING(HD_ENABLE_GPU_COUNT_VISIBLE_INSTANCES, false, "Enable GPU frustum culling visible count query"); TF_DEFINE_ENV_SETTING(HD_ENABLE_GPU_INSTANCE_FRUSTUM_CULLING, true, "Enable GPU per-instance frustum culling"); HdSt_IndirectDrawBatch::HdSt_IndirectDrawBatch( HdStDrawItemInstance * drawItemInstance) : HdSt_DrawBatch(drawItemInstance) , _drawCommandBufferDirty(false) , _bufferArraysHash(0) , _numVisibleItems(0) , _numTotalVertices(0) , _numTotalElements(0) /* The following two values are set before draw by * SetEnableTinyPrimCulling(). */ , _useTinyPrimCulling(false) , _dirtyCullingProgram(false) /* The following four values are initialized in _Init(). */ , _useDrawArrays(false) , _useInstancing(false) , _useGpuCulling(false) , _useGpuInstanceCulling(false) , _instanceCountOffset(0) , _cullInstanceCountOffset(0) , _cullResultSync(0) { _Init(drawItemInstance); } /*virtual*/ void HdSt_IndirectDrawBatch::_Init(HdStDrawItemInstance * drawItemInstance) { HdSt_DrawBatch::_Init(drawItemInstance); drawItemInstance->SetBatchIndex(0); drawItemInstance->SetBatch(this); // remember buffer arrays version for dispatch buffer updating HdStDrawItem const* drawItem = drawItemInstance->GetDrawItem(); _bufferArraysHash = drawItem->GetBufferArraysHash(); // determine gpu culling program by the first drawitem _useDrawArrays = !drawItem->GetTopologyRange(); _useInstancing = static_cast(drawItem->GetInstanceIndexRange()); _useGpuCulling = IsEnabledGPUFrustumCulling(); // note: _useInstancing condition is not necessary. it can be removed // if we decide always to use instance culling. _useGpuInstanceCulling = _useInstancing && _useGpuCulling && IsEnabledGPUInstanceFrustumCulling(); if (_useGpuCulling) { _cullingProgram.Initialize( _useDrawArrays, _useGpuInstanceCulling, _bufferArraysHash); } } HdSt_IndirectDrawBatch::_CullingProgram & HdSt_IndirectDrawBatch::_GetCullingProgram( HdStResourceRegistrySharedPtr const &resourceRegistry) { if (!_cullingProgram.GetGLSLProgram() || _dirtyCullingProgram) { // create a culling shader key HdSt_CullingShaderKey shaderKey(_useGpuInstanceCulling, _useTinyPrimCulling, IsEnabledGPUCountVisibleInstances()); // sharing the culling geometric shader for the same configuration. HdSt_GeometricShaderSharedPtr cullShader = HdSt_GeometricShader::Create(shaderKey, resourceRegistry); _cullingProgram.SetGeometricShader(cullShader); _cullingProgram.CompileShader(_drawItemInstances.front()->GetDrawItem(), /*indirect=*/true, resourceRegistry); _dirtyCullingProgram = false; } return _cullingProgram; } HdSt_IndirectDrawBatch::~HdSt_IndirectDrawBatch() { } void HdSt_IndirectDrawBatch::SetEnableTinyPrimCulling(bool tinyPrimCulling) { if (_useTinyPrimCulling != tinyPrimCulling) { _useTinyPrimCulling = tinyPrimCulling; _dirtyCullingProgram = true; } } /* static */ bool HdSt_IndirectDrawBatch::IsEnabledGPUFrustumCulling() { GlfContextCaps const &caps = GlfContextCaps::GetInstance(); // GPU frustum culling requires SSBO of bindless buffer static bool isEnabledGPUFrustumCulling = TfGetEnvSetting(HD_ENABLE_GPU_FRUSTUM_CULLING) && (caps.shaderStorageBufferEnabled || caps.bindlessBufferEnabled); return isEnabledGPUFrustumCulling && !TfDebug::IsEnabled(HDST_DISABLE_FRUSTUM_CULLING); } /* static */ bool HdSt_IndirectDrawBatch::IsEnabledGPUCountVisibleInstances() { static bool isEnabledGPUCountVisibleInstances = TfGetEnvSetting(HD_ENABLE_GPU_COUNT_VISIBLE_INSTANCES); return isEnabledGPUCountVisibleInstances; } /* static */ bool HdSt_IndirectDrawBatch::IsEnabledGPUInstanceFrustumCulling() { GlfContextCaps const &caps = GlfContextCaps::GetInstance(); // GPU instance frustum culling requires SSBO of bindless buffer static bool isEnabledGPUInstanceFrustumCulling = TfGetEnvSetting(HD_ENABLE_GPU_INSTANCE_FRUSTUM_CULLING) && (caps.shaderStorageBufferEnabled || caps.bindlessBufferEnabled); return isEnabledGPUInstanceFrustumCulling; } static int _GetElementOffset(HdStBufferArrayRangeGLSharedPtr const& range) { return range? range->GetElementOffset() : 0; } void HdSt_IndirectDrawBatch::_CompileBatch( HdStResourceRegistrySharedPtr const &resourceRegistry) { HD_TRACE_FUNCTION(); HF_MALLOC_TAG_FUNCTION(); int drawCount = _drawItemInstances.size(); if (_drawItemInstances.empty()) return; // drawcommand is configured as one of followings: // // DrawArrays + non-instance culling : 14 integers (+ numInstanceLevels) struct _DrawArraysCommand { GLuint count; GLuint instanceCount; GLuint first; GLuint baseInstance; // XXX: This is just padding to avoid configuration changes during // transform feedback, which are not accounted for during shader // caching. We should find a better solution. GLuint __reserved_0; GLuint modelDC; GLuint constantDC; GLuint elementDC; GLuint primitiveDC; GLuint fvarDC; GLuint instanceIndexDC; GLuint shaderDC; GLuint vertexDC; GLuint topologyVisibilityDC; }; // DrawArrays + Instance culling : 17 integers (+ numInstanceLevels) struct _DrawArraysInstanceCullCommand { GLuint count; GLuint instanceCount; GLuint first; GLuint baseInstance; GLuint cullCount; GLuint cullInstanceCount; GLuint cullFirstVertex; GLuint cullBaseInstance; GLuint modelDC; GLuint constantDC; GLuint elementDC; GLuint primitiveDC; GLuint fvarDC; GLuint instanceIndexDC; GLuint shaderDC; GLuint vertexDC; GLuint topologyVisibilityDC; }; // DrawElements + non-instance culling : 14 integers (+ numInstanceLevels) struct _DrawElementsCommand { GLuint count; GLuint instanceCount; GLuint first; GLuint baseVertex; GLuint baseInstance; GLuint modelDC; GLuint constantDC; GLuint elementDC; GLuint primitiveDC; GLuint fvarDC; GLuint instanceIndexDC; GLuint shaderDC; GLuint vertexDC; GLuint topologyVisibilityDC; }; // DrawElements + Instance culling : 18 integers (+ numInstanceLevels) struct _DrawElementsInstanceCullCommand { GLuint count; GLuint instanceCount; GLuint first; GLuint baseVertex; GLuint baseInstance; GLuint cullCount; GLuint cullInstanceCount; GLuint cullFirstVertex; GLuint cullBaseInstance; GLuint modelDC; GLuint constantDC; GLuint elementDC; GLuint primitiveDC; GLuint fvarDC; GLuint instanceIndexDC; GLuint shaderDC; GLuint vertexDC; GLuint topologyVisibilityDC; }; // Count the number of visible items. We may actually draw fewer // items than this when GPU frustum culling is active _numVisibleItems = 0; // elements to be drawn (early out for empty batch) _numTotalElements = 0; _numTotalVertices = 0; size_t instancerNumLevels = _drawItemInstances[0]->GetDrawItem()->GetInstancePrimvarNumLevels(); // how many integers in the dispatch struct int commandNumUints = _useDrawArrays ? (_useGpuInstanceCulling ? sizeof(_DrawArraysInstanceCullCommand)/sizeof(GLuint) : sizeof(_DrawArraysCommand)/sizeof(GLuint)) : (_useGpuInstanceCulling ? sizeof(_DrawElementsInstanceCullCommand)/sizeof(GLuint) : sizeof(_DrawElementsCommand)/sizeof(GLuint)); // followed by instanceDC[numlevels] commandNumUints += instancerNumLevels; TF_DEBUG(HD_MDI).Msg("\nCompile MDI Batch\n"); TF_DEBUG(HD_MDI).Msg(" - num uints: %d\n", commandNumUints); TF_DEBUG(HD_MDI).Msg(" - useDrawArrays: %d\n", _useDrawArrays); TF_DEBUG(HD_MDI).Msg(" - useGpuInstanceCulling: %d\n", _useGpuInstanceCulling); size_t numDrawItemInstances = _drawItemInstances.size(); TF_DEBUG(HD_MDI).Msg(" - num draw items: %zu\n", numDrawItemInstances); // Note: GL specifies baseVertex as 'int' and other as 'uint' in // drawcommand struct, but we never set negative baseVertex in our // usecases for bufferArray so we use uint for all fields here. _drawCommandBuffer.resize(numDrawItemInstances * commandNumUints); std::vector::iterator cmdIt = _drawCommandBuffer.begin(); TF_DEBUG(HD_MDI).Msg(" - Processing Items:\n"); for (size_t item = 0; item < numDrawItemInstances; ++item) { HdStDrawItemInstance const * instance = _drawItemInstances[item]; HdStDrawItem const * drawItem = _drawItemInstances[item]->GetDrawItem(); // // index buffer data // HdBufferArrayRangeSharedPtr const & indexBar_ = drawItem->GetTopologyRange(); HdStBufferArrayRangeGLSharedPtr indexBar = std::static_pointer_cast(indexBar_); // // topology visiibility buffer data // HdBufferArrayRangeSharedPtr const & topVisBar_ = drawItem->GetTopologyVisibilityRange(); HdStBufferArrayRangeGLSharedPtr topVisBar = std::static_pointer_cast(topVisBar_); // // element (per-face) buffer data // HdBufferArrayRangeSharedPtr const & elementBar_ = drawItem->GetElementPrimvarRange(); HdStBufferArrayRangeGLSharedPtr elementBar = std::static_pointer_cast(elementBar_); // // vertex attrib buffer data // HdBufferArrayRangeSharedPtr const & vertexBar_ = drawItem->GetVertexPrimvarRange(); HdStBufferArrayRangeGLSharedPtr vertexBar = std::static_pointer_cast(vertexBar_); // // constant buffer data // HdBufferArrayRangeSharedPtr const & constantBar_ = drawItem->GetConstantPrimvarRange(); HdStBufferArrayRangeGLSharedPtr constantBar = std::static_pointer_cast(constantBar_); // // face varying buffer data // HdBufferArrayRangeSharedPtr const & fvarBar_ = drawItem->GetFaceVaryingPrimvarRange(); HdStBufferArrayRangeGLSharedPtr fvarBar = std::static_pointer_cast(fvarBar_); // // instance buffer data // int instanceIndexWidth = instancerNumLevels + 1; std::vector instanceBars(instancerNumLevels); for (size_t i = 0; i < instancerNumLevels; ++i) { HdBufferArrayRangeSharedPtr const & ins_ = drawItem->GetInstancePrimvarRange(i); HdStBufferArrayRangeGLSharedPtr ins = std::static_pointer_cast(ins_); instanceBars[i] = ins; } // // instance indices // HdBufferArrayRangeSharedPtr const & instanceIndexBar_ = drawItem->GetInstanceIndexRange(); HdStBufferArrayRangeGLSharedPtr instanceIndexBar = std::static_pointer_cast(instanceIndexBar_); // // shader parameter // HdBufferArrayRangeSharedPtr const & shaderBar_ = drawItem->GetMaterialShader()->GetShaderData(); HdStBufferArrayRangeGLSharedPtr shaderBar = std::static_pointer_cast(shaderBar_); // 3 for triangles, 4 for quads, n for patches GLuint numIndicesPerPrimitive = drawItem->GetGeometricShader()->GetPrimitiveIndexSize(); // // Get parameters from our buffer range objects to // allow drawing to access the correct elements from // aggregated buffers. // GLuint numElements = indexBar ? indexBar->GetNumElements() : 0; GLuint vertexOffset = 0; GLuint vertexCount = 0; if (vertexBar) { vertexOffset = vertexBar->GetElementOffset(); vertexCount = vertexBar->GetNumElements(); } // if delegate fails to get vertex primvars, it could be empty. // skip the drawitem to prevent drawing uninitialized vertices. if (vertexCount == 0) numElements = 0; GLuint baseInstance = (GLuint)item; // drawing coordinates. GLuint modelDC = 0; // reserved for future extension GLuint constantDC = _GetElementOffset(constantBar); GLuint vertexDC = vertexOffset; GLuint topologyVisibilityDC = _GetElementOffset(topVisBar); GLuint elementDC = _GetElementOffset(elementBar); GLuint primitiveDC = _GetElementOffset(indexBar); GLuint fvarDC = _GetElementOffset(fvarBar); GLuint instanceIndexDC = _GetElementOffset(instanceIndexBar); GLuint shaderDC = _GetElementOffset(shaderBar); GLuint indicesCount = numElements * numIndicesPerPrimitive; // It's possible to have instanceIndexBar which is empty, and no instancePrimvars. // in that case instanceCount should be 0, instead of 1, otherwise // frustum culling shader writes the result out to out-of-bound buffer. // this is covered by testHdDrawBatching/EmptyDrawBatchTest GLuint instanceCount = instanceIndexBar ? instanceIndexBar->GetNumElements()/instanceIndexWidth : 1; if (!instance->IsVisible()) instanceCount = 0; GLuint firstIndex = indexBar ? indexBar->GetElementOffset() * numIndicesPerPrimitive : 0; if (_useDrawArrays) { if (_useGpuInstanceCulling) { *cmdIt++ = vertexCount; *cmdIt++ = instanceCount; *cmdIt++ = vertexOffset; *cmdIt++ = baseInstance; *cmdIt++ = 1; /* cullCount (always 1) */ *cmdIt++ = instanceCount; /* cullInstanceCount */ *cmdIt++ = 0; /* cullFirstVertex (not used)*/ *cmdIt++ = baseInstance; /* cullBaseInstance */ *cmdIt++ = modelDC; *cmdIt++ = constantDC; *cmdIt++ = elementDC; *cmdIt++ = primitiveDC; *cmdIt++ = fvarDC; *cmdIt++ = instanceIndexDC; *cmdIt++ = shaderDC; *cmdIt++ = vertexDC; *cmdIt++ = topologyVisibilityDC; } else { *cmdIt++ = vertexCount; *cmdIt++ = instanceCount; *cmdIt++ = vertexOffset; *cmdIt++ = baseInstance; cmdIt++; // __reserved_0 *cmdIt++ = modelDC; *cmdIt++ = constantDC; *cmdIt++ = elementDC; *cmdIt++ = primitiveDC; *cmdIt++ = fvarDC; *cmdIt++ = instanceIndexDC; *cmdIt++ = shaderDC; *cmdIt++ = vertexDC; *cmdIt++ = topologyVisibilityDC; } } else { if (_useGpuInstanceCulling) { *cmdIt++ = indicesCount; *cmdIt++ = instanceCount; *cmdIt++ = firstIndex; *cmdIt++ = vertexOffset; *cmdIt++ = baseInstance; *cmdIt++ = 1; /* cullCount (always 1) */ *cmdIt++ = instanceCount; /* cullInstanceCount */ *cmdIt++ = 0; /* cullFirstVertex (not used)*/ *cmdIt++ = baseInstance; /* cullBaseInstance */ *cmdIt++ = modelDC; *cmdIt++ = constantDC; *cmdIt++ = elementDC; *cmdIt++ = primitiveDC; *cmdIt++ = fvarDC; *cmdIt++ = instanceIndexDC; *cmdIt++ = shaderDC; *cmdIt++ = vertexDC; *cmdIt++ = topologyVisibilityDC; } else { *cmdIt++ = indicesCount; *cmdIt++ = instanceCount; *cmdIt++ = firstIndex; *cmdIt++ = vertexOffset; *cmdIt++ = baseInstance; *cmdIt++ = modelDC; *cmdIt++ = constantDC; *cmdIt++ = elementDC; *cmdIt++ = primitiveDC; *cmdIt++ = fvarDC; *cmdIt++ = instanceIndexDC; *cmdIt++ = shaderDC; *cmdIt++ = vertexDC; *cmdIt++ = topologyVisibilityDC; } } for (size_t i = 0; i < instancerNumLevels; ++i) { GLuint instanceDC = _GetElementOffset(instanceBars[i]); *cmdIt++ = instanceDC; } if (TfDebug::IsEnabled(HD_MDI)) { std::vector::iterator cmdIt2 = cmdIt - commandNumUints; std::cout << " - "; while (cmdIt2 != cmdIt) { std::cout << *cmdIt2 << " "; cmdIt2++; } std::cout << std::endl; } _numVisibleItems += instanceCount; _numTotalElements += numElements; _numTotalVertices += vertexCount; } TF_DEBUG(HD_MDI).Msg(" - Num Visible: %zu\n", _numVisibleItems); TF_DEBUG(HD_MDI).Msg(" - Total Elements: %zu\n", _numTotalElements); TF_DEBUG(HD_MDI).Msg(" - Total Verts: %zu\n", _numTotalVertices); // make sure we filled all TF_VERIFY(cmdIt == _drawCommandBuffer.end()); // allocate draw dispatch buffer _dispatchBuffer = resourceRegistry->RegisterDispatchBuffer(_tokens->drawIndirect, drawCount, commandNumUints); // define binding views if (_useDrawArrays) { if (_useGpuInstanceCulling) { // draw indirect command _dispatchBuffer->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawArraysInstanceCullCommand, count)); // drawing coords 0 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysInstanceCullCommand, modelDC)); // drawing coords 1 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord1, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysInstanceCullCommand, fvarDC)); // drawing coords 2 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord2, {HdTypeInt32, 1}, offsetof(_DrawArraysInstanceCullCommand, topologyVisibilityDC)); // instance drawing coords if (instancerNumLevels > 0) { _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawArraysInstanceCullCommand)); } } else { // draw indirect command _dispatchBuffer->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawArraysCommand, count)); // drawing coords 0 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysCommand, modelDC)); // drawing coords 1 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord1, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysCommand, fvarDC)); // drawing coords 2 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord2, {HdTypeInt32, 1}, offsetof(_DrawArraysCommand, topologyVisibilityDC)); // instance drawing coords if (instancerNumLevels > 0) { _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawArraysCommand)); } } } else { if (_useGpuInstanceCulling) { // draw indirect command _dispatchBuffer->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawElementsInstanceCullCommand, count)); // drawing coords 0 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsInstanceCullCommand, modelDC)); // drawing coords 1 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord1, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsInstanceCullCommand, fvarDC)); // drawing coords 2 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord2, {HdTypeInt32, 1}, offsetof(_DrawElementsInstanceCullCommand, topologyVisibilityDC)); // instance drawing coords if (instancerNumLevels > 0) { _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawElementsInstanceCullCommand)); } } else { // draw indirect command _dispatchBuffer->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawElementsCommand, count)); // drawing coords 0 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsCommand, modelDC)); // drawing coords 1 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord1, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsCommand, fvarDC)); // drawing coords 2 _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoord2, {HdTypeInt32, 1}, offsetof(_DrawElementsCommand, topologyVisibilityDC)); // instance drawing coords if (instancerNumLevels > 0) { _dispatchBuffer->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawElementsCommand)); } } } // copy data _dispatchBuffer->CopyData(_drawCommandBuffer); if (_useGpuCulling) { // Make a duplicate of the draw dispatch buffer to use as an input // for GPU frustum culling (a single buffer cannot be bound for // both reading and xform feedback). We use only the instanceCount // and drawingCoord parameters, but it is simplest to just make // a copy. _dispatchBufferCullInput = resourceRegistry->RegisterDispatchBuffer( _tokens->drawIndirectCull, drawCount, commandNumUints); // define binding views // // READ THIS CAREFULLY whenever you try to add/remove/shuffle // the drawing coordinate struct. // // We use vec2 as a type of drawingCoord1 for GPU culling: // // DrawingCoord1 is defined as 4 integers struct: // GLuint fvarDC; // GLuint instanceIndexDC; // GLuint shaderDC; // GLuint vertexDC; // // And CodeGen generates GetInstanceIndexCoord() as // // int GetInstanceIndexCoord() { return GetDrawingCoord1().y; } // // So the instanceIndex coord must be the second element. // That is why we need to add, at minimum, vec2 for drawingCoord1. // // We don't add a vec4, since we prefer smaller number of attributes // to be processed in the vertex input assembler, which in general gives // better performance especially in older hardware. In this case we // can't skip fvarDC without changing CodeGen logic, but we can // skip shaderDC and vertexDC for culling. // // XXX: Reorder members of drawingCoord0 and drawingCoord1 in CodeGen, // so we can minimize the vertex attributes fetched during culling. // // Since drawingCoord2 contains only topological visibility, we skip it // for the culling pass. // if (_useDrawArrays) { if (_useGpuInstanceCulling) { // cull indirect command _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawArraysInstanceCullCommand, cullCount)); // cull drawing coord 0 _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysInstanceCullCommand, modelDC)); // cull drawing coord 1 _dispatchBufferCullInput->AddBufferResourceView( // see the comment above HdTokens->drawingCoord1, {HdTypeInt32Vec2, 1}, offsetof(_DrawArraysInstanceCullCommand, fvarDC)); // cull instance drawing coord if (instancerNumLevels > 0) { _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawArraysInstanceCullCommand)); } // cull draw index _dispatchBufferCullInput->AddBufferResourceView( _tokens->drawCommandIndex, {HdTypeInt32, 1}, offsetof(_DrawArraysInstanceCullCommand, baseInstance)); } else { // cull indirect command _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawArraysCommand, count)); // cull drawing coord 0 _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawArraysCommand, modelDC)); // cull draw index _dispatchBufferCullInput->AddBufferResourceView( _tokens->drawCommandIndex, {HdTypeInt32, 1}, offsetof(_DrawArraysCommand, baseInstance)); // cull instance count input _dispatchBufferCullInput->AddBufferResourceView( _tokens->instanceCountInput, {HdTypeInt32, 1}, offsetof(_DrawArraysCommand, instanceCount)); } } else { if (_useGpuInstanceCulling) { // cull indirect command _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawElementsInstanceCullCommand, cullCount)); // cull drawing coord 0 _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsInstanceCullCommand, modelDC)); // cull drawing coord 1 _dispatchBufferCullInput->AddBufferResourceView( // see the comment above HdTokens->drawingCoord1, {HdTypeInt32Vec2, 1}, offsetof(_DrawElementsInstanceCullCommand, fvarDC)); // cull instance drawing coord if (instancerNumLevels > 0) { _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoordI, {HdTypeInt32, instancerNumLevels}, sizeof(_DrawElementsInstanceCullCommand)); } // cull draw index _dispatchBufferCullInput->AddBufferResourceView( _tokens->drawCommandIndex, {HdTypeInt32, 1}, offsetof(_DrawElementsInstanceCullCommand, baseInstance)); } else { // cull indirect command _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawDispatch, {HdTypeInt32, 1}, offsetof(_DrawElementsCommand, count)); // cull drawing coord 0 _dispatchBufferCullInput->AddBufferResourceView( HdTokens->drawingCoord0, {HdTypeInt32Vec4, 1}, offsetof(_DrawElementsCommand, modelDC)); // cull draw index _dispatchBufferCullInput->AddBufferResourceView( _tokens->drawCommandIndex, {HdTypeInt32, 1}, offsetof(_DrawElementsCommand, baseInstance)); // cull instance count input _dispatchBufferCullInput->AddBufferResourceView( _tokens->instanceCountInput, {HdTypeInt32, 1}, offsetof(_DrawElementsCommand, instanceCount)); } } // copy data _dispatchBufferCullInput->CopyData(_drawCommandBuffer); } // cache the location of instanceCount, to be used at // DrawItemInstanceChanged(). if (_useDrawArrays) { if (_useGpuInstanceCulling) { _instanceCountOffset = offsetof(_DrawArraysInstanceCullCommand, instanceCount)/sizeof(GLuint); _cullInstanceCountOffset = offsetof(_DrawArraysInstanceCullCommand, cullInstanceCount)/sizeof(GLuint); } else { _instanceCountOffset = _cullInstanceCountOffset = offsetof(_DrawArraysCommand, instanceCount)/sizeof(GLuint); } } else { if (_useGpuInstanceCulling) { _instanceCountOffset = offsetof(_DrawElementsInstanceCullCommand, instanceCount)/sizeof(GLuint); _cullInstanceCountOffset = offsetof(_DrawElementsInstanceCullCommand, cullInstanceCount)/sizeof(GLuint); } else { _instanceCountOffset = _cullInstanceCountOffset = offsetof(_DrawElementsCommand, instanceCount)/sizeof(GLuint); } } } bool HdSt_IndirectDrawBatch::Validate(bool deepValidation) { if (!TF_VERIFY(!_drawItemInstances.empty())) return false; // check the hash to see they've been reallocated/migrated or not. // note that we just need to compare the hash of the first item, // since drawitems are aggregated and ensure that they are sharing // same buffer arrays. HdStDrawItem const* batchItem = _drawItemInstances.front()->GetDrawItem(); size_t bufferArraysHash = batchItem->GetBufferArraysHash(); if (_bufferArraysHash != bufferArraysHash) { _bufferArraysHash = bufferArraysHash; _dispatchBuffer.reset(); return false; } // Deep validation is needed when a drawItem changes its buffer spec, // surface shader or geometric shader. if (deepValidation) { // look through all draw items to be still compatible size_t numDrawItemInstances = _drawItemInstances.size(); for (size_t item = 0; item < numDrawItemInstances; ++item) { HdStDrawItem const * drawItem = _drawItemInstances[item]->GetDrawItem(); if (!TF_VERIFY(drawItem->GetGeometricShader())) { return false; } if (!_IsAggregated(batchItem, drawItem)) { return false; } } } return true; } void HdSt_IndirectDrawBatch::_ValidateCompatibility( HdStBufferArrayRangeGLSharedPtr const& constantBar, HdStBufferArrayRangeGLSharedPtr const& indexBar, HdStBufferArrayRangeGLSharedPtr const& topologyVisibilityBar, HdStBufferArrayRangeGLSharedPtr const& elementBar, HdStBufferArrayRangeGLSharedPtr const& fvarBar, HdStBufferArrayRangeGLSharedPtr const& vertexBar, int instancerNumLevels, HdStBufferArrayRangeGLSharedPtr const& instanceIndexBar, std::vector const& instanceBars) const { HdStDrawItem const* failed = nullptr; for (HdStDrawItemInstance const* itemInstance : _drawItemInstances) { HdStDrawItem const* itm = itemInstance->GetDrawItem(); if (constantBar && !TF_VERIFY(constantBar ->IsAggregatedWith(itm->GetConstantPrimvarRange()))) { failed = itm; break; } if (indexBar && !TF_VERIFY(indexBar ->IsAggregatedWith(itm->GetTopologyRange()))) { failed = itm; break; } if (topologyVisibilityBar && !TF_VERIFY(topologyVisibilityBar ->IsAggregatedWith(itm->GetTopologyVisibilityRange()))) { failed = itm; break; } if (elementBar && !TF_VERIFY(elementBar ->IsAggregatedWith(itm->GetElementPrimvarRange()))) { failed = itm; break; } if (fvarBar && !TF_VERIFY(fvarBar ->IsAggregatedWith(itm->GetFaceVaryingPrimvarRange()))) { failed = itm; break; } if (vertexBar && !TF_VERIFY(vertexBar ->IsAggregatedWith(itm->GetVertexPrimvarRange()))) { failed = itm; break; } if (!TF_VERIFY(instancerNumLevels == itm->GetInstancePrimvarNumLevels())) { failed = itm; break; } if (instanceIndexBar && !TF_VERIFY(instanceIndexBar ->IsAggregatedWith(itm->GetInstanceIndexRange()))) { failed = itm; break; } if (!TF_VERIFY(instancerNumLevels == (int)instanceBars.size())) { failed = itm; break; } std::vector itmInstanceBars( instancerNumLevels); if (instanceIndexBar) { for (int i = 0; i < instancerNumLevels; ++i) { if (itmInstanceBars[i] && !TF_VERIFY(itmInstanceBars[i] ->IsAggregatedWith(itm->GetInstancePrimvarRange(i)), "%d", i)) { failed = itm; break; } } } } if (failed) { std::cout << failed->GetRprimID() << std::endl; } } void HdSt_IndirectDrawBatch::PrepareDraw( HdStRenderPassStateSharedPtr const &renderPassState, HdStResourceRegistrySharedPtr const &resourceRegistry) { HD_TRACE_FUNCTION(); if (!glBindBuffer) return; // glew initialized GLF_GROUP_FUNCTION(); // // compile // if (!_dispatchBuffer) { _CompileBatch(resourceRegistry); } // there is no non-zero draw items. if (( _useDrawArrays && _numTotalVertices == 0) || (!_useDrawArrays && _numTotalElements == 0)) return; HdStDrawItem const* batchItem = _drawItemInstances.front()->GetDrawItem(); // Bypass freezeCulling if the command buffer is dirty. bool freezeCulling = TfDebug::IsEnabled(HD_FREEZE_CULL_FRUSTUM) && !_drawCommandBufferDirty; bool gpuCulling = _useGpuCulling; if (gpuCulling && !_useGpuInstanceCulling) { // disable GPU culling when instancing enabled and // not using instance culling. if (batchItem->GetInstanceIndexRange()) gpuCulling = false; } // Do we have to update our dispatch buffer because drawitem instance // data has changed? // On the first time through, after batches have just been compiled, // the flag will be false because the resource registry will have already // uploaded the buffer. if (_drawCommandBufferDirty) { _dispatchBuffer->CopyData(_drawCommandBuffer); if (gpuCulling) { _dispatchBufferCullInput->CopyData(_drawCommandBuffer); } _drawCommandBufferDirty = false; } // // cull // if (gpuCulling && !freezeCulling) { if (_useGpuInstanceCulling) { _GPUFrustumInstanceCulling( batchItem, renderPassState, resourceRegistry); } else { _GPUFrustumNonInstanceCulling( batchItem, renderPassState, resourceRegistry); } } if (TfDebug::IsEnabled(HD_DRAWITEM_DRAWN)) { void const *bufferData = NULL; // instanceCount is a second entry of drawcommand for both // DrawArraysIndirect and DrawElementsIndirect. const void *instanceCountOffset = (const void*) (_dispatchBuffer->GetResource(HdTokens->drawDispatch)->GetOffset() + sizeof(GLuint)); const int dispatchBufferStride = _dispatchBuffer->GetEntireResource()->GetStride(); GlfContextCaps const &caps = GlfContextCaps::GetInstance(); if (gpuCulling) { HgiBufferHandle const& buffer = _dispatchBuffer->GetEntireResource()->GetId(); if (caps.directStateAccessEnabled) { bufferData = glMapNamedBuffer( buffer->GetRawResource(), GL_READ_ONLY); } else { glBindBuffer(GL_ARRAY_BUFFER, buffer->GetRawResource()); bufferData = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); glBindBuffer(GL_ARRAY_BUFFER, 0); } } for (size_t item=0; item<_drawItemInstances.size(); ++item) { HdStDrawItemInstance const * drawItemInstance = _drawItemInstances[item]; if(!drawItemInstance->IsVisible()) { continue; } HdStDrawItem const * drawItem = drawItemInstance->GetDrawItem(); if (gpuCulling) { GLint const *instanceCount = (GLint const *)( (ptrdiff_t)(bufferData) + (ptrdiff_t)(instanceCountOffset) + item*dispatchBufferStride); bool isVisible = (*instanceCount > 0); if (!isVisible) { continue; } } std::stringstream ss; ss << *drawItem; TF_DEBUG(HD_DRAWITEM_DRAWN).Msg("PREP DRAW: \n%s\n", ss.str().c_str()); } if (gpuCulling) { HgiBufferHandle const& buffer = _dispatchBuffer->GetEntireResource()->GetId(); if (caps.directStateAccessEnabled) { glUnmapNamedBuffer(buffer->GetRawResource()); } else { glBindBuffer(GL_ARRAY_BUFFER, buffer->GetRawResource()); glUnmapBuffer(GL_ARRAY_BUFFER); glBindBuffer(GL_ARRAY_BUFFER, 0); } } } if (gpuCulling && !freezeCulling) { if (IsEnabledGPUCountVisibleInstances()) { _EndGPUCountVisibleInstances(_cullResultSync, &_numVisibleItems); glDeleteSync(_cullResultSync); _cullResultSync = 0; } } } void HdSt_IndirectDrawBatch::ExecuteDraw( HdStRenderPassStateSharedPtr const &renderPassState, HdStResourceRegistrySharedPtr const &resourceRegistry) { HD_TRACE_FUNCTION(); if (!glBindBuffer) return; // glew initialized if (!TF_VERIFY(!_drawItemInstances.empty())) return; HdStDrawItem const* batchItem = _drawItemInstances.front()->GetDrawItem(); if (!TF_VERIFY(batchItem)) return; if (!TF_VERIFY(_dispatchBuffer)) return; // there is no non-zero draw items. if (( _useDrawArrays && _numTotalVertices == 0) || (!_useDrawArrays && _numTotalElements == 0)) return; GLF_GROUP_FUNCTION(); // // draw // // bind program _DrawingProgram & program = _GetDrawingProgram(renderPassState, /*indirect=*/true, resourceRegistry); HdStGLSLProgramSharedPtr const &glslProgram = program.GetGLSLProgram(); if (!TF_VERIFY(glslProgram)) return; if (!TF_VERIFY(glslProgram->Validate())) return; GLuint programId = glslProgram->GetProgram()->GetRawResource(); TF_VERIFY(programId); GlfDebugLabelProgram(programId, "DrawingProgram"); glUseProgram(programId); const HdSt_ResourceBinder &binder = program.GetBinder(); const HdStShaderCodeSharedPtrVector &shaders = program.GetComposedShaders(); // XXX: for surfaces shader, we need to iterate all drawItems to // make textures resident, instead of just the first batchItem TF_FOR_ALL(it, shaders) { (*it)->BindResources(programId, binder, *renderPassState); } // constant buffer bind HdBufferArrayRangeSharedPtr constantBar_ = batchItem->GetConstantPrimvarRange(); HdStBufferArrayRangeGLSharedPtr constantBar = std::static_pointer_cast(constantBar_); binder.BindConstantBuffer(constantBar); // index buffer bind HdBufferArrayRangeSharedPtr indexBar_ = batchItem->GetTopologyRange(); HdStBufferArrayRangeGLSharedPtr indexBar = std::static_pointer_cast(indexBar_); binder.BindBufferArray(indexBar); // topology visibility buffer bind HdBufferArrayRangeSharedPtr topVisBar_ = batchItem->GetTopologyVisibilityRange(); HdStBufferArrayRangeGLSharedPtr topVisBar = std::static_pointer_cast(topVisBar_); binder.BindInterleavedBuffer(topVisBar, HdTokens->topologyVisibility); // element buffer bind HdBufferArrayRangeSharedPtr elementBar_ = batchItem->GetElementPrimvarRange(); HdStBufferArrayRangeGLSharedPtr elementBar = std::static_pointer_cast(elementBar_); binder.BindBufferArray(elementBar); // fvar buffer bind HdBufferArrayRangeSharedPtr fvarBar_ = batchItem->GetFaceVaryingPrimvarRange(); HdStBufferArrayRangeGLSharedPtr fvarBar = std::static_pointer_cast(fvarBar_); binder.BindBufferArray(fvarBar); // vertex buffer bind HdBufferArrayRangeSharedPtr vertexBar_ = batchItem->GetVertexPrimvarRange(); HdStBufferArrayRangeGLSharedPtr vertexBar = std::static_pointer_cast(vertexBar_); binder.BindBufferArray(vertexBar); // instance buffer bind int instancerNumLevels = batchItem->GetInstancePrimvarNumLevels(); std::vector instanceBars(instancerNumLevels); // instance index indirection HdBufferArrayRangeSharedPtr instanceIndexBar_ = batchItem->GetInstanceIndexRange(); HdStBufferArrayRangeGLSharedPtr instanceIndexBar = std::static_pointer_cast(instanceIndexBar_); if (instanceIndexBar) { // note that while instanceIndexBar is mandatory for instancing but // instanceBar can technically be empty (it doesn't make sense though) // testHdInstance --noprimvars covers that case. for (int i = 0; i < instancerNumLevels; ++i) { HdBufferArrayRangeSharedPtr ins_ = batchItem->GetInstancePrimvarRange(i); HdStBufferArrayRangeGLSharedPtr ins = std::static_pointer_cast(ins_); instanceBars[i] = ins; binder.BindInstanceBufferArray(instanceBars[i], i); } binder.BindBufferArray(instanceIndexBar); } if (false && ARCH_UNLIKELY(TfDebug::IsEnabled(HD_SAFE_MODE))) { _ValidateCompatibility(constantBar, indexBar, topVisBar, elementBar, fvarBar, vertexBar, instancerNumLevels, instanceIndexBar, instanceBars); } // shader buffer bind HdStBufferArrayRangeGLSharedPtr shaderBar; TF_FOR_ALL(shader, shaders) { HdBufferArrayRangeSharedPtr shaderBar_ = (*shader)->GetShaderData(); shaderBar = std::static_pointer_cast(shaderBar_); if (shaderBar) { binder.BindBuffer(HdTokens->materialParams, shaderBar->GetResource()); } } // drawindirect command, drawing coord, instanceIndexBase bind HdStBufferArrayRangeGLSharedPtr dispatchBar = _dispatchBuffer->GetBufferArrayRange(); binder.BindBufferArray(dispatchBar); // update geometric shader states program.GetGeometricShader()->BindResources( programId, binder, *renderPassState); GLuint batchCount = _dispatchBuffer->GetCount(); TF_DEBUG(HD_DRAWITEM_DRAWN).Msg("DRAW (indirect): %d\n", batchCount); if (_useDrawArrays) { TF_DEBUG(HD_MDI).Msg("MDI Drawing Arrays:\n" " - primitive mode: %d\n" " - indirect: %d\n" " - drawCount: %d\n" " - stride: %zu\n", program.GetGeometricShader()->GetPrimitiveMode(), 0, batchCount, _dispatchBuffer->GetCommandNumUints()*sizeof(GLuint)); glMultiDrawArraysIndirect( program.GetGeometricShader()->GetPrimitiveMode(), 0, // draw command always starts with 0 batchCount, _dispatchBuffer->GetCommandNumUints()*sizeof(GLuint)); } else { TF_DEBUG(HD_MDI).Msg("MDI Drawing Elements:\n" " - primitive mode: %d\n" " - buffer type: GL_UNSIGNED_INT\n" " - indirect: %d\n" " - drawCount: %d\n" " - stride: %zu\n", program.GetGeometricShader()->GetPrimitiveMode(), 0, batchCount, _dispatchBuffer->GetCommandNumUints()*sizeof(GLuint)); glMultiDrawElementsIndirect( program.GetGeometricShader()->GetPrimitiveMode(), GL_UNSIGNED_INT, 0, // draw command always starts with 0 batchCount, _dispatchBuffer->GetCommandNumUints()*sizeof(GLuint)); } HD_PERF_COUNTER_INCR(HdPerfTokens->drawCalls); HD_PERF_COUNTER_ADD(HdTokens->itemsDrawn, _numVisibleItems); // // cleanup // binder.UnbindConstantBuffer(constantBar); binder.UnbindInterleavedBuffer(topVisBar, HdTokens->topologyVisibility); binder.UnbindBufferArray(elementBar); binder.UnbindBufferArray(fvarBar); binder.UnbindBufferArray(indexBar); binder.UnbindBufferArray(vertexBar); binder.UnbindBufferArray(dispatchBar); if(shaderBar) { binder.UnbindBuffer(HdTokens->materialParams, shaderBar->GetResource()); } if (instanceIndexBar) { for (int i = 0; i < instancerNumLevels; ++i) { binder.UnbindInstanceBufferArray(instanceBars[i], i); } binder.UnbindBufferArray(instanceIndexBar); } TF_FOR_ALL(it, shaders) { (*it)->UnbindResources(programId, binder, *renderPassState); } program.GetGeometricShader()->UnbindResources(programId, binder, *renderPassState); glUseProgram(0); } void HdSt_IndirectDrawBatch::_GPUFrustumInstanceCulling( HdStDrawItem const *batchItem, HdStRenderPassStateSharedPtr const &renderPassState, HdStResourceRegistrySharedPtr const &resourceRegistry) { HdBufferArrayRangeSharedPtr constantBar_ = batchItem->GetConstantPrimvarRange(); HdStBufferArrayRangeGLSharedPtr constantBar = std::static_pointer_cast(constantBar_); int instancerNumLevels = batchItem->GetInstancePrimvarNumLevels(); std::vector instanceBars(instancerNumLevels); for (int i = 0; i < instancerNumLevels; ++i) { HdBufferArrayRangeSharedPtr ins_ = batchItem->GetInstancePrimvarRange(i); HdStBufferArrayRangeGLSharedPtr ins = std::static_pointer_cast(ins_); instanceBars[i] = ins; } HdBufferArrayRangeSharedPtr instanceIndexBar_ = batchItem->GetInstanceIndexRange(); HdStBufferArrayRangeGLSharedPtr instanceIndexBar = std::static_pointer_cast(instanceIndexBar_); HdStBufferArrayRangeGLSharedPtr cullDispatchBar = _dispatchBufferCullInput->GetBufferArrayRange(); _CullingProgram cullingProgram = _GetCullingProgram(resourceRegistry); HdStGLSLProgramSharedPtr const & glslProgram = cullingProgram.GetGLSLProgram(); if (!TF_VERIFY(glslProgram)) return; if (!TF_VERIFY(glslProgram->Validate())) return; // We perform frustum culling on the GPU with the rasterizer disabled, // stomping the instanceCount of each drawing command in the // dispatch buffer to 0 for primitives that are culled, skipping // over other elements. const HdSt_ResourceBinder &binder = cullingProgram.GetBinder(); GLuint programId = glslProgram->GetProgram()->GetRawResource(); glUseProgram(programId); // bind buffers binder.BindConstantBuffer(constantBar); // bind per-drawitem attribute (drawingCoord, instanceCount, drawCommand) binder.BindBufferArray(cullDispatchBar); if (instanceIndexBar) { int instancerNumLevels = batchItem->GetInstancePrimvarNumLevels(); for (int i = 0; i < instancerNumLevels; ++i) { binder.BindInstanceBufferArray(instanceBars[i], i); } binder.BindBufferArray(instanceIndexBar); } if (IsEnabledGPUCountVisibleInstances()) { _BeginGPUCountVisibleInstances(resourceRegistry); } // bind destination buffer (using entire buffer bind to start from offset=0) binder.BindBuffer(_tokens->dispatchBuffer, _dispatchBuffer->GetEntireResource()); // set cull parameters unsigned int drawCommandNumUints = _dispatchBuffer->GetCommandNumUints(); GfMatrix4f cullMatrix(renderPassState->GetCullMatrix()); GfVec2f drawRangeNDC(renderPassState->GetDrawingRangeNDC()); binder.BindUniformui(_tokens->ulocDrawCommandNumUints, 1, &drawCommandNumUints); binder.BindUniformf(_tokens->ulocCullMatrix, 16, cullMatrix.GetArray()); if (_useTinyPrimCulling) { binder.BindUniformf(_tokens->ulocDrawRangeNDC, 2, drawRangeNDC.GetArray()); } // run culling shader bool validProgram = true; // XXX: should we cache cull command offset? HdStBufferResourceGLSharedPtr cullCommandBuffer = _dispatchBufferCullInput->GetResource(HdTokens->drawDispatch); if (!TF_VERIFY(cullCommandBuffer)) { validProgram = false; } if (validProgram) { glEnable(GL_RASTERIZER_DISCARD); int resetPass = 1; binder.BindUniformi(_tokens->ulocResetPass, 1, &resetPass); glMultiDrawArraysIndirect( GL_POINTS, reinterpret_cast( static_cast(cullCommandBuffer->GetOffset())), _dispatchBufferCullInput->GetCount(), cullCommandBuffer->GetStride()); // dispatch buffer is bound via SSBO // (see _CullingProgram::_GetCustomBindings) glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); resetPass = 0; binder.BindUniformi(_tokens->ulocResetPass, 1, &resetPass); glMultiDrawArraysIndirect( GL_POINTS, reinterpret_cast( static_cast(cullCommandBuffer->GetOffset())), _dispatchBufferCullInput->GetCount(), cullCommandBuffer->GetStride()); glDisable(GL_RASTERIZER_DISCARD); } // Reset all vertex attribs and their divisors. Note that the drawing // program has different bindings from the culling program does // in general, even though most of buffers will likely be assigned // with same attrib divisors again. binder.UnbindConstantBuffer(constantBar); binder.UnbindBufferArray(cullDispatchBar); if (instanceIndexBar) { int instancerNumLevels = batchItem->GetInstancePrimvarNumLevels(); for (int i = 0; i < instancerNumLevels; ++i) { binder.UnbindInstanceBufferArray(instanceBars[i], i); } binder.UnbindBufferArray(instanceIndexBar); } // unbind destination dispatch buffer binder.UnbindBuffer(_tokens->dispatchBuffer, _dispatchBuffer->GetEntireResource()); // make sure the culling results (instanceIndices and instanceCount) // are synchronized for the next drawing. glMemoryBarrier( GL_COMMAND_BARRIER_BIT | // instanceCount for MDI GL_SHADER_STORAGE_BARRIER_BIT | // instanceCount for shader GL_UNIFORM_BARRIER_BIT); // instanceIndices // a fence has to be added after the memory barrier. if (IsEnabledGPUCountVisibleInstances()) { _cullResultSync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } else { _cullResultSync = 0; } } void HdSt_IndirectDrawBatch::_GPUFrustumNonInstanceCulling( HdStDrawItem const *batchItem, HdStRenderPassStateSharedPtr const &renderPassState, HdStResourceRegistrySharedPtr const &resourceRegistry) { HdBufferArrayRangeSharedPtr constantBar_ = batchItem->GetConstantPrimvarRange(); HdStBufferArrayRangeGLSharedPtr constantBar = std::static_pointer_cast(constantBar_); HdStBufferArrayRangeGLSharedPtr cullDispatchBar = _dispatchBufferCullInput->GetBufferArrayRange(); _CullingProgram &cullingProgram = _GetCullingProgram(resourceRegistry); HdStGLSLProgramSharedPtr const & glslProgram = cullingProgram.GetGLSLProgram(); if (!TF_VERIFY(glslProgram)) return; if (!TF_VERIFY(glslProgram->Validate())) return; // We perform frustum culling on the GPU with the rasterizer disabled, // stomping the instanceCount of each drawing command in the // dispatch buffer to 0 for primitives that are culled, skipping // over other elements. GLuint programId = glslProgram->GetProgram()->GetRawResource(); glUseProgram(programId); const HdSt_ResourceBinder &binder = cullingProgram.GetBinder(); // bind constant binder.BindConstantBuffer(constantBar); // bind drawing coord, instance count binder.BindBufferArray(cullDispatchBar); if (IsEnabledGPUCountVisibleInstances()) { _BeginGPUCountVisibleInstances(resourceRegistry); } // set cull parameters unsigned int drawCommandNumUints = _dispatchBuffer->GetCommandNumUints(); GfMatrix4f cullMatrix(renderPassState->GetCullMatrix()); GfVec2f drawRangeNDC(renderPassState->GetDrawingRangeNDC()); binder.BindUniformf(_tokens->ulocCullMatrix, 16, cullMatrix.GetArray()); binder.BindUniformui(_tokens->ulocDrawCommandNumUints, 1, &drawCommandNumUints); if (_useTinyPrimCulling) { binder.BindUniformf(_tokens->ulocDrawRangeNDC, 2, drawRangeNDC.GetArray()); } // bind destination buffer (using entire buffer bind to start from offset=0) binder.BindBuffer(_tokens->dispatchBuffer, _dispatchBuffer->GetEntireResource()); glEnable(GL_RASTERIZER_DISCARD); glDrawArrays(GL_POINTS, 0, _dispatchBufferCullInput->GetCount()); glDisable(GL_RASTERIZER_DISCARD); // unbind destination dispatch buffer binder.UnbindBuffer(_tokens->dispatchBuffer, _dispatchBuffer->GetEntireResource()); // make sure the culling results (instanceCount) // are synchronized for the next drawing. glMemoryBarrier( GL_COMMAND_BARRIER_BIT | // instanceCount for MDI GL_SHADER_STORAGE_BARRIER_BIT // instanceCount for shader ); // a fence has to be added after the memory barrier. if (IsEnabledGPUCountVisibleInstances()) { _cullResultSync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } else { _cullResultSync = 0; } // unbind all binder.UnbindConstantBuffer(constantBar); binder.UnbindBufferArray(cullDispatchBar); glUseProgram(0); } void HdSt_IndirectDrawBatch::DrawItemInstanceChanged(HdStDrawItemInstance const* instance) { // We need to check the visibility and update if needed if (_dispatchBuffer) { size_t batchIndex = instance->GetBatchIndex(); int commandNumUints = _dispatchBuffer->GetCommandNumUints(); int numLevels = instance->GetDrawItem()->GetInstancePrimvarNumLevels(); int instanceIndexWidth = numLevels + 1; // When non-instance culling is being used, cullcommand points the same // location as drawcommands. Then we update the same place twice, it // might be better than branching. std::vector::iterator instanceCountIt = _drawCommandBuffer.begin() + batchIndex * commandNumUints + _instanceCountOffset; std::vector::iterator cullInstanceCountIt = _drawCommandBuffer.begin() + batchIndex * commandNumUints + _cullInstanceCountOffset; HdBufferArrayRangeSharedPtr const &instanceIndexBar_ = instance->GetDrawItem()->GetInstanceIndexRange(); HdStBufferArrayRangeGLSharedPtr instanceIndexBar = std::static_pointer_cast(instanceIndexBar_); int newInstanceCount = instanceIndexBar ? instanceIndexBar->GetNumElements() : 1; newInstanceCount = instance->IsVisible() ? (newInstanceCount/std::max(1, instanceIndexWidth)) : 0; TF_DEBUG(HD_MDI).Msg("\nInstance Count changed: %d -> %d\n", *instanceCountIt, newInstanceCount); // Update instance count and overall count of visible items. if (static_cast(newInstanceCount) != (*instanceCountIt)) { _numVisibleItems += (newInstanceCount - (*instanceCountIt)); *instanceCountIt = newInstanceCount; *cullInstanceCountIt = newInstanceCount; _drawCommandBufferDirty = true; } } } void HdSt_IndirectDrawBatch::_BeginGPUCountVisibleInstances( HdStResourceRegistrySharedPtr const &resourceRegistry) { if (!_resultBuffer) { _resultBuffer = resourceRegistry->RegisterPersistentBuffer( _tokens->drawIndirectResult, sizeof(GLint), 0); } // Reset visible item count int32_t count = 0; glNamedBufferSubData(_resultBuffer->GetBuffer()->GetRawResource(), 0, sizeof(count), &count); // XXX: temporarily hack during refactoring. // we'd like to use the same API as other buffers. int binding = _cullingProgram.GetBinder().GetBinding( _tokens->drawIndirectResult).GetLocation(); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, _resultBuffer->GetBuffer()->GetRawResource()); } void HdSt_IndirectDrawBatch::_EndGPUCountVisibleInstances(GLsync resultSync, size_t * result) { GLenum status = glClientWaitSync(resultSync, GL_SYNC_FLUSH_COMMANDS_BIT, HD_CULL_RESULT_TIMEOUT_NS); if (status != GL_ALREADY_SIGNALED && status != GL_CONDITION_SATISFIED) { // We could loop, but we don't expect to timeout. TF_RUNTIME_ERROR("Unexpected ClientWaitSync timeout"); *result = 0; return; } // Return visible item count int32_t count = 0; glGetNamedBufferSubData(_resultBuffer->GetBuffer()->GetRawResource(), 0, sizeof(count), &count); *result = count; // XXX: temporarily hack during refactoring. // we'd like to use the same API as other buffers. int binding = _cullingProgram.GetBinder().GetBinding( _tokens->drawIndirectResult).GetLocation(); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding, 0); } void HdSt_IndirectDrawBatch::_CullingProgram::Initialize( bool useDrawArrays, bool useInstanceCulling, size_t bufferArrayHash) { if (useDrawArrays != _useDrawArrays || useInstanceCulling != _useInstanceCulling || bufferArrayHash != _bufferArrayHash) { // reset shader Reset(); } _useDrawArrays = useDrawArrays; _useInstanceCulling = useInstanceCulling; _bufferArrayHash = bufferArrayHash; } /* virtual */ void HdSt_IndirectDrawBatch::_CullingProgram::_GetCustomBindings( HdBindingRequestVector *customBindings, bool *enableInstanceDraw) const { if (!TF_VERIFY(enableInstanceDraw) || !TF_VERIFY(customBindings)) return; customBindings->push_back(HdBindingRequest(HdBinding::SSBO, _tokens->drawIndirectResult)); customBindings->push_back(HdBindingRequest(HdBinding::SSBO, _tokens->dispatchBuffer)); customBindings->push_back(HdBindingRequest(HdBinding::UNIFORM, _tokens->ulocDrawRangeNDC)); customBindings->push_back(HdBindingRequest(HdBinding::UNIFORM, _tokens->ulocCullMatrix)); if (_useInstanceCulling) { customBindings->push_back( HdBindingRequest(HdBinding::DRAW_INDEX_INSTANCE, _tokens->drawCommandIndex)); customBindings->push_back( HdBindingRequest(HdBinding::UNIFORM, _tokens->ulocDrawCommandNumUints)); customBindings->push_back( HdBindingRequest(HdBinding::UNIFORM, _tokens->ulocResetPass)); } else { // non-instance culling customBindings->push_back( HdBindingRequest(HdBinding::DRAW_INDEX, _tokens->drawCommandIndex)); customBindings->push_back( HdBindingRequest(HdBinding::DRAW_INDEX, _tokens->instanceCountInput)); customBindings->push_back( HdBindingRequest(HdBinding::UNIFORM, _tokens->ulocDrawCommandNumUints)); } // set instanceDraw true if instanceCulling is enabled. // this value will be used to determine if glVertexAttribDivisor needs to // be enabled or not. *enableInstanceDraw = _useInstanceCulling; } PXR_NAMESPACE_CLOSE_SCOPE