using System.Collections;
using System;
using System.Collections.Generic;
using UnityEngine;
using UnityEditor;
using Unity.Collections;
using Unity.Mathematics;
using UnityEngine.Rendering;
using Unity.Burst;
using Unity.Jobs;
using static UnityEngine.Mesh;
using UnityEngine.Assertions;
using static SLZ.CustomStaticBatching.PackedChannel;
namespace SLZ.CustomStaticBatching
public struct RendererData
public Mesh mesh;
public MeshFilter meshFilter;
public MeshRenderer meshRenderer;
public Transform rendererTransform;
public class SBCombineMeshList
CombineRendererSettings crs;
public CombineRendererSettings settings { get => crs; set => crs = value; }
public ComputeShader transferVtxBufferCompute;
public SBCombineMeshList(ComputeShader transferVtxComputeShader)
crs = new CombineRendererSettings(true);
transferVtxBufferCompute = transferVtxComputeShader;
/// Takes an array of pre-sorted renderers, and bins them into chunks of <65535 verticies.
/// This is necessary for using ushort index buffers, and assumes all meshes in the list
/// are using 16 bit index buffers already
public void GetCombinedMeshBins16(RendererData[] sortedRenderers, int renderersLength, int[] renderer2Mesh, out ushort[] renderer2CMeshIdx, out List cMeshIdxRange)
// bin the sorted renderers into groups containing less than 2^16 verticies
renderer2CMeshIdx = new ushort[renderersLength];
int vertexCount = 0;
ushort currentMeshIdx = 0;
int meshGroupBeginIdx = 0;
//cMeshIdxRange = new List();
cMeshIdxRange = new List();
if (renderersLength == 0) return;
for (int rIdx = 0; rIdx < renderersLength; rIdx++)
int meshVertexCount = sortedRenderers[rIdx].mesh.vertexCount;
vertexCount += meshVertexCount;
if (vertexCount >= 0xffff)
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, rIdx));
meshGroupBeginIdx = rIdx;
vertexCount = meshVertexCount;
renderer2CMeshIdx[rIdx] = currentMeshIdx;
if (meshGroupBeginIdx == 0 || meshGroupBeginIdx < (renderersLength - 1))
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, renderersLength));
/// Takes an array of pre-sorted renderers, and bins them into chunks of <65535 verticies, until it hits the first mesh that needs a 32 bit index buffer.
/// After that point it bins into max32Verts sized bins. Assumes that the renderer array has been sorted such that 32 bit index meshes are all at the end of the array.
/// Sorted list of renderer data
/// Range of valid renderers in the sortedRenderers array, starting from 0
/// The maximum number of vertices that can be in a 32-bit index buffer combined mesh. Since a 32-bit index buffer can represent trillions of vertices, its a good idea to arbitrarily put a cap on how large the combined mesh can be
/// Output array that maps each item in the renderer list to the index of the combined mesh it will be a part of
/// Output list of [start, end) index ranges in the sorted renderer list, where each item represents a group of renderers that will be combined
/// Output index in cMeshIdxRange where 32-bit index meshes start
public void GetCombinedMeshBins(
RendererData[] sortedRenderers,
int renderersLength,
out ushort[] renderer2CMeshIdx,
out List cMeshIdxRange,
out int largeIdxBinStart)
// bin the sorted renderers into groups containing less than 2^16 verticies
renderer2CMeshIdx = new ushort[renderersLength];
int vertexCount = 0;
ushort currentMeshIdx = 0;
int meshGroupBeginIdx = 0;
//cMeshIdxRange = new List();
cMeshIdxRange = new List();
if (renderersLength == 0)
largeIdxBinStart = 0;
int rIdx = 0;
for (; rIdx < renderersLength; rIdx++)
Mesh m = sortedRenderers[rIdx].mesh;
if (m.indexFormat == IndexFormat.UInt32)
int meshVertexCount = m.vertexCount;
vertexCount += meshVertexCount;
if (vertexCount >= 0xffff)
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, rIdx));
meshGroupBeginIdx = rIdx;
vertexCount = meshVertexCount;
renderer2CMeshIdx[rIdx] = currentMeshIdx;
// if the loop ended in the middle of filling a bin, add a bin from the end of the last bin to the last renderer index before the loop ended
if ((meshGroupBeginIdx == 0 && rIdx > 0) || meshGroupBeginIdx < rIdx - 1)
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, rIdx));
meshGroupBeginIdx = rIdx;
vertexCount = 0;
largeIdxBinStart = cMeshIdxRange.Count;
int max32Vtx = crs.maxCombined32Idx;
if (crs.allow32bitIdx)
int largeIdxStart = rIdx;
for (; rIdx < renderersLength; rIdx++)
Mesh m = sortedRenderers[rIdx].mesh;
int meshVertexCount = m.vertexCount;
vertexCount += meshVertexCount;
if (vertexCount > max32Vtx)
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, rIdx));
meshGroupBeginIdx = rIdx;
vertexCount = meshVertexCount;
renderer2CMeshIdx[rIdx] = currentMeshIdx;
if ((meshGroupBeginIdx == largeIdxStart && rIdx > largeIdxStart) || meshGroupBeginIdx < renderersLength - 1)
cMeshIdxRange.Add(new int2(meshGroupBeginIdx, renderersLength));
/// Get a list of unique meshes for the given renderers, and a mapping from the index of the renderer to the index of the unique mesh.
/// For small lists of meshes where doing a job would take longer
/// Array of renderer structs from which to generate the list of unique meshes
/// output list of unique meshes
/// Array that maps each index of the renderer array to an index in the unique mesh list
public static void SerialGetUniqueMeshes(RendererData[] renderers, out List meshList, out int[] renderer2Mesh)
meshList = new List(renderers.Length);
//Debug.Log("Num Renderers: " + renderers.Length);
Dictionary meshListIndex = new Dictionary(renderers.Length);
renderer2Mesh = new int[renderers.Length];
for (int i = 0; i < renderers.Length; i++)
Mesh m = renderers[i].mesh;
int index;
if (!meshListIndex.TryGetValue(m, out index))
index = meshList.Count;
meshListIndex.Add(m, index);
renderer2Mesh[i] = index;
/// Get a list of unique meshes for the given renderers, and a mapping from the index of the renderer to the index of the unique mesh.
/// Does the serial version, but also generates the MeshDataArray needed for jobs.
/// Array of renderer structs from which to generate the list of unique meshes
/// output list of unique meshes
/// output array of readonly meshdata structs for use by the jobs system
/// Array that maps each index of the renderer array to an index in the unique mesh list
public static void ParallelGetUniqueMeshes(RendererData[] renderers, out List meshList, out Mesh.MeshDataArray meshDataArray, out int[] renderer2Mesh)
SerialGetUniqueMeshes(renderers, out meshList, out renderer2Mesh);
meshDataArray = MeshUtility.AcquireReadOnlyMeshData(meshList);
meshDataArray = Mesh.AcquireReadOnlyMeshData(meshList);
/// Gets packed channel information for each of the 12 possible channels of the vertex struct for each mesh in the input list.
/// Also determines if the mesh's layout is compatible for merging, and marks bad meshes with strange vertex
/// attribute formats that can't losslessly be converted to floating point (like integer formats)
/// List of meshes to get the channel information of
/// output array of packed channel information. The index of each element divided by 12 is the index of the mesh it corresponds to
/// outupt array of flags that correspond to each mesh in the mesh list. If the value is 1, the mesh has incompatible channel formats and can't be combined
public static void SerialGetMeshLayout(List meshList, out NativeArray meshChannels, out NativeArray invalidMeshes)
int numMeshes = meshList.Count;
meshChannels = new NativeArray(NUM_VTX_CHANNELS * numMeshes, Allocator.Temp);
invalidMeshes = new NativeArray(numMeshes, Allocator.Temp);
Span vtxFmtLUT = stackalloc byte[16]; // currently there are 12 VertexAttributeFormat enum values from 0 to 11. If this ever changes, this could break!
for (int i = 0; i < 16; i++) vtxFmtLUT[i] = (byte)VtxFormats.Invalid; // 0 represents an invalid format
vtxFmtLUT[(int)VertexAttributeFormat.Float32] = (byte)VtxFormats.Float32;
vtxFmtLUT[(int)VertexAttributeFormat.Float16] = (byte)VtxFormats.Float16;
vtxFmtLUT[(int)VertexAttributeFormat.SNorm8] = (byte)VtxFormats.SNorm8;
vtxFmtLUT[(int)VertexAttributeFormat.UNorm8] = (byte)VtxFormats.UNorm8;
for (int i = 0; i < numMeshes; i++)
int baseIdx = NUM_VTX_CHANNELS * i;
Mesh data = meshList[i];
bool meshIsInvalid = !data.HasVertexAttribute(VertexAttribute.Position) || data.vertexBufferCount > 2; // Only support 2 streams for now
for (int channel = 0; channel < NUM_VTX_CHANNELS; channel++)
bool hasAttribute = data.HasVertexAttribute((VertexAttribute)channel);
VertexAttribute attribute = (VertexAttribute)channel;
if (hasAttribute)
byte channelFormat = vtxFmtLUT[(int)data.GetVertexAttributeFormat(attribute)];
meshIsInvalid = meshIsInvalid || (channelFormat == (int)VtxFormats.Invalid);
meshChannels[baseIdx + channel] = new PackedChannel
dimension = (byte)data.GetVertexAttributeDimension(attribute),
format = channelFormat,
offset = (byte)data.GetVertexAttributeOffset(attribute),
stream = (byte)data.GetVertexAttributeStream(attribute)
invalidMeshes[i] = meshIsInvalid ? (byte)1 : (byte)0;
/// Gets packed channel information for each of the 12 possible channels of the vertex struct for each mesh in the input list,
/// doing so using parallel jobs. Also determines if the mesh's layout is compatible for merging, and flags bad meshes with more than 2 vertex streams
/// that contain vertex attributes with strange formats that can't losslessly be converted to floating point (like integer formats)
/// Array of mesh data to get the channel information of
/// output array of packed channel information. The index of each element divided by 12 is the index of the mesh it corresponds to
/// outupt array of flags that correspond to each mesh in the mesh list. If the value is 1, the mesh has incompatible channel formats and can't be combined
public static void ParallelGetMeshLayout(Mesh.MeshDataArray meshDataArray, out NativeArray meshChannels, out NativeArray invalidMeshes)
meshChannels = new NativeArray(NUM_VTX_CHANNELS * meshDataArray.Length, Allocator.Persistent);
invalidMeshes = new NativeArray(meshDataArray.Length, Allocator.TempJob);
NativeArray vtxFmtLUT = new NativeArray(16, Allocator.TempJob); // currently there are 12 VertexAttributeFormat enum values from 0 to 11. If this ever changes, this could break!
for (int i = 0; i < 16; i++) vtxFmtLUT[i] = (byte)VtxFormats.Invalid; // 255 represents an invalid format
vtxFmtLUT[(int)VertexAttributeFormat.Float32] = (byte)VtxFormats.Float32;
vtxFmtLUT[(int)VertexAttributeFormat.Float16] = (byte)VtxFormats.Float16;
vtxFmtLUT[(int)VertexAttributeFormat.SNorm8] = (byte)VtxFormats.SNorm8;
vtxFmtLUT[(int)VertexAttributeFormat.UNorm8] = (byte)VtxFormats.UNorm8;
GetMeshLayoutJob getLayout = new GetMeshLayoutJob { _meshChannels = meshChannels, _invalidMeshes = invalidMeshes, _vtxFmtLUT = vtxFmtLUT, _meshData = meshDataArray };
JobHandle layoutHandle = getLayout.Schedule(meshDataArray.Length, 16);
/// Gets the vertex struct layout of an array of meshes, and populates an array of flags that indicate if a mesh has an
/// incompatible vertex attribute format or more than 2 vertex streams
struct GetMeshLayoutJob : IJobParallelFor
public NativeArray _meshChannels;
public NativeArray _invalidMeshes;
public NativeArray _vtxFmtLUT;
public Mesh.MeshDataArray _meshData;
public void Execute(int i)
int baseIdx = NUM_VTX_CHANNELS * i;
MeshData data = _meshData[i];
bool meshIsInvalid = !data.HasVertexAttribute(VertexAttribute.Position) || data.vertexBufferCount > 2; // Only support 2 streams for now
for (int channel = 0; channel < NUM_VTX_CHANNELS; channel++)
bool hasAttribute = data.HasVertexAttribute((VertexAttribute)channel);
VertexAttribute attribute = (VertexAttribute)channel;
if (hasAttribute)
byte channelFormat = _vtxFmtLUT[(int)data.GetVertexAttributeFormat(attribute)];
meshIsInvalid = meshIsInvalid || (channelFormat == (int)VtxFormats.Invalid);
_meshChannels[baseIdx + channel] = new PackedChannel
dimension = (byte)data.GetVertexAttributeDimension(attribute),
format = channelFormat,
offset = (byte)data.GetVertexAttributeOffset(attribute),
stream = (byte)data.GetVertexAttributeStream(attribute)
_invalidMeshes[i] = meshIsInvalid ? (byte)1 : (byte)0;
/// Cleans the renderer array of renderers whose mesh that can't be combined other meshes, moving all the valid meshes to the front of the array.
/// Does not resize the array, instead returns the number of vaild renderers which should be used in place of renderers.length
/// The number of valid meshes in the array
public int CleanInvalidRenderers(NativeArray invalidMeshes, RendererData[] renderers, int[] renderer2Mesh)
int numRenderers = renderers.Length;
int p = 0;
for (int i = 0; i < numRenderers; i++)
if (invalidMeshes[renderer2Mesh[i]] == 0)
renderers[p] = renderers[i];
renderer2Mesh[p] = renderer2Mesh[i];
return p;
/// Gets the sign of the scale of each renderer. Used to determine if the winding order of a renderer needs to be flipped in the combined mesh,
/// and to set the sign of the tangent's 4th component for each vertex
public NativeArray GetRendererScaleSign(RendererData[] rd)
NativeArray scaleSign = new NativeArray(rd.Length, Allocator.TempJob);
NativeArray object2World = new NativeArray(rd.Length, Allocator.TempJob);
for (int i = 0; i < rd.Length; i++)
object2World[i] = (float3x3)((float4x4)rd[i].rendererTransform.localToWorldMatrix);
GetRendererNegativeScale scaleJob = new GetRendererNegativeScale() { isNegativeScale = scaleSign, object2World = object2World };
JobHandle scaleJh = scaleJob.Schedule(rd.Length, 16);
return scaleSign;
struct GetRendererNegativeScale : IJobParallelFor
public NativeArray isNegativeScale;
public NativeArray object2World;
public void Execute(int i)
float3x3 Object2WorldNoTranslation = object2World[i];
float determinant = math.determinant(Object2WorldNoTranslation);
isNegativeScale[i] = determinant > 0 ? (byte)1 : (byte)0;
struct UniqueMeshData
public int[] renderer2Mesh;
public NativeArray meshChannels;
public NativeArray invalidMeshes;
public NativeArray GetCombinedMeshLayout(
RendererData[] renderers,
ref NativeArray meshChannels,
int[] renderer2Mesh,
int startIdx, int endIdx)
// Create list of unique meshes and array of pointers from the sortedRenderers to the unique meshes
int combinedCount = endIdx - startIdx;
List meshIndex = new List(combinedCount);
HashSet uniqueMeshSet = new HashSet(combinedCount);
bool isLightmapped = false;
bool isDynamicLightmapped = false;
for (int i = startIdx; i < endIdx; i++)
int index = renderer2Mesh[i];
//Debug.Log("Renderer2Mesh: " + i + ":" + index);
if (!uniqueMeshSet.Contains(index))
// Determine if the combined mesh will be lightmapped.
// Sometimes, people will use UV0 as the lightmap UV. This doesn't work with static batching as the lightmap scale/offset
// gets baked into the lightmap UV, and UV0 is normally compressed to 16 bit which isn't enough for lightmaps.
// Therefore, forcibly add UV1 to lightmapped combined meshes even if none of the input meshes have it.
MeshRenderer mr = renderers[i].meshRenderer;
isLightmapped = isLightmapped || (GameObjectUtility.AreStaticEditorFlagsSet(mr.gameObject, StaticEditorFlags.ContributeGI) && mr.receiveGI == ReceiveGI.Lightmaps);
isLightmapped = isLightmapped || (mr.lightmapIndex < 0xFFFE && mr.lightmapIndex > 0);
isDynamicLightmapped = isDynamicLightmapped || (mr.realtimeLightmapIndex < 0xFFFE && mr.realtimeLightmapIndex > 0);
int meshIdxCount = meshIndex.Count;
//Debug.Log("Unique Mesh count in combined mesh: " + meshIdxCount);
NativeArray combinedFormat = new NativeArray(NUM_VTX_CHANNELS, Allocator.TempJob);
Span minTypeLUT = stackalloc int[] { 1, 4, 4, 2, 1 };
Span useAltStream = stackalloc bool[12];
int altStreamFlag = 1 << 24;
for (int mesh = 0; mesh < meshIdxCount; mesh++)
int meshPtr = meshIndex[mesh] * NUM_VTX_CHANNELS;
for (int channel = 0; channel < NUM_VTX_CHANNELS; channel++)
PackedChannel a = combinedFormat[channel];
PackedChannel b = meshChannels[meshPtr + channel];
int largestFmt = math.max((int)a.format, (int)b.format);
largestFmt = math.min(largestFmt, (int)crs.serializedVtxFormats[channel]);
int maxDim = math.max((int)a.dimension, (int)b.dimension);
int roundDim = minTypeLUT[largestFmt];
maxDim = ((maxDim + roundDim - 1) / roundDim) * roundDim;
int stream = useAltStream[channel] ? altStreamFlag : 0;
combinedFormat[channel] = new PackedChannel { packedData = (uint)(maxDim | (largestFmt << 8) | stream) };
// Add a lightmap UV1 if one or more of the input renderers are either static or dynamic lightmapped but none of the inputs have UV1's
if ((isLightmapped || isDynamicLightmapped) && combinedFormat[5].dimension == 0)
combinedFormat[5] = new PackedChannel() { dimension = 2, format = crs.serializedVtxFormats[5], stream = combinedFormat[5].stream };
// Add a dynamic lightmap UV2 if there are dynamic lightmapped renderers in the input, but none of the inputs have UV2's
if (isDynamicLightmapped && combinedFormat[6].dimension == 0)
combinedFormat[6] = new PackedChannel() { dimension = 2, format = crs.serializedVtxFormats[6], stream = combinedFormat[6].stream };
uint cumulativeOffset = 0;
uint cumulativeOffset2 = 0;
ReadOnlySpan vtxFmtToBytes = PackedChannel.VtxFmtToBytes;
for (int channel = 0; channel < NUM_VTX_CHANNELS; channel++)
if (useAltStream[channel])
combinedFormat[channel] = new PackedChannel { packedData = combinedFormat[channel].packedData | (cumulativeOffset2 << 16) };
cumulativeOffset2 = (cumulativeOffset2 + (uint)vtxFmtToBytes[combinedFormat[channel].format] * combinedFormat[channel].dimension);
combinedFormat[channel] = new PackedChannel { packedData = combinedFormat[channel].packedData | (cumulativeOffset << 16) };
cumulativeOffset = (cumulativeOffset + (uint)vtxFmtToBytes[combinedFormat[channel].format] * combinedFormat[channel].dimension);
return combinedFormat;
struct CombinedMeshSmInfo
public int[] rendererIdx;
public int[] submeshStart;
public int[] submeshCount;
public static VertexAttributeDescriptor[] VtxAttrDescFromPacked(NativeArray packedChannels)
List vertexAttributes = new List(NUM_VTX_CHANNELS);
ReadOnlySpan formatLUT = PackedChannel.ToUnityFormatLUT;
for (int i = 0; i < NUM_VTX_CHANNELS; i++)
if (packedChannels[i].dimension != 0)
vertexAttributes.Add(new VertexAttributeDescriptor((VertexAttribute)i, formatLUT[packedChannels[i].format], packedChannels[i].dimension, packedChannels[i].stream));
return vertexAttributes.ToArray();
/// Generates a Unity Mesh for a list of renderers. Generates the combined index buffer, sets the submesh descriptors,
/// calculates the worldspace bounds of each submesh, and flips the winding order for negatively scaled meshes.
/// Sorted array of renderers
/// List of unique meshes used by the renderers
/// Range of indices in rd that will be combined in the output mesh
/// A mapping from each index of rd to a mesh in uniqueMeshList
/// An array of dimension NUM_VTX_CHANNELS (currently 12) that contains a description of the vertex attribute format and dimension of each possible channel in the output mesh
/// An array of bytes that indicates if the renderer of the corresponding index has been scaled negatively. If so, the winding order of the indices in the combined mesh must be reversed
public Mesh GetCombinedMeshObject(RendererData[] rd, MeshDataArray uniqueMeshList, int2 rendererRange, int[] renderer2Mesh, ref NativeArray packedChannels, ref NativeArray rendererScaleSign, bool highPidxBuffer)
where T : unmanaged
Assert.IsTrue(typeof(T) == typeof(int) || typeof(T) == typeof(ushort), "GetCombinedMeshObject can only use ushort and int types!");
// Are we using a high-precision index buffer?
bool highPIdxBuffer = typeof(T) == typeof(int);
// Get the total number of vertices, submeshes, and valid renderers that make up this combined mesh
int vertexCount = 0;
int submeshCount = 0;
int rendererCount = 0;
int[] validRendererIdx = new int[rendererRange.y - rendererRange.x];
// Iterate once over the range of renderers, counting the total verticies and submeshes.
for (int i = rendererRange.x; i < rendererRange.y; i++)
Mesh tempMesh = rd[i].mesh;
vertexCount += rd[i].mesh.vertexCount;
submeshCount += rd[i].mesh.subMeshCount;
validRendererIdx[rendererCount] = i;
List vertexAttributes = new List(NUM_VTX_CHANNELS);
ReadOnlySpan formatLUT = PackedChannel.ToUnityFormatLUT;
for (int i = 0; i < NUM_VTX_CHANNELS; i++)
if (packedChannels[i].dimension != 0)
vertexAttributes.Add(new VertexAttributeDescriptor((VertexAttribute)i, formatLUT[packedChannels[i].format], packedChannels[i].dimension, packedChannels[i].stream));
Mesh combinedMesh = new Mesh();
combinedMesh.SetVertexBufferParams(vertexCount, vertexAttributes.ToArray());
SubMeshDescriptor[] subMeshDescriptors = new SubMeshDescriptor[submeshCount];
CombinedMeshSmInfo combinedSmInfo = new CombinedMeshSmInfo()
rendererIdx = new int[rendererCount],
submeshStart = new int[rendererCount],
submeshCount = new int[rendererCount],
int meshPointer = 0;
int smPointer = 0;
int idxCount = 0;
int vtxPointer = 0;
// Iterate again over the renderers this time getting the submesh descriptors of all the meshes,
// calculating the sum of their index counts, and calculating the union of all their bounds
bool initializeBounds = true;
Bounds totalBounds = new Bounds();
NativeArray submeshBounds = new NativeArray(submeshCount, Allocator.TempJob);
NativeArray rendererObject2World = new NativeArray(rendererCount, Allocator.TempJob);
NativeArray submesh2Renderer = new NativeArray(submeshCount, Allocator.TempJob);
// Keep track of the most indices a submesh has, so when it comes time to copy the indices we can make a nativearray of exactly the right size as a staging buffer
int maxSmIdxCount = 0;
for (int i = 0; i < rendererCount; i++)
int rIdx = validRendererIdx[i];
int meshIdx = renderer2Mesh[rIdx];
int smCount = rd[rIdx].mesh.subMeshCount;
int firstSubMesh = smPointer;
Bounds bounds = rd[rIdx].meshRenderer.bounds;
rendererObject2World[i] = rd[rIdx].rendererTransform.localToWorldMatrix;
if (initializeBounds)
totalBounds = bounds;
initializeBounds = false;
for (int sm = 0; sm < smCount; sm++)
SubMeshDescriptor smd = rd[rIdx].mesh.GetSubMesh(sm);
SubMeshDescriptor smd2 = new SubMeshDescriptor()
baseVertex = 0,
firstVertex = smd.firstVertex + vtxPointer,
bounds = smd.bounds,
indexCount = smd.indexCount,
indexStart = idxCount,
vertexCount = smd.vertexCount,
topology = smd.topology,
submesh2Renderer[smPointer] = (ushort)i;
submeshBounds[smPointer] = smd.bounds;
//Debug.Log("Submesh " + smPointer + " index start: " + smd2.indexStart + " bounds: " + smd2.bounds);
maxSmIdxCount = math.max(maxSmIdxCount, smd.indexCount);
idxCount += smd.indexCount;
subMeshDescriptors[smPointer] = smd2;
combinedSmInfo.rendererIdx[meshPointer] = rIdx;
combinedSmInfo.submeshStart[meshPointer] = firstSubMesh;
combinedSmInfo.submeshCount[meshPointer] = smPointer - firstSubMesh;
vtxPointer += rd[rIdx].mesh.vertexCount;
// Set the total size of the index buffer of the combined mesh, and set the total bounds.
combinedMesh.SetIndexBufferParams(idxCount, highPIdxBuffer ? IndexFormat.UInt32 : IndexFormat.UInt16);
combinedMesh.bounds = totalBounds;
// Transform the bounds of each submesh from its local object space to world space
TransformSubmeshBounds transformSubmeshBounds = new TransformSubmeshBounds() { bounds = submeshBounds, obj2World = rendererObject2World, obj2WorldIdx = submesh2Renderer };
JobHandle transformBoundsHandle = transformSubmeshBounds.Schedule(submeshCount, 16);
// Recreate each submesh descriptor struct with the new worldspace bounds
for (int i = 0; i < submeshCount; i++)
subMeshDescriptors[i] = new SubMeshDescriptor()
baseVertex = subMeshDescriptors[i].baseVertex,
firstVertex = subMeshDescriptors[i].firstVertex,
bounds = submeshBounds[i],
indexCount = subMeshDescriptors[i].indexCount,
indexStart = subMeshDescriptors[i].indexStart,
vertexCount = subMeshDescriptors[i].vertexCount,
topology = subMeshDescriptors[i].topology,
// Create the combined mesh's index buffer, and populate it with the indices of all the input meshes' submeshes
NativeArray indexBuffer = new NativeArray(idxCount, Allocator.TempJob);
NativeArray indexStartCountOffsetFlip = new NativeArray(submeshCount, Allocator.TempJob);
int idxPointer = 0;
int smPointer2 = 0;
List indices = new List();
Span topologyCount = stackalloc int[5] { 0, 0, 0, 0, 0 };
topologyCount[(int)MeshTopology.Triangles] = 3;
topologyCount[(int)MeshTopology.Quads] = 4;
for (int i = 0; i < rendererCount; i++)
int rendererIdx = combinedSmInfo.rendererIdx[i];
int meshIdx = renderer2Mesh[combinedSmInfo.rendererIdx[i]];
MeshData tmesh = uniqueMeshList[meshIdx];
int firstSubMesh = combinedSmInfo.submeshStart[i];
int totalIdxCount = 0;
for (int sm = 0; sm < combinedSmInfo.submeshCount[i]; sm++)
int totalSm = firstSubMesh + sm;
int numIdx = (int)subMeshDescriptors[totalSm].indexCount;
if (highPIdxBuffer)
NativeArray idxAlias = NativeArraySubArray.GetSubArrayAlias(indexBuffer, idxPointer, numIdx);
tmesh.GetIndices(idxAlias, sm);
NativeArray idxAlias = NativeArraySubArray.GetSubArrayAlias(indexBuffer, idxPointer, numIdx);
tmesh.GetIndices(idxAlias, sm);
//NativeArray.Copy(CSBListExt.GetInternalArray(indices), 0, indexBuffer, idxPointer, numIdx);
int sign = rendererScaleSign[rendererIdx] == 0 ? 1 : 0;
int topo = topologyCount[(int)subMeshDescriptors[smPointer2].topology];
topo *= sign;
indexStartCountOffsetFlip[smPointer2] = new int4(idxPointer, numIdx, subMeshDescriptors[firstSubMesh].firstVertex, topo);
idxPointer += numIdx;
totalIdxCount += numIdx;
// Offset the indices for each submesh in the combined mesh's index buffer by the total index count of the preceeding submeshes
if (highPIdxBuffer)
NativeArray intBuffer = indexBuffer.Reinterpret(sizeof(int));
GenericInt32 intMath = new GenericInt32();
OffsetFlipIndexBuffer offsetIdxJob = new OffsetFlipIndexBuffer {
indices = intBuffer, indexStartCountOffset = indexStartCountOffsetFlip, converter = intMath };
JobHandle jobHandle = offsetIdxJob.Schedule(submeshCount, 1);
NativeArray shortBuffer = indexBuffer.Reinterpret(sizeof(ushort));
GenericInt16 shortMath = new GenericInt16();
OffsetFlipIndexBuffer offsetIdxJob = new OffsetFlipIndexBuffer {
indices = shortBuffer, indexStartCountOffset = indexStartCountOffsetFlip, converter = shortMath };
JobHandle jobHandle = offsetIdxJob.Schedule(submeshCount, 1);
combinedMesh.SetIndexBufferData(indexBuffer, 0, 0, idxCount, MeshUpdateFlags.DontRecalculateBounds);
combinedMesh.SetSubMeshes(subMeshDescriptors, MeshUpdateFlags.DontRecalculateBounds);
return combinedMesh;
/// Job for offsetting the values of the indices of each submesh in the combined index buffer by the total number of indices of all the preceeding submeshes
/// Also flips the order of the indices of each primitive if the submesh was scaled negatively
/// Type of the index buffer, assumed to be int or ushort
/// Struct implementing the IGenericInt interface for T, to provide the method adding an int and T
public struct OffsetFlipIndexBuffer : IJobParallelFor
where T : unmanaged
where TConverter : IGenericInt
public NativeArray indices;
public NativeArray indexStartCountOffset;
public TConverter converter;
public JobHandle ISchedule(int arrayLength, int innerLoopBatchCount)
return this.Schedule(arrayLength, innerLoopBatchCount);
public void Execute(int i)
int4 idxDat = indexStartCountOffset[i];
int idxStart = idxDat.x;
int idxEnd = idxStart + idxDat.y;
int offset = idxDat.z;
int primitiveCount = idxDat.w;
if (primitiveCount == 0)
for (int idx = idxStart; idx < idxEnd; idx++)
indices[idx] = converter.Add(indices[idx], offset);
else if (primitiveCount == 3)
for (int idx = idxStart; idx < idxEnd; idx += 3)
indices[idx] = converter.Add(indices[idx], offset);
indices[idx + 1] = converter.Add(indices[idx + 1], offset);
indices[idx + 2] = converter.Add(indices[idx + 2], offset);
T temp = indices[idx];
indices[idx] = indices[idx + 2];
indices[idx + 2] = temp;
else if (primitiveCount == 4)
for (int idx = idxStart; idx < idxEnd; idx += 4)
indices[idx] = converter.Add(indices[idx], offset);
indices[idx + 1] = converter.Add(indices[idx + 1], offset);
indices[idx + 2] = converter.Add(indices[idx + 2], offset);
indices[idx + 3] = converter.Add(indices[idx + 3], offset);
T temp = indices[idx];
indices[idx] = indices[idx + 3];
indices[idx + 3] = temp;
temp = indices[idx + 1];
indices[idx + 1] = indices[idx + 2];
indices[idx + 2] = temp;
public struct TransformSubmeshBounds : IJobParallelFor
public NativeArray bounds;
public NativeArray obj2World;
public NativeArray obj2WorldIdx;
public void Execute(int i)
float4x4 T = obj2World[obj2WorldIdx[i]];
float4 center = new float4((float3)bounds[i].center, 1);
center = math.mul(T, center);
float3 extents = bounds[i].extents;
float3x3 T2 = new float3x3(math.abs(, math.abs(, math.abs(;
extents = math.mul(T2, math.abs(extents));
bounds[i] = new Bounds(, 2 * extents);
static int propMeshInBuffer = Shader.PropertyToID("MeshInBuffer");
static int propMeshOutBuffer = Shader.PropertyToID("MeshOutBuffer");
static int propVertIn = Shader.PropertyToID("vertIn");
static int propVertOut = Shader.PropertyToID("vertOut");
const int meshInBufferSize = 224;
struct MeshInBuffer
public Matrix4x4 ObjectToWorld; // 4x4x4 = 64 bytes
public Matrix4x4 WorldToObject; // 128
public float4 lightmapScaleOffset; // 144
public float4 dynLightmapScaleOffset; // 160
public int4 offset_strideIn_TanSign; // 176
// + float 4x3 (48) = 224 bytes
const int meshOutBufferSize = 64;
internal struct AsyncMeshReadbackData
public AsyncGPUReadbackRequest request;
public GraphicsBuffer gpuBuffer;
public NativeArray cpuBuffer;
public void FinishMeshReadback(Mesh combinedMesh)
combinedMesh.SetVertexBufferData(cpuBuffer, 0, 0, cpuBuffer.Length, 0, MeshUpdateFlags.DontValidateIndices | MeshUpdateFlags.DontRecalculateBounds | MeshUpdateFlags.DontNotifyMeshUsers | MeshUpdateFlags.DontResetBoneBounds);
internal AsyncMeshReadbackData ComputeCopyMeshes(ref NativeArray meshPackedChannels, ref NativeArray combinedPackedChannels, ref NativeArray rendererScaleSign, Mesh combinedMesh,
RendererData[] rd, int2 rendererRange, int[] renderer2Mesh, List meshList)
// Figure out what lightmaps are potentially present in the combined mesh.
// If either UV1 or UV2 are in the combined mesh, but not in an input mesh,
// then we need to instruct the shader to copy the previous UV channel with a
// dimension > 0 to that channel
bool hasLightmap = combinedPackedChannels[5].dimension > 0;
bool hasDynLightmap = combinedPackedChannels[6].dimension > 0;
ComputeShader meshCopy = transferVtxBufferCompute;
ComputeBuffer meshInSettings = new ComputeBuffer(meshInBufferSize / 4, 4, ComputeBufferType.Constant);
ComputeBuffer meshOutSettings = new ComputeBuffer(meshOutBufferSize / 4, 4, ComputeBufferType.Constant);
Span meshOutBuffer = stackalloc int4[1];
int combinedStride = combinedMesh.GetVertexBufferStride(0);
meshOutBuffer[0] = new int4(combinedStride, 0, 0, 0);
GraphicsBuffer combinedMeshBuffer = new GraphicsBuffer(GraphicsBuffer.Target.Raw, combinedMesh.vertexCount, combinedStride);
CommandBuffer cmd = new CommandBuffer();
cmd.SetComputeBufferParam(meshCopy, 0, propVertOut, combinedMeshBuffer);
CSBBufferExt.CmdSetFromSpan(cmd, meshOutSettings, meshOutBuffer, 0, 0, 1);
cmd.SetBufferData(meshOutSettings, combinedPackedChannels, 0, 4, NUM_VTX_CHANNELS);
cmd.SetComputeConstantBufferParam(meshCopy, propMeshOutBuffer, meshOutSettings, 0, meshOutBufferSize);
cmd.SetComputeConstantBufferParam(meshCopy, propMeshInBuffer, meshInSettings, 0, meshInBufferSize);
int combinedMeshCopyIndex = 0;
int numMeshesCopied = 0;
GraphicsBuffer[] meshBuffers = new GraphicsBuffer[rendererRange.y - rendererRange.x];
Span meshInBuffer = stackalloc MeshInBuffer[1];
Span meshPackedChannels2 = stackalloc PackedChannel[NUM_VTX_CHANNELS];
for (int renderer = rendererRange.x; renderer < rendererRange.y; renderer++)
int meshIdx = renderer2Mesh[renderer];
int stride = meshList[meshIdx].GetVertexBufferStride(0);
//Debug.Log("single Mesh Stride = " + stride);
int tanSign = rendererScaleSign[renderer] > 0 ? 1 : -1;
meshInBuffer[0] = new MeshInBuffer
ObjectToWorld = rd[renderer].rendererTransform.localToWorldMatrix,
WorldToObject = rd[renderer].rendererTransform.worldToLocalMatrix,
lightmapScaleOffset = new float4(rd[renderer].meshRenderer.lightmapScaleOffset),
dynLightmapScaleOffset = new float4(rd[renderer].meshRenderer.realtimeLightmapScaleOffset),
offset_strideIn_TanSign = new int4(combinedMeshCopyIndex, stride, tanSign, 0)
combinedMeshCopyIndex += combinedStride * meshList[meshIdx].vertexCount;
CSBBufferExt.CmdSetFromSpan(cmd, meshInSettings, meshInBuffer, 0, 0, 1);
// Handle lightmapped meshes where uv0 is being used as the lightmap UV. Set UV1's packed data to be UV0's so it just copies UV0 to UV1
// Also handle the dynamic lightmap. If UV2 is missing and there's no UV2 in the output, its using UV1
bool missingLM = hasLightmap && meshPackedChannels[NUM_VTX_CHANNELS * meshIdx + 5].dimension == 0;
bool missingDynLM = hasDynLightmap && meshPackedChannels[NUM_VTX_CHANNELS * meshIdx + 6].dimension == 0;
if (missingLM || missingDynLM)
CSBNativeArraySpanExt.Copy(meshPackedChannels, NUM_VTX_CHANNELS * meshIdx, meshPackedChannels2, 0, 12);
if (missingLM)
meshPackedChannels2[5] = meshPackedChannels2[4];
if (missingDynLM)
meshPackedChannels2[6] = meshPackedChannels2[5].dimension == 0 ? meshPackedChannels2[4] : meshPackedChannels2[5];
CSBBufferExt.CmdSetFromSpan(cmd, meshInSettings, meshPackedChannels2, 0, 44, NUM_VTX_CHANNELS);
cmd.SetBufferData(meshInSettings, meshPackedChannels, NUM_VTX_CHANNELS * meshIdx, 44, NUM_VTX_CHANNELS);
Mesh singleMesh = meshList[meshIdx];
singleMesh.vertexBufferTarget |= GraphicsBuffer.Target.Raw;
meshBuffers[numMeshesCopied] = singleMesh.GetVertexBuffer(0);
cmd.SetComputeBufferParam(meshCopy, 0, propVertIn, meshBuffers[numMeshesCopied]);
cmd.DispatchCompute(meshCopy, 0, (meshList[meshIdx].vertexCount + 31) / 32, 1, 1);
for (int i = 0; i < numMeshesCopied; i++)
int numBytes = combinedMesh.GetVertexBufferStride(0) * combinedMesh.vertexCount;
NativeArray bufferBytes = new NativeArray(numBytes, Allocator.Persistent);
AsyncGPUReadbackRequest request = AsyncGPUReadback.RequestIntoNativeArray(ref bufferBytes, combinedMeshBuffer);
#if UNITY_2022_2_OR_NEWER
request.forcePlayerLoopUpdate = true;
AsyncMeshReadbackData readbackInfo = new AsyncMeshReadbackData() { request = request, gpuBuffer = combinedMeshBuffer, cpuBuffer = bufferBytes };
return readbackInfo;
internal void JobCopyMeshes(ref NativeArray meshPackedChannels, ref NativeArray combinedPackedChannels, ref NativeArray rendererScaleSign, Mesh combinedMesh,
RendererData[] rd, int2 rendererRange, int[] renderer2Mesh, MeshDataArray meshList)
// Figure out what lightmaps are potentially present in the combined mesh.
// If either UV1 or UV2 are in the combined mesh, but not in an input mesh,
// then we need to instruct the job to copy the previous UV channel with a
// dimension > 0 to that channel
bool hasLightmap = combinedPackedChannels[5].dimension > 0;
bool hasDynLightmap = combinedPackedChannels[6].dimension > 0;
int combinedStride = combinedMesh.GetVertexBufferStride(0);
int combinedStride2 = combinedMesh.GetVertexBufferStride(1);
NativeArray combinedMeshVert = new NativeArray(combinedStride * combinedMesh.vertexCount, Allocator.TempJob);
NativeArray combinedMeshVert2 = new NativeArray(combinedStride2 * combinedMesh.vertexCount, Allocator.TempJob);
int4 strideOut = new int4(combinedStride, combinedStride2, 0, 0);
int combinedMeshCopyIndex = 0;
int combinedMeshCopyIndex2 = 0;
int numMeshesCopied = 0;
FixedList32Bytes formatToBytes = new FixedList32Bytes() { 1, 1, 1, 2, 4 };
NativeArray meshPackedChannels2 = new NativeArray(NUM_VTX_CHANNELS, Allocator.TempJob);
for (int renderer = rendererRange.x; renderer < rendererRange.y; renderer++)
int meshIdx = renderer2Mesh[renderer];
int stride = meshList[meshIdx].GetVertexBufferStride(0);
int stride2 = meshList[meshIdx].GetVertexBufferStride(1);
bool hasSecondBuffer = stride2 > 0;
if (!hasSecondBuffer) stride2 = 1; // max of 1, so we can store the sign of the tangent in this value even if there is no second buffer
//Debug.Log("single Mesh Stride = " + stride);
int tanSign = rendererScaleSign[renderer] > 0 ? 1 : -1;
NativeArray.Copy(meshPackedChannels, NUM_VTX_CHANNELS * meshIdx, meshPackedChannels2, 0, NUM_VTX_CHANNELS);
for (int i = 0; i < NUM_VTX_CHANNELS; i++)
Debug.Assert(meshPackedChannels2[i].offset % 4 == 0, "offset not aligned on 4 bytes, failure!");
// Handle lightmapped meshes where uv0 is being used as the lightmap UV. Set UV1's packed data to be UV0's so it just copies UV0 to UV1
// Also handle the dynamic lightmap. If UV2 is missing and there's no UV2 in the output, its using UV1
bool missingLM = hasLightmap && meshPackedChannels2[5].dimension == 0;
bool missingDynLM = hasDynLightmap && meshPackedChannels2[6].dimension == 0;
if (missingLM || missingDynLM)
if (missingLM)
meshPackedChannels2[5] = meshPackedChannels2[4];
if (missingDynLM)
meshPackedChannels2[6] = meshPackedChannels2[5].dimension == 0 ? meshPackedChannels2[4] : meshPackedChannels2[5];
NativeArray inVert = meshList[meshIdx].GetVertexData(0);
TransferVtxBuffer vtxJob = new TransferVtxBuffer
vertIn = inVert,
vertIn2 = hasSecondBuffer ? meshList[meshIdx].GetVertexData(1) : inVert,
vertOut = combinedMeshVert,
vertOut2 = combinedMeshVert2,
ObjectToWorld = rd[renderer].rendererTransform.localToWorldMatrix,
WorldToObject = rd[renderer].rendererTransform.worldToLocalMatrix,
lightmapScaleOffset = new float4(rd[renderer].meshRenderer.lightmapScaleOffset),
dynLightmapScaleOffset = new float4(rd[renderer].meshRenderer.realtimeLightmapScaleOffset),
offset_strideIn_offset2_strideIn2 = new int4(combinedMeshCopyIndex, stride, combinedMeshCopyIndex2, stride2 * tanSign),
inPackedChannelInfo = meshPackedChannels2,
strideOut = strideOut,
outPackedChannelInfo = combinedPackedChannels,
formatToBytes = formatToBytes,
int vertexCount = meshList[meshIdx].vertexCount;
combinedMeshCopyIndex += combinedStride * vertexCount;
combinedMeshCopyIndex2 += combinedStride2 * vertexCount;
JobHandle vtxJobHandle = vtxJob.Schedule(vertexCount, 16);
//vtxJob.vertIn.Dispose(); // Can cause an error? Somehow, the array is already disposed of sometimes. Anyways, I don't need to dispose of this as its just a view into the mesh's buffer.
combinedMesh.SetVertexBufferData(combinedMeshVert, 0, 0, combinedMeshVert.Length, 0, MeshUpdateFlags.DontValidateIndices | MeshUpdateFlags.DontRecalculateBounds | MeshUpdateFlags.DontNotifyMeshUsers | MeshUpdateFlags.DontResetBoneBounds);
if (combinedStride2 > 0)
combinedMesh.SetVertexBufferData(combinedMeshVert2, 0, 0, combinedMeshVert2.Length, 1, MeshUpdateFlags.DontValidateIndices | MeshUpdateFlags.DontRecalculateBounds | MeshUpdateFlags.DontNotifyMeshUsers | MeshUpdateFlags.DontResetBoneBounds);
//for (int uvIdx = 4; uvIdx < 12; uvIdx++)
// if (combinedPackedChannels[uvIdx].dimension > 0)
// {
// combinedMesh.RecalculateUVDistributionMetric(uvIdx - 4);
// }
public static string VtxStructToString(NativeArray packedChannels, int startIdx)
string outp = "";
if (packedChannels[startIdx].dimension != 0) outp += string.Format("\n Position: {0}\n", packedChannels[startIdx].ToString());
if (packedChannels[startIdx + 1].dimension != 0) outp += string.Format(" Normal: {0}\n", packedChannels[startIdx + 1].ToString());
if (packedChannels[startIdx + 2].dimension != 0) outp += string.Format(" Tangent: {0}\n", packedChannels[startIdx + 2].ToString());
if (packedChannels[startIdx + 3].dimension != 0) outp += string.Format(" Color: {0}\n", packedChannels[startIdx + 3].ToString());
if (packedChannels[startIdx + 4].dimension != 0) outp += string.Format(" UV0: {0}\n", packedChannels[startIdx + 4].ToString());
if (packedChannels[startIdx + 5].dimension != 0) outp += string.Format(" UV1: {0}\n", packedChannels[startIdx + 5].ToString());
if (packedChannels[startIdx + 6].dimension != 0) outp += string.Format(" UV2: {0}\n", packedChannels[startIdx + 6].ToString());
if (packedChannels[startIdx + 7].dimension != 0) outp += string.Format(" UV3: {0}\n", packedChannels[startIdx + 7].ToString());
if (packedChannels[startIdx + 8].dimension != 0) outp += string.Format(" UV4: {0}\n", packedChannels[startIdx + 8].ToString());
if (packedChannels[startIdx + 9].dimension != 0) outp += string.Format(" UV5: {0}\n", packedChannels[startIdx + 9].ToString());
if (packedChannels[startIdx + 10].dimension != 0) outp += string.Format(" UV6: {0}\n", packedChannels[startIdx + 10].ToString());
if (packedChannels[startIdx + 11].dimension != 0) outp += string.Format(" UV7: {0}\n", packedChannels[startIdx + 11].ToString());
return outp;