Sunday, December 15, 2013

HLSL instruction count optimization

I found a simple way to print out the number of instructions in a compiled shader, which is an easy way to optimize compared to looking at total framerate.  Instruction count isn't the whole story about how fast a shader will run, but it's one thing to look at.

Includes:
#define OUTPUT_SHADER_INSTRUCTION_COUNT
#ifdef OUTPUT_SHADER_INSTRUCTION_COUNT
#define INITGUID
#include "D3D11Shader.h"
#include "D3Dcompiler.h"
#include "GHDebugMessage.h"
#endif
After loading the shader buffer:
#ifdef OUTPUT_SHADER_INSTRUCTION_COUNT
ID3D11ShaderReflection* pReflector = NULL;
D3DReflect(fileBuf, fileLen, IID_ID3D11ShaderReflection, (void**)&pReflector);
D3D11_SHADER_DESC shaderDesc;
pReflector->GetDesc(&shaderDesc);
GHDebugMessage::outputString("Shader %s instruction count %d", shaderName, shaderDesc.InstructionCount);
#endif
We have a very unoptimized experimental shader.  I ran some instruction count measurements on some simple changes.

Starting instruction count: 272
Ending instruction count: 235
Instructions saved: 37

Adding lerp:

Shader sbterrainpixel.cso instruction count 270
(saves 2 instructions)

old:
float4 color;
color.w = 1.0;
color.xyz = (cliffColor.xyz*blendColor.y) + (groundColor.xyz*blendColor.x);
new:
float4 color = lerp(cliffColor, groundColor, blendColor.x);

Swizzle:

No difference with or without the swizzle.
color.xyz *= blendColor.z;
color *= blendColor.z

Vector Ops:

Shader sbterrainpixel.cso instruction count 262
(saves 1 instruction per function call)

old:
float2 offsetUV = float2(offsetShadProj.x / offsetShadProj.w, 1.0f - offsetShadProj.y / offsetShadProj.w);
return tex.Sample(samp, offsetUV).x;

new:
float2 offsetUV = offsetShadProj.xy / offsetShadProj.w;
offsetUV.y = 1.0 - offsetUV.y;
return tex.Sample(samp, offsetUV).x;

More Vector Ops:

Shader sbterrainpixel.cso instruction count 246
(saves 4 instructions per function call)

old:
const float xPixelOffset = 0.0015;
const float yPixelOffset = 0.0015;
float4 offsetShadProj = shadowPos + float4(offSet.x * xPixelOffset,
offSet.y * yPixelOffset, 0.0, 0.0);

new:
const float2 pixelOffset = float2(0.002, 0.002);
float2 multOffset = offSet.xy * pixelOffset;
float4 offsetShadProj = shadowPos + float4(multOffset.xy, 0.0, 0.0);

Again Vector Ops:

Shader sbterrainpixel.cso instruction count 245
(saves 1 instruction)

old:
float4 trailColor = TrailTexture.Sample(TrailTextureSampler, float2(input.trailPos.x / input.trailPos.w, 1.0 - input.trailPos.y / input.trailPos.w));

new:
float2 trailUV = input.trailPos.xy / input.trailPos.w;
trailUV.y = 1.0 - trailUV.y;
float4 trailColor = TrailTexture.Sample(TrailTextureSampler, trailUV);

Bad Code:

Shader sbterrainpixel.cso instruction count 239
(saves 6 instructions)

old:
float shadInBorder = saturate(step(0.95, shadCenter.x) + step(0.95, shadCenter.y) +
step(shadCenter.x, 0.05) + step(shadCenter.y, 0.05));
// todo: more efficient.
color.xyz -= (color.xyz*(1.0-shadTot) * (1.0-shadInBorder));

new:
// if we're on the border, come up with a value bigger than 1.
float shadInBorder = step(0.95, shadCenter.x) + step(0.95, shadCenter.y) +
step(shadCenter.x, 0.05) + step(shadCenter.y, 0.05);
// multiply color by the shadow value, unless we are on the border.
color.xyz *= saturate(shadTot + shadInBorder);

More Bad Code:

Shader sbterrainpixel.cso instruction count 235
(saves 4 instructions)

old:
float shadInWideBorder = saturate(step(0.95, wideShadowUV.x) + step(0.95, wideShadowUV.y) + step(wideShadowUV.x, 0.05) + step(wideShadowUV.y, 0.05));
color.xyz -= (color.xyz*(1.0 - wideshadTot)) * shadInBorder * (1.0-shadInWideBorder);

new:
// apply wide shadow if we are not in the wide border and are in the short border.
float shadInWideBorder = step(0.95, wideShadowUV.x) + step(0.95, wideShadowUV.y) + step(wideShadowUV.x, 0.05) + step(wideShadowUV.y, 0.05);
color.xyz *= saturate(wideshadTot + shadInWideBorder + step(shadInBorder, 0.9));