Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Pixel UberShader for 0 texgens
- int idot(int3 x, int3 y)
- {
- int3 tmp = x * y;
- return tmp.x + tmp.y + tmp.z;
- }
- int idot(int4 x, int4 y)
- {
- int4 tmp = x * y;
- return tmp.x + tmp.y + tmp.z + tmp.w;
- }
- int iround(float x) { return int (round(x)); }
- int2 iround(float2 x) { return int2(round(x)); }
- int3 iround(float3 x) { return int3(round(x)); }
- int4 iround(float4 x) { return int4(round(x)); }
- SamplerState samp[8] : register(s0);
- Texture2DArray Tex[8] : register(t0);
- cbuffer PSBlock : register(b0) {
- int4 color[4];
- int4 k[4];
- int4 alphaRef;
- float4 texdim[8];
- int4 czbias[2];
- int4 cindscale[2];
- int4 cindmtx[6];
- int4 cfogcolor;
- int4 cfogi;
- float4 cfogf[2];
- float4 czslope;
- float2 cefbscale;
- uint bpmem_genmode;
- uint bpmem_alphaTest;
- uint bpmem_fogParam3;
- uint bpmem_fogRangeBase;
- uint bpmem_dstalpha;
- uint bpmem_ztex_op;
- bool bpmem_late_ztest;
- bool bpmem_rgba6_format;
- bool bpmem_dither;
- bool bpmem_bounding_box;
- uint4 bpmem_pack1[16];
- uint4 bpmem_pack2[8];
- int4 konstLookup[32];
- };
- #define bpmem_combiners(i) (bpmem_pack1[(i)].xy)
- #define bpmem_tevind(i) (bpmem_pack1[(i)].z)
- #define bpmem_iref(i) (bpmem_pack1[(i)].w)
- #define bpmem_tevorder(i) (bpmem_pack2[(i)].x)
- #define bpmem_tevksel(i) (bpmem_pack2[(i)].y)
- struct VS_OUTPUT {
- float4 pos : POSITION;
- float4 colors_0 : COLOR0;
- float4 colors_1 : COLOR1;
- float4 clipPos : TEXCOORD0;
- float clipDist0 : SV_ClipDistance0;
- float clipDist1 : SV_ClipDistance1;
- };
- uint bitfieldExtract(uint val, int off, int size) {
- // This built-in function is only support in OpenGL 4.0+ and ES 3.1+
- // Microsoft's HLSL compiler automatically optimises this to a bitfield extract instruction.
- uint mask = uint((1 << size) - 1);
- return uint(val >> off) & mask;
- }
- int4 sampleTexture(uint sampler_num, float2 uv) {
- // This is messy, but DirectX, OpenGl 3.3 and Opengl ES 3.0 doesn't support dynamic indexing of the sampler array
- // With any luck the shader compiler will optimise this if the hardware supports dynamic indexing.
- switch(sampler_num) {
- case 0u: return iround(Tex[0].Sample(samp[0], float3(uv, 0.0)) * 255.0);
- case 1u: return iround(Tex[1].Sample(samp[1], float3(uv, 0.0)) * 255.0);
- case 2u: return iround(Tex[2].Sample(samp[2], float3(uv, 0.0)) * 255.0);
- case 3u: return iround(Tex[3].Sample(samp[3], float3(uv, 0.0)) * 255.0);
- case 4u: return iround(Tex[4].Sample(samp[4], float3(uv, 0.0)) * 255.0);
- case 5u: return iround(Tex[5].Sample(samp[5], float3(uv, 0.0)) * 255.0);
- case 6u: return iround(Tex[6].Sample(samp[6], float3(uv, 0.0)) * 255.0);
- case 7u: return iround(Tex[7].Sample(samp[7], float3(uv, 0.0)) * 255.0);
- }
- }
- int4 Swizzle(uint s, int4 color) {
- // AKA: Color Channel Swapping
- int4 ret;
- ret.r = color[bitfieldExtract(bpmem_tevksel(s * 2u), 0, 2)];
- ret.g = color[bitfieldExtract(bpmem_tevksel(s * 2u), 2, 2)];
- ret.b = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 0, 2)];
- ret.a = color[bitfieldExtract(bpmem_tevksel(s * 2u + 1u), 2, 2)];
- return ret;
- }
- int Wrap(int coord, uint mode) {
- if (mode == 0u) // ITW_OFF
- return coord;
- else if (mode < 6u) // ITW_256 to ITW_16
- return coord & (0xfffe >> mode);
- else // ITW_0
- return 0;
- }
- // TEV's Linear Interpolate, plus bias, add/subtract and scale
- int tevLerp(int A, int B, int C, int D, uint bias, bool op, bool alpha, uint shift) {
- // Scale C from 0..255 to 0..256
- C += C >> 7;
- // Add bias to D
- if (bias == 1u) D += 128;
- else if (bias == 2u) D -= 128;
- int lerp = (A << 8) + (B - A)*C;
- if (shift != 3u) {
- lerp = lerp << shift;
- D = D << shift;
- }
- if ((shift == 3u) == alpha)
- lerp = lerp + (op ? 127 : 128);
- int result = lerp >> 8;
- // Add/Subtract D
- if(op) // Subtract
- result = D - result;
- else // Add
- result = D + result;
- // Most of the Shift was moved inside the lerp for improved percision
- // But we still do the divide by 2 here
- if (shift == 3u)
- result = result >> 1;
- return result;
- }
- // TEV's Linear Interpolate, plus bias, add/subtract and scale
- int3 tevLerp3(int3 A, int3 B, int3 C, int3 D, uint bias, bool op, bool alpha, uint shift) {
- // Scale C from 0..255 to 0..256
- C += C >> 7;
- // Add bias to D
- if (bias == 1u) D += 128;
- else if (bias == 2u) D -= 128;
- int3 lerp = (A << 8) + (B - A)*C;
- if (shift != 3u) {
- lerp = lerp << shift;
- D = D << shift;
- }
- if ((shift == 3u) == alpha)
- lerp = lerp + (op ? 127 : 128);
- int3 result = lerp >> 8;
- // Add/Subtract D
- if(op) // Subtract
- result = D - result;
- else // Add
- result = D + result;
- // Most of the Shift was moved inside the lerp for improved percision
- // But we still do the divide by 2 here
- if (shift == 3u)
- result = result >> 1;
- return result;
- }
- // Implements operations 0-5 of tev's compare mode,
- // which are common to both color and alpha channels
- bool tevCompare(uint op, int3 color_A, int3 color_B) {
- switch (op) {
- case 0u: // TEVCMP_R8_GT
- return (color_A.r > color_B.r);
- case 1u: // TEVCMP_R8_EQ
- return (color_A.r == color_B.r);
- case 2u: // TEVCMP_GR16_GT
- int A_16 = (color_A.r | (color_A.g << 8));
- int B_16 = (color_B.r | (color_B.g << 8));
- return A_16 > B_16;
- case 3u: // TEVCMP_GR16_EQ
- return (color_A.r == color_B.r && color_A.g == color_B.g);
- case 4u: // TEVCMP_BGR24_GT
- int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16));
- int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16));
- return A_24 > B_24;
- case 5u: // TEVCMP_BGR24_EQ
- return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b);
- default:
- return false;
- }
- }
- // Helper function for Alpha Test
- bool alphaCompare(int a, int b, uint compare) {
- switch (compare) {
- case 0u: // NEVER
- return false;
- case 1u: // LESS
- return a < b;
- case 2u: // EQUAL
- return a == b;
- case 3u: // LEQUAL
- return a <= b;
- case 4u: // GREATER
- return a > b;
- case 5u: // NEQUAL;
- return a != b;
- case 6u: // GEQUAL
- return a >= b;
- case 7u: // ALWAYS
- return true;
- }
- }
- struct State {
- int4 Reg[4];
- int4 TexColor;
- int AlphaBump;
- };
- struct StageState {
- uint stage;
- uint order;
- uint cc;
- uint ac;
- };
- int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1);
- int4 getKonstColor(State s, StageState ss);
- int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) {
- switch (index) {
- case 0u: // prev.rgb
- return s.Reg[0].rgb;
- case 1u: // prev.aaa
- return s.Reg[0].aaa;
- case 2u: // c0.rgb
- return s.Reg[1].rgb;
- case 3u: // c0.aaa
- return s.Reg[1].aaa;
- case 4u: // c1.rgb
- return s.Reg[2].rgb;
- case 5u: // c1.aaa
- return s.Reg[2].aaa;
- case 6u: // c2.rgb
- return s.Reg[3].rgb;
- case 7u: // c2.aaa
- return s.Reg[3].aaa;
- case 8u:
- return s.TexColor.rgb;
- case 9u:
- return s.TexColor.aaa;
- case 10u:
- return getRasColor(s, ss, colors_0, colors_1).rgb;
- case 11u:
- return getRasColor(s, ss, colors_0, colors_1).aaa;
- case 12u: // One
- return int3(255, 255, 255);
- case 13u: // Half
- return int3(128, 128, 128);
- case 14u:
- return getKonstColor(s, ss).rgb;
- case 15u: // Zero
- return int3(0, 0, 0);
- }
- }
- int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint index) {
- switch (index) {
- case 0u: // prev.a
- return s.Reg[0].a;
- case 1u: // c0.a
- return s.Reg[1].a;
- case 2u: // c1.a
- return s.Reg[2].a;
- case 3u: // c2.a
- return s.Reg[3].a;
- case 4u:
- return s.TexColor.a;
- case 5u:
- return getRasColor(s, ss, colors_0, colors_1).a;
- case 6u:
- return getKonstColor(s, ss).a;
- case 7u: // Zero
- return 0;
- }
- }
- int4 getTevReg(in State s, uint index) {
- switch (index) {
- case 0u: // prev
- return s.Reg[0];
- case 1u: // c0
- return s.Reg[1];
- case 2u: // c1
- return s.Reg[2];
- case 3u: // c2
- return s.Reg[3];
- default: // prev
- return s.Reg[0];
- }
- }
- void setRegColor(inout State s, uint index, int3 color) {
- switch (index) {
- case 0u: // prev
- s.Reg[0].rgb = color;
- break;
- case 1u: // c0
- s.Reg[1].rgb = color;
- break;
- case 2u: // c1
- s.Reg[2].rgb = color;
- break;
- case 3u: // c2
- s.Reg[3].rgb = color;
- break;
- }
- }
- void setRegAlpha(inout State s, uint index, int alpha) {
- switch (index) {
- case 0u: // prev
- s.Reg[0].a = alpha;
- break;
- case 1u: // c0
- s.Reg[1].a = alpha;
- break;
- case 2u: // c1
- s.Reg[2].a = alpha;
- break;
- case 3u: // c2
- s.Reg[3].a = alpha;
- break;
- }
- }
- void main(
- out float4 ocol0 : SV_Target0,
- out float4 ocol1 : SV_Target1,
- in float4 rawpos : SV_Position,
- in float4 colors_0 : COLOR0,
- in float4 colors_1 : COLOR1
- ,
- in float4 clipPos : TEXCOORD0,
- in float clipDist0 : SV_ClipDistance0
- ,
- in float clipDist1 : SV_ClipDistance1
- ) {
- int3 tevcoord = int3(0, 0, 0);
- State s;
- s.TexColor = int4(0, 0, 0, 0);
- s.AlphaBump = 0;
- s.Reg[0] = color[0];
- s.Reg[1] = color[1];
- s.Reg[2] = color[2];
- s.Reg[3] = color[3];
- uint num_stages = bitfieldExtract(bpmem_genmode, 10, 4);
- // Main tev loop
- [loop]
- for(uint stage = 0u; stage <= num_stages; stage++)
- {
- StageState ss;
- ss.stage = stage;
- ss.cc = bpmem_combiners(stage).x;
- ss.ac = bpmem_combiners(stage).y;
- ss.order = bpmem_tevorder(stage>>1);
- if ((stage & 1u) == 1u)
- ss.order = ss.order >> 12;
- // This is the Meat of TEV
- {
- // Color Combiner
- uint color_a = bitfieldExtract(ss.cc, 12, 4);
- uint color_b = bitfieldExtract(ss.cc, 8, 4);
- uint color_c = bitfieldExtract(ss.cc, 4, 4);
- uint color_d = bitfieldExtract(ss.cc, 0, 4);
- uint color_bias = bitfieldExtract(ss.cc, 16, 2);
- bool color_op = bool(bitfieldExtract(ss.cc, 18, 1));
- bool color_clamp = bool(bitfieldExtract(ss.cc, 19, 1));
- uint color_shift = bitfieldExtract(ss.cc, 20, 2);
- uint color_dest = bitfieldExtract(ss.cc, 22, 2);
- uint color_compare_op = color_shift << 1 | uint(color_op);
- int3 color_A = selectColorInput(s, ss, colors_0, colors_1, color_a) & int3(255, 255, 255);
- int3 color_B = selectColorInput(s, ss, colors_0, colors_1, color_b) & int3(255, 255, 255);
- int3 color_C = selectColorInput(s, ss, colors_0, colors_1, color_c) & int3(255, 255, 255);
- int3 color_D = selectColorInput(s, ss, colors_0, colors_1, color_d); // 10 bits + sign
- int3 color;
- if(color_bias != 3u) { // Normal mode
- color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, color_shift);
- } else { // Compare mode
- // op 6 and 7 do a select per color channel
- if (color_compare_op == 6u) {
- // TEVCMP_RGB8_GT
- color.r = (color_A.r > color_B.r) ? color_C.r : 0;
- color.g = (color_A.g > color_B.g) ? color_C.g : 0;
- color.b = (color_A.b > color_B.b) ? color_C.b : 0;
- } else if (color_compare_op == 7u) {
- // TEVCMP_RGB8_EQ
- color.r = (color_A.r == color_B.r) ? color_C.r : 0;
- color.g = (color_A.g == color_B.g) ? color_C.g : 0;
- color.b = (color_A.b == color_B.b) ? color_C.b : 0;
- } else {
- // The remaining ops do one compare which selects all 3 channels
- color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, 0);
- }
- color = color_D + color;
- }
- // Clamp result
- if (color_clamp)
- color = clamp(color, 0, 255);
- else
- color = clamp(color, -1024, 1023);
- // Write result to the correct input register of the next stage
- setRegColor(s, color_dest, color);
- // Alpha Combiner
- uint alpha_a = bitfieldExtract(ss.ac, 13, 3);
- uint alpha_b = bitfieldExtract(ss.ac, 10, 3);
- uint alpha_c = bitfieldExtract(ss.ac, 7, 3);
- uint alpha_d = bitfieldExtract(ss.ac, 4, 3);
- uint alpha_bias = bitfieldExtract(ss.ac, 16, 2);
- bool alpha_op = bool(bitfieldExtract(ss.ac, 18, 1));
- bool alpha_clamp = bool(bitfieldExtract(ss.ac, 19, 1));
- uint alpha_shift = bitfieldExtract(ss.ac, 20, 2);
- uint alpha_dest = bitfieldExtract(ss.ac, 22, 2);
- uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op);
- int alpha_A;
- int alpha_B;
- if (alpha_bias != 3u || alpha_compare_op > 5u) {
- // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5
- alpha_A = selectAlphaInput(s, ss, colors_0, colors_1, alpha_a) & 255;
- alpha_B = selectAlphaInput(s, ss, colors_0, colors_1, alpha_b) & 255;
- };
- int alpha_C = selectAlphaInput(s, ss, colors_0, colors_1, alpha_c) & 255;
- int alpha_D = selectAlphaInput(s, ss, colors_0, colors_1, alpha_d); // 10 bits + sign
- int alpha;
- if(alpha_bias != 3u) { // Normal mode
- alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, true, alpha_shift);
- } else { // Compare mode
- if (alpha_compare_op == 6u) {
- // TEVCMP_A8_GT
- alpha = (alpha_A > alpha_B) ? alpha_C : 0;
- } else if (alpha_compare_op == 7u) {
- // TEVCMP_A8_EQ
- alpha = (alpha_A == alpha_B) ? alpha_C : 0;
- } else {
- // All remaining alpha compare ops actually compare the color channels
- alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0;
- }
- alpha = alpha_D + alpha;
- }
- // Clamp result
- if (alpha_clamp)
- alpha = clamp(alpha, 0, 255);
- else
- alpha = clamp(alpha, -1024, 1023);
- // Write result to the correct input register of the next stage
- setRegAlpha(s, alpha_dest, alpha);
- }
- } // Main tev loop
- int4 TevResult;
- TevResult.xyz = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).x, 22, 2)).xyz;
- TevResult.w = getTevReg(s, bitfieldExtract(bpmem_combiners(num_stages).y, 22, 2)).w;
- TevResult &= 255;
- int zCoord = int((1.0 - rawpos.z) * 16777216.0);
- zCoord = clamp(zCoord, 0, 0xFFFFFF);
- // Depth Texture
- int early_zCoord = zCoord;
- if (bpmem_ztex_op != 0u) {
- int ztex = int(czbias[1].w); // fixed bias
- // Whatever texture was in our last stage, it's now our depth texture
- ztex += idot(s.TexColor.xyzw, czbias[0].xyzw);
- ztex += (bpmem_ztex_op == 1u) ? zCoord : 0;
- zCoord = ztex & 0xFFFFFF;
- }
- // Alpha Test
- if (bpmem_alphaTest != 0u) {
- bool comp0 = alphaCompare(TevResult.a, alphaRef.r, bitfieldExtract(bpmem_alphaTest, 16, 3));
- bool comp1 = alphaCompare(TevResult.a, alphaRef.g, bitfieldExtract(bpmem_alphaTest, 19, 3));
- // These if statements are written weirdly to work around intel and qualcom bugs with handling booleans.
- switch (bitfieldExtract(bpmem_alphaTest, 22, 2)) {
- case 0u: // AND
- if (comp0 && comp1) break; else discard; break;
- case 1u: // OR
- if (comp0 || comp1) break; else discard; break;
- case 2u: // XOR
- if (comp0 != comp1) break; else discard; break;
- case 3u: // XNOR
- if (comp0 == comp1) break; else discard; break;
- }
- }
- if (bpmem_dither) {
- // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering
- // Here the matrix is encoded into the two factor constants
- int2 dither = int2(rawpos.xy) & 1;
- TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2);
- }
- // Fog
- uint fog_function = bitfieldExtract(bpmem_fogParam3, 21, 3);
- if (fog_function != 0u) {
- // TODO: This all needs to be converted from float to fixed point
- float ze;
- if (bitfieldExtract(bpmem_fogParam3, 20, 1) == 0u) {
- // perspective
- // ze = A/(B - (Zs >> B_SHF)
- ze = (cfogf[1].x * 16777216.0) / float(cfogi.y - (zCoord >> cfogi.w));
- } else {
- // orthographic
- // ze = a*Zs (here, no B_SHF)
- ze = cfogf[1].x * float(zCoord) / 16777216.0;
- }
- if (bool(bitfieldExtract(bpmem_fogRangeBase, 10, 1))) {
- // x_adjust = sqrt((x-center)^2 + k^2)/k
- // ze *= x_adjust
- // TODO Instead of this theoretical calculation, we should use the
- // coefficient table given in the fog range BP registers!
- float x_adjust = (2.0 * (rawpos.x / cfogf[0].y)) - 1.0 - cfogf[0].x;
- x_adjust = sqrt(x_adjust * x_adjust + cfogf[0].z * cfogf[0].z) / cfogf[0].z;
- ze *= x_adjust;
- }
- float fog = clamp(ze - cfogf[1].z, 0.0, 1.0);
- if (fog_function > 3u) {
- switch (fog_function) {
- case 4u:
- fog = 1.0 - exp2(-8.0 * fog);
- break;
- case 5u:
- fog = 1.0 - exp2(-8.0 * fog * fog);
- break;
- case 6u:
- fog = exp2(-8.0 * (1.0 - fog));
- break;
- case 7u:
- fog = 1.0 - fog;
- fog = exp2(-8.0 * fog * fog);
- break;
- }
- }
- int ifog = iround(fog * 256.0);
- TevResult.rgb = (TevResult.rgb * (256 - ifog) + cfogcolor.rgb * ifog) >> 8;
- }
- if (bpmem_rgba6_format)
- ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;
- else
- ocol0.rgb = float3(TevResult.rgb) / 255.0;
- if (bpmem_dstalpha != 0u)
- ocol0.a = float(bitfieldExtract(bpmem_dstalpha, 0, 8) >> 2) / 63.0;
- else
- ocol0.a = float(TevResult.a >> 2) / 63.0;
- // Dest alpha override (dual source blending)
- // Colors will be blended against the alpha from ocol1 and
- // the alpha from ocol0 will be written to the framebuffer.
- ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0);
- }
- int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) {
- // Select Ras for stage
- uint ras = bitfieldExtract(ss.order, 7, 3);
- if (ras < 2u) { // Lighting Channel 0 or 1
- int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0);
- uint swap = bitfieldExtract(ss.ac, 0, 2);
- return Swizzle(swap, color);
- } else if (ras == 5u) { // Alpha Bumb
- return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump);
- } else if (ras == 6u) { // Normalzied Alpha Bump
- int normalized = s.AlphaBump | s.AlphaBump >> 5;
- return int4(normalized, normalized, normalized, normalized);
- } else {
- return int4(0, 0, 0, 0);
- }
- }
- int4 getKonstColor(State s, StageState ss) {
- // Select Konst for stage
- // TODO: a switch case might be better here than an dynamically // indexed uniform lookup
- uint tevksel = bpmem_tevksel(ss.stage>>1);
- if ((ss.stage & 1u) == 0u)
- return int4(konstLookup[bitfieldExtract(tevksel, 4, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 9, 5)].a);
- else
- return int4(konstLookup[bitfieldExtract(tevksel, 14, 5)].rgb, konstLookup[bitfieldExtract(tevksel, 19, 5)].a);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement