Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <windows.h>
- #include <D3D10.h>
- #include <D3DX10.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <stdarg.h>
- #include <algorithm>
- // "Does your GPU have pixel alignment requirements for quads?" tester
- // Win32 only, D3D10+ only because it's a quick hack.
- //
- // The test works like this:
- // We fill the screen with triangles laid out in a grid, with a spacing of 2 pixels
- // in each direction. Each triangle has the vertex coordinates (in pixels):
- // (x0, y0)
- // (x0, y0 + 2.5)
- // (x0 + 2.5, y0)
- // which means each triangle covers exactly 3 pixels. They end up looking like this:
- // *
- // * *
- // where the lower-left star is at the base vertex position (x0, y0). This is all set
- // up in the vertex shader. We then try displacing the triangles by one pixel each in
- // the x and y directions (as well as both at once). If the GPU can produce rasterized
- // quads with arbitrary alignment, this shouldn't make any significant difference. If,
- // however, the GPU tiles the render target with "quad footprints" and will rasterize
- // any quads touched by a triangle, shifting our test triangles by 1 pixel in either x
- // or y will cause 2 quads (instead of 1) to be rasterized per triangle; the case where
- // we shift by 1 pixel in both x and y is even worse, since the triangle now covers 3
- // quads (each of which have exactly one pixel lit).
- //
- // The PS just does random busy work (counting primes under a specified threshold) in
- // a deliberately inefficient way to make pixels take long to shade so we get long
- // enough frame times. The main point here is to just keep the ALUs busy and make sure
- // that other potential limiting factors (such as memory bandwidth) are out of the
- // equation. WorkFactor (set in the code below) controls how much work is done per pixel.
- // You have to set this up properly so the driver doesn't think the GPU is hung working
- // on the DrawPrimitive and does a reset. :) Choosing WorkFactor so the first test takes
- // about 130ms worked fine for me.
- //
- // So, for unconstrained quad alignment we expect equal performance in all 4 cases, whereas
- // for even-x/y alignment we expect one of the cases (probably the non-shifted one) to be
- // fastest, the two cases that hit 2 quads each to be roughly twice as slow, and the last
- // case (which hits 3 quads per triangle) to take roughly 3x as long as the fast case.
- //
- // Test results:
- //
- // AMD Radeon HD 5770 (WorkFactor = 100) - Evergreen series
- // xOffset=0 yOffset=0: 191.7 ms
- // xOffset=1 yOffset=0: 383.9 ms
- // xOffset=0 yOffset=1: 384.3 ms
- // xOffset=1 yOffset=1: 588.5 ms
- // -> nearly perfectly linear - even alignment required.
- //
- // AMD Radeon HD 6900 (WorkFactor = 100) - Northern islands series
- // xOffset=0 yOffset=0: 137.7 ms
- // xOffset=1 yOffset=0: 206.9 ms
- // xOffset=0 yOffset=1: 208.0 ms
- // xOffset=1 yOffset=1: 301.8 ms
- // -> likely to have even-alignment requirement.
- //
- // NVidia GeForce 8800 GTX (WorkFactor = 50) - G80 architecture
- // xOffset=0 yOffset=0: 127.6 ms
- // xOffset=1 yOffset=0: 157.0 ms
- // xOffset=0 yOffset=1: 158.5 ms
- // xOffset=1 yOffset=1: 252.8 ms
- // -> we definitely get some slowdown here, but not as much as expected. The ratios are
- // fairly consistent between different work factors, so this isn't just some constant overhead
- // that's distorting the relative frame times. I don't have detailed knowledge of how the
- // GF 8x00 series handles rasterization and quad dispatch internally, so I can't say for sure
- // what's going on here.
- //
- // NVidia GeForce GTX 465 (WorkFactor = 100) - Fermi architecture
- // xOffset=0 yOffset=0: 71.8ms
- // xOffset=1 yOffset=0: 139.3ms
- // xOffset=0 yOffset=1: 138.3ms
- // xOffset=1 yOffset=1: 207.9ms
- // -> This one's crystal clear: alignment required. And whatever other bottlenecks besides shaders
- // seemed to exist for this test in previous NV architectures seem to have been eliminated.
- // Shader code
- static const char shaderCode[] =
- "cbuffer cbAll : register(cb0) {\n"
- " float4 pixelToNDC;\n"
- " uint width, upperBound;\n"
- "};\n"
- "struct PSIn {\n"
- " float4 Pos : SV_POSITION;\n"
- "};\n"
- "PSIn VS(uint i : SV_VertexID)\n"
- "{\n"
- " PSIn o;\n"
- " uint iTri = i / 3;\n"
- " uint iVertInTri = i % 3;\n"
- " float2 v;\n"
- " v.x = (float) (iTri % width)*2;\n"
- " v.y = (float) (iTri / width)*2;\n"
- " v.x += (iVertInTri == 2) ? 2.5f : 0.0f;\n"
- " v.y += (iVertInTri == 1) ? 2.5f : 0.0f;\n"
- " o.Pos.xy = v * pixelToNDC.xy + pixelToNDC.zw;\n"
- " o.Pos.z = 0.5f;\n"
- " o.Pos.w = 1.0f;\n"
- " return o;\n"
- "}\n"
- "float4 PS(PSIn x) : SV_Target\n"
- "{\n"
- " uint nPrimes = 0;\n"
- " for (uint i=2; i < upperBound; i++)\n"
- " {\n"
- " uint nFactors = 0;\n"
- " for (uint j=2; j <= i; j++)\n"
- " if (i % j == 0) nFactors++;\n"
- " nPrimes += (nFactors == 1) ? 1 : 0;\n"
- " }\n"
- " return (nPrimes != 0) ? float4(1,1,1,1) : float4(0,0,0,0);\n"
- "}\n"
- "technique10 Render {\n"
- " pass P0 {\n"
- " SetVertexShader(CompileShader(vs_4_0, VS()));\n"
- " SetGeometryShader(NULL);\n"
- " SetPixelShader(CompileShader(ps_4_0, PS()));\n"
- " }\n"
- "}\n";
- // ---- App code.
- #pragma comment(lib, "d3d10.lib")
- #pragma comment(lib, "d3dx10.lib")
- static const int WIDTH = 512, HEIGHT = 512;
- static HWND hWnd;
- static IDXGISwapChain *swapChain;
- static ID3D10Device *device;
- static ID3D10RenderTargetView *renderTargetView;
- static ID3D10Effect *effect;
- static void errorExit(const char *fmt, ...)
- {
- char buffer[2048];
- va_list arg;
- va_start(arg, fmt);
- vsprintf_s(buffer, fmt, arg);
- va_end(arg);
- MessageBoxA(hWnd, buffer, "Error", MB_ICONERROR | MB_OK);
- exit(1);
- }
- static void check(HRESULT hr)
- {
- if (!FAILED(hr))
- return;
- errorExit("D3D error code %08x\n", hr);
- }
- static LRESULT CALLBACK windowProc(HWND hWnd, UINT msg, WPARAM wparam, LPARAM lparam)
- {
- switch (msg)
- {
- case WM_DESTROY:
- PostQuitMessage(0);
- break;
- case WM_CHAR:
- if (wparam == 27) // escape
- DestroyWindow(hWnd);
- return 0;
- }
- return DefWindowProc(hWnd, msg, wparam, lparam);
- }
- static void createWindow(HINSTANCE hInst)
- {
- WNDCLASS wc = { 0, windowProc, 0, 0, hInst, 0, LoadCursor(0, IDC_ARROW), (HBRUSH) GetStockObject(WHITE_BRUSH), NULL, TEXT("quadtest") };
- if (!RegisterClass(&wc))
- errorExit("Couldn't register class.");
- RECT r = { 0, 0, WIDTH, HEIGHT };
- AdjustWindowRect(&r, WS_OVERLAPPEDWINDOW, FALSE);
- hWnd = CreateWindow(TEXT("quadtest"), TEXT("quadtest"), WS_OVERLAPPEDWINDOW | WS_VISIBLE, CW_USEDEFAULT, CW_USEDEFAULT,
- r.right - r.left, r.bottom - r.top, NULL, NULL, hInst, NULL);
- if (!hWnd)
- errorExit("Error creating window.");
- }
- static void initD3D()
- {
- DXGI_SWAP_CHAIN_DESC sd = {
- {
- WIDTH, HEIGHT, { 60, 1 }, DXGI_FORMAT_R8G8B8A8_UNORM,
- DXGI_MODE_SCANLINE_ORDER_UNSPECIFIED, DXGI_MODE_SCALING_UNSPECIFIED
- },
- { 1, 0 },
- DXGI_USAGE_RENDER_TARGET_OUTPUT,
- 1,
- hWnd,
- TRUE,
- DXGI_SWAP_EFFECT_DISCARD,
- 0
- };
- check(D3D10CreateDeviceAndSwapChain(NULL, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION,
- &sd, &swapChain, &device));
- // Create a render target view
- ID3D10Texture2D *buffer;
- check(swapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (void **)&buffer));
- check(device->CreateRenderTargetView(buffer, NULL, &renderTargetView));
- buffer->Release();
- // Compile the shaders
- ID3D10Blob *errors;
- HRESULT hr = D3DX10CreateEffectFromMemory(shaderCode, strlen(shaderCode), "shader.fx", NULL,
- NULL, "fx_4_0", 0, 0, device, NULL, NULL, &effect, &errors, NULL);
- if (FAILED(hr))
- errorExit("Effect compilation error: %s", errors->GetBufferPointer());
- if (errors)
- errors->Release();
- // Initialize the viewport
- D3D10_VIEWPORT vp = { 0, 0, WIDTH, HEIGHT, 0.0f, 1.0f };
- device->RSSetViewports(1, &vp);
- }
- static void deinitD3D()
- {
- effect->Release();
- renderTargetView->Release();
- device->Release();
- swapChain->Release();
- }
- static double frame(char *desc, int test, int nFrameInTest)
- {
- static const float clearColor[4] = { 0, 0, 0, 0 };
- device->ClearRenderTargetView(renderTargetView, clearColor);
- device->OMSetRenderTargets(1, &renderTargetView, NULL);
- // Prepare for rendering
- int WidthQuads = WIDTH/2 - 1;
- int HeightQuads = HEIGHT/2 - 1;
- int WorkFactor = 100;
- int xOffset = (test & 1), yOffset = (test >> 1) & 1;
- // During startup, don't do any significant work.
- if (test == -1)
- {
- strcpy_s(desc, 256, "Warmup");
- WidthQuads = HeightQuads = 1;
- WorkFactor = 3;
- xOffset = yOffset = 0;
- }
- else
- sprintf_s(desc, 256, "xOffset=%d yOffset=%d", xOffset, yOffset);
- float pixelToNDC[4];
- pixelToNDC[0] = 2.0f / WIDTH;
- pixelToNDC[1] = 2.0f / HEIGHT;
- pixelToNDC[2] = -1.0f + xOffset * pixelToNDC[0];
- pixelToNDC[3] = -1.0f + yOffset * pixelToNDC[1];
- effect->GetVariableByName("pixelToNDC")->AsVector()->SetFloatVector(pixelToNDC);
- effect->GetVariableByName("width")->AsScalar()->SetInt(WidthQuads);
- effect->GetVariableByName("upperBound")->AsScalar()->SetInt(WorkFactor);
- // Actually render
- device->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
- effect->GetTechniqueByIndex(0)->GetPassByIndex(0)->Apply(0);
- device->Draw(WidthQuads * HeightQuads * 3, 0);
- // Present
- swapChain->Present(0, 0);
- // Stats
- static LARGE_INTEGER lastFrame, freq;
- LARGE_INTEGER now;
- double msTime = 0.0;
- QueryPerformanceCounter(&now);
- if (nFrameInTest == 0)
- QueryPerformanceFrequency(&freq);
- else
- msTime = 1000.0 * (now.QuadPart - lastFrame.QuadPart) / freq.QuadPart;
- lastFrame = now;
- return msTime;
- }
- int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
- {
- createWindow(hInstance);
- initD3D();
- static const int startTestFrame = 50;
- static const int warmupFramesPerTest = 3;
- static const int measureFramesPerTest = 7;
- static const int numTests = 4;
- static const int totalFramesPerTest = warmupFramesPerTest + measureFramesPerTest;
- double tempTimes[measureFramesPerTest];
- double testResult[numTests];
- char testDesc[numTests][256];
- char textBuffer[1024];
- int frameCounter = 0;
- for (;;)
- {
- MSG msg;
- while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE))
- {
- if (msg.message == WM_QUIT)
- goto Done;
- TranslateMessage(&msg);
- DispatchMessage(&msg);
- }
- if (frameCounter < startTestFrame)
- {
- frame(textBuffer, -1, frameCounter);
- Sleep(10);
- }
- else
- {
- int testFrame = frameCounter - startTestFrame;
- int iTest = testFrame / totalFramesPerTest;
- int frameInTest = testFrame % totalFramesPerTest;
- if (iTest >= numTests)
- {
- // Format results
- char *p = textBuffer;
- char *pEnd = textBuffer + sizeof(textBuffer) / sizeof(*textBuffer);
- for (int i=0; i < numTests; i++)
- p += sprintf_s(p, pEnd - p, "%s: %.1f ms\n", testDesc[i], testResult[i]);
- p += sprintf_s(p, pEnd-p, "\nPress Ctrl+C to copy to clipboard!");
- // Present them in a message box
- MessageBoxA(hWnd, textBuffer, "Test results", MB_ICONINFORMATION | MB_OK);
- DestroyWindow(hWnd);
- }
- else
- {
- double time = frame(testDesc[iTest], iTest, frameInTest);
- if (frameInTest >= warmupFramesPerTest)
- tempTimes[frameInTest - warmupFramesPerTest] = time;
- if (frameInTest == totalFramesPerTest - 1)
- {
- // Find and record the median
- std::nth_element(tempTimes, tempTimes + (measureFramesPerTest/2), tempTimes + measureFramesPerTest);
- testResult[iTest] = tempTimes[measureFramesPerTest/2];
- }
- }
- }
- frameCounter++;
- }
- Done:
- deinitD3D();
- }
Add Comment
Please, Sign In to add comment