Guest User

Untitled

a guest
Jan 21st, 2018
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.16 KB | None | 0 0
  1. #include <windows.h>
  2. #include <D3D10.h>
  3. #include <D3DX10.h>
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <stdarg.h>
  7. #include <algorithm>
  8.  
  9. // "Does your GPU have pixel alignment requirements for quads?" tester
  10. // Win32 only, D3D10+ only because it's a quick hack.
  11. //
  12. // The test works like this:
  13. // We fill the screen with triangles laid out in a grid, with a spacing of 2 pixels
  14. // in each direction. Each triangle has the vertex coordinates (in pixels):
  15. // (x0, y0)
  16. // (x0, y0 + 2.5)
  17. // (x0 + 2.5, y0)
  18. // which means each triangle covers exactly 3 pixels. They end up looking like this:
  19. // *
  20. // * *
  21. // where the lower-left star is at the base vertex position (x0, y0). This is all set
  22. // up in the vertex shader. We then try displacing the triangles by one pixel each in
  23. // the x and y directions (as well as both at once). If the GPU can produce rasterized
  24. // quads with arbitrary alignment, this shouldn't make any significant difference. If,
  25. // however, the GPU tiles the render target with "quad footprints" and will rasterize
  26. // any quads touched by a triangle, shifting our test triangles by 1 pixel in either x
  27. // or y will cause 2 quads (instead of 1) to be rasterized per triangle; the case where
  28. // we shift by 1 pixel in both x and y is even worse, since the triangle now covers 3
  29. // quads (each of which have exactly one pixel lit).
  30. //
  31. // The PS just does random busy work (counting primes under a specified threshold) in
  32. // a deliberately inefficient way to make pixels take long to shade so we get long
  33. // enough frame times. The main point here is to just keep the ALUs busy and make sure
  34. // that other potential limiting factors (such as memory bandwidth) are out of the
  35. // equation. WorkFactor (set in the code below) controls how much work is done per pixel.
  36. // You have to set this up properly so the driver doesn't think the GPU is hung working
  37. // on the DrawPrimitive and does a reset. :) Choosing WorkFactor so the first test takes
  38. // about 130ms worked fine for me.
  39. //
  40. // So, for unconstrained quad alignment we expect equal performance in all 4 cases, whereas
  41. // for even-x/y alignment we expect one of the cases (probably the non-shifted one) to be
  42. // fastest, the two cases that hit 2 quads each to be roughly twice as slow, and the last
  43. // case (which hits 3 quads per triangle) to take roughly 3x as long as the fast case.
  44. //
  45. // Test results:
  46. //
  47. // AMD Radeon HD 5770 (WorkFactor = 100) - Evergreen series
  48. // xOffset=0 yOffset=0: 191.7 ms
  49. // xOffset=1 yOffset=0: 383.9 ms
  50. // xOffset=0 yOffset=1: 384.3 ms
  51. // xOffset=1 yOffset=1: 588.5 ms
  52. // -> nearly perfectly linear - even alignment required.
  53. //
  54. // AMD Radeon HD 6900 (WorkFactor = 100) - Northern islands series
  55. // xOffset=0 yOffset=0: 137.7 ms
  56. // xOffset=1 yOffset=0: 206.9 ms
  57. // xOffset=0 yOffset=1: 208.0 ms
  58. // xOffset=1 yOffset=1: 301.8 ms
  59. // -> likely to have even-alignment requirement.
  60. //
  61. // NVidia GeForce 8800 GTX (WorkFactor = 50) - G80 architecture
  62. // xOffset=0 yOffset=0: 127.6 ms
  63. // xOffset=1 yOffset=0: 157.0 ms
  64. // xOffset=0 yOffset=1: 158.5 ms
  65. // xOffset=1 yOffset=1: 252.8 ms
  66. // -> we definitely get some slowdown here, but not as much as expected. The ratios are
  67. // fairly consistent between different work factors, so this isn't just some constant overhead
  68. // that's distorting the relative frame times. I don't have detailed knowledge of how the
  69. // GF 8x00 series handles rasterization and quad dispatch internally, so I can't say for sure
  70. // what's going on here.
  71. //
  72. // NVidia GeForce GTX 465 (WorkFactor = 100) - Fermi architecture
  73. // xOffset=0 yOffset=0: 71.8ms
  74. // xOffset=1 yOffset=0: 139.3ms
  75. // xOffset=0 yOffset=1: 138.3ms
  76. // xOffset=1 yOffset=1: 207.9ms
  77. // -> This one's crystal clear: alignment required. And whatever other bottlenecks besides shaders
  78. // seemed to exist for this test in previous NV architectures seem to have been eliminated.
  79.  
  80. // Shader code
  81. static const char shaderCode[] =
  82. "cbuffer cbAll : register(cb0) {\n"
  83. " float4 pixelToNDC;\n"
  84. " uint width, upperBound;\n"
  85. "};\n"
  86. "struct PSIn {\n"
  87. " float4 Pos : SV_POSITION;\n"
  88. "};\n"
  89. "PSIn VS(uint i : SV_VertexID)\n"
  90. "{\n"
  91. " PSIn o;\n"
  92. " uint iTri = i / 3;\n"
  93. " uint iVertInTri = i % 3;\n"
  94. " float2 v;\n"
  95. " v.x = (float) (iTri % width)*2;\n"
  96. " v.y = (float) (iTri / width)*2;\n"
  97. " v.x += (iVertInTri == 2) ? 2.5f : 0.0f;\n"
  98. " v.y += (iVertInTri == 1) ? 2.5f : 0.0f;\n"
  99. " o.Pos.xy = v * pixelToNDC.xy + pixelToNDC.zw;\n"
  100. " o.Pos.z = 0.5f;\n"
  101. " o.Pos.w = 1.0f;\n"
  102. " return o;\n"
  103. "}\n"
  104. "float4 PS(PSIn x) : SV_Target\n"
  105. "{\n"
  106. " uint nPrimes = 0;\n"
  107. " for (uint i=2; i < upperBound; i++)\n"
  108. " {\n"
  109. " uint nFactors = 0;\n"
  110. " for (uint j=2; j <= i; j++)\n"
  111. " if (i % j == 0) nFactors++;\n"
  112. " nPrimes += (nFactors == 1) ? 1 : 0;\n"
  113. " }\n"
  114. " return (nPrimes != 0) ? float4(1,1,1,1) : float4(0,0,0,0);\n"
  115. "}\n"
  116. "technique10 Render {\n"
  117. " pass P0 {\n"
  118. " SetVertexShader(CompileShader(vs_4_0, VS()));\n"
  119. " SetGeometryShader(NULL);\n"
  120. " SetPixelShader(CompileShader(ps_4_0, PS()));\n"
  121. " }\n"
  122. "}\n";
  123.  
  124. // ---- App code.
  125.  
  126. #pragma comment(lib, "d3d10.lib")
  127. #pragma comment(lib, "d3dx10.lib")
  128.  
  129. static const int WIDTH = 512, HEIGHT = 512;
  130. static HWND hWnd;
  131.  
  132. static IDXGISwapChain *swapChain;
  133. static ID3D10Device *device;
  134. static ID3D10RenderTargetView *renderTargetView;
  135.  
  136. static ID3D10Effect *effect;
  137.  
  138. static void errorExit(const char *fmt, ...)
  139. {
  140. char buffer[2048];
  141. va_list arg;
  142.  
  143. va_start(arg, fmt);
  144. vsprintf_s(buffer, fmt, arg);
  145. va_end(arg);
  146.  
  147. MessageBoxA(hWnd, buffer, "Error", MB_ICONERROR | MB_OK);
  148. exit(1);
  149. }
  150.  
  151. static void check(HRESULT hr)
  152. {
  153. if (!FAILED(hr))
  154. return;
  155.  
  156. errorExit("D3D error code %08x\n", hr);
  157. }
  158.  
  159. static LRESULT CALLBACK windowProc(HWND hWnd, UINT msg, WPARAM wparam, LPARAM lparam)
  160. {
  161. switch (msg)
  162. {
  163. case WM_DESTROY:
  164. PostQuitMessage(0);
  165. break;
  166.  
  167. case WM_CHAR:
  168. if (wparam == 27) // escape
  169. DestroyWindow(hWnd);
  170. return 0;
  171. }
  172.  
  173. return DefWindowProc(hWnd, msg, wparam, lparam);
  174. }
  175.  
  176. static void createWindow(HINSTANCE hInst)
  177. {
  178. WNDCLASS wc = { 0, windowProc, 0, 0, hInst, 0, LoadCursor(0, IDC_ARROW), (HBRUSH) GetStockObject(WHITE_BRUSH), NULL, TEXT("quadtest") };
  179. if (!RegisterClass(&wc))
  180. errorExit("Couldn't register class.");
  181.  
  182. RECT r = { 0, 0, WIDTH, HEIGHT };
  183. AdjustWindowRect(&r, WS_OVERLAPPEDWINDOW, FALSE);
  184. hWnd = CreateWindow(TEXT("quadtest"), TEXT("quadtest"), WS_OVERLAPPEDWINDOW | WS_VISIBLE, CW_USEDEFAULT, CW_USEDEFAULT,
  185. r.right - r.left, r.bottom - r.top, NULL, NULL, hInst, NULL);
  186. if (!hWnd)
  187. errorExit("Error creating window.");
  188. }
  189.  
  190. static void initD3D()
  191. {
  192. DXGI_SWAP_CHAIN_DESC sd = {
  193. {
  194. WIDTH, HEIGHT, { 60, 1 }, DXGI_FORMAT_R8G8B8A8_UNORM,
  195. DXGI_MODE_SCANLINE_ORDER_UNSPECIFIED, DXGI_MODE_SCALING_UNSPECIFIED
  196. },
  197. { 1, 0 },
  198. DXGI_USAGE_RENDER_TARGET_OUTPUT,
  199. 1,
  200. hWnd,
  201. TRUE,
  202. DXGI_SWAP_EFFECT_DISCARD,
  203. 0
  204. };
  205.  
  206. check(D3D10CreateDeviceAndSwapChain(NULL, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION,
  207. &sd, &swapChain, &device));
  208.  
  209. // Create a render target view
  210. ID3D10Texture2D *buffer;
  211. check(swapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (void **)&buffer));
  212. check(device->CreateRenderTargetView(buffer, NULL, &renderTargetView));
  213. buffer->Release();
  214.  
  215. // Compile the shaders
  216. ID3D10Blob *errors;
  217. HRESULT hr = D3DX10CreateEffectFromMemory(shaderCode, strlen(shaderCode), "shader.fx", NULL,
  218. NULL, "fx_4_0", 0, 0, device, NULL, NULL, &effect, &errors, NULL);
  219. if (FAILED(hr))
  220. errorExit("Effect compilation error: %s", errors->GetBufferPointer());
  221.  
  222. if (errors)
  223. errors->Release();
  224.  
  225. // Initialize the viewport
  226. D3D10_VIEWPORT vp = { 0, 0, WIDTH, HEIGHT, 0.0f, 1.0f };
  227. device->RSSetViewports(1, &vp);
  228. }
  229.  
  230. static void deinitD3D()
  231. {
  232. effect->Release();
  233. renderTargetView->Release();
  234. device->Release();
  235. swapChain->Release();
  236. }
  237.  
  238. static double frame(char *desc, int test, int nFrameInTest)
  239. {
  240. static const float clearColor[4] = { 0, 0, 0, 0 };
  241. device->ClearRenderTargetView(renderTargetView, clearColor);
  242. device->OMSetRenderTargets(1, &renderTargetView, NULL);
  243.  
  244. // Prepare for rendering
  245. int WidthQuads = WIDTH/2 - 1;
  246. int HeightQuads = HEIGHT/2 - 1;
  247. int WorkFactor = 100;
  248. int xOffset = (test & 1), yOffset = (test >> 1) & 1;
  249.  
  250. // During startup, don't do any significant work.
  251. if (test == -1)
  252. {
  253. strcpy_s(desc, 256, "Warmup");
  254. WidthQuads = HeightQuads = 1;
  255. WorkFactor = 3;
  256. xOffset = yOffset = 0;
  257. }
  258. else
  259. sprintf_s(desc, 256, "xOffset=%d yOffset=%d", xOffset, yOffset);
  260.  
  261. float pixelToNDC[4];
  262. pixelToNDC[0] = 2.0f / WIDTH;
  263. pixelToNDC[1] = 2.0f / HEIGHT;
  264. pixelToNDC[2] = -1.0f + xOffset * pixelToNDC[0];
  265. pixelToNDC[3] = -1.0f + yOffset * pixelToNDC[1];
  266. effect->GetVariableByName("pixelToNDC")->AsVector()->SetFloatVector(pixelToNDC);
  267. effect->GetVariableByName("width")->AsScalar()->SetInt(WidthQuads);
  268. effect->GetVariableByName("upperBound")->AsScalar()->SetInt(WorkFactor);
  269.  
  270. // Actually render
  271. device->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
  272. effect->GetTechniqueByIndex(0)->GetPassByIndex(0)->Apply(0);
  273. device->Draw(WidthQuads * HeightQuads * 3, 0);
  274.  
  275. // Present
  276. swapChain->Present(0, 0);
  277.  
  278. // Stats
  279. static LARGE_INTEGER lastFrame, freq;
  280. LARGE_INTEGER now;
  281. double msTime = 0.0;
  282.  
  283. QueryPerformanceCounter(&now);
  284. if (nFrameInTest == 0)
  285. QueryPerformanceFrequency(&freq);
  286. else
  287. msTime = 1000.0 * (now.QuadPart - lastFrame.QuadPart) / freq.QuadPart;
  288.  
  289. lastFrame = now;
  290. return msTime;
  291. }
  292.  
  293. int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
  294. {
  295. createWindow(hInstance);
  296. initD3D();
  297.  
  298. static const int startTestFrame = 50;
  299. static const int warmupFramesPerTest = 3;
  300. static const int measureFramesPerTest = 7;
  301. static const int numTests = 4;
  302.  
  303. static const int totalFramesPerTest = warmupFramesPerTest + measureFramesPerTest;
  304. double tempTimes[measureFramesPerTest];
  305. double testResult[numTests];
  306. char testDesc[numTests][256];
  307. char textBuffer[1024];
  308. int frameCounter = 0;
  309.  
  310. for (;;)
  311. {
  312. MSG msg;
  313. while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE))
  314. {
  315. if (msg.message == WM_QUIT)
  316. goto Done;
  317.  
  318. TranslateMessage(&msg);
  319. DispatchMessage(&msg);
  320. }
  321.  
  322. if (frameCounter < startTestFrame)
  323. {
  324. frame(textBuffer, -1, frameCounter);
  325. Sleep(10);
  326. }
  327. else
  328. {
  329. int testFrame = frameCounter - startTestFrame;
  330. int iTest = testFrame / totalFramesPerTest;
  331. int frameInTest = testFrame % totalFramesPerTest;
  332.  
  333. if (iTest >= numTests)
  334. {
  335. // Format results
  336. char *p = textBuffer;
  337. char *pEnd = textBuffer + sizeof(textBuffer) / sizeof(*textBuffer);
  338. for (int i=0; i < numTests; i++)
  339. p += sprintf_s(p, pEnd - p, "%s: %.1f ms\n", testDesc[i], testResult[i]);
  340.  
  341. p += sprintf_s(p, pEnd-p, "\nPress Ctrl+C to copy to clipboard!");
  342.  
  343. // Present them in a message box
  344. MessageBoxA(hWnd, textBuffer, "Test results", MB_ICONINFORMATION | MB_OK);
  345. DestroyWindow(hWnd);
  346. }
  347. else
  348. {
  349. double time = frame(testDesc[iTest], iTest, frameInTest);
  350. if (frameInTest >= warmupFramesPerTest)
  351. tempTimes[frameInTest - warmupFramesPerTest] = time;
  352.  
  353. if (frameInTest == totalFramesPerTest - 1)
  354. {
  355. // Find and record the median
  356. std::nth_element(tempTimes, tempTimes + (measureFramesPerTest/2), tempTimes + measureFramesPerTest);
  357. testResult[iTest] = tempTimes[measureFramesPerTest/2];
  358. }
  359. }
  360. }
  361.  
  362. frameCounter++;
  363. }
  364.  
  365. Done:
  366. deinitD3D();
  367. }
Add Comment
Please, Sign In to add comment