SHARE
TWEET

asdasdasd

a guest Jan 22nd, 2020 80 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 7,
  6.    "metadata": {},
  7.    "outputs": [
  8.     {
  9.      "name": "stdout",
  10.      "output_type": "stream",
  11.      "text": [
  12.       "\u001b[32m\u001b[1mActivating\u001b[22m\u001b[39m environment at `~/Project.toml`\n"
  13.      ]
  14.     }
  15.    ],
  16.    "source": [
  17.     "] activate ../"
  18.    ]
  19.   },
  20.   {
  21.    "cell_type": "markdown",
  22.    "metadata": {},
  23.    "source": [
  24.     "## Problem"
  25.    ]
  26.   },
  27.   {
  28.    "cell_type": "markdown",
  29.    "metadata": {},
  30.    "source": [
  31.     "Optimize the following function."
  32.    ]
  33.   },
  34.   {
  35.    "cell_type": "code",
  36.    "execution_count": 10,
  37.    "metadata": {},
  38.    "outputs": [
  39.     {
  40.      "data": {
  41.       "text/plain": [
  42.        "work (generic function with 1 method)"
  43.       ]
  44.      },
  45.      "execution_count": 10,
  46.      "metadata": {},
  47.      "output_type": "execute_result"
  48.     }
  49.    ],
  50.    "source": [
  51.     "function work(mat, s, v, N)\n",
  52.     "    val = 0.0\n",
  53.     "    for i in 1:N\n",
  54.     "        for j in 1:N\n",
  55.     "            val = mod(v[i],256);\n",
  56.     "            mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));\n",
  57.     "        end\n",
  58.     "    end;\n",
  59.     "end"
  60.    ]
  61.   },
  62.   {
  63.    "cell_type": "code",
  64.    "execution_count": 11,
  65.    "metadata": {},
  66.    "outputs": [
  67.     {
  68.      "data": {
  69.       "text/plain": [
  70.        "\u001b[32m\u001b[1mTest Passed\u001b[22m\u001b[39m"
  71.       ]
  72.      },
  73.      "execution_count": 11,
  74.      "metadata": {},
  75.      "output_type": "execute_result"
  76.     }
  77.    ],
  78.    "source": [
  79.     "using Test\n",
  80.     "x = rand()\n",
  81.     "@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)\n",
  82.     "@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)"
  83.    ]
  84.   },
  85.   {
  86.    "cell_type": "code",
  87.    "execution_count": 12,
  88.    "metadata": {},
  89.    "outputs": [
  90.     {
  91.      "data": {
  92.       "text/plain": [
  93.        "opt1 (generic function with 1 method)"
  94.       ]
  95.      },
  96.      "execution_count": 12,
  97.      "metadata": {},
  98.      "output_type": "execute_result"
  99.     }
  100.    ],
  101.    "source": [
  102.     "# pulling out + analytical opt\n",
  103.     "function opt1(mat, s, v, N)\n",
  104.     "    val = 0.0\n",
  105.     "    @inbounds for i in 1:N\n",
  106.     "        val = mod(v[i],256);\n",
  107.     "        val = -cos(2*val)\n",
  108.     "        for j in 1:N\n",
  109.     "            mat[i,j] = s[i,j]*val;\n",
  110.     "        end\n",
  111.     "    end;\n",
  112.     "    mat\n",
  113.     "end"
  114.    ]
  115.   },
  116.   {
  117.    "cell_type": "code",
  118.    "execution_count": 13,
  119.    "metadata": {},
  120.    "outputs": [
  121.     {
  122.      "data": {
  123.       "text/plain": [
  124.        "opt12 (generic function with 1 method)"
  125.       ]
  126.      },
  127.      "execution_count": 13,
  128.      "metadata": {},
  129.      "output_type": "execute_result"
  130.     }
  131.    ],
  132.    "source": [
  133.     "# pulling out + analytical opt + allocate values\n",
  134.     "function opt12(mat, s, v, N)\n",
  135.     "    val = Vector{Float64}(undef, length(v))\n",
  136.     "    @inbounds for i in eachindex(val)\n",
  137.     "        val[i] = -cos(2*mod(v[i],256));\n",
  138.     "    end\n",
  139.     "    \n",
  140.     "    @inbounds for i in 1:N\n",
  141.     "        for j in 1:N\n",
  142.     "            mat[i,j] = s[i,j]*val[i];\n",
  143.     "        end\n",
  144.     "    end;\n",
  145.     "    mat\n",
  146.     "end"
  147.    ]
  148.   },
  149.   {
  150.    "cell_type": "code",
  151.    "execution_count": 14,
  152.    "metadata": {},
  153.    "outputs": [
  154.     {
  155.      "data": {
  156.       "text/plain": [
  157.        "opt2 (generic function with 1 method)"
  158.       ]
  159.      },
  160.      "execution_count": 14,
  161.      "metadata": {},
  162.      "output_type": "execute_result"
  163.     }
  164.    ],
  165.    "source": [
  166.     "# reordering loops\n",
  167.     "function opt2(mat, s, v, N)\n",
  168.     "    val = 0.0\n",
  169.     "    @inbounds for j in 1:N\n",
  170.     "        for i in 1:N\n",
  171.     "            val = mod(v[i],256);\n",
  172.     "            val = -cos(2*val)\n",
  173.     "            mat[i,j] = s[i,j]*val;\n",
  174.     "        end\n",
  175.     "    end;\n",
  176.     "end"
  177.    ]
  178.   },
  179.   {
  180.    "cell_type": "code",
  181.    "execution_count": 15,
  182.    "metadata": {},
  183.    "outputs": [
  184.     {
  185.      "data": {
  186.       "text/plain": [
  187.        "opt22 (generic function with 1 method)"
  188.       ]
  189.      },
  190.      "execution_count": 15,
  191.      "metadata": {},
  192.      "output_type": "execute_result"
  193.     }
  194.    ],
  195.    "source": [
  196.     "# reordering loops + allocate values\n",
  197.     "function opt22(mat, s, v, N)\n",
  198.     "    val = Vector{Float64}(undef, length(v))\n",
  199.     "    @inbounds for i in eachindex(val)\n",
  200.     "        val[i] = -cos(2*mod(v[i],256));\n",
  201.     "    end\n",
  202.     "    \n",
  203.     "    @inbounds for j in 1:N\n",
  204.     "        for i in 1:N\n",
  205.     "            mat[i,j] = s[i,j]*val[i];\n",
  206.     "        end\n",
  207.     "    end;\n",
  208.     "end"
  209.    ]
  210.   },
  211.   {
  212.    "cell_type": "markdown",
  213.    "metadata": {},
  214.    "source": [
  215.     "More optimizations: blocking, tabling values of `val`..."
  216.    ]
  217.   },
  218.   {
  219.    "cell_type": "markdown",
  220.    "metadata": {},
  221.    "source": [
  222.     "## Single thread benchmarks"
  223.    ]
  224.   },
  225.   {
  226.    "cell_type": "code",
  227.    "execution_count": 22,
  228.    "metadata": {},
  229.    "outputs": [],
  230.    "source": [
  231.     "using BenchmarkTools\n",
  232.     "N = 4000\n",
  233.     "mat = zeros(N,N)\n",
  234.     "s = rand(N,N)\n",
  235.     "v = rand(Int, N);"
  236.    ]
  237.   },
  238.   {
  239.    "cell_type": "code",
  240.    "execution_count": 24,
  241.    "metadata": {},
  242.    "outputs": [
  243.     {
  244.      "name": "stdout",
  245.      "output_type": "stream",
  246.      "text": [
  247.       "Performance: 8.832933179745996 MIt/s\n"
  248.      ]
  249.     }
  250.    ],
  251.    "source": [
  252.     "runtime = @belapsed work($mat, $s, $v, $N);\n",
  253.     "perf = N*N*1e-6/runtime # MIt/s\n",
  254.     "println(\"Performance: $perf MIt/s\")"
  255.    ]
  256.   },
  257.   {
  258.    "cell_type": "code",
  259.    "execution_count": 25,
  260.    "metadata": {},
  261.    "outputs": [
  262.     {
  263.      "name": "stdout",
  264.      "output_type": "stream",
  265.      "text": [
  266.       "Performance: 18.954256315496306 MIt/s\n"
  267.      ]
  268.     }
  269.    ],
  270.    "source": [
  271.     "runtime = @belapsed opt1($mat, $s, $v, $N);\n",
  272.     "perf = N*N*1e-6/runtime # MIt/s\n",
  273.     "println(\"Performance: $perf MIt/s\")"
  274.    ]
  275.   },
  276.   {
  277.    "cell_type": "code",
  278.    "execution_count": 26,
  279.    "metadata": {},
  280.    "outputs": [
  281.     {
  282.      "name": "stdout",
  283.      "output_type": "stream",
  284.      "text": [
  285.       "Performance: 18.667771913881896 MIt/s\n"
  286.      ]
  287.     }
  288.    ],
  289.    "source": [
  290.     "runtime = @belapsed opt12($mat, $s, $v, $N);\n",
  291.     "perf = N*N*1e-6/runtime # MIt/s\n",
  292.     "println(\"Performance: $perf MIt/s\")"
  293.    ]
  294.   },
  295.   {
  296.    "cell_type": "code",
  297.    "execution_count": 27,
  298.    "metadata": {},
  299.    "outputs": [
  300.     {
  301.      "name": "stdout",
  302.      "output_type": "stream",
  303.      "text": [
  304.       "Performance: 39.56193729495743 MIt/s\n"
  305.      ]
  306.     }
  307.    ],
  308.    "source": [
  309.     "runtime = @belapsed opt2($mat, $s, $v, $N);\n",
  310.     "perf = N*N*1e-6/runtime # MIt/s\n",
  311.     "println(\"Performance: $perf MIt/s\")"
  312.    ]
  313.   },
  314.   {
  315.    "cell_type": "code",
  316.    "execution_count": 28,
  317.    "metadata": {},
  318.    "outputs": [
  319.     {
  320.      "name": "stdout",
  321.      "output_type": "stream",
  322.      "text": [
  323.       "Performance: 641.2379740326675 MIt/s\n"
  324.      ]
  325.     }
  326.    ],
  327.    "source": [
  328.     "runtime = @belapsed opt22($mat, $s, $v, $N);\n",
  329.     "perf = N*N*1e-6/runtime # MIt/s\n",
  330.     "println(\"Performance: $perf MIt/s\")"
  331.    ]
  332.   },
  333.   {
  334.    "cell_type": "code",
  335.    "execution_count": 29,
  336.    "metadata": {},
  337.    "outputs": [
  338.     {
  339.      "data": {
  340.       "text/plain": [
  341.        "52.957615277130884"
  342.       ]
  343.      },
  344.      "execution_count": 29,
  345.      "metadata": {},
  346.      "output_type": "execute_result"
  347.     }
  348.    ],
  349.    "source": [
  350.     "641/21.47"
  351.    ]
  352.   },
  353.   {
  354.    "cell_type": "markdown",
  355.    "metadata": {},
  356.    "source": [
  357.     "## Multi-threading"
  358.    ]
  359.   },
  360.   {
  361.    "cell_type": "code",
  362.    "execution_count": 34,
  363.    "metadata": {},
  364.    "outputs": [
  365.     {
  366.      "data": {
  367.       "text/plain": [
  368.        "8"
  369.       ]
  370.      },
  371.      "execution_count": 34,
  372.      "metadata": {},
  373.      "output_type": "execute_result"
  374.     }
  375.    ],
  376.    "source": [
  377.     "using Hwloc\n",
  378.     "Hwloc.num_physical_cores()"
  379.    ]
  380.   },
  381.   {
  382.    "cell_type": "code",
  383.    "execution_count": 35,
  384.    "metadata": {},
  385.    "outputs": [
  386.     {
  387.      "data": {
  388.       "text/plain": [
  389.        "8"
  390.       ]
  391.      },
  392.      "execution_count": 35,
  393.      "metadata": {},
  394.      "output_type": "execute_result"
  395.     }
  396.    ],
  397.    "source": [
  398.     "Base.Threads.nthreads()"
  399.    ]
  400.   },
  401.   {
  402.    "cell_type": "code",
  403.    "execution_count": null,
  404.    "metadata": {},
  405.    "outputs": [],
  406.    "source": [
  407.     "function opt22_threaded(mat, s, v, N)\n",
  408.     "    val = Vector{Float64}(undef, length(v))\n",
  409.     "    @inbounds for i in eachindex(val)\n",
  410.     "        val[i] = -cos(2*mod(v[i],256));\n",
  411.     "    end\n",
  412.     "    \n",
  413.     "    @inbounds Threads.@threads for j in 1:N\n",
  414.     "        for i in 1:N\n",
  415.     "            mat[i,j] = s[i,j]*val[i];\n",
  416.     "        end\n",
  417.     "    end;\n",
  418.     "    mat\n",
  419.     "end"
  420.    ]
  421.   },
  422.   {
  423.    "cell_type": "code",
  424.    "execution_count": 30,
  425.    "metadata": {},
  426.    "outputs": [
  427.     {
  428.      "name": "stdout",
  429.      "output_type": "stream",
  430.      "text": [
  431.       "Performance: 1710.8674621986506 MIt/s\n"
  432.      ]
  433.     }
  434.    ],
  435.    "source": [
  436.     "runtime = @belapsed opt22_threaded($mat, $s, $v, $N);\n",
  437.     "perf = N*N*1e-6/runtime # MIt/s\n",
  438.     "println(\"Performance: $perf MIt/s\")\n",
  439.     "# Performanc (1 thread):   678.7915812874131 MIt/s\n",
  440.     "# Performance (4 threads): 1172.5993721170087 MIt/s"
  441.    ]
  442.   },
  443.   {
  444.    "cell_type": "markdown",
  445.    "metadata": {},
  446.    "source": [
  447.     "## Maximal performance?"
  448.    ]
  449.   },
  450.   {
  451.    "cell_type": "markdown",
  452.    "metadata": {},
  453.    "source": [
  454.     "\"Roofline model\""
  455.    ]
  456.   },
  457.   {
  458.    "cell_type": "code",
  459.    "execution_count": 84,
  460.    "metadata": {},
  461.    "outputs": [
  462.     {
  463.      "name": "stdout",
  464.      "output_type": "stream",
  465.      "text": [
  466.       "Memory bounded performance: 2.08 GIt/s\n"
  467.      ]
  468.     }
  469.    ],
  470.    "source": [
  471.     "bs = 50 # [GB/s] max memory bandwidth (only an estimate for the unknown CPU model)\n",
  472.     "traffic = 24 # [B/iter] in each iteration we have: LOAD s, LOAD + STORE mat, each contributing 8 B\n",
  473.     "println(\"Memory bounded performance: \", round(bs/traffic, digits=2), \" GIt/s\")"
  474.    ]
  475.   }
  476.  ],
  477.  "metadata": {
  478.   "kernelspec": {
  479.    "display_name": "Julia 1.3.1",
  480.    "language": "julia",
  481.    "name": "julia-1.3"
  482.   },
  483.   "language_info": {
  484.    "file_extension": ".jl",
  485.    "mimetype": "application/julia",
  486.    "name": "julia",
  487.    "version": "1.3.1"
  488.   }
  489.  },
  490.  "nbformat": 4,
  491.  "nbformat_minor": 4
  492. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top