Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[32m\u001b[1mActivating\u001b[22m\u001b[39m environment at `~/Project.toml`\n"
- ]
- }
- ],
- "source": [
- "] activate ../"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Problem"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Optimize the following function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "work (generic function with 1 method)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "function work(mat, s, v, N)\n",
- " val = 0.0\n",
- " for i in 1:N\n",
- " for j in 1:N\n",
- " val = mod(v[i],256);\n",
- " mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));\n",
- " end\n",
- " end;\n",
- "end"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "\u001b[32m\u001b[1mTest Passed\u001b[22m\u001b[39m"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "using Test\n",
- "x = rand()\n",
- "@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)\n",
- "@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "opt1 (generic function with 1 method)"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# pulling out + analytical opt\n",
- "function opt1(mat, s, v, N)\n",
- " val = 0.0\n",
- " @inbounds for i in 1:N\n",
- " val = mod(v[i],256);\n",
- " val = -cos(2*val)\n",
- " for j in 1:N\n",
- " mat[i,j] = s[i,j]*val;\n",
- " end\n",
- " end;\n",
- " mat\n",
- "end"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "opt12 (generic function with 1 method)"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# pulling out + analytical opt + allocate values\n",
- "function opt12(mat, s, v, N)\n",
- " val = Vector{Float64}(undef, length(v))\n",
- " @inbounds for i in eachindex(val)\n",
- " val[i] = -cos(2*mod(v[i],256));\n",
- " end\n",
- " \n",
- " @inbounds for i in 1:N\n",
- " for j in 1:N\n",
- " mat[i,j] = s[i,j]*val[i];\n",
- " end\n",
- " end;\n",
- " mat\n",
- "end"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "opt2 (generic function with 1 method)"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# reordering loops\n",
- "function opt2(mat, s, v, N)\n",
- " val = 0.0\n",
- " @inbounds for j in 1:N\n",
- " for i in 1:N\n",
- " val = mod(v[i],256);\n",
- " val = -cos(2*val)\n",
- " mat[i,j] = s[i,j]*val;\n",
- " end\n",
- " end;\n",
- "end"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "opt22 (generic function with 1 method)"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# reordering loops + allocate values\n",
- "function opt22(mat, s, v, N)\n",
- " val = Vector{Float64}(undef, length(v))\n",
- " @inbounds for i in eachindex(val)\n",
- " val[i] = -cos(2*mod(v[i],256));\n",
- " end\n",
- " \n",
- " @inbounds for j in 1:N\n",
- " for i in 1:N\n",
- " mat[i,j] = s[i,j]*val[i];\n",
- " end\n",
- " end;\n",
- "end"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "More optimizations: blocking, tabling values of `val`..."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Single thread benchmarks"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "using BenchmarkTools\n",
- "N = 4000\n",
- "mat = zeros(N,N)\n",
- "s = rand(N,N)\n",
- "v = rand(Int, N);"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 8.832933179745996 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed work($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 18.954256315496306 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed opt1($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 18.667771913881896 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed opt12($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 39.56193729495743 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed opt2($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 641.2379740326675 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed opt22($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "52.957615277130884"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "641/21.47"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Multi-threading"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "8"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "using Hwloc\n",
- "Hwloc.num_physical_cores()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "8"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Base.Threads.nthreads()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "function opt22_threaded(mat, s, v, N)\n",
- " val = Vector{Float64}(undef, length(v))\n",
- " @inbounds for i in eachindex(val)\n",
- " val[i] = -cos(2*mod(v[i],256));\n",
- " end\n",
- " \n",
- " @inbounds Threads.@threads for j in 1:N\n",
- " for i in 1:N\n",
- " mat[i,j] = s[i,j]*val[i];\n",
- " end\n",
- " end;\n",
- " mat\n",
- "end"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performance: 1710.8674621986506 MIt/s\n"
- ]
- }
- ],
- "source": [
- "runtime = @belapsed opt22_threaded($mat, $s, $v, $N);\n",
- "perf = N*N*1e-6/runtime # MIt/s\n",
- "println(\"Performance: $perf MIt/s\")\n",
- "# Performanc (1 thread): 678.7915812874131 MIt/s\n",
- "# Performance (4 threads): 1172.5993721170087 MIt/s"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Maximal performance?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\"Roofline model\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 84,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Memory bounded performance: 2.08 GIt/s\n"
- ]
- }
- ],
- "source": [
- "bs = 50 # [GB/s] max memory bandwidth (only an estimate for the unknown CPU model)\n",
- "traffic = 24 # [B/iter] in each iteration we have: LOAD s, LOAD + STORE mat, each contributing 8 B\n",
- "println(\"Memory bounded performance: \", round(bs/traffic, digits=2), \" GIt/s\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Julia 1.3.1",
- "language": "julia",
- "name": "julia-1.3"
- },
- "language_info": {
- "file_extension": ".jl",
- "mimetype": "application/julia",
- "name": "julia",
- "version": "1.3.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement