pico-8 cartridge // http://www.pico-8.com version 42 __lua__ -- prof: cpu cycle counter v1.4 -- BY PANCELOR --[[------------------------ use this cart to precisely measure code execution time -------------------------------- ★ overview ★ -------------------------------- | tab 0 | usage guide | | tab 1 | (internals) | | tab 2 | your code here | -------------------------------- ----------------------- -- ★ usage guide ★ -- ----------------------- 웃: i have two code snippets; which one is faster? 🐱: edit the last tab with your snippets, then run the cart. it will tell you precisely how much cpu it takes to run each snippet. the results are also copied to your clipboard. 웃: what do the numbers mean? 🐱: the cpu cost is reported as lua and system cycle counts. look up stat(1) and stat(2) for more info. if you're not sure, just look at the first number. lower is faster (better) 웃: why "{locals={9}}" in the example? 🐱: accessing local variables is faster than global vars. so if your test involves local variables, simulate this by passing them in: prof(function(a) sqrt(a) end,{ locals={9} }) /!\ /!\ /!\ /!\ local values from outside the current scope are also slower to access! example: global = 4 local outer = 4 prof(function(x) local _ = x --fast end,function(x) local _ = outer --slow end,function(x) local _ = global --slow end,{ locals={4} }) /!\ /!\ /!\ /!\ 웃: can i do "prof(myfunc)"? 🐱: no, this sometimes gives wrong results! always use inline functions: prof(function() --code for myfunc here end) as an example, "prof(sin)" reports "-2" -- wrong! but "prof(function()sin()end)" correctly reports "4" (see the technical notes at the start of the next tab for a brief explanation. technically, "prof(myfunc)" will work if myfunc was made by the user, but you will risk confusing yourself) --------------- ★ method 2 ★ --------------- this cart is based on code by samhocevar: https://www.lexaloffle.com/bbs/?pid=60198#p if you do this method, be very careful with local/global vars. it's very easy to accidentally measure the wrong thing. here's an example of how to measure cycles (ignoring this cart and using the old method) function _init() local a=11.2 -- locals local n=1024 flip() local tot1,sys1=stat(1),stat(2) for i=1,n do end --calibrate local tot2,sys2=stat(1),stat(2) for i=1,n do local _=sqrt(a) end --measure local tot3,sys3=stat(1),stat(2) function cyc(t0,t1,t2) return ((t2-t1)-(t1-t0))*128/n*256/stat(8)*256 end local lua = cyc(tot1-sys1,tot2-sys2,tot3-sys3) local sys = cyc(sys1,sys2,sys3) print(lua.."+"..sys.."="..(lua+sys).." (lua+sys)") end run this once, see the results, then change the "measure" line to some other code you want to measure. note: wrapping the code inside "_init()" is required, otherwise builtin functions like "sin" will be measured wrong. (the reason is explained at the start of the next tab) --------------- ★ method 3 ★ --------------- another way to measure cpu cost is to run something like this: function _draw() cls(1) local x=9 for i=1,1000 do local a=sqrt(x) --snippet1 -- local b=x^0.5 --snippet2 end end while running, press ctrl-p to see the performance monitor. the middle number shows how much of cpu is being used, as a fraction. (0.60 = 60% used) now, change the comments on the two code snippets inside _draw() and re-run. compare the new result with the old to determine which snippet is faster. note: every loop iteration costs an additional 2 cycles, so the ratio of the two fractions will not match the ratio of the execution time of the snippets. but this method can quickly tell you which snippet is faster. ]] -->8 --[[ profiler.lua more info: https://www.lexaloffle.com/bbs/?tid=46117 usage: prof(function() memcpy(0,0x200,64) end,function() poke4(0,peek4(0x200,16)) end) passing locals: prof( function(a,b) local c=(a+1)*(b+1)-1 end, function(a,b) local c=a*b+a+b end, {locals={3,5}} ) getting global/local variables exactly right is very tricky; you should always use inline functions like above; if you try e.g. prof(sin) the results will be wrong. # minutiae / notes to self: --------------------------- doing this at top-level is awkward: for _=1,n do end -- calibrate for _=1,n do sin() end -- measure b/c sin is secretly local at top-level, so it gives a misleading result (3 cycles). do it inside _init instead for a more representative result (4 cycles). ## separate issue: ------------------ if you call prof(sin), it gives the wrong result (-2 cycles) because it's comparing sin() against noop() (not truly nothing). but we want the noop() there for normal inline prof() calls, to avoid measuring the cost of the indirection (calling func() from inside prof() is irrelevant to how cpu-expensive func()'s body is) ]] -- prof(fn1,fn2,...,fnN,[opts]) -- -- opts.locals: values to pass -- opts.name: text label -- opts.n: number of iterations function prof(...) local funcs={...} local opts=type(funcs[#funcs])=="table" and deli(funcs) or {} -- build output string local msg="" local function log(s) msg..=s.."\n" end if opts.name then log("prof: "..opts.name) end for fn in all(funcs) do local dat=prof_one(fn,opts) log(sub(" "..dat.total,-3) .." (" ..dat.lua .." lua, " ..dat.sys .." sys)") end -- copy to clipboard printh(msg,"@clip") -- print + pause cls() stop(msg) end function prof_one(func, opts) opts = opts or {} local n = opts.n or 0x200 --how many times to call func local locals = opts.locals or {} --locals to pass func -- we want to type -- local m = 0x80_0000/n -- but 8MHz is too large to fit in a pico-8 number, -- so we do (0x80_0000>>16)/(n>>16) instead -- (n is always an integer, so n>>16 won't lose any bits) local m = 0x80/(n>>16) assert(0x80/m << 16 == n, "n is too small") -- make sure m didn't overflow local fps = stat(8) -- given three timestamps (pre-calibration, middle, post-measurement), -- calculate how many more CPU cycles func() took compared to noop() -- derivation: -- T := ((t2-t1)-(t1-t0))/n (frames) -- this is the extra time for each func call, compared to noop -- this is measured in #-of-frames -- it will be a small fraction for most ops -- F := 1/30 (seconds/frame) (or 1/60 if this test is running at 60fps) -- this is just the framerate that the tests run at, not the framerate of your game -- M := 256*256*128 = 0x80_0000 = 8MHz (cycles/second) -- (PICO-8 runs at 8MHz; see https://www.lexaloffle.com/dl/docs/pico-8_manual.html#CPU) -- cycles := T frames * F seconds/frame * M cycles/second -- optimization / working around pico-8's fixed point numbers: -- T2 := T*n = (t2-t1)-(t1-t0) -- M2 := M/n = (M>>16)/(n>>16) := m (e.g. when n is 0x1000, m is 0x800) -- cycles := T2*M2*F local function cycles(t0,t1,t2) local diff = (t2-t1)-(t1-t0) local e1 = "must use inline functions -- see usage guide" assert(0<=diff,e1) local thresh = 0x7fff.ffff/(m/fps) local e2 = "code is too large or slow -- try profiling manually with stat(1)" assert(diff<=thresh,e2) return diff*(m/fps) end local noop = function() end -- this must be local, because func is local flip() --avoid flipping mid-measurement local atot,asys=stat(1),stat(2) for _=1,n do noop(unpack(locals)) end -- calibrate local btot,bsys=stat(1),stat(2) for _=1,n do func(unpack(locals)) end -- measure local ctot,csys=stat(1),stat(2) -- gather results local tot=cycles(atot,btot,ctot) local sys=cycles(asys,bsys,csys) return { lua=tot-sys, sys=sys, total=tot, } end -->8 -- your code here --edit me: prof(function(dx,dy) local d = max(abs(dx),abs(dy)) local n = min(abs(dx),abs(dy)) / d return sqrt(n*n + 1) * d end,function(dx, dy) local d,n=abs(dx),abs(dy) if (d