From b919eb824d2f2a37db6044d10fce85dda4370a78 Mon Sep 17 00:00:00 2001 From: Kistaro Windrider Date: Sun, 8 Jun 2025 15:08:58 -0700 Subject: [PATCH] testing vector magnitude calculation --- magnitest.p8 | 96 ++++++++++ profiling.p8 | 496 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 592 insertions(+) create mode 100644 magnitest.p8 create mode 100644 profiling.p8 diff --git a/magnitest.p8 b/magnitest.p8 new file mode 100644 index 0000000..86b6475 --- /dev/null +++ b/magnitest.p8 @@ -0,0 +1,96 @@ +pico-8 cartridge // http://www.pico-8.com +version 42 +__lua__ +--magnitest.p8 +--accuracy and perf tests for +--vector calculation logic + +-- profiler says 67 cycles +-- 39 lua, 28 system +function bbs28999alg(dx, dy) + local d = max(abs(dx),abs(dy)) + local n = min(abs(dx),abs(dy)) / d + return sqrt(n*n + 1) * d +end + +-- profiler says 56 cycles +-- 24 lua, 32 system +function bbs28999algopt(dx, dy) + local d,n=abs(dx),abs(dy) + if (d8 +function _init() + gdx,gdy=0,0 + acc=1 + magnitude = 1 + tresult=0 + bresult=0 + delta=0 +end + +function _update60() + if (btn(0)) gdx -= acc + if (btn(1)) gdx += acc + if (btn(2)) gdy -= acc + if (btn(3)) gdy += acc + if (btn(4) and btn(5)) gdx,gdy=0,0 + + if btn == 0 or btn(4) then + acc = 1 + elseif btn(5) then + acc *= 1.1 + else + acc *= 1.02 + end + + local a,b=abs(gdx),abs(gdy) + if (b>a) a,b=b,a + magnitude = 1 + while magnitude < a do + magnitude *= 2 + end + + bresult=bbs28999algopt(gdx,gdy) + tresult=trigalg(gdx,gdy) + delta=tresult-bresult +end + +cols = { + 6,7, + [4]=10, + [8]=9, + [16]=4, + [32]=14, + [64]=15, + [128]=11, + [256]=12, + [512]=13, + [1024]=8, +} + +function _draw() + camera(-63,-63) + line(0,0,gdx/magnitude,gdy/magnitude,cols[magnitude] or 2) + rectfill(0,0,gdx/magnitude,0,5) + rectfill(0,0,0,gdy/magnitude,5) + print("x: "..tostr(gdx).." y: "..tostr(gdy).."\nbase: "..tostr(bresult).."\ntrig: "..tostr(tresult).."\n\ndiff: "..tostr(delta),-63,-63) +end + +__gfx__ +00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +00700700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +00077000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +00077000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +00700700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 diff --git a/profiling.p8 b/profiling.p8 new file mode 100644 index 0000000..5d5bd6e --- /dev/null +++ b/profiling.p8 @@ -0,0 +1,496 @@ +pico-8 cartridge // http://www.pico-8.com +version 42 +__lua__ +-- prof: cpu cycle counter v1.4 +-- BY PANCELOR +--[[------------------------ + + +use this cart to precisely +measure code execution time + +-------------------------------- + ★ overview ★ +-------------------------------- +| tab 0 | usage guide | +| tab 1 | (internals) | +| tab 2 | your code here | +-------------------------------- + + +----------------------- +-- ★ usage guide ★ -- +----------------------- + + +웃: i have two code snippets; + which one is faster? + +🐱: edit the last tab with your + snippets, then run the cart. + it will tell you precisely + how much cpu it takes to + run each snippet. + + the results are also copied + to your clipboard. + + + +웃: what do the numbers mean? + +🐱: the cpu cost is reported + as lua and system cycle + counts. look up stat(1) + and stat(2) for more info. + + if you're not sure, just + look at the first number. + lower is faster (better) + + + +웃: why "{locals={9}}" + in the example? + +🐱: accessing local variables + is faster than global vars. + + so if your test involves + local variables, simulate + this by passing them in: + + prof(function(a) + sqrt(a) + end,{ locals={9} }) + + /!\ /!\ /!\ /!\ + local values from outside + the current scope are also + slower to access! example: + + global = 4 + local outer = 4 + prof(function(x) + local _ = x --fast + end,function(x) + local _ = outer --slow + end,function(x) + local _ = global --slow + end,{ locals={4} }) + /!\ /!\ /!\ /!\ + + + +웃: can i do "prof(myfunc)"? + +🐱: no, this sometimes gives + wrong results! always use + inline functions: + + prof(function() + --code for myfunc here + end) + + as an example, "prof(sin)" + reports "-2" -- wrong! but + "prof(function()sin()end)" + correctly reports "4" + + (see the technical notes at + the start of the next tab + for a brief explanation. + technically, "prof(myfunc)" + will work if myfunc was made + by the user, but you will + risk confusing yourself) + + + +--------------- + ★ method 2 ★ +--------------- + + + +this cart is based on +code by samhocevar: +https://www.lexaloffle.com/bbs/?pid=60198#p + +if you do this method, be very +careful with local/global vars. +it's very easy to accidentally +measure the wrong thing. + +here's an example of how to +measure cycles (ignoring this +cart and using the old method) + + function _init() + local a=11.2 -- locals + + local n=1024 + flip() + local tot1,sys1=stat(1),stat(2) + for i=1,n do end --calibrate + local tot2,sys2=stat(1),stat(2) + for i=1,n do local _=sqrt(a) end --measure + local tot3,sys3=stat(1),stat(2) + + function cyc(t0,t1,t2) return ((t2-t1)-(t1-t0))*128/n*256/stat(8)*256 end + local lua = cyc(tot1-sys1,tot2-sys2,tot3-sys3) + local sys = cyc(sys1,sys2,sys3) + print(lua.."+"..sys.."="..(lua+sys).." (lua+sys)") + end + +run this once, see the results, +then change the "measure" line +to some other code you want +to measure. + +note: wrapping the code inside +"_init()" is required, otherwise +builtin functions like "sin" +will be measured wrong. +(the reason is explained at +the start of the next tab) + + + +--------------- + ★ method 3 ★ +--------------- + + + +another way to measure cpu cost +is to run something like this: + + function _draw() + cls(1) + local x=9 + for i=1,1000 do + local a=sqrt(x) --snippet1 + -- local b=x^0.5 --snippet2 + end + end + +while running, press ctrl-p to +see the performance monitor. +the middle number shows how much +of cpu is being used, as a +fraction. (0.60 = 60% used) + +now, change the comments on the +two code snippets inside _draw() +and re-run. compare the new +result with the old to determine +which snippet is faster. + +note: every loop iteration costs +an additional 2 cycles, so the +ratio of the two fractions will +not match the ratio of the +execution time of the snippets. +but this method can quickly tell +you which snippet is faster. + + + +]] + +-->8 +--[[ profiler.lua +more info: https://www.lexaloffle.com/bbs/?tid=46117 + +usage: + prof(function() + memcpy(0,0x200,64) + end,function() + poke4(0,peek4(0x200,16)) + end) + +passing locals: + prof( + function(a,b) + local c=(a+1)*(b+1)-1 + end, + function(a,b) + local c=a*b+a+b + end, + {locals={3,5}} + ) + +getting global/local variables exactly right +is very tricky; you should always use inline +functions like above; if you try e.g. prof(sin) +the results will be wrong. + + +# minutiae / notes to self: +--------------------------- +doing this at top-level is awkward: + for _=1,n do end -- calibrate + for _=1,n do sin() end -- measure +b/c sin is secretly local at top-level, +so it gives a misleading result (3 cycles). +do it inside _init instead for a +more representative result (4 cycles). + +## separate issue: +------------------ +if you call prof(sin), it gives the wrong result (-2 cycles) because +it's comparing sin() against noop() (not truly nothing). +but we want the noop() there for normal inline prof() calls, +to avoid measuring the cost of the indirection +(calling func() from inside prof() is irrelevant to +how cpu-expensive func()'s body is) +]] + +-- prof(fn1,fn2,...,fnN,[opts]) +-- +-- opts.locals: values to pass +-- opts.name: text label +-- opts.n: number of iterations +function prof(...) + local funcs={...} + local opts=type(funcs[#funcs])=="table" and deli(funcs) or {} + + -- build output string + local msg="" + local function log(s) + msg..=s.."\n" + end + + if opts.name then + log("prof: "..opts.name) + end + for fn in all(funcs) do + local dat=prof_one(fn,opts) + log(sub(" "..dat.total,-3) + .." (" + ..dat.lua + .." lua, " + ..dat.sys + .." sys)") + end + + -- copy to clipboard + printh(msg,"@clip") + -- print + pause + cls() + stop(msg) +end + +function prof_one(func, opts) + opts = opts or {} + local n = opts.n or 0x200 --how many times to call func + local locals = opts.locals or {} --locals to pass func + + -- we want to type + -- local m = 0x80_0000/n + -- but 8MHz is too large to fit in a pico-8 number, + -- so we do (0x80_0000>>16)/(n>>16) instead + -- (n is always an integer, so n>>16 won't lose any bits) + local m = 0x80/(n>>16) + assert(0x80/m << 16 == n, "n is too small") -- make sure m didn't overflow + local fps = stat(8) + + -- given three timestamps (pre-calibration, middle, post-measurement), + -- calculate how many more CPU cycles func() took compared to noop() + -- derivation: + -- T := ((t2-t1)-(t1-t0))/n (frames) + -- this is the extra time for each func call, compared to noop + -- this is measured in #-of-frames -- it will be a small fraction for most ops + -- F := 1/30 (seconds/frame) (or 1/60 if this test is running at 60fps) + -- this is just the framerate that the tests run at, not the framerate of your game + -- M := 256*256*128 = 0x80_0000 = 8MHz (cycles/second) + -- (PICO-8 runs at 8MHz; see https://www.lexaloffle.com/dl/docs/pico-8_manual.html#CPU) + -- cycles := T frames * F seconds/frame * M cycles/second + -- optimization / working around pico-8's fixed point numbers: + -- T2 := T*n = (t2-t1)-(t1-t0) + -- M2 := M/n = (M>>16)/(n>>16) := m (e.g. when n is 0x1000, m is 0x800) + -- cycles := T2*M2*F + local function cycles(t0,t1,t2) + local diff = (t2-t1)-(t1-t0) + local e1 = "must use inline functions -- see usage guide" + assert(0<=diff,e1) + local thresh = 0x7fff.ffff/(m/fps) + local e2 = "code is too large or slow -- try profiling manually with stat(1)" + assert(diff<=thresh,e2) + return diff*(m/fps) + end + + local noop = function() end -- this must be local, because func is local + flip() --avoid flipping mid-measurement + local atot,asys=stat(1),stat(2) + for _=1,n do noop(unpack(locals)) end -- calibrate + local btot,bsys=stat(1),stat(2) + for _=1,n do func(unpack(locals)) end -- measure + local ctot,csys=stat(1),stat(2) + + -- gather results + local tot=cycles(atot,btot,ctot) + local sys=cycles(asys,bsys,csys) + return { + lua=tot-sys, + sys=sys, + total=tot, + } +end + +-->8 +-- your code here + +--edit me: +prof(function(dx,dy) + local d = max(abs(dx),abs(dy)) + local n = min(abs(dx),abs(dy)) / d + return sqrt(n*n + 1) * d +end,function(dx, dy) + local d,n=abs(dx),abs(dy) + if (d