F# Performance Tweaking
Jomo Fisher—I’ve enjoyed reading the blog series on F# and game development here. Joh had posted an example of F# of using RK4 and showed some performance numbers comparing F# to C++ and OCaml. Don Syme had been reading too and asked if anyone on the F# team wanted to take a stab to see if they could optimize Joh’s code any further. I thought it would be fun. It turns out there were a few small changes that make a significant performance improvement without sacrificing immutability or other nice functionalness.
The example code in this article was compiled with the 1.9.6.2 version of the F# compiler.
Joh probably already accounted for this, but it’s worth pointing out that there is a huge performance difference between Debug and Release builds for this example. On my machine I could measure a 3x performance difference between the two.
Considering just Release builds, I was able to get another 2-3x improvement by making several simple changes:
- Change a few functions to return struct instead of tuple. For this, I created a ps (e.g. Position-Speed) struct and a sa (e.g. Speed-Acceleration) struct.
- I manually lifted a few computations out of the outer similar loop. I made (euler_implicit_opt accel_opt) and (runge_kutta_4_opt (adapt_opt accel_opt))into values.
- I changed the simulate function to use OptimizedClosures.FastFunc3.
This last part especially needs some explaining. Closure optimization can be used when a function will be called many times. It allows F# to do a bit of expensive work up front that will later be amortized across the many calls to the function. Below is the simulate function using optimized closures. I’ve bolded the affected lines.
let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =
let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps:ps<_> = oc.Invoke(pos0,speed0,delta)
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0
Unfortunately, this is not something that you can stick everywhere and just get a benefit (otherwise, the compiler could just inject it for you). You should definitely profile before and after. Here are my final performance numbers:
Optimized---
Euler single: 2.529253
Euler multi: 1.328133
RK4 single: 2.383238
RK4 multi: 1.241124
Baseline---
Euler single: 7.672767
Euler multi: 3.922392
RK4 single: 4.563456
RK4 multi: 2.450245
Here’s the entire code. The optimized functions end in _opt. Thanks Joh for an awesome diversion!
#light
open System
let run_time_func (func: unit -> 'a) =
let time0 = System.DateTime.Now
let res = func ()
let diff0 = System.DateTime.Now - time0
(diff0, res)
let map_parallel func items =
let tasks =
seq {
for i in items -> async {
return (func i)
}
}
Async.Run (Async.Parallel tasks)
let gravity =
-9.81
let inline spring pos =
let k = 100.0 in
-k * pos
let inline drag k pos speed = -k * speed
type ps<'num> = struct
val pos:'num
val speed:'num
new(pos,speed) = {pos=pos;speed=speed}
end
type sa<'num> = struct
val speed:'num
val accel:'num
new(speed,accel) = {speed=speed;accel=accel}
end
let inline euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
pos + delta * speed2, speed2
let inline euler_implicit_opt accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
ps(pos + delta * speed2, speed2)
let inline adapt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
speed, accel
let inline adapt_opt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
sa(speed, accel)
let inline runge_kutta_4 deriv_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)
let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)
let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)
let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)
let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in
pos1, speed1
let inline runge_kutta_4_opt deriv_func pos speed delta =
let half_delta = 0.5 * delta
let sa1:sa<_> = deriv_func pos speed
let sa2 = deriv_func (pos + half_delta * sa1.speed) (speed + half_delta * sa1.accel)
let sa3 = deriv_func (pos + half_delta * sa2.speed) (speed + half_delta * sa2.accel)
let sa4 = deriv_func (pos + delta * sa3.speed) (speed + delta * sa3.speed)
let pos1 = pos + delta/6.0*(sa1.speed + 2.0*sa2.speed + 2.0*sa3.speed + sa4.speed)
let speed1 = speed + delta/6.0*(sa1.accel + 2.0*sa2.accel + 2.0*sa3.accel + sa4.accel) in
ps(pos1, speed1)
let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta
repeat t1 pos1 speed1
repeat t0 pos0 speed0
let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =
let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps:ps<_> = oc.Invoke(pos0,speed0,delta)
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0
let initial_states = [ for s in 1..1000 -> (float s * 1.0, 0.0) ]
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.025
let accel pos speed = (drag 1.0) pos speed + spring pos + gravity in
let accel_opt pos speed = (drag 1.0) pos speed + spring pos + gravity in
let ef = (euler_implicit_opt accel_opt)
let rk4f = (runge_kutta_4_opt (adapt_opt accel_opt))
// Changes
// - Change euler_implicit_opt and runge_kutta_4_opt to return a struct (ps)
// - Manually lift (euler_implicit_opt accel) and (runge_kutta_4_opt (adapt accel)) out of loop
// - Change adapt_opt to return struct (sa)
// - Use optimized closure in simulate_opt
printfn "Optimized---"
for i in 0..0 do
let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))
printfn "Euler single: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))
printfn "Euler multi: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))
printfn "RK4 single: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))
printfn "RK4 multi: %f" run_time.TotalSeconds;
printfn "Baseline---"
for i in 0..0 do
let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))
printfn "Euler single: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))
printfn "Euler multi: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))
printfn "RK4 single: %f" run_time.TotalSeconds;
let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))
printfn "RK4 multi: %f" run_time.TotalSeconds;
done;
T his posting is provided "AS IS" with no warranties, and confers no rights.
Comments
Anonymous
September 16, 2008
What?! You have a physical constant and no units of measure?! :)Anonymous
September 16, 2008
Units of measure would fit nicely in this code. But, I don't know which version of the compiler Joh is using and they weren't available until 1.9.6.0. They won't affect performance anyway.Anonymous
September 16, 2008
I managed to get another 20% off the RK4 time by singling out the delta/6.0 calculation in runge_kutta_4_opt.Anonymous
September 17, 2008
I'm pleased to see my experiments are attracting attention :) I had somehow missed the availability of structs which made me wonder how fast 3d computation could be achieved in F#. Good catch, brilsmurf. I would not have thought it would make such an impact.Anonymous
September 17, 2008
This post is entirely devoted to shameless plugs and ‘Me Too’ blogging.   The Stack Overflow public