Shootout/Nbody

From HaskellWiki

This is a ShootoutEntry for the N-body benchmark.

Each program should model the orbits of Jovian planets, using the same simple symplectic-integrator - see the Java program.

Correct output N = 1000 is

-0.169075164

-0.169087605

Benchmarks

Finally, we get some decent performance. Timing on Linux/P4, N=1,000,000


Author Time Flags
current 8.561 -O2 -optc-O3 -optc-ffast-math
current-fexcess-precision 6.335 -O2 -fexcess-precision -optc-O3 -optc-ffast-math
chris 4.515 -O2
chris 2.547 -O2 -fexcess-precision
chris 2.363 -O2 -fexcess-precision -optc-O -optc-ffast-math
C 1.174
chris+dons 1.112 -O2 -fexcess-precision -optc-O -optc-ffast-math
stuarray 1.073 -O2 -fexcess-precision -optc-O -optc-ffast-math -funbox-strict-fields
stuarray2 0.912 -O3 -fexcess-precision -optc-O -optc-ffast-math -funbox-strict-fields
C 0.518 -O2

Again with N=5,000,000 (as used in shootout):

Author Time Flags
current 42.685 -O2 -optc-O3 -optc-ffast-math
chris 11.753 -O2 -fexcess-precision -optc-O -optc-ffast-math
C 5.786
chris+dons 5.498 -O2 -fexcess-precision -optc-O -optc-ffast-math
stuarray 5.290 -O2 -fexcess-precision -optc-O -optc-ffast-math -funbox-strict-fields
stuarray2 4.489 -O2 -fexcess-precision -optc-O -optc-ffast-math -funbox-strict-fields
C 2.555 -O2

Hooray!

Need to add my smaller/faster tweak to Joel's entry. -- ChrisKuklewicz

On x86/Linux gcc produces weird numerical errors with the new IOUArray entries (chris and chris+dons), when using the -optc-Ox flags. It cannot be used above -optc-O1

chris ERROR -O2 -optc-O3 -optc-ffast-math
chris ERROR -O2 -optc-O2 -fexcess-precision -optc-ffast-math

Architecture differences

This is very CPU architecture dependent, which means a powerbook G4 is not seeing the same results as linux/x86.

Compiling both with -fglasgow-exts -fexcess-precision -O3 -optc-O3 -optc-ffast-math on a G4:

   $ time ./chris 5000000; time ./dons+chris 5000000
   -0.169075164
   -0.169083134
   real    0m27.794s
   user    0m21.176s
   sys     0m0.268s
   -0.169075164
   -0.169083134
   real    0m25.985s
   user    0m22.357s
   sys     0m0.236s

On linux/x86 {{{dons+chris}}} beats {{{chris}}} by a factor of two and on a G4 {{{dons+chris}}} is a few percent slower. Not too surprising, since the floating point hardware differs. The performanace for the this benchmark (AMD Sempron, Inten P4) is untunable on a G4.

So Don: you can submit this when you are happy with the tweaking since I am blind to x86 performance here. Can you try passing architecture switches to -optc such as -mtune? -- ChrisKuklewicz

Conservative extension

Replace hardcoded sizeof double with Storable instance sizeOf.

{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Olof Kraigher and Don Stewart.
--
-- To be compiled with:
--
--  -O2 -fglasgow-exts -funbox-strict-fields -fbang-patterns -optc-O
--
-- Don't enable -optc-mfpmath=sse -optc-msse2, this triggers a gcc bug on x86
--
-- TODO, rewrite in ST.
--

import Foreign
import Foreign.Storable
import Foreign.Marshal.Alloc
import Data.IORef
import Control.Monad
import System
import Text.Printf

main = do
    n <- getArgs >>= readIO.head
    initialize
    offset_momentum
    energy 0 planets >>= printf "%.9f\n"
    replicateM_ n (advance planets)
    energy 0 planets >>= printf "%.9f\n"

offset_momentum = do
    m <- foldr (.+.) (Vec 0 0 0)
             `fmap` (mapM momentum
                   . take (nbodies - 1)
                   . iterate next $ next planets)

    setVec (vel planets) $ (-1/solar_mass) *. m
  where
    momentum p = liftM2 (*.) (mass p) (getVec (vel p))

energy :: Double -> Ptr Double -> IO Double
energy !e p
    | p == end = return e
    | otherwise      = do
        p1 <- getVec (pos p)
        v1 <- getVec (vel p)
        m1 <- mass p
        e  <- energy2 p1 m1 e p2
        energy (e + 0.5 * m1 * magnitude2 v1) p2
    where p2 = next p

energy2 :: Vector3 -> Double -> Double -> Ptr Double -> IO Double
energy2 p1 m1 !e p
    | p  == end = return e
    | otherwise = do
        p2 <- getVec (pos p)
        v2 <- getVec (vel p)
        m2 <- mass p
        let distance = sqrt . magnitude2 $ p1 .-. p2
        energy2 p1 m1 (e - m1 * m2 / distance) (next p)

advance :: Ptr Double -> IO ()
advance p1 = when (p1 /= end) $ do
    pos1 <- getVec (pos p1)
    m1   <- mass p1
    go pos1 m1 p2
    advance  p2
 where
    vel1 = vel p1
    p2   = next p1

    go pos1 m1 p2
         | p2 /= end = do
                pos2 <- getVec (pos p2)
                m2   <- mass p2
                let vel2       = vel p2
                    difference = pos1 .-. pos2
                    distance2  = magnitude2 difference
                    distance   = sqrt distance2
                    magnitude  = delta_t / (distance2 * distance)
                    mass_magn  = magnitude *. difference
                vel1 -= m2 *. mass_magn
                vel2 += m1 *. mass_magn
                go pos1 m1 (next p2)

          | otherwise = do
                v1 <- getVec vel1
                p1 += delta_t *. v1

------------------------------------------------------------------------

planets :: Ptr Double
planets = unsafePerformIO $ mallocBytes (7 * nbodies * sizeOf (undefined :: Double))

nbodies :: Int
nbodies = 5

solar_mass, delta_t, days_per_year :: Double
days_per_year = 365.24
delta_t       = 0.01
solar_mass    = 4 * pi ** 2

initialize = mapM_ newPlanet planets
  where
   dp = days_per_year
   planets =
    [0, 0, 0,
     0, 0, 0,
     1 * solar_mass,
     4.84143144246472090e+00,        (-1.16032004402742839e+00), (-1.03622044471123109e-01),
     1.66007664274403694e-03*dp,     7.69901118419740425e-03*dp, (-6.90460016972063023e-05)*dp,
     9.54791938424326609e-04 * solar_mass,

     8.34336671824457987e+00,        4.12479856412430479e+00,    (-4.03523417114321381e-01),
     (-2.76742510726862411e-03)*dp,  4.99852801234917238e-03*dp, 2.30417297573763929e-05*dp,
     2.85885980666130812e-04 * solar_mass,

     1.28943695621391310e+01,        (-1.51111514016986312e+01), (-2.23307578892655734e-01),
     2.96460137564761618e-03*dp,     2.37847173959480950e-03*dp, (-2.96589568540237556e-05)*dp,
     4.36624404335156298e-05 * solar_mass,

     1.53796971148509165e+01,        (-2.59193146099879641e+01), 1.79258772950371181e-01,
     2.68067772490389322e-03*dp,     1.62824170038242295e-03*dp, (-9.51592254519715870e-05)*dp,
     5.15138902046611451e-05 * solar_mass
    ]

------------------------------------------------------------------------
-- Support for 3 dimensional mutable vectors

cursor :: IORef (Ptr Double)
cursor = unsafePerformIO $ newIORef planets

data Vector3 = Vec !Double !Double !Double

end :: Ptr Double
end = inc planets $! nbodies * 7

next  :: Ptr Double -> Ptr Double
next p = inc p 7

inc :: Ptr Double -> Int -> Ptr Double
inc ptr n = plusPtr ptr (n * sizeOf (undefined :: Double))

newPlanet :: Double -> IO ()
newPlanet d = do
    ptr <- readIORef cursor
    pokeElemOff ptr 0 d
    writeIORef cursor $! inc ptr 1

pos :: Ptr Double -> Ptr Double
pos ptr = ptr

vel :: Ptr Double -> Ptr Double
vel ptr = inc ptr 3

mass :: Ptr Double -> IO Double
mass ptr = peekElemOff ptr 6

------------------------------------------------------------------------

(Vec x y z) .+. (Vec u v w) = Vec (x+u) (y+v) (z+w)

(Vec x y z) .-. (Vec u v w) = Vec (x-u) (y-v) (z-w)

k *. (Vec x y z) = Vec (k*x) (k*y) (k*z) -- allocates

magnitude2 (Vec x y z) = x*x + y*y + z*z

------------------------------------------------------------------------

getVec p = do
    x <- peek p
    y <- peekElemOff p 1
    z <- peekElemOff p 2
    return $! Vec x y z

setVec p (Vec x y z)= do
    poke        p   x
    pokeElemOff p 1 y
    pokeElemOff p 2 z

infix 4 +=
infix 4 -=

v1 += (Vec u v w) = do
    x <- peek v1;          poke        v1   (x+u)
    y <- peekElemOff v1 1; pokeElemOff v1 1 (y+v)
    z <- peekElemOff v1 2; pokeElemOff v1 2 (z+w)

v1 -= (Vec u v w)  = do
    x <- peek v1;          poke        v1   (x-u)
    y <- peekElemOff v1 1; pokeElemOff v1 1 (y-v)
    z <- peekElemOff v1 2; pokeElemOff v1 2 (z-w)


Current entry

Improves on the one below, shorter and at tad faster. I removed the unnecessary inc function and it runs a bit faster. // Olof

{-# OPTIONS -fexcess-precision #-}
-- 
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--        
-- Contributed by Olof Kraigher, with help from Don Stewart.
--     
-- Compile with:
--
--  -funbox-strict-fields -fglasgow-exts -fbang-patterns -O3
--      -optc-O3 -optc-mfpmath=sse -optc-msse2 -optc-march=pentium4 
-- 

import Foreign
import Foreign.Storable
import Foreign.Marshal.Alloc
import Data.IORef
import Control.Monad
import System
import Text.Printf

main = do
    n <- getArgs >>= readIO.head
    initialize
    offset_momentum
    energy 0 planets >>= printf "%.9f\n"
    replicateM_ n (advance planets)
    energy 0 planets >>= printf "%.9f\n"

offset_momentum = do
    m <- foldr (.+.) (Vec 0 0 0)
             `fmap` (mapM momentum
                   . take (nbodies - 1)
                   . iterate next $ next planets)

    setVec (vel planets) $ (-1/solar_mass) *. m
  where
    momentum !p = liftM2 (*.) (mass p) (getVec (vel p))

energy :: Double -> Ptr Double -> IO Double
energy !e !p
    | p == end = return e
    | otherwise      = do
        p1 <- getVec (pos p)
        v1 <- getVec (vel p)
        m1 <- mass p
        e  <- energy2 p1 m1 e p2
        energy (e + 0.5 * m1 * magnitude2 v1) p2
    where p2 = next p

energy2 !p1 !m1 !e !p
    | p  == end = return e
    | otherwise = do
        p2 <- getVec (pos p)
        v2 <- getVec (vel p)
        m2 <- mass p
        let distance = sqrt . magnitude2 $ p1 .-. p2
        energy2 p1 m1 (e - m1 * m2 / distance) (next p)

advance :: Ptr Double -> IO ()
advance !p1 = when (p1 /= end) $ do
    pos1 <- getVec $ pos p1
    m1   <- mass p1
    let go !p2
            | p2 /= end = do
                pos2 <- getVec (pos p2)
                m2   <- mass p2
                let vel2       = vel p2
                    difference = pos1 .-. pos2
                    distance2  = magnitude2 difference
                    distance   = sqrt distance2
                    magnitude  = delta_t / (distance2 * distance)
                    mass_magn  = magnitude *. difference
                vel1 -= m2 *. mass_magn
                vel2 += m1 *. mass_magn
                go (next p2)

            | otherwise = do
                v1 <- getVec vel1
                p1 += delta_t *. v1
    go p2
    advance  p2
  where
    vel1 = vel p1
    p2   = next p1

------------------------------------------------------------------------

planets :: Ptr Double
planets = unsafePerformIO $ mallocBytes (7 * nbodies * 8) -- sizeOf double = 8 

nbodies :: Int
nbodies = 5

solar_mass, delta_t, days_per_year :: Double
days_per_year = 365.24
solar_mass    = 4 * pi ** 2;
delta_t       = 0.01

initialize = mapM_ newPlanet planets
  where
   dp = days_per_year
   planets =
    [0, 0, 0,
     0, 0, 0,
     1 * solar_mass,
     4.84143144246472090e+00,        (-1.16032004402742839e+00), (-1.03622044471123109e-01),
     1.66007664274403694e-03*dp,     7.69901118419740425e-03*dp, (-6.90460016972063023e-05)*dp,
     9.54791938424326609e-04 * solar_mass,

     8.34336671824457987e+00,        4.12479856412430479e+00,    (-4.03523417114321381e-01),
     (-2.76742510726862411e-03)*dp,  4.99852801234917238e-03*dp, 2.30417297573763929e-05*dp,
     2.85885980666130812e-04 * solar_mass,

     1.28943695621391310e+01,        (-1.51111514016986312e+01), (-2.23307578892655734e-01),
     2.96460137564761618e-03*dp,     2.37847173959480950e-03*dp, (-2.96589568540237556e-05)*dp,
     4.36624404335156298e-05 * solar_mass,

     1.53796971148509165e+01,        (-2.59193146099879641e+01), 1.79258772950371181e-01,
     2.68067772490389322e-03*dp,     1.62824170038242295e-03*dp, (-9.51592254519715870e-05)*dp,
     5.15138902046611451e-05 * solar_mass
    ]

------------------------------------------------------------------------
-- Support for 3 dimensional mutable vectors

data Vector3 = Vec !Double !Double !Double

end :: Ptr Double
end = plusPtr planets $ nbodies * 7 * 8

next  :: Ptr Double -> Ptr Double
next p = plusPtr p 56

cursor :: IORef (Ptr Double)
cursor = unsafePerformIO $ newIORef planets

newPlanet :: Double -> IO ()
newPlanet !d = do
    ptr <- readIORef cursor
    pokeElemOff ptr 0 d
    writeIORef cursor (plusPtr ptr 8)

pos :: Ptr Double -> Ptr Double
pos ptr = ptr

vel :: Ptr Double -> Ptr Double
vel ptr = plusPtr ptr 24

mass :: Ptr Double -> IO Double
mass ptr = peekElemOff ptr 6

------------------------------------------------------------------------

(Vec x y z) .+. (Vec u v w) = Vec (x+u) (y+v) (z+w)

(Vec x y z) .-. (Vec u v w) = Vec (x-u) (y-v) (z-w)

k *. (Vec x y z) = Vec (k*x) (k*y) (k*z) -- allocates

magnitude2 (Vec x y z) = x*x + y*y + z*z

------------------------------------------------------------------------

getVec !p = liftM3 Vec (peek p) (f 1) (f 2)
    where f = peekElemOff p

setVec p (Vec x y z)= do
    poke        p   x
    pokeElemOff p 1 y
    pokeElemOff p 2 z

infix 4 +=
infix 4 -=

v1 += (Vec u v w) = do
    x <- peek v1;          poke        v1   (x+u)
    y <- peekElemOff v1 1; pokeElemOff v1 1 (y+v)
    z <- peekElemOff v1 2; pokeElemOff v1 2 (z+w)

v1 -= (Vec u v w)  = do
    x <- peek v1;          poke        v1   (x-u)
    y <- peekElemOff v1 1; pokeElemOff v1 1 (y-v)
    z <- peekElemOff v1 2; pokeElemOff v1 2 (z-w)

Proposed entry

A fast and relatively nice entry that runs 1.5x the fastest c++ entry.

{-# OPTIONS_GHC -fbang-patterns -fexcess-precision#-}

{-
    The Computer Language Shootout
       http://shootout.alioth.debian.org/
       
    Contributed by Olof Kraigher, with help from Don Stewart.
    
    Compile with:
        -fglasgow-exts -fbang-patterns -fexcess-precision -O3 -optc-O2 -optc-mfpmath=sse -optc-msse2 -optc-march=pentium4
-}

import Foreign.Ptr; import Foreign.Storable;    import Foreign.Marshal.Alloc;   import Foreign(unsafePerformIO);
import Data.IORef;  import Control.Monad;       import System;                  import Text.Printf;

data Vector3 = Vec !Double !Double !Double

nr_of_planets :: Int
nr_of_planets = 5

delta_t :: Double
delta_t = 0.01

days_per_year :: Double
days_per_year = 365.24

solar_mass :: Double
solar_mass = 4 * pi ** 2;

planets :: Ptr Double
planets = unsafePerformIO $ mallocBytes (7 * nr_of_planets * 8) -- sizeOf double = 8 

end :: Ptr Double
end = inc planets $ nr_of_planets * 7

cursor :: IORef (Ptr Double)
cursor = unsafePerformIO $ newIORef planets

{-# INLINE inc #-}
inc :: Ptr Double -> Int -> Ptr Double
inc ptr n = plusPtr ptr $ n * 8

{-# INLINE new #-}
new :: Double -> IO ()
new !d = do
    ptr <- readIORef cursor
    pokeElemOff ptr 0 d
    writeIORef cursor (inc ptr 1)

{-# INLINE getVec #-}
getVec :: Ptr Double -> IO Vector3
getVec ptr = 
    let p = peekElemOff ptr in
    liftM3 Vec (p 0) (p 1) (p 2)

{-# INLINE setVec #-}
setVec :: Ptr Double -> Vector3 -> IO ()
setVec ptr (Vec x y z)= do
    pokeElemOff ptr 0 x
    pokeElemOff ptr 1 y
    pokeElemOff ptr 2 z

{-# INLINE pos #-}    
pos :: Ptr Double -> Ptr Double
pos ptr = ptr

{-# INLINE vel #-}
vel :: Ptr Double -> Ptr Double
vel ptr = inc ptr 3

{-# INLINE mass #-}
mass :: Ptr Double -> IO Double
mass ptr = peekElemOff ptr 6

{-# INLINE (.+.) #-}
(.+.) :: Vector3 -> Vector3 -> Vector3
(Vec x y z) .+. (Vec u v w) = Vec (x+u) (y+v) (z+w)

{-# INLINE (.-.) #-}
(.-.) :: Vector3 -> Vector3 -> Vector3
(Vec x y z) .-. (Vec u v w) = Vec (x-u) (y-v) (z-w)

{-# INLINE (*.) #-}
(*.) :: Double -> Vector3 -> Vector3
k *. (Vec x y z) = Vec (k*x) (k*y) (k*z)

{-# INLINE magnitude2 #-}
magnitude2 :: Vector3 -> Double
magnitude2 (Vec x y z) = x*x + y*y + z*z

infix 4 +=
infix 4 -=

{-# INLINE (+=) #-}
(+=) :: Ptr Double -> Vector3 -> IO ()
v1 += v2 = do 
    v1' <- getVec v1
    setVec v1 $ v1' .+. v2

{-# INLINE (-=) #-}
(-=) :: Ptr Double -> Vector3 -> IO ()
v1 -= v2 = do 
    v1' <- getVec v1
    setVec v1 $ v1' .-. v2

initialize :: IO ()
initialize =
    let dp = days_per_year in
    mapM_ new
        [   0, 0, 0,
            0, 0, 0,
            1 * solar_mass, 
            4.84143144246472090e+00,        (-1.16032004402742839e+00), (-1.03622044471123109e-01),
            1.66007664274403694e-03*dp,     7.69901118419740425e-03*dp, (-6.90460016972063023e-05)*dp,
            9.54791938424326609e-04 * solar_mass,
        
            8.34336671824457987e+00,        4.12479856412430479e+00,    (-4.03523417114321381e-01),
            (-2.76742510726862411e-03)*dp,  4.99852801234917238e-03*dp, 2.30417297573763929e-05*dp,
            2.85885980666130812e-04 * solar_mass,
        
            1.28943695621391310e+01,        (-1.51111514016986312e+01), (-2.23307578892655734e-01),
            2.96460137564761618e-03*dp,     2.37847173959480950e-03*dp, (-2.96589568540237556e-05)*dp,
            4.36624404335156298e-05 * solar_mass,
        
            1.53796971148509165e+01,        (-2.59193146099879641e+01), 1.79258772950371181e-01,
            2.68067772490389322e-03*dp,     1.62824170038242295e-03*dp, (-9.51592254519715870e-05)*dp,
            5.15138902046611451e-05 * solar_mass
        ]


advance :: IO ()
advance = do
    let advance' !planet1
            | planet1 /= end = do
                p1 <- getVec $ pos planet1
                m1  <- mass planet1
                let vel1 = vel planet1
                
                let advance'' !planet2
                        | planet2 /= end = do
                            p2 <- getVec $ pos planet2
                            m2 <- mass planet2
                            let vel2 = vel planet2
                            let difference = p1 .-. p2
                            let distance2 = magnitude2 difference
                            let distance = sqrt distance2
                            let magnitude = delta_t / (distance2 * distance)
                            let mass_magn = magnitude *. difference
                            vel1 -= (m2 *. mass_magn)
                            vel2 += (m1 *. mass_magn)
                            advance'' (inc planet2 7)
                            
                        | otherwise = do
                            v1 <- getVec vel1
                            planet1 += delta_t *. v1
                
                let planet1' = (inc planet1 7)
                advance'' planet1'
                advance' planet1'
                
            | otherwise = return ()
                
    advance' planets

energy :: IO Double
energy = do
    e <- newIORef 0
    
    let energy' !planet1
            | planet1 /= end = do
                p1 <- getVec $ pos planet1
                v1 <- getVec $ vel planet1
                m1 <- mass planet1
    
                let energy'' !planet2
                        | planet2 /= end = do
                            p2 <- getVec $ pos planet2
                            v2 <- getVec $ vel planet2
                            m2 <- mass planet2
                            let distance = sqrt $ magnitude2 $ p1 .-. p2
                            modifyIORef e (\x -> x - m1 * m2 / distance)
                            energy'' (inc planet2 7)

                        | otherwise = do
                            modifyIORef e (+ 0.5 * m1 * (magnitude2 v1))
                
                let planet1' = (inc planet1 7)
                energy'' planet1'
                energy' planet1'
    
            | otherwise = return ()
                
    energy' planets
    readIORef e

offset_momentum :: IO ()
offset_momentum = do
    let momentum' !planet = do
        v <- getVec $ vel planet
        m <- mass planet
        return $ m *. v
    
    momentum <- return.(foldl (.+.) (Vec 0 0 0)) =<< 
        (mapM momentum' $ take (nr_of_planets - 1) $ iterate (\x -> inc x 7) (inc planets 7))

    setVec (vel planets) $ (-1/solar_mass) *. momentum

main = do
    n <- getArgs >>= readIO.head
    initialize
    offset_momentum
    energy >>= printf "%.9f\n"
    replicateM_ n advance
    energy >>= printf "%.9f\n"

Proposed entry

Complete rewrite, attention paid down to the asm level on the inner loops. Very much faster.

Note that in GHC -fexcess-precision is ignored on the command line!

Submitted

{-# OPTIONS -fexcess-precision #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
-- Compile with:
--  -O -fglasgow-exts -fbang-patterns -optc-O3 -optc-march=pentium4
--
import System
import System.IO.Unsafe
import Text.Printf
import Foreign.Marshal.Array
import Foreign
import Control.Monad
import Bits

import GHC.Exts

------------------------------------------------------------------------

main = do
    n <- getArgs >>= readIO . head
    offset_momentum
    printf "%.9f\n" =<< energy
    replicateM_ n advance
    printf "%.9f\n" =<< energy

------------------------------------------------------------------------

type Bodies = Ptr Double

nbodies = 5

bodies :: Bodies
bodies = unsafePerformIO . newArray . concat $
   [planet 1 0 0 0 0 0 0                -- sun
   ,planet 9.54791938424326609e-04      -- jupiter
        4.84143144246472090e+00  (-1.16032004402742839e+00) (-1.03622044471123109e-01)
      ( 1.66007664274403694e-03) ( 7.69901118419740425e-03) (-6.90460016972063023e-05)
   ,planet 2.85885980666130812e-04      -- saturn
        8.34336671824457987e+00    4.12479856412430479e+00  (-4.03523417114321381e-01)
      (-2.76742510726862411e-03) ( 4.99852801234917238e-03) ( 2.30417297573763929e-05)
   ,planet 4.36624404335156298e-05      -- uranus
        1.28943695621391310e+01  (-1.51111514016986312e+01) (-2.23307578892655734e-01)
      ( 2.96460137564761618e-03) ( 2.37847173959480950e-03) (-2.96589568540237556e-05)
   ,planet 5.15138902046611451e-05      -- neptune
        1.53796971148509165e+01  (-2.59193146099879641e+01)   1.79258772950371181e-01
      ( 2.68067772490389322e-03) ( 1.62824170038242295e-03) (-9.51592254519715870e-05)]
  where
    planet m x y z vx vy vz =
        [m*solar_mass,x, y, z, vx*days_per_year, vy*days_per_year, vz*days_per_year, 0]
    -- n.b widht of 8 doubles to make bitwise math nice

-- field access
(W# body) `at` (W# field) = I#
    (word2Int# (field `or#` (body `uncheckedShiftL#` 3#))) -- sizeof Double

put !body !field  = pokeElemOff bodies (body `at` field)
get !body !field  = peekElemOff bodies (body `at` field)

--
-- field offsets in flattened array
--
mass = 0
x    = 1
y    = 2
z    = 3
vx   = 4
vy   = 5
vz   = 6
solar_mass    = 4 * pi * pi
days_per_year = 365.24

------------------------------------------------------------------------
-- Offset momentum

offset_momentum :: IO ()
offset_momentum = do
    (px,py,pz) <- go 0 0 0 0
    put 0 vx (- px / solar_mass)
    put 0 vy (- py / solar_mass)
    put 0 vz (- pz / solar_mass)
  where
    go !i !px !py !pz
        | i >= nbodies = return (px,py,pz)
        | otherwise    = do
            imass <- look mass
            ivx   <- look vx
            ivy   <- look vy
            ivz   <- look vz
            go (i+1) (px + ivx * imass)
                     (py + ivy * imass)
                     (pz + ivz * imass)

        where look = get i

------------------------------------------------------------------------
-- Energy

energy = goI 0 0

goI :: Word -> Double -> IO Double
goI !i !e
    | i >= nbodies = return e
    | otherwise    = do
       im   <- look mass
       ix   <- look x      -- cache the body offset calculation?
       iy   <- look y
       iz   <- look z
       ivx  <- look vx
       ivy  <- look vy
       ivz  <- look vz
       e'   <- goJ ix iy iz im (i+1) (e + 0.5 * im * (ivx * ivx + ivy * ivy + ivz * ivz))
       goI (i+1) e'
    where
       look = get i

goJ !ix !iy !iz !im !j !e
    | j >= nbodies = return e
    | otherwise    = do

        b2m  <- look mass
        b2x  <- look x
        b2y  <- look y
        b2z  <- look z
        let dx   = ix - b2x
            dy   = iy - b2y
            dz   = iz - b2z
            distance = sqrt $! dx * dx + dy * dy + dz * dz
        goJ ix iy iz im (j+1) (e - ((im * b2m) / distance))
    where
        look = get j

------------------------------------------------------------------------
-- Advance

advance :: IO ()
advance = advanceI 0 >> update

advanceI i@(W# ii) = when (i < nbodies) $ do
    bm  <- lookI mass
    bx  <- lookI x      -- cache the body offset calculation!
    by  <- lookI y
    bz  <- lookI z
    bvx <- lookI vx              -- float these out to an accumultor
    bvy <- lookI vy
    bvz <- lookI vz
    advanceJ off bm bx by bz bvx bvy bvz (i+1)
    advanceI (i+1)
  where
    off   = W# (ii `uncheckedShiftL#` 3#)
    lookI x = peekElemOff bodies (fromIntegral $ x .|. off)
    setI  = put i

advanceJ !i_off !bm !bx !by !bz !bvx !bvy !bvz !j@(W# jj)
     | j >= nbodies = do
        let setI x = pokeElemOff bodies (fromIntegral $ x .|. i_off)
        setI vx bvx
        setI vy bvy
        setI vz bvz

     | otherwise    = do

        b2mass <- lookJ mass
        b2x    <- lookJ x
        b2y    <- lookJ y
        b2z    <- lookJ z
        b2vx   <- lookJ vx
        b2vy   <- lookJ vy
        b2vz   <- lookJ vz

        let dx = bx - b2x
            dy = by - b2y
            dz = bz - b2z
            distance = sqrt (dx * dx + dy * dy + dz * dz)
            mag      = 0.01 / (distance * distance * distance)
            massmag  = bm     * mag

        setJ vx (b2vx + dx * massmag)
        setJ vy (b2vy + dy * massmag)
        setJ vz (b2vz + dz * massmag)

        let mass2mag = - b2mass * mag

        advanceJ i_off bm bx by bz (bvx + dx * mass2mag)
                               (bvy + dy * mass2mag)
                               (bvz + dz * mass2mag) (j+1)
   where
    off   = W# (jj `uncheckedShiftL#` 3#)
    lookJ x = peekElemOff bodies (fromIntegral $ x .|. off)
    setJ  x = pokeElemOff bodies (fromIntegral $ x .|. off)

update = forM_ [0..nbodies-1] $ \i -> do
    let look  = get i
        set   = put i
    bx  <- look x
    by  <- look y
    bz  <- look z
    bvx <- look vx
    bvy <- look vy
    bvz <- look vz
    set x (bx + 0.01 * bvx)
    set y (by + 0.01 * bvy)
    set z (bz + 0.01 * bvz)

old entry

Modifies the ghc flags. Submitted.

{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O -optc-march=pentium4 #-}
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Christoph Bauer
-- Written in Haskell by Chris Kuklewicz, further tweaks by Don Stewart
--
-- -O2 -optc-O -fglasgow-exts -fexcess-precision -optc-ffast-math
--
-- -optc-O3 cannot be used, as at least one version of gcc miscompiles this program
--

import System
import System.IO.Unsafe
import Monad
import Bits
import List
import Data.Array.IO
import Data.Array.Base  (unsafeRead,unsafeWrite)
import Text.Printf
import GHC.Base
import GHC.Int

default (Int)

main = do n <- getArgs >>= readIO . head
          offsetMomentum
          energy >>= printf "%.9f\n"
          advance n
          energy >>= printf "%.9f\n"

-- Offsets for each field
x = 0; y = 1; z = 2; vx= 3; vy= 4; vz= 5; m = 6

type Bodies = IOUArray Int Double

b :: Bodies = unsafePerformIO $ newListArray (0,pred (length bodiesData)) (bodiesData)
{-# NOINLINE b #-}

set = unsafeWrite b
{-# INLINE set #-}

resetB = zipWithM_ set [0..] bodiesData

-- sun jupiter saturn uranus neptune
-- sun starts at center at rest
bodiesData =  concat [ mkB 1 0 0 0 0 0 0
   ,mkB 9.54791938424326609e-04
        4.84143144246472090e+00  (-1.16032004402742839e+00) (-1.03622044471123109e-01)
      ( 1.66007664274403694e-03) ( 7.69901118419740425e-03) (-6.90460016972063023e-05)
   ,mkB 2.85885980666130812e-04
        8.34336671824457987e+00    4.12479856412430479e+00  (-4.03523417114321381e-01)
      (-2.76742510726862411e-03) ( 4.99852801234917238e-03) ( 2.30417297573763929e-05)
   ,mkB 4.36624404335156298e-05
        1.28943695621391310e+01  (-1.51111514016986312e+01) (-2.23307578892655734e-01)
      ( 2.96460137564761618e-03) ( 2.37847173959480950e-03) (-2.96589568540237556e-05)
   ,mkB 5.15138902046611451e-05
        1.53796971148509165e+01  (-2.59193146099879641e+01)   1.79258772950371181e-01
      ( 2.68067772490389322e-03) ( 1.62824170038242295e-03) (-9.51592254519715870e-05)]

mkB m x y z vx vy vz =
    [x, y, z, vx*days_per_year,vy*days_per_year, vz*days_per_year, m*solar_mass, 0]

solar_mass    = 4 * pi * pi
days_per_year = 365.24
nbodies       = 4 -- that is 0 to 4

mass :: Int -> IO Double
mass i = unsafeRead b (massOffset .|. (shiftl i 3)) where massOffset = 6

-- Give the sun a small velocity so the total momentum of all bodies totals to zero
offsetMomentum :: IO ()
offsetMomentum = do sm <- mass 0
                    let act i = mass i >>= \m -> addScaled 3 (-m/sm) (3 .|. (shiftl i 3))
                    mapM_ act [1..nbodies]

-- Total all kineticE and potentialE
energy = loop 0 0
  where loop !i !e
            | i > nbodies = return e
            | otherwise   = do ke <- kineticE i
                               (loop' (i+1) i $! (e+ke)) >>= loop (i+1)
        loop' !j !i !e
            | j > nbodies = return e
            | otherwise   = do pe <- potentialE i j
                               loop' (j+1) i $! (e + pe)

kineticE !i = do
    m <- mass i
    vx <- unsafeRead b (f vx)
    vy <- unsafeRead b (f vy)
    vz <- unsafeRead b (f vz)
    return $! 0.5 * m * (vx*vx + vy*vy + vz*vz)
  where f = (.|. (shiftl i 3))

potentialE !i !j = do
    m1 <- mass i
    m2 <- mass j
    dx <- liftM2 (-) (unsafeRead b (f x)) (unsafeRead b (g x))
    dy <- liftM2 (-) (unsafeRead b (f y)) (unsafeRead b (g y))
    dz <- liftM2 (-) (unsafeRead b (f z)) (unsafeRead b (g z))
    return $! ((-1)*m1*m2/sqrt (dx*dx + dy*dy + dz*dz))
  where
    f = (.|.) (shiftl j 3)
    g = (.|.) (shiftl i 3)

addScaled !i !a !j = do
    set i1 =<< liftM2 scale (unsafeRead b i1) (unsafeRead b j1)
    set i2 =<< liftM2 scale (unsafeRead b i2) (unsafeRead b j2)
    set i3 =<< liftM2 scale (unsafeRead b i3) (unsafeRead b j3)
  where scale old new = old + a * new
        i1 = i; i2 = succ i1; i3 = succ i2;
        j1 = j; j2 = succ j1; j3 = succ j2;

addScaled3 !i !a !jx !jy !jz = do
    set i1 =<< liftM (scale jx) (unsafeRead b i1)
    set i2 =<< liftM (scale jy) (unsafeRead b i2)
    set i3 =<< liftM (scale jz) (unsafeRead b i3)
  where scale new old = a * new + old
        i1 = i; i2 = succ i1; i3 = succ i2;

-- This is the main code. Essentially all the time is spent here
advance :: Int -> IO ()
advance n = when (n > 0) $ updateVel 0 >> advance (pred n)

  where updateVel i = when (i <= nbodies) $ do
            im  <- unsafeRead b (f m)
            ix  <- unsafeRead b (f x)
            iy  <- unsafeRead b (f y)
            iz  <- unsafeRead b (f z)
            ivx <- unsafeRead b (f vx)
            ivy <- unsafeRead b (f vy)
            ivz <- unsafeRead b (f vz)

            let updateVel' :: Double -> Double -> Double -> Int -> IO ()
                updateVel' !ivx !ivy !ivz !j =
                  if j > nbodies then do
                    unsafeWrite b (f vx) ivx
                    unsafeWrite b (f vy) ivy
                    unsafeWrite b (f vz) ivz
                  else do
                    let g = (.|. shiftl j 3)
                    jm <- unsafeRead b (g m)
                    dx <- liftM (ix-) (unsafeRead b (g x))
                    dy <- liftM (iy-) (unsafeRead b (g y))
                    dz <- liftM (iz-) (unsafeRead b (g z))
                    let distance = sqrt (dx*dx+dy*dy+dz*dz)
                        mag = 0.01 / (distance * distance * distance)
                    addScaled3 (3 .|. (shiftl j 3)) ( im*mag) dx dy dz
                    let a = -jm*mag
                        ivx' = ivx+a*dx
                        ivy' = ivy+a*dy
                        ivz' = ivz+a*dz
                    updateVel' ivx' ivy' ivz' $! (j+1)


            updateVel' ivx ivy ivz $! (i+1)
            addScaled (shiftl i 3) 0.01 (3 .|. (shiftl i 3))
            updateVel (i+1)

          where
            f = (.|. shift i 3)

shiftl = shiftL -- (I# i) (I# j) = I# (word2Int# (uncheckedShiftL# (int2Word# i) j))

Old entry

On a powerbook G4 (GHC 6.4.1, gcc 4.0.1) this is 1.7x time faster than dons+chris. 12.5 seconds versus 21.2 seconds for N=5,000,000 --ChrisKuklewicz Unboxing the strict fields gives a 3x speedup.

--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Written by Joel Koerwer.
-- Uses 7 STUArrays for the state.
-- Edited for length by ChrisKuklewicz
--
-- -O3 -optc-O3 -fexcess-precision -funbox-strict-fields -optc-ffast-math
--
-- -funbox-strict-fields is critical
--
-- -optc-ffast-math doesn't speed things up for me.
-- Well, neither does -optc-O3, but that may just be here.
--
import Control.Monad    (liftM2,liftM3,liftM4)
import Control.Monad.ST (ST, runST)
import Data.Array.ST    (STUArray, newListArray)
import Data.Array.Base  (unsafeRead, unsafeWrite)
import System           (getArgs)
import Text.Printf      (printf)

(!) = unsafeRead :: STUArray s Int Double -> Int -> ST s Double 
writeArray = unsafeWrite :: STUArray s Int Double -> Int -> Double -> ST s () 
size :: Int = 5
dt :: Double = 0.01

data PhaseSpace s = PS {ms,rxs,rys,rzs,vxs,vys,vzs :: !(STUArray s Int Double)}

main :: IO ()
main = do n <- getArgs >>= readIO . head
          printf "%.9f\n" $ runST (do state <- initialState
                                      offsetMomentum state
                                      energy state)
          printf "%.9f\n" $ runST (do state <- initialState
                                      offsetMomentum state
                                      energy state
                                      advance n state
                                      energy state)

advance :: Int -> PhaseSpace s -> ST s ()
advance 0 _   = return ()
advance n sys = do kick sys
                   drift sys
                   advance (n-1) sys

readAll (PS m x y z vx vy vz) i = do (x,y,z) <- liftM3 (,,) (x!i)  (y!i) (z!i)
                                     (vx,vy,vz) <- liftM3 (,,) (vx!i) (vy!i) (vz!i)
                                     return (x,y,z,vx,vy,vz)
{-# INLINE readAll #-}

kick ps@(PS m rx ry rz vx vy vz) = outer 0 where
    outer i | i >= size = return ()
            | otherwise = inner (i+1) >> outer (i+1) where
                inner j | j >= size = return ()
                        | otherwise = do
                            (mi,mj) <- liftM2 (,) (m!i) (m!j)
                            (rxi,ryi,rzi,vxi,vyi,vzi) <- readAll ps i
                            (rxj,ryj,rzj,vxj,vyj,vzj) <- readAll ps j
                            let (dx,dy,dz) = (rxi-rxj, ryi-ryj, rzi-rzj)
                                dist2 = dx*dx + dy*dy + dz*dz
                                mag = dt / (dist2 * sqrt dist2)
                            writeArray vx i (vxi - dx*mj*mag)
                            writeArray vy i (vyi - dy*mj*mag)
                            writeArray vz i (vzi - dz*mj*mag)
                            writeArray vx j (vxj + dx*mi*mag)
                            writeArray vy j (vyj + dy*mi*mag)
                            writeArray vz j (vzj + dz*mi*mag)
                            inner (j+1)

drift ps@(PS _ rxs rys rzs _ _ _) = loop 0 where
    loop i | i >= size = return ()
           | otherwise = do (x,y,z,vx,vy,vz) <- readAll ps i
                            writeArray rxs i (x + dt*vx)
                            writeArray rys i (y + dt*vy)
                            writeArray rzs i (z + dt*vz)
                            loop (i+1)

energy sys = liftM2 (+) (kineticEnergy sys) (potentialEnergy sys)

kineticEnergy (PS ms _ _ _ vxs vys vzs) = loop 0 0 where
    loop i accum | i >= size = return (0.5 * accum)
                 | otherwise = do
                        (m,vx,vy,vz) <- liftM4 (,,,) (ms!i) (vxs!i) (vys!i) (vzs!i)
                        let v2 = vx*vx + vy*vy + vz*vz
                        loop (i+1) $! (accum + m*v2)

potentialEnergy (PS ms rxs rys rzs _ _ _) = outer 0 0 where
    outer i a | i >= size = return a
              | otherwise = inner i (i+1) a >>= outer (i+1)
    inner i j a | j >= size = return a
                | otherwise = do
                    (mi,xi,yi,zi) <- liftM4 (,,,) (ms!i) (rxs!i) (rys!i) (rzs!i)
                    (mj,xj,yj,zj) <- liftM4 (,,,) (ms!j) (rxs!j) (rys!j) (rzs!j)
                    let (dx,dy,dz) = (xi-xj, yi-yj, zi-zj)
                        dist = sqrt (dx*dx + dy*dy + dz*dz)
                    inner i (j+1) $! (a - mi*mj/dist)

offsetMomentum (PS ms rxs rys rzs vxs vys vzs) = do
    (px,py,pz) <- calcMomentum 0 (0,0,0)
    (mSun,vSunx,vSuny,vSunz) <- liftM4 (,,,) (ms!0) (vxs!0) (vys!0) (vzs!0)
    writeArray vxs 0 (vSunx - px/mSun)
    writeArray vys 0 (vSuny - py/mSun)
    writeArray vzs 0 (vSunz - pz/mSun)
  where calcMomentum i (px,py,pz) | i >= size = return (px,py,pz)
                                  | otherwise = do
                                        (m,vx,vy,vz) <- liftM4 (,,,) (ms!i) (vxs!i) (vys!i) (vzs!i)
                                        calcMomentum (i+1) $! (px+vx*m,py+vy*m,pz+vz*m)

initialState = do m <- mkSTUArray masses
                  x <- mkSTUArray positionXs
                  y <- mkSTUArray positionYs
                  z <- mkSTUArray positionZs
                  vx <- mkSTUArray velocityXs
                  vy <- mkSTUArray velocityYs
                  vz <- mkSTUArray velocityZs
                  return (PS m x y z vx vy vz)

mkSTUArray = newListArray (0,size-1) :: [Double] -> ST s (STUArray s Int Double)

masses = map (4*pi*pi*) [1, 9.54791938424326609e-04, 2.85885980666130812e-04, 4.36624404335156298e-05, 5.15138902046611451e-05]

positionXs = [0, 4.84143144246472090e+00, 8.34336671824457987e+00, 1.28943695621391310e+01, 1.53796971148509165e+01]

positionYs = [0, (-1.16032004402742839e+00), 4.12479856412430479e+00, (-1.51111514016986312e+01), (-2.59193146099879641e+01)]

positionZs = [0, (-1.03622044471123109e-01), (-4.03523417114321381e-01), (-2.23307578892655734e-01), 1.79258772950371181e-01]

velocityXs = map (365.24*) [0, 1.66007664274403694e-03, (-2.76742510726862411e-03), 2.96460137564761618e-03, 2.68067772490389322e-03]

velocityYs = map (365.24*) [0, 7.69901118419740425e-03, 4.99852801234917238e-03, 2.37847173959480950e-03, 1.62824170038242295e-03]

velocityZs = map (365.24*) [0, (-6.90460016972063023e-05), 2.30417297573763929e-05, (-2.96589568540237556e-05), (-9.51592254519715870e-05)]

An STUArray Version (Joel Koerwer)

This one is slightly faster than chris+dons on my machine. The "proposed entry" has ben submitted, so I pushed it and its associated benchmarks down.

I get a speed up from the STUArray versus the dons+chris version on a Powerbook G4. Don+chris is 4.38 s user time, Joel's is 3.74 s user time (17% faster). This makes sense to me, since it does not need to shiftL and .|. the indices. I was cleaning it up without making it slower and I accidentally made it faster (I think by hoisting dt and size). -- ChrisKuklewicz

--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Written by Joel Koerwer.
-- Uses 7 STUArrays for the state.
--
-- Performance is very similar to the IOUArray version of
-- Chris Kuklewicz and Don Stewart. This one is slightly faster
-- on my machine at n=5,000,000 (15 s vs 17 s).
--
-- -O3 -optc-O3 -fexcess-precision -funbox-strict-fields
--
-- -optc-ffast-math doesn't speed things up for me.
-- Well, neither does -optc-O3, but that may just be here.
--


import Control.Monad.ST (ST, runST)
import Data.Array.ST    (STUArray, newListArray)
import Data.Array.Base  (unsafeRead, unsafeWrite)

import System      (getArgs)
import Text.Printf (printf)

readArray :: STUArray s Int Double -> Int -> ST s Double
readArray = unsafeRead

writeArray :: STUArray s Int Double -> Int -> Double -> ST s ()
writeArray = unsafeWrite

data PhaseSpace s = PS {size :: !Int,
                        ms,rxs,rys,rzs,vxs,vys,vzs :: !(STUArray s Int Double)}

advance :: Int -> Double -> PhaseSpace s -> ST s ()
advance 0 _  _   = return ()
advance n dt sys = do
    kick dt sys
    drift dt sys
    advance (n-1) dt sys

kick :: Double -> PhaseSpace s -> ST s ()
kick dt (PS size m rx ry rz vx vy vz) = outer 0
    where
    outer i | i >= size = return ()
            | otherwise = inner (i+1) >> outer (i+1)
                where
                inner j | j >= size = return ()
                        | otherwise = do
                            mi <- readArray m i
                            mj <- readArray m j
                            rxi <- readArray rx i
                            rxj <- readArray rx j
                            ryi <- readArray ry i
                            ryj <- readArray ry j
                            rzi <- readArray rz i
                            rzj <- readArray rz j
                            vxi <- readArray vx i
                            vxj <- readArray vx j
                            vyi <- readArray vy i
                            vyj <- readArray vy j
                            vzi <- readArray vz i
                            vzj <- readArray vz j
                            let dx = rxi-rxj
                                dy = ryi-ryj
                                dz = rzi-rzj
                                dist2 = dx*dx + dy*dy + dz*dz
                                dist = sqrt dist2
                                mag = dt / (dist*dist2)
                            writeArray vx i (vxi - dx*mj*mag)
                            writeArray vy i (vyi - dy*mj*mag)
                            writeArray vz i (vzi - dz*mj*mag)
                            writeArray vx j (vxj + dx*mi*mag)
                            writeArray vy j (vyj + dy*mi*mag)
                            writeArray vz j (vzj + dz*mi*mag)
                            inner (j+1)

drift :: Double -> PhaseSpace s -> ST s ()
drift dt (PS size _ rxs rys rzs vxs vys vzs) = loop 0
    where
    loop i | i >= size = return ()
           | otherwise = do
                x <- readArray rxs i
                y <- readArray rys i
                z <- readArray rzs i
                vx <- readArray vxs i
                vy <- readArray vys i
                vz <- readArray vzs i
                writeArray rxs i (x + dt*vx)
                writeArray rys i (y + dt*vy)
                writeArray rzs i (z + dt*vz)
                loop (i+1)

energy :: PhaseSpace s -> ST s Double
energy sys = do t <- kineticEnergy sys
                u <- potentialEnergy sys
                return (t+u)

kineticEnergy :: PhaseSpace s -> ST s Double
kineticEnergy (PS size ms _ _ _ vxs vys vzs) = loop 0 0
    where
    loop i accum | i >= size = return (0.5 * accum)
                 | otherwise = do
                        m <- readArray ms i
                        vx <- readArray vxs i
                        vy <- readArray vys i
                        vz <- readArray vzs i
                        let v2 = vx*vx + vy*vy + vz*vz
                        loop (i+1) (accum + m*v2)

potentialEnergy :: PhaseSpace s -> ST s Double
potentialEnergy (PS size ms rxs rys rzs _ _ _) = outer 0 0
    where
    outer i a | i >= size = return a
              | otherwise = inner i (i+1) a >>= outer (i+1)

    inner i j a | j >= size = return a
                | otherwise = do
                    mi <- readArray ms i
                    mj <- readArray ms j
                    xi <- readArray rxs i
                    xj <- readArray rxs j
                    yi <- readArray rys i
                    yj <- readArray rys j
                    zi <- readArray rzs i
                    zj <- readArray rzs j
                    let dx = xi-xj
                        dy = yi-yj
                        dz = zi-zj
                        dist = sqrt (dx*dx + dy*dy + dz*dz)
                    inner i (j+1) (a - mi*mj/dist)


offsetMomentum :: PhaseSpace s -> ST s ()
offsetMomentum (PS size ms rxs rys rzs vxs vys vzs) = do
    (px,py,pz) <- calcMomentum 0 (0,0,0)
    mSun <- readArray ms 0
    vSunx <- readArray vxs 0
    vSuny <- readArray vys 0
    vSunz <- readArray vzs 0
    writeArray vxs 0 (vSunx - px/mSun)
    writeArray vys 0 (vSuny - py/mSun)
    writeArray vzs 0 (vSunz - pz/mSun)
        where
        calcMomentum i (px,py,pz) | i >= size = return (px,py,pz)
                                  | otherwise = do
                                        m <- readArray ms i
                                        vx <- readArray vxs i
                                        vy <- readArray vys i
                                        vz <- readArray vzs i
                                        let px' = px + vx*m
                                            py' = py + vy*m
                                            pz' = pz + vz*m
                                        calcMomentum (i+1) (px',py',pz')

main = do
    n <- getArgs >>= readIO . head
    printf "%.9f\n" $ runST (do state <- initialState
                                offsetMomentum state
                                energy state)

    printf "%.9f\n" $ runST (do state <- initialState
                                offsetMomentum state
                                advance n 0.01 state
                                energy state)

initialState :: ST s (PhaseSpace s)
initialState = do
        m <- mkSTUArray masses
        x <- mkSTUArray positionXs
        y <- mkSTUArray positionYs
        z <- mkSTUArray positionZs
        vx <- mkSTUArray velocityXs
        vy <- mkSTUArray velocityYs
        vz <- mkSTUArray velocityZs
        return (PS 5 m x y z vx vy vz)

mkSTUArray :: [Double] -> ST s (STUArray s Int Double)
mkSTUArray = newListArray (0,4)

masses = map (4*pi*pi*) [1, 9.54791938424326609e-04, 2.85885980666130812e-04, 4.36624404335156298e-05, 5.15138902046611451e-05]

positionXs = [0, 4.84143144246472090e+00, 8.34336671824457987e+00, 1.28943695621391310e+01, 1.53796971148509165e+01]

positionYs = [0, (-1.16032004402742839e+00), 4.12479856412430479e+00, (-1.51111514016986312e+01), (-2.59193146099879641e+01)]

positionZs = [0, (-1.03622044471123109e-01), (-4.03523417114321381e-01), (-2.23307578892655734e-01), 1.79258772950371181e-01]

velocityXs = map (365.24*) [0, 1.66007664274403694e-03, (-2.76742510726862411e-03), 2.96460137564761618e-03, 2.68067772490389322e-03]

velocityYs = map (365.24*) [0, 7.69901118419740425e-03, 4.99852801234917238e-03, 2.37847173959480950e-03, 1.62824170038242295e-03]

velocityZs = map (365.24*) [0, (-6.90460016972063023e-05), 2.30417297573763929e-05, (-2.96589568540237556e-05), (-9.51592254519715870e-05)]


Proposed Entry (dons+chris)

A carefully tuned version of Chris' breakthrough entry. Runs faster than unoptimised gcc C. Be sure not to use -optc-O3 on linux, or badness may ensue. The OPTIONS pragma passes -O to gcc last, so it will override any other -O args to gcc passed on the cmd line.

{-# OPTIONS -optc-O #-}
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Christoph Bauer
-- Written in Haskell by Chris Kuklewicz, further tweaks by Don Stewart
--
-- -O2 -optc-O -fglasgow-exts -fexcess-precision -optc-ffast-math
--
-- -optc-O3 cannot be used, as at least one version of gcc miscompiles this program
--

import System
import System.IO.Unsafe
import Monad
import Data.Bits
import Data.List
import Data.Array.IO
import Data.Array.Base(unsafeRead,unsafeWrite)
import Text.Printf

default (Int)

main = do n <- getArgs >>= readIO . head
          offsetMomentum
          energy >>= printf "%.9f\n"
          advance n
          energy >>= printf "%.9f\n"

-- Offsets for each field
x = 0; y = 1; z = 2; vx= 3; vy= 4; vz= 5; m = 6

type Bodies = IOUArray Int Double

b :: Bodies = unsafePerformIO $ newListArray (0,pred (length bodiesData)) (bodiesData)
{-# NOINLINE b #-}

set = unsafeWrite b
{-# INLINE set #-}

resetB = zipWithM_ set [0..] bodiesData

-- sun jupiter saturn uranus neptune
-- sun starts at center at rest
bodiesData =  concat [ mkB 1 0 0 0 0 0 0
   ,mkB 9.54791938424326609e-04
        4.84143144246472090e+00  (-1.16032004402742839e+00) (-1.03622044471123109e-01)
      ( 1.66007664274403694e-03) ( 7.69901118419740425e-03) (-6.90460016972063023e-05)
   ,mkB 2.85885980666130812e-04
        8.34336671824457987e+00    4.12479856412430479e+00  (-4.03523417114321381e-01)
      (-2.76742510726862411e-03) ( 4.99852801234917238e-03) ( 2.30417297573763929e-05)
   ,mkB 4.36624404335156298e-05
        1.28943695621391310e+01  (-1.51111514016986312e+01) (-2.23307578892655734e-01)
      ( 2.96460137564761618e-03) ( 2.37847173959480950e-03) (-2.96589568540237556e-05)
   ,mkB 5.15138902046611451e-05
        1.53796971148509165e+01  (-2.59193146099879641e+01)   1.79258772950371181e-01
      ( 2.68067772490389322e-03) ( 1.62824170038242295e-03) (-9.51592254519715870e-05)]

mkB m x y z vx vy vz = 
    [x, y, z, vx*days_per_year,vy*days_per_year, vz*days_per_year, m*solar_mass, 0]

solar_mass    = 4 * pi * pi
days_per_year = 365.24
nbodies       = 4 -- that is 0 to 4

mass i = unsafeRead b (massOffset .|. (shiftL i 3)) where massOffset = 6

-- Give the sun a small velocity so the total momentum of all bodies totals to zero
offsetMomentum :: IO ()
offsetMomentum = do sm <- mass 0
                    let act i = mass i >>= \m -> addScaled 3 (-m/sm) (3 .|. (shiftL i 3))
                    mapM_ act [1..nbodies]

-- Total all kineticE and potentialE
energy = loop 0 0
  where loop i e | i > nbodies = return e
                 | otherwise   = do ke <- kineticE i
                                    (loop' (i+1) i $! (e+ke)) >>= loop (i+1)
        loop' j i e | j > nbodies = return e
                    | otherwise   = do pe <- potentialE i j
                                       loop' (j+1) i $! (e + pe)

kineticE i = let i' = (.|. (shiftL i 3))
             in do m <- mass i
                   vx <- unsafeRead b (i' vx)
                   vy <- unsafeRead b (i' vy)
                   vz <- unsafeRead b (i' vz)
                   return $! 0.5 * m * (vx*vx + vy*vy + vz*vz)

potentialE i j = do
    m1 <- mass i
    m2 <- mass j
    let (i', j') = ((.|. (shiftL i 3)), (.|. (shiftL j 3)))
    dx <- liftM2 (-) (unsafeRead b (i' x)) (unsafeRead b (j' x))
    dy <- liftM2 (-) (unsafeRead b (i' y)) (unsafeRead b (j' y))
    dz <- liftM2 (-) (unsafeRead b (i' z)) (unsafeRead b (j' z))
    return $! ((-1)*m1*m2/sqrt (dx*dx + dy*dy + dz*dz))

addScaled i a j | i `seq` a `seq` j `seq` False = undefined -- stricitfy
addScaled i a j = do set i1 =<< liftM2 scale (unsafeRead b i1) (unsafeRead b j1)
                     set i2 =<< liftM2 scale (unsafeRead b i2) (unsafeRead b j2)
                     set i3 =<< liftM2 scale (unsafeRead b i3) (unsafeRead b j3)
    where scale old new = old + a * new
          i1 = i; i2 = succ i1; i3 = succ i2;
          j1 = j; j2 = succ j1; j3 = succ j2;

addScaled3 i a jx jy jz | i `seq` a `seq` jx `seq` jy `seq` jz `seq` False = undefined
addScaled3 i a jx jy jz = do set i1 =<< liftM (scale jx) (unsafeRead b i1)
                             set i2 =<< liftM (scale jy) (unsafeRead b i2)
                             set i3 =<< liftM (scale jz) (unsafeRead b i3)
    where scale new old = a * new + old
          i1 = i; i2 = succ i1; i3 = succ i2;

-- This is the main code. Essentially all the time is spent here
advance n = when (n > 0) $ updateVel 0 >> advance (pred n)

  where updateVel i = when (i <= nbodies) $ do
            let i' = (.|. shift i 3)
            im  <- unsafeRead b (i' m)
            ix  <- unsafeRead b (i' x)
            iy  <- unsafeRead b (i' y)
            iz  <- unsafeRead b (i' z)
            ivx <- unsafeRead b (i' vx)
            ivy <- unsafeRead b (i' vy)
            ivz <- unsafeRead b (i' vz)

            let updateVel' ivx ivy ivz j =  ivx `seq` ivy `seq` ivz `seq`
                  if j > nbodies then do
                    unsafeWrite b (i' vx) ivx
                    unsafeWrite b (i' vy) ivy
                    unsafeWrite b (i' vz) ivz
                  else do
                    let j' = (.|. shiftL j 3)
                    jm <- unsafeRead b (j' m)
                    dx <- liftM (ix-) (unsafeRead b (j' x))
                    dy <- liftM (iy-) (unsafeRead b (j' y))
                    dz <- liftM (iz-) (unsafeRead b (j' z))
                    let distance = sqrt (dx*dx+dy*dy+dz*dz)
                        mag = 0.01 / (distance * distance * distance)
                    addScaled3 (3 .|. (shiftL j 3)) ( im*mag) dx dy dz
                    let a = -jm*mag
                        ivx' = ivx+a*dx
                        ivy' = ivy+a*dy
                        ivz' = ivz+a*dz
                    updateVel' ivx' ivy' ivz' $! (j+1)

            updateVel' ivx ivy ivz $! (i+1)
            addScaled (shiftL i 3) 0.01 (3 .|. (shiftL i 3))
            updateVel (i+1)

Improved entry (chris)

I made it go faster! By a factor of 2.5, which is not closing much of the gap. But any speed improvement is a breakthrough at this point. Hopefully someone else can see why it is faster and speed it up further. I am just tweaking things at random now.

All the n-body data (7 doubles: mass x y z vx vy vz) is put into a single IOUArray Int Double. Each body is offset by 8 indices, so there is a padding of 1 double to get an alignment of 8. This let me calculate the offset of each double with {{{shiftL _ 3}}} and . The array itself is a global variable.

--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Christoph Bauer
-- Rewritten in Haskell from C by Don Stewart
-- Rewritten again by Chris Kuklewicz
--
-- -O2 -optc-O3 -fglasgow-exts -fexcess-precision -optc-ffast-math
--
-- This code actually runs faster then the previous entry!
--
import Debug.Trace

import System
import System.IO.Unsafe
import Monad
import Data.Bits
import Data.List
import Data.Array.MArray
import Data.Array.IO
import Data.Array.Base(unsafeRead,unsafeWrite)
import Text.Printf
import Data.IORef

default(Int)

main = do
    args <- getArgs
    let n = if null args then 1000000 else read ( head args )
    offsetMomentum
    energy >>= printf "%.9f\n"
    advance n
    energy >>= printf "%.9f\n" 

-- Offsets for each field
x = 0
y = 1
z = 2
vx= 3
vy= 4
vz= 5
m = 6

type Bodies = IOUArray Int Double

{-# NOINLINE b #-}
b :: Bodies = unsafePerformIO $ newListArray (0,pred (length bodiesData)) (bodiesData)
get = unsafeRead b
set = unsafeWrite b
resetB = zipWithM_ set [0..] bodiesData

-- sun jupiter saturn uranus neptune
-- sun starts at center at rest
bodiesData =  concat [ mkB 1 0 0 0 0 0 0
   ,mkB 9.54791938424326609e-04 
        4.84143144246472090e+00  (-1.16032004402742839e+00) (-1.03622044471123109e-01)
      ( 1.66007664274403694e-03) ( 7.69901118419740425e-03) (-6.90460016972063023e-05)
   ,mkB 2.85885980666130812e-04
        8.34336671824457987e+00    4.12479856412430479e+00  (-4.03523417114321381e-01)
      (-2.76742510726862411e-03) ( 4.99852801234917238e-03) ( 2.30417297573763929e-05)
   ,mkB 4.36624404335156298e-05
        1.28943695621391310e+01  (-1.51111514016986312e+01) (-2.23307578892655734e-01)
      ( 2.96460137564761618e-03) ( 2.37847173959480950e-03) (-2.96589568540237556e-05)
   ,mkB 5.15138902046611451e-05
        1.53796971148509165e+01  (-2.59193146099879641e+01)   1.79258772950371181e-01
      ( 2.68067772490389322e-03) ( 1.62824170038242295e-03) (-9.51592254519715870e-05)]

mkB m x y z vx vy vz = [x, y, z, vx*days_per_year, 
  vy*days_per_year, vz*days_per_year, m*solar_mass, 0]

solar_mass    = 4 * pi * pi
days_per_year = 365.24
nbodies = 4 -- that is 0 to 4

sh i = shiftL i 3 -- multiply by 8
pos i = sh i
vel i = velOffset .|. sh i where velOffset = 3
mass i = get (massOffset .|. sh i) where massOffset = 6

-- Give the sun a small velocity so the total momentum of all bodies totals to zero
offsetMomentum :: IO ()
offsetMomentum = do  sm <- mass 0
                     let sv = vel 0
                         act i = mass i >>= \m -> addScaled sv (-m/sm) (vel i)
                     mapM_ act [1..nbodies]

-- Total all kineticE and potentialE
energy :: IO Double
energy = loop 0 0
  where loop i e | i > nbodies = return e
                 | otherwise   = do ke <- kineticE i
                                    (loop' (i+1) i $! (e+ke)) >>= loop (i+1) 
        loop' j i e | j > nbodies = return e
                    | otherwise   = do pe <- potentialE i j
                                       loop' (j+1) i $! (e + pe)

kineticE i = let i' = (.|. sh i)
             in do m <- mass i
                   vx <- get (i' vx)
                   vy <- get (i' vy)
                   vz <- get (i' vz)
                   return $! 0.5 * m * (vx*vx + vy*vy + vz*vz)

potentialE i j = do
  m1 <- mass i
  m2 <- mass j
  let i' = (.|. (sh i))
      j' = (.|. (sh j))
  dx <- liftM2 (-) (get (i' x)) (get (j' x))
  dy <- liftM2 (-) (get (i' y)) (get (j' y))
  dz <- liftM2 (-) (get (i' z)) (get (j' z))
  return $! ((-1)*m1*m2/sqrt (dx*dx + dy*dy + dz*dz))

dt = 0.01

addScaled i a j = let scale old new = old + a * new
                      i1=i; i2=succ i1; i3=succ i2;
                      j1=j; j2=succ j1; j3=succ j2;
                  in do set i1 =<< liftM2 scale (get i1) (get j1)
                        set i2 =<< liftM2 scale (get i2) (get j2)
                        set i3 =<< liftM2 scale (get i3) (get j3)

addScaled3 i a jx jy jz = do
  let scale new old = a * new + old
      i1=i; i2=succ i1; i3=succ i2;
  set i1 =<< liftM (scale jx) (get i1)
  set i2 =<< liftM (scale jy) (get i2)
  set i3 =<< liftM (scale jz) (get i3)

-- This is the main code. Essentially all the time is spent here
advance n = when (n>0) $ do
  let {-# NOINLINE updateVel #-}
      updateVel i = when (i <= nbodies) $ do 
        let i' = (.|. sh i)
        im <- get (i' m)
        ix <- get (i' x)
        iy <- get (i' y)
        iz <- get (i' z)
        ivx <- get (i' vx)
        ivy <- get (i' vy)
        ivz <- get (i' vz)
        let {-# INLINE updateVel' #-}
            updateVel' ivx ivy ivz j =  ivx `seq` ivy `seq` ivz `seq` 
              if j > nbodies then do
                set (i' vx) ivx
                set (i' vy) ivy
                set (i' vz) ivz
              else do
                let j' = (.|. sh j)
                jm <- get (j' m)
                dx <- liftM (ix-) (get (j' x))
                dy <- liftM (iy-) (get (j' y))
                dz <- liftM (iz-) (get (j' z))
                let distance = sqrt (dx*dx+dy*dy+dz*dz)
                    mag = dt / (distance * distance * distance)
                addScaled3 (vel j) ( im*mag) dx dy dz
                let a = -jm*mag
                    ivx' = ivx+a*dx
                    ivy' = ivy+a*dy
                    ivz' = ivz+a*dz
                updateVel' ivx' ivy' ivz' $! (j+1)
        updateVel' ivx ivy ivz $! (i+1)
        addScaled (pos i) dt (vel i)
        updateVel (i+1)
  updateVel 0
  advance (pred n)

Old attempt by ChrisKuklewicz

I heavily transformed Don's code. This now runs almost exactly as fast as Einar's code on my machine. So I am niether happy nor sad.

Note that this creates the {{{data MutVec = V ...}}} and {{{data Body = B}}} all at the beginning, puts them into an array, and never constructs any new ones. From them on it just looks up {{{IORef Double}}}'s and works on those.

I cannot find any Haskell idioms for making this run 10x faster, but other languages manage to do it, such as OCaml.

Would peeking and poking unboxed Doubles be the way to go?

--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Christoph Bauer
-- Rewritten in Haskell from C by Don Stewart
-- Rewritten again by Chris Kuklewicz
--
-- -O2  -optc-O3 -funbox-strict-fields -fglasgow-exts -fexcess-precision -fasm
--
-- Runs at the same speed as the Haskell#3 open mutable records code
-- by Einar.
--
-- Advance is where ALL the work is done and where any optimization must go.
--
import System
import Control.Monad
import Data.List
import Data.Array.Array
import Text.Printf
import Data.IORef

default()

type Bodies = Array Int Body
data MutVec = V { x, y, z :: !(IORef Double) }
data Body = B { mass :: !Double, pos,vel :: !MutVec } 

newV x y z =  x `seq` y `seq` z `seq` do liftM3 V (newIORef x) (newIORef y) (newIORef z)

mkB m x y z vx vy vz = liftM2 (B (m*solar_mass)) (newV x y z) 
  (newV (vx*days_per_year) (vy*days_per_year) (vz*days_per_year))

{-# INLINE mIORef #-}
mIORef r f = readIORef r >>= ( ($!) (writeIORef r)  ) . f 
{-# INLINE minus #-}
minus x = (\y -> y-x)
{-# INLINE do2 #-}
do2 fm m1 m2 = do x1 <- m1; x2 <- m2; fm x1 x2

{-# INLINE addScaled #-}
addScaled v1@(V x1 y1 z1) a v2@(V x2 y2 z2) = do
  (mIORef x1).(+).(*a) =<< readIORef x2
  (mIORef y1).(+).(*a) =<< readIORef y2
  (mIORef z1).(+).(*a) =<< readIORef z2

{-# INLINE setDiff #-}
setDiff d@(V dx dy dz) v1@(V x1 y1 z1) v2@(V x2 y2 z2) = do
  do2 (\a b->writeIORef dx $! (a-b)) (readIORef x1)  (readIORef x2)
  do2 (\a b->writeIORef dy $! (a-b)) (readIORef y1)  (readIORef y2)
  do2 (\a b->writeIORef dz $! (a-b)) (readIORef z1)  (readIORef z2)
  return d

{-# INLINE kineticE #-}
kineticE mass (V x y z) = do vx <- readIORef x ;  vy <- readIORef y ;  vz <- readIORef z
                             return $! 0.5 * mass * (vx*vx + vy*vy + vz*vz)

{-# INLINE magV #-}
magV (V x y z) = do
  liftM3 (\a b c->sqrt (a*a+b*b+c*c)) (readIORef x) (readIORef y) (readIORef z) >>= (return $!)

{-# INLINE potentialE #-}
potentialE mass1 (V x1 y1 z1) mass2 (V x2 y2 z2) = do
  dx <- liftM2 (-) (readIORef x1) (readIORef x2)
  dy <- liftM2 (-) (readIORef y1) (readIORef y2)
  dz <- liftM2 (-) (readIORef z1) (readIORef z2)
  return $! ((-1)*mass1*mass2/sqrt (dx*dx + dy*dy + dz*dz))

mkBodies :: IO Bodies
mkBodies = liftM (listArray (0,nbodies-1)) $ sequence bodiesData

-- sun jupiter saturn uranus neptune
bodiesData = [ mkB 1 0 0 0 0 0 0
  ,mkB 9.54791938424326609e-04 
       4.84143144246472090e+00  (-1.16032004402742839e+00) (-1.03622044471123109e-01)
     ( 1.66007664274403694e-03) ( 7.69901118419740425e-03) (-6.90460016972063023e-05)
  ,mkB 2.85885980666130812e-04
       8.34336671824457987e+00    4.12479856412430479e+00  (-4.03523417114321381e-01)
     (-2.76742510726862411e-03) ( 4.99852801234917238e-03) ( 2.30417297573763929e-05)
  ,mkB 4.36624404335156298e-05
       1.28943695621391310e+01  (-1.51111514016986312e+01) (-2.23307578892655734e-01)
     ( 2.96460137564761618e-03) ( 2.37847173959480950e-03) (-2.96589568540237556e-05)
  ,mkB 5.15138902046611451e-05
       1.53796971148509165e+01  (-2.59193146099879641e+01)   1.79258772950371181e-01
     ( 2.68067772490389322e-03) ( 1.62824170038242295e-03) (-9.51592254519715870e-05) ]

solar_mass    = 4 * pi * pi
days_per_year = 365.24
nbodies = length bodiesData

offsetMomentum :: Bodies -> IO ()
offsetMomentum arr = sequence_ [ addScaled sv (-m/sm) v | (B m _ v) <- tail (elems arr) ]
  where (B sm sp sv) = (arr ! 0)

energy :: Int -> Bodies -> IO Double
energy nbodies arr = loop 0 0
    where loop i e | i >= nbodies = return e
                   | otherwise    = do let (B mb bp bv) = arr ! i
                                       ke <- kineticE mb bv
                                       (loop' (i+1) bp mb $! (e+ke)) >>= loop (i+1) 
          loop' j bp mb e | j >= nbodies = return e
                          | otherwise    = do let (B mb2 bp2 _) = arr ! j
                                              pe <- potentialE mb bp mb2 bp2
                                              loop' (j+1) bp mb (e + pe)

main = do
    (n::Int) <- getArgs >>= readIO . head
    bodies <- mkBodies
    dxyz <- newV 0 0 0
    let dt = 0.01
        advance = do
          let updateVel i = when (i < nbodies) $ do 
                updateVel' (bodies ! i) (i+1) 
                updateVel (i+1)
              updateVel' b1@(B mb1 bp1 bv1) j = if j >= nbodies then return () else do
                let (B mb2 bp2 bv2)  = bodies ! j
                setDiff dxyz bp1 bp2
                distance <- magV dxyz
                let mag = dt / (distance * distance * distance)
                addScaled bv2 (mb1*mag) dxyz
                addScaled bv1 (-mb2*mag) dxyz
                updateVel' b1 (j+1) 
              updatePos i = when (i < nbodies) $ do 
                let (B _ bp bv) = bodies ! i
                addScaled bp dt bv
                updatePos (i+1)
          updateVel 0
          updatePos 0
    offsetMomentum bodies
    energy nbodies bodies >>= printf "%.9f\n"
    replicateM_ n advance
    energy nbodies bodies >>= printf "%.9f\n"

Joel Koerwer's nth attempt

I suppose the comments say it all.

-- Joel Koerwer's attempt at the Computer Shootout nbody problem.
-- Not as fast as Einar Kartunnen's Open Mutable Records implementation,
-- but I like it!
-- I get about double the runtime of Einar's code.

import System (getArgs)
import Text.Printf (printf)

data Vector3 = V3 !Double !Double !Double deriving (Show, Eq)

instance Num Vector3 where
    (V3 x y z) + (V3 u v w) = V3 (x+u) (y+v) (z+w)
    (V3 x y z) - (V3 u v w) = V3 (x-u) (y-v) (z-w)
    (V3 x y z) * (V3 u v w) = V3 (x*u) (y*v) (z*w)
    fromInteger n = V3 (fromInteger n) (fromInteger n) (fromInteger n)
    abs (V3 x y z) = V3 (abs x) (abs y) (abs z)
    signum (V3 x y z) = V3 (signum x) (signum y) (signum z)

-- Mnemonically read "scalar times vector".
(.*|) :: Double -> Vector3 -> Vector3
a .*| (V3 x y z) = V3 (a*x) (a*y) (a*z)

-- Similarly for "vector divided by scalar".
(|/.) :: Vector3 -> Double -> Vector3
(V3 x y z) |/. a = V3 (x/a) (y/a) (z/a)

dot :: Vector3 -> Vector3 -> Double
(V3 x y z) `dot` (V3 u v w) = x*u + y*v + z*w

norm2 v = v `dot` v
norm = sqrt . norm2

type PhaseSpace = ([Vector3],[Vector3])

advance :: PhaseSpace -> PhaseSpace
advance sys@(rs,vs) = let
    vs' = zipWith (+) vs $ getDeltaVs sys
    rs' = zipWith ((+) . (dt .*| )) vs' rs
    in
    (rs', vs')

getDeltaVs :: PhaseSpace -> [Vector3]
getDeltaVs sys@(rs,_) = let
    timesMdT = zipWith (.*|) dtms
    mags = getMags rs
    in
    map (sum . timesMdT) mags


-- getMags does the "real work". Given a list of position vectors [r0,r1,..,rN]
-- return the "magnitude matrix"
--
-- [[  0   m01  m02 .. m0N],
--  [-m01   0   m12 .. m1N],
--  [-m02 -m12   0  .. m2N],
--  [  :    :      .    : ],
--  [  :    :        .  : ],
--  [-m0N -m1N -m2N ..  0 ]]
--
-- where mij = -mji = (rj - ri) / |rj - ri|^3.
--
-- This method is faster than calulating all differences
-- seperately, but not by a whole lot.
getMags :: [Vector3] -> [[Vector3]]
getMags [] = []
getMags (r:rs) = let
    mkMag r' = let d = r' - r in d |/. (norm d * norm2 d)
    newMags = map mkMag rs
    minusMags = map negate newMags
    in
    (0 : newMags) : zipWith (:) minusMags (getMags rs)

energy :: PhaseSpace -> Double
energy sys = kineticEnergy sys + potentialEnergy sys

-- T = 0.5 * sum_i mi * vi^2
kineticEnergy :: PhaseSpace -> Double
kineticEnergy (_,vs) = 0.5 * (sum $ zipWith (*) ms (map norm2 vs))

-- U = sum_{i/=j} -0.5*mi*mj/|ri-rj|
potentialEnergy :: PhaseSpace -> Double
potentialEnergy (rs,_) = let a = -0.5 :: Double in
    (a *) . sum $ zipWith ((*) . sum . zipWith (*) ms) (invDists rs) ms

-- Calculate the inverse distances between positions [r0,..rN].
-- Similar to getMags, but here, mij = |mij| = 1 / |ri-rj|.
invDists :: [Vector3] -> [[Double]]
invDists [] = []
invDists (r:rs) = let
    newDs = map ((1/) . norm . (r -)) rs
    in
    (0 : newDs) : zipWith (:) newDs (invDists rs)

-- offset gives the Sun an initial velocity such that the entire system
-- has zero overall momentum.
offset :: PhaseSpace -> PhaseSpace
offset (rs, vs@(vSun:vPlanets)) = let
    pTotal = sum $ zipWith (.*|) ms vs
    mSun = head ms
    dv = pTotal |/. mSun
    in
    (rs, vSun-dv : vPlanets)

---------- A strict version of iterate for our loop -----------
seqList [] y = y
seqList (x:xs) y = x `seq` seqList xs y

seqTuple (x,y) z = seqList x $ seqList y z

iterate' f a = let b = f a in seqTuple b $ a : iterate' f b
---------------------------------------------------------------

main = let
    sys = offset initialPhaseSpace
    print9 = printf "%.9f\n"
    in
    do
        [n'] <- getArgs
        let n = read n' :: Int
        print9 (energy sys)
        print9 . energy $ iterate' advance sys !! n

-- I've stuck all the ugly constants and initial conditions at the end.
dt = 0.01
solarMass = 4*pi*pi
daysPerYear = 365.24
posns =[V3 0 0 0,
        V3 4.8414314424647209 (-1.16032004402742839) (-1.03622044471123109e-01),
        V3 8.34336671824457987 4.12479856412430479 (-4.03523417114321381e-01),
        V3 12.8943695621391310 (-15.1111514016986312) (-2.23307578892655734e-1),
        V3 15.3796971148509165 (-25.9193146099879641) (1.79258772950371181e-1)
        ]
vels' =[
    V3 0 0 0,
    V3 1.66007664274403694e-3 7.69901118419740425e-3 (-6.90460016972063023e-5),
    V3 (-2.76742510726862411e-3) 4.99852801234917238e-3 2.30417297573763929e-5,
    V3 2.96460137564761618e-3 2.37847173959480950e-3 (-2.96589568540237556e-5),
    V3 2.68067772490389322e-3 1.62824170038242295e-3 (-9.51592254519715870e-5)
    ]
vels = map (daysPerYear .*|) vels'
initialPhaseSpace = (posns, vels)

dtms = map (dt *) ms -- The masses are only used by themselves in energy, so go
                     -- ahead and precalculate m*dt.
ms = map (solarMass *) [1, 9.54791938424326609e-04, 2.85885980666130812e-04,
                        4.36624404335156298e-05, 5.15138902046611451e-05]

An imperative entry

Based on the C version. Too many slow updates on the array still.

{-# OPTIONS -O2 -optc-O3 -funbox-strict-fields -fglasgow-exts #-}
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Christoph Bauer
-- Rewritten in Haskell from C by Don Stewart
--
-- Advance is still too slow.
--
import System
import Control.Monad
import Data.Array.IO
import Data.Array.Base
import Text.Printf

data Planet = P { x, y, z, vx, vy, vz :: !Double } deriving Show

mass = [solar_mass
       ,9.54791938424326609e-04 * solar_mass
       ,2.85885980666130812e-04 * solar_mass
       ,4.36624404335156298e-05 * solar_mass
       ,5.15138902046611451e-05 * solar_mass]

bodies =[P 0 0 0 0 0 0                                  -- sun

        -- jupiter
        ,P  4.84143144246472090e+00                     -- jupiter
          (-1.16032004402742839e+00)
          (-1.03622044471123109e-01)
          ( 1.66007664274403694e-03 * days_per_year)
          ( 7.69901118419740425e-03 * days_per_year)
          (-6.90460016972063023e-05 * days_per_year)

        ,P  8.34336671824457987e+00                     -- saturn
            4.12479856412430479e+00
          (-4.03523417114321381e-01)
          (-2.76742510726862411e-03 * days_per_year)
          ( 4.99852801234917238e-03 * days_per_year)
          ( 2.30417297573763929e-05 * days_per_year)

        ,P  1.28943695621391310e+01                     -- uranus
          (-1.51111514016986312e+01)
          (-2.23307578892655734e-01)
          ( 2.96460137564761618e-03 * days_per_year)
          ( 2.37847173959480950e-03 * days_per_year)
          (-2.96589568540237556e-05 * days_per_year)

        ,P  1.53796971148509165e+01                     -- neptune
          (-2.59193146099879641e+01)
            1.79258772950371181e-01
          ( 2.68067772490389322e-03 * days_per_year)
          ( 1.62824170038242295e-03 * days_per_year)
          (-9.51592254519715870e-05 * days_per_year) ]

solar_mass    = 4 * pi * pi :: Double
days_per_year = 365.24
nbodies = 5

offsetMomentum :: Int -> IOArray Int Planet -> IOUArray Int Double -> IO ()
offsetMomentum nbodies arr mass = do
    let loop i px py pz 
            | i >= nbodies = return (px,py,pz)
            | otherwise    = do
                p  <- unsafeRead arr i
                mp <- unsafeRead mass i
                let px' = px + vx p * mp
                    py' = py + vy p * mp
                    pz' = pz + vz p * mp
                loop (i+1) px' py' pz'

    (px,py,pz) <- loop 0 0.0 0.0 0.0

    p <- unsafeRead arr 0
    unsafeWrite arr 0 $! p { vx = - px / solar_mass
                           , vy = - py / solar_mass
                           , vz = - pz / solar_mass }

energy :: Int -> IOArray Int Planet -> IOUArray Int Double -> IO Double
energy nbodies arr mass = loop 0 (0.0 :: Double)
    where loop i e 
            | i >= nbodies = return e
            | otherwise    = do
                b <- unsafeRead arr i
                mb<- unsafeRead mass i
                let (x,y,z) = (vx b, vy b, vz b)
                    e' = e + 0.5 * mb * (x*x + y*y + z*z)
                e'' <- loop' (i+1) e' b mb
                loop (i+1) e''

          loop' j e b mb
            | j >= nbodies = return e
            | otherwise    = do
                b2 <- unsafeRead arr j
                mb2<- unsafeRead mass j
                let dx = x b - x b2
                    dy = y b - y b2
                    dz = z b - z b2
                    distance = sqrt $! dx * dx + dy * dy + dz * dz
                loop' (j+1) (e - ((mb * mb2) / distance)) b mb

advance :: Int -> IOArray Int Planet -> IOUArray Int Double -> Double -> IO ()
advance nbodies arr mass dt = do
    for0 0
    for3 0
    where
        for0 i = when (i < nbodies) $ do
            b  <- unsafeRead arr i
            mb <- unsafeRead mass i
            for1 (i+1) mb b >>= unsafeWrite arr i
            for0 (i+1)

        for1 j mb b = if j >= nbodies then return b else do
            b2 <- unsafeRead arr j 
            mb2<- unsafeRead mass j
            let dx = x b - x b2
                dy = y b - y b2
                dz = z b - z b2
                distance = sqrt $! dx * dx + dy * dy + dz * dz
                mag = dt / (distance * distance * distance)
                (m1,m2) = (mb2 * mag, mb * mag)

            unsafeWrite arr j $! b2 { vx = vx b2 + dx * m2
                                    , vy = vy b2 + dy * m2
                                    , vz = vz b2 + dz * m2 }

            for1 (j+1) mb $! b { vx = vx b  - dx * m1
                               , vy = vy b  - dy * m1
                               , vz = vz b  - dz * m1 }

        for3 i = when (i < nbodies) $ do
            b <- unsafeRead arr i
            unsafeWrite arr i $! b { x = x b + dt * vx b 
                                   , y = y b + dt * vy b 
                                   , z = z b + dt * vz b } 
            for3 (i+1)

main = do
    (n::Int) <- getArgs >>= readIO . head
    arr      <- newListArray (0,nbodies-1) bodies :: IO (IOArray Int Planet)
    massarr  <- newListArray (0,nbodies-1) mass   :: IO (IOUArray Int Double)
    offsetMomentum nbodies arr massarr
    energy nbodies arr massarr >>= printf "%.9f\n" 
    replicateM_ n $ advance nbodies arr massarr 0.01
    energy nbodies arr massarr >>= printf "%.9f\n"


Original entry

This is the "Haskell GHC #3" entry, the fastest of the three.

-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
-- NBody based on Open Mutable Records
-- Contributed by Einar Karttunen

import Control.Monad
import Control.Monad.Reader
import Data.IORef
import System
import Text.Printf

-- This is a monad for open mutable records programming
newtype OO t r = OO (ReaderT t IO r) deriving(Monad, MonadReader t, MonadIO)

-- run a computation with a record (object)
with :: s -> OO s a -> OO b a
with this (OO c) = liftIO (runReaderT c this)

-- run an OO computation producing an IO value.
ooToIO :: OO s a -> IO a
ooToIO (OO c) = runReaderT c undefined

-- the plain record types
data a :.: r = RC !a !r
infixr :.:
data END = END

-- Next we define a field access method.
class Select r f t | r f -> t                 where (!) :: r -> f -> Ref t
instance Select (Field f t :.: r) f t        where (!) (RC (F x) _) _ = x
instance Select r f t => Select (a :.: r) f t where (!) (RC _ t) = (!) t

-- And finally the type of mutable fields.
type Ref a = IORef a
newtype Field name rtype = F (Ref rtype)

-- Next we define a way to construct record values.
infixr ##
(##) :: v -> OO s r -> OO s ((Field f v) :.: r)
(##) v r = do { h <- liftIO (newIORef v); t <- r; return (RC (F h) t) }
end = return END :: OO s END

-- Get the value of a field.
value :: Select s f t => f -> OO s t
value a  = do x <- asks (\s -> s!a)
              liftIO (readIORef x)

-- Or set the value of a field.
(<-:) :: Select s f t => f -> t -> OO s ()
a <-: b  = do x <- asks (\s -> s!a)
              liftIO (writeIORef x b)

-- And as a convenience add value to an double field.
(+=) :: Select s f V3 => f -> V3 -> OO s V3
a += b   = do x <- asks (\s -> s!a)
              val <- liftIO (readIORef x)
              let z = val+b
              z `seq` liftIO (writeIORef x z)
              return z

-- V3 vector like things.
data V3 = V3 !Double !Double !Double deriving(Show,Eq)
instance Num V3 where
  (V3 x1 y1 z1) + (V3 x2 y2 z2) = V3 (x1+x2) (y1+y2) (z1+z2)
  (V3 x1 y1 z1) - (V3 x2 y2 z2) = V3 (x1-x2) (y1-y2) (z1-z2)
  (V3 x1 y1 z1) * (V3 x2 y2 z2) = V3 (x1*x2) (y1*y2) (z1*z2)
  fromInteger o = let v = fromInteger o in V3 v v v
instance Fractional V3 where
  (V3 x1 y1 z1) / (V3 x2 y2 z2) = V3 (x1/x2) (y1/y2) (z1/z2)

from field obj = with obj (value field)
dv3 v = V3 v v v
v3len (V3 x y z) = sqrt (x*x + y*y + z*z)

{-# RULE "dv3/mult" forall a b. dv3 a * (V3 x y z) = V3 (a*x) (a*y) (a*z) #-}

-- Bodies
data Base = Base; data Diff = Diff; data Mass = Mass
type Body = Field Base V3 :.: Field Diff V3 :.: Field Mass Double :.: END

-- Create a new body
newBody x1 y1 z1 x2 y2 z2 mass =
  V3 x1 y1 z1 ## (dv3 year * V3 x2 y2 z2) ## (mass * solarMass) ## end :: OO s Body

offsetMomemtum bodies@(b:_) = foldM comp 0 bodies >>= (\d -> with b (Diff <-: (-d / dv3 solarMass)))
    where comp old body = do diff <- Diff `from` body
                             mass <- Mass `from` body
                             return (old + (dv3 mass * diff))

advance dt bodies = advanceLoop dt bodies >> mapM_ comp bodies
    where comp body = with body ((\diff -> Base += (dv3 dt * diff)) =<< value Diff)

advanceLoop :: Double -> [Body] -> OO s ()
advanceLoop dt []          = return ()
advanceLoop dt (b:bs)      = do
  bmass <- Mass `from` b
  bbase <- Base `from` b
  let inner []     = return ()
      inner (p:ps) = do diff <- (Base `from` p >>= \pbase -> return (bbase - pbase))
                        let dist = v3len diff
                        let magdiff = dv3 (dt / (dist * dist * dist)) * diff
                        pmass <- Mass `from` p
                        with b (Diff += (dv3 pmass * (-magdiff)))
                        with p (Diff += (dv3 bmass * magdiff))
                        inner ps
  inner bs >> advanceLoop dt bs

energy :: [Body] -> Double -> OO s Double
energy []     e = return e
energy (p:ps) e = do pdiff@(V3 x y z) <- Diff `from` p
                     pmass <- Mass `from` p
                     pbase <- Base `from` p
                     let e' = e + 0.5 * pmass * (x*x + y*y + z*z)
                     let inner []     e = return e
                         inner (j:js) e = do jmass <- Mass `from` j
                                             jbase <- Base `from` j
                                             inner js $! (e - ((pmass*jmass)/v3len(pbase - jbase)))
                     inner ps e' >>= energy ps

solarMass = 4*pi*pi :: Double
year = 365.24 :: Double

createBodies = do
  b0 <- newBody 0 0 0 0 0 0 1
  b1 <- newBody 4.84143144246472090e+00 (-1.16032004402742839e+00) (-1.03622044471123109e-01)
               (1.66007664274403694e-03) (7.69901118419740425e-03) (-6.90460016972063023e-05)
               9.54791938424326609e-04
  b2 <- newBody 8.34336671824457987e+00 4.12479856412430479e+00 (-4.03523417114321381e-01)
               (-2.76742510726862411e-03) (4.99852801234917238e-03) (2.30417297573763929e-05)
               (2.85885980666130812e-04)
  b3 <- newBody 1.28943695621391310e+01 (-1.51111514016986312e+01) (-2.23307578892655734e-01)
               2.96460137564761618e-03 2.37847173959480950e-03 (-2.96589568540237556e-05)
               4.36624404335156298e-05
  b4 <- newBody 1.53796971148509165e+01 (-2.59193146099879641e+01) 1.79258772950371181e-01
               2.68067772490389322e-03 1.62824170038242295e-03 (-9.51592254519715870e-05)
               5.15138902046611451e-05
  return [b0,b1,b2,b3,b4]

main = do ~[n] <- getArgs
          (e1,e2) <- ooToIO (do bodies <- createBodies
                                offsetMomemtum bodies
                                e1 <- energy bodies 0
                                replicateM_ (read n) $ advance 0.01 bodies
                                e2 <- energy bodies 0
                                return (e1,e2))
          printf "%.9f\n%.9f\n" e1 e2


Smaller code

Anyone interested in seeing how small we can make the code? Here's my original attempt, which unfortunately suffers from a massive space-leak. I'd be interested in seeing if it is possible to have a succinct version without leaks.

import System(getArgs)
import Numeric
import Data.List

data Vec = V !Double !Double !Double deriving Show
instance Num Vec where
    (V a b c) + (V x y z) = (V (a+x) (b+y) (c+z))
    (V a b c) - (V x y z) = (V (a-x) (b-y) (c-z))
    fromInteger 0 = V 0.0 0.0 0.0 -- for sum function
instance Eq Vec where (V a b c) == (V x y z) = (a==x) && (b==y) && (c==z)

dot (V a b c) (V x y z) = a*x + b*y + c*z
scale (V a b c) n = V (n*a) (n*b) (n*c)
mag (V x y z) =  sqrt (x*x + y*y + z*z)

data Planet = Planet Vec Vec Double deriving Show --Position Velocity Mass
dist (Planet p1 _ _) (Planet p2 _ _) = mag $ p1 - p2
mass (Planet _ _ m) = m
vel  (Planet _ v _) = v
pos  (Planet p _ _) = p

main = do 
        [arg] <- getArgs
        let iter = read arg
        let n = 5::Int
        let bodies = offset_momentum n [sun, jupiter, saturn, neptune, uranus]
        let begin = energy n bodies
        putStrLn $ showFFloat (Just 9) begin ""
        let final = (iterate (advance n 0.01) bodies)
        let end = energy n (final !! iter)
        putStrLn $ showFFloat (Just 9) end ""

days_per_year = 365.24 
solar_mass = 4.0 * pi * pi

advance :: Int -> Double -> [Planet] -> [Planet]
advance n dt (p:ps) = take n ((Planet (pos p + delta_x)
                              (new_v) (mass p) ) : advance n dt (ps++[p]))
 where
   delta_v = sum (map (\q -> 
                  (pos p - pos q) `scale` ((mass q)*dt/(dist p q)^3)) ps)
   new_v   = (vel p) - delta_v
   delta_x = new_v `scale` dt 

energy:: Int -> [Planet] -> Double
energy n ps = kinetic - potential
  where
    kinetic   = 0.5 * (sum (map (\q->(mass q)*((vel q) `dot` (vel q))) ps))
    potential = sum $ zipWith grav_pot ps (tail (init (tails ps)))
    grav_pot x xs = sum $ map (\y -> (mass x)*(mass y)/(dist x y)) xs

offset_momentum n ((Planet p v m):ps) = (Planet p new_v m):ps
  where new_v = (sum (map (\n->(vel n) `scale` (mass n)) ps)) 
                `scale` ((-1.0)/solar_mass)

jupiter = (Planet 
 (V 4.84143144246472090e+00 (-1.16032004402742839e+00) (-1.03622044471123109e-01))
 (V ( 1.66007664274403694e-03 * days_per_year)
    ( 7.69901118419740425e-03 * days_per_year)
    ((-6.90460016972063023e-05) * days_per_year)) 
 (9.54791938424326609e-04 * solar_mass))

saturn = (Planet
 (V 8.34336671824457987e+00 4.12479856412430479e+00 (-4.03523417114321381e-01))
 (V (-2.76742510726862411e-03 * days_per_year) 
    (4.99852801234917238e-03 * days_per_year)
    (2.30417297573763929e-05 * days_per_year))
 (2.85885980666130812e-04 * solar_mass))

uranus = (Planet
 (V 1.28943695621391310e+01 (-1.51111514016986312e+01) (-2.23307578892655734e-01))
 (V (2.96460137564761618e-03 * days_per_year)
    (2.37847173959480950e-03 * days_per_year)
    (-2.96589568540237556e-05 * days_per_year))
 (4.36624404335156298e-05 * solar_mass))

neptune = (Planet
 (V 1.53796971148509165e+01 (-2.59193146099879641e+01) 1.79258772950371181e-01)
 (V (2.68067772490389322e-03 * days_per_year)
    (1.62824170038242295e-03 * days_per_year)
    (-9.51592254519715870e-05 * days_per_year))
 (5.15138902046611451e-05 * solar_mass))

sun = (Planet (V 0.0 0.0 0.0) (V 0.0 0.0 0.0) solar_mass)