Benchmarks Game/Parallel/SpectralNorm
Parallel versions of spectral-norm
Slightly nicer
{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
-- 2) changed -optc-O to -optc-O3
-- 3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
-- -O2 -threaded --make
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
import GHC.Base
import GHC.Float
import GHC.Int
import Control.Concurrent
import Control.Exception
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
forkIO (child `finally` putMVar me ());
return me )
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
{-
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i $ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}
{-
-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[] -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box
-}
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
Easy parallelisation
Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.
Should give reasonable cpu utilisation, and overall speedup.
Compiling:
$ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -threaded --make
(That is, pretty much the same flags, but with -threaded) Running:
$ time ./C 5500 +RTS -N5 1.274224153 ./C 5500 +RTS -N5 12.57s user 0.01s system 339% cpu 3.708 total
(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,
$ time ./A 5500 1.274224153 ./A 5500 12.56s user 0.01s system 99% cpu 12.586 total
The code:
{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
-- 2) changed -optc-O to -optc-O3
-- 3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
-- -O2 -threaded --make
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
import GHC.Base
import GHC.Float
import GHC.Int
import Control.Concurrent
import Control.Exception
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i $ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[] -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
ST/IOUArray-based solution
Based on the STUArray solution from Shootout/Spectral, converted to IOUArray for use of forkIO. (For use with ST it would require a forkAndWaitST implementation...)
Note that the benchmark requires synchronization between each iteration. This solution slices the output array into n parts and spawns one child per slice, waiting for every child to complete before continuing. (In theory, it's inefficient to repeatedly fork and synchronize like this, but it doesn't seem to cost that much.)
{-# OPTIONS_GHC -fglasgow-exts #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
-- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
-- -optc-ffast-math
-- GHC options:
-- -O2 -fexcess-precision -threaded
import Control.Concurrent
import Control.Concurrent.MVar
import Control.Exception
import Control.Monad.ST
import Control.Monad
import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits
import System.Environment
import Text.Printf
-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float
main = do
n <- fmap (read . head) getArgs
--print (spectralNorm n)
printf "%.9f" =<< spectralNormM forkAndWaitIO allocDoubleIO n
-- STUArray version not parallelised, so we're IO-only for now...
--spectralNorm :: Int -> Double
--spectralNorm n = runST (spectralNormM forkAndWaitST allocDouble n)
--------------------------------------------------------------------------
-- Benchmark code
caps = 4
spectralNormM :: (forall x. (x -> m ()) -> [x] -> m ()) -> Allocator m Int Double -> Int -> m Double
spectralNormM forkAndWait allocator n = case allocator of
(Allocator alloc) -> do
let ubound = n-1
u <- alloc (0,ubound)
v <- alloc (0,ubound)
tmp <- alloc (0,ubound)
fill u 1.0
replicateM_ 10 $ do
multiplyAtAv forkAndWait 0 ubound u tmp v
multiplyAtAv forkAndWait 0 ubound v tmp u
vBv <- u .| v
vv <- v .| v
return $ sqrt (vBv / vv)
forkChild child = do
m <- newEmptyMVar
forkIO $ (child `finally` putMVar m ())
return m
waitForChildren = mapM_ takeMVar
forkAndWaitIO f xs = waitForChildren =<< mapM (forkChild . f) xs
-- For a serialized version that doesn't fork (but works in any monad)
--forkAndWait f xs = mapM_ f xs
multiplyAtAv forkAndWait lb ub v u atav = do
let forkChildren f = forkAndWait f (ranges lb caps ub)
forkChildren (multiplyAv v u)
forkChildren (multiplyAtv u atav)
multiplyAv v av (l,u) = forM_ [l..u] $ \i -> do
writeArray av i 0.0
iterateArray v $ \j -> do
v' <- readArray v j
modArray av i (+ ((a i j) * v'))
multiplyAtv v atv (l,u) = forM_ [l..u] $ \i -> do
writeArray atv i 0.0
iterateArray v $ \j -> do
v' <- readArray v j
modArray atv i (+ ((a j i) * v'))
a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / ((i' + j') * ((i' + j' + 1) / 2) + i' + 1)
-- where i' = fromIntegral i; j' = fromIntegral j
a (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
--------------------------------------------------------------------------
-- General utility code
fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)
u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
where
go lb ub u v acc
| lb <= ub = do
u' <- readArray u lb
v' <- readArray v lb
go (lb+1) ub u v (acc + u'*v')
| otherwise = return acc
-- Inline pragma: ~103x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i
-- Inline pragma: ~18x speedup
{-# INLINE iterateArray #-}
iterateArray arr f = getBounds arr >>= \(lb,ub) -> forM_ [lb..ub] f
ranges lb n ub = go lb ((ub - lb) `div` n) ub
where
go x n y | x < y = (x, min (x+n) y) : go (x+n+1) n y
go _ _ _ = []
-- Ugly hack to allow polymorphically creating arrays in both ST and IO (required by ST's phantom 's' type)
data Allocator m i e = forall a. MArray a e m =>
Allocator ((i, i) -> m (a i e))
allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))
allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))