Difference between revisions of "Benchmarks Game/Parallel/SpectralNorm"
< Benchmarks Game | Parallel
Jump to navigation
Jump to search
DonStewart (talk | contribs) |
DonStewart (talk | contribs) |
||
Line 1: | Line 1: | ||
Parallel version of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm] |
Parallel version of [http://shootout.alioth.debian.org/u64q/benchmark.php?test=spectralnorm&lang=all spectral-norm] |
||
+ | == Slightly nicer == |
||
− | Easy parallelisation. |
||
+ | |||
+ | <haskell> |
||
+ | {-# LANGUAGE BangPatterns, MagicHash #-} |
||
+ | {-# OPTIONS -fvia-C -fexcess-precision #-} |
||
+ | -- |
||
+ | -- The Computer Language Benchmarks Game |
||
+ | -- http://shootout.alioth.debian.org/ |
||
+ | -- |
||
+ | -- Translation from Clean by Don Stewart |
||
+ | -- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA# |
||
+ | -- 2) changed -optc-O to -optc-O3 |
||
+ | -- 3) added -optc-ffast-math |
||
+ | -- Parallelised by Don Stewart |
||
+ | -- |
||
+ | -- Should be compiled with: |
||
+ | -- -O2 -threaded --make |
||
+ | -- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math |
||
+ | -- |
||
+ | |||
+ | import System |
||
+ | import Foreign.Marshal.Array |
||
+ | import Foreign |
||
+ | import Text.Printf |
||
+ | import Control.Monad |
||
+ | import Data.ByteString.Internal |
||
+ | |||
+ | import GHC.Base |
||
+ | import GHC.Float |
||
+ | import GHC.Int |
||
+ | |||
+ | import Control.Concurrent |
||
+ | import Control.Exception |
||
+ | |||
+ | type Reals = Ptr Double |
||
+ | |||
+ | main = do |
||
+ | n <- getArgs >>= readIO . head |
||
+ | |||
+ | u <- mallocArray n :: IO Reals |
||
+ | forM_ [0..n-1] $ \i -> pokeElemOff u i 1 |
||
+ | v <- mallocArray n :: IO Reals |
||
+ | forM_ [0..n-1] $ \i -> pokeElemOff v i 0 |
||
+ | |||
+ | powerMethod 10 n u v |
||
+ | printf "%.9f\n" (eigenvalue n u v 0 0 0) |
||
+ | |||
+ | ------------------------------------------------------------------------ |
||
+ | |||
+ | eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double |
||
+ | eigenvalue !n !u !v !i !vBv !vv |
||
+ | | i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi) |
||
+ | | otherwise = sqrt $! vBv / vv |
||
+ | where |
||
+ | ui = inlinePerformIO (peekElemOff u i) |
||
+ | vi = inlinePerformIO (peekElemOff v i) |
||
+ | |||
+ | ------------------------------------------------------------------------ |
||
+ | |||
+ | powerMethod :: Int -> Int -> Reals -> Reals -> IO () |
||
+ | powerMethod i n u v = |
||
+ | mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar; |
||
+ | forkIO (child `finally` putMVar me ()); |
||
+ | return me ) |
||
+ | where |
||
+ | child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u |
||
+ | |||
+ | {- |
||
+ | powerMethod i n u v = do |
||
+ | -- roll our own synchronisation |
||
+ | children <- newMVar [] |
||
+ | replicateM_ i $ do |
||
+ | me <- newEmptyMVar |
||
+ | cs <- takeMVar children |
||
+ | putMVar children (me : cs) |
||
+ | forkIO (child `finally` putMVar me ()) |
||
+ | wait children |
||
+ | where |
||
+ | child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u |
||
+ | -} |
||
+ | |||
+ | {- |
||
+ | -- wait on children |
||
+ | wait box = do |
||
+ | cs <- takeMVar box |
||
+ | case cs of |
||
+ | [] -> return () |
||
+ | m:ms -> putMVar box ms >> takeMVar m >> wait box |
||
+ | -} |
||
+ | |||
+ | -- multiply vector v by matrix A and then by matrix A transposed |
||
+ | timesAtAv :: Reals -> Int -> Reals -> Reals -> IO () |
||
+ | timesAtAv !t !n !u !atau = do |
||
+ | timesAv n u t |
||
+ | timesAtv n t atau |
||
+ | |||
+ | timesAv :: Int -> Reals -> Reals -> IO () |
||
+ | timesAv !n !u !au = go 0 |
||
+ | where |
||
+ | go :: Int -> IO () |
||
+ | go !i = when (i < n) $ do |
||
+ | pokeElemOff au i (avsum i 0 0) |
||
+ | go (i+1) |
||
+ | |||
+ | avsum :: Int -> Int -> Double -> Double |
||
+ | avsum !i !j !acc |
||
+ | | j < n = avsum i (j+1) (acc + ((aij i j) * uj)) |
||
+ | | otherwise = acc |
||
+ | where uj = inlinePerformIO (peekElemOff u j) |
||
+ | |||
+ | timesAtv :: Int -> Reals -> Reals -> IO () |
||
+ | timesAtv !n !u !a = go 0 |
||
+ | where |
||
+ | go :: Int -> IO () |
||
+ | go !i = when (i < n) $ do |
||
+ | pokeElemOff a i (atvsum i 0 0) |
||
+ | go (i+1) |
||
+ | |||
+ | atvsum :: Int -> Int -> Double -> Double |
||
+ | atvsum !i !j !acc |
||
+ | | j < n = atvsum i (j+1) (acc + ((aij j i) * uj)) |
||
+ | | otherwise = acc |
||
+ | where uj = inlinePerformIO (peekElemOff u j) |
||
+ | |||
+ | -- |
||
+ | -- manually unbox the inner loop: |
||
+ | -- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1) |
||
+ | -- |
||
+ | aij (I# i) (I# j) = D# ( |
||
+ | case i +# j of |
||
+ | n -> case n *# (n+#1#) of |
||
+ | t -> case t `uncheckedIShiftRA#` 1# of |
||
+ | u -> case u +# (i +# 1#) of |
||
+ | r -> 1.0## /## (int2Double# r)) |
||
+ | </haskell> |
||
+ | |||
+ | == Easy parallelisation == |
||
Takes the existing single core spectral-norm Haskell entry, and |
Takes the existing single core spectral-norm Haskell entry, and |
||
Line 28: | Line 164: | ||
./A 5500 12.56s user 0.01s system 99% cpu |
./A 5500 12.56s user 0.01s system 99% cpu |
||
12.586 total |
12.586 total |
||
+ | |||
+ | The code: |
||
<haskell> |
<haskell> |
Revision as of 20:33, 20 September 2008
Parallel version of spectral-norm
Slightly nicer
{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
-- 2) changed -optc-O to -optc-O3
-- 3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
-- -O2 -threaded --make
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
import GHC.Base
import GHC.Float
import GHC.Int
import Control.Concurrent
import Control.Exception
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v =
mapM_ takeMVar =<< replicateM i ( do me <- newEmptyMVar;
forkIO (child `finally` putMVar me ());
return me )
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
{-
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i $ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-}
{-
-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[] -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box
-}
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
Easy parallelisation
Takes the existing single core spectral-norm Haskell entry, and parallelises the main loop using explicit concurrency based on forkIO and MVars.
Should give reasonable cpu utilisation, and overall speedup.
Compiling:
$ ghc C.hs -O2 -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math -threaded --make
(That is, pretty much the same flags, but with -threaded) Running:
$ time ./C 5500 +RTS -N5 1.274224153 ./C 5500 +RTS -N5 12.57s user 0.01s system 339% cpu 3.708 total
(Just add +RTS -N5 -RTS) Using 5 capabilities (to hide latency). Compared to current sequential entry,
$ time ./A 5500 1.274224153 ./A 5500 12.56s user 0.01s system 99% cpu 12.586 total
The code:
{-# LANGUAGE BangPatterns, MagicHash #-}
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
-- 2) changed -optc-O to -optc-O3
-- 3) added -optc-ffast-math
-- Parallelised by Don Stewart
--
-- Should be compiled with:
-- -O2 -threaded --make
-- -optc-O2 -optc-march=native -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
import GHC.Base
import GHC.Float
import GHC.Int
import Control.Concurrent
import Control.Exception
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod i n u v = do
-- roll our own synchronisation
children <- newMVar []
replicateM_ i $ do
me <- newEmptyMVar
cs <- takeMVar children
putMVar children (me : cs)
forkIO (child `finally` putMVar me ())
wait children
where
child = allocaArray n $ \t -> timesAtAv t n u v >> timesAtAv t n v u
-- wait on children
wait box = do
cs <- takeMVar box
case cs of
[] -> return ()
m:ms -> putMVar box ms >> takeMVar m >> wait box
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))