Shootout/Spectral
A Shootout Entry for the spectral norm test
Timings
Debian Linux x86, n=2,500
||Entry || Time|| ||dons || 9.407|| ||old || 10.564||
Current entry
Damn fast.
{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
-- 2) changed -optc-O to -optc-O3
-- 3) added -optc-ffast-math
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
-- -O -fglasgow-exts -fbang-patterns
-- -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
-- Note: The following flags appear to INCREASE running time:
-- -O2 -optc-funroll-loops
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal
import GHC.Base
import GHC.Float
import GHC.Int
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
Proposed entry
Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.
{-# OPTIONS -fexcess-precision #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
-- -O -fglasgow-exts -fbang-patterns
-- -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2
--
import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Base
import GHC.Base
import GHC.Float
import GHC.Int
type Reals = Ptr Double
main = do
n <- getArgs >>= readIO . head
u <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
v <- mallocArray n :: IO Reals
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
powerMethod 10 n u v
printf "%.9f\n" (eigenvalue n u v 0 0 0)
------------------------------------------------------------------------
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
| otherwise = sqrt $! vBv / vv
where
ui = inlinePerformIO (peekElemOff u i)
vi = inlinePerformIO (peekElemOff v i)
------------------------------------------------------------------------
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u
-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
timesAv n u t
timesAtv n t atau
{-# INLINE timesAtAv #-}
timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff au i (avsum i 0 0)
go (i+1)
avsum :: Int -> Int -> Double -> Double
avsum !i !j !acc
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAv #-}
timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
where
go :: Int -> IO ()
go !i = when (i < n) $ do
pokeElemOff a i (atvsum i 0 0)
go (i+1)
atvsum :: Int -> Int -> Double -> Double
atvsum !i !j !acc
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
| otherwise = acc
where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAtv #-}
--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case divInt# t 2# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
Current entry
Better gcc flags. Bang patterns. Submitted
Didn't do so well in practice though :/
{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
import Control.Monad
import System
import Text.Printf
import Data.Array.IO
import Data.Array.Base
main = getArgs >>= approximate . read . head >>= printf "%.9f\n"
approximate n = do
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
let loop !vbv !vv !i
| i >= n = return (vbv,vv)
| otherwise = do ui <- unsafeRead u i
vi <- unsafeRead v i
loop (vbv + ui * vi) (vv + vi * vi) (i+1)
(vbv,vv) <- loop 0 0 0
return $! sqrt (vbv/vv)
-- return element i,j of infinite matrix A
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j
-- multiply vector v by matrix A */
multiplyAv !n !v !av = loop 0
where loop i = when (i < n) $ do avi <- loop' i 0 0
unsafeWrite av i avi >> loop (i+1)
loop' !i !j !av | j >= n = return av
| otherwise = do vj <- unsafeRead v j
loop' i (j+1) (av + a i j * vj)
-- multiply vector v by matrix A transposed
multiplyAtv !n !v !atv = loop 0
where loop i = when (i < n) $ do atvi <- loop' i 0 0
unsafeWrite atv i atvi >> loop (i+1)
loop' !i !j !atvi
| j >= n = return atvi
| otherwise = do vj <- unsafeRead v j
loop' i (j+1) (atvi + a j i * vj)
-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv !n !v !atav = do
u <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
multiplyAv n v u >> multiplyAtv n u atav
Old entry
GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision
{-# OPTIONS -optc-O #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
-- gcc miscompiles this program with -O3
--
import Control.Monad
import System
import Numeric
import Data.Array.IO
import Data.Array.Base
main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s []
approximate n = do
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
v <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
loop 0 0 0 n u v
loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined
loop vbv vv i n u v | i >= n = return $! sqrt (vbv/vv)
| otherwise = do ui <- unsafeRead u i
vi <- unsafeRead v i
loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v
-- return element i,j of infinite matrix A
a i j | i `seq` j `seq` False = undefined
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j
-- multiply vector v by matrix A */
multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined
multiplyAv n v av = loop 0
where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1)
loop' i j av | i `seq` j `seq` av `seq` False = undefined
loop' i j av | j >= n = return av
| otherwise = do vj <- v `unsafeRead` j
loop' i (j+1) (av + a i j * vj)
-- multiply vector v by matrix A transposed
multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined
multiplyAtv n v atv = loop 0
where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1)
loop' i j atvi | j `seq` atvi `seq` False = undefined
loop' i j atvi | j >= n = return atvi
| otherwise = do vj <- v `unsafeRead` j
loop' i (j+1) (atvi + a j i * vj)
-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined
multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
multiplyAv n v u >> multiplyAtv n u atav
Current entry
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Original C contributed by Sebastien Loisel
-- Conversion to C++ by Jon Harrop
-- Conversion to Haskell by Einar Karttunen
import Control.Monad
import Control.Monad.ST
import Data.Array.Base
import Data.Array.ST
import Numeric
import System
eval_A :: Int -> Int -> Double
eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1)
plusAt :: STUArray s Int Double -> Int -> Double -> ST s ()
plusAt a i v = do o <- unsafeRead a i
unsafeWrite a i (v+o)
eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_A_Times_u u au = outer (snd $ bounds u)
where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj)
inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1)
eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_At_Times_u u au = outer (snd $ bounds u)
where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj)
inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1)
eval_AtA_Times_u u v = do w <- newArray (bounds u) 0
eval_A_Times_u u w >> eval_At_Times_u w v
main = do
n <- getArgs >>= return.read.head
let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
v <- newArray (0,n-1) 0
replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
vLoop u v n (0, 0))
putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""
vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double)
vLoop u v 0 a = return a
vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op
where op = do ui <- unsafeRead u i
vi <- unsafeRead v i
return (vBv+(ui*vi),vv+(vi*vi))
STUArray-based solution
One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).
Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.
{-# OPTIONS_GHC -fglasgow-exts #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
-- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
-- -optc-ffast-math
-- GHC options:
-- -O2 -fexcess-precision
import Control.Monad.ST
import Control.Monad
import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits
import System.Environment
-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float
main = do
n <- fmap (read . head) getArgs
print (spectralNorm n)
-- Use this variant for IOUArray rather than STUArray
--print =<< spectralNormM allocDoubleIO n
spectralNorm :: Int -> Double
spectralNorm n = runST (spectralNormM allocDouble n)
-----------------
-- Benchmark code
spectralNormM allocator n = case allocator of
(Allocator alloc) -> do
let ubound = n-1
u <- alloc (0,ubound)
v <- alloc (0,ubound)
tmp <- alloc (0,ubound)
fill u 1.0
replicateM_ 10 $ do
multiplyAtAv ubound u tmp v
multiplyAtAv ubound v tmp u
vBv <- u .| v
vv <- v .| v
return $ sqrt (vBv / vv)
multiplyAtAv n v u atav = do
multiplyAv n v u
multiplyAtv n u atav
multiplyAv n v av = iterateM_ 0 n $ \i -> do
writeArray av i 0.0
iterateM_ 0 n $ \j -> do
v' <- readArray v j
modArray av i (+ ((a i j) * v'))
multiplyAtv n v atv = iterateM_ 0 n $ \i -> do
writeArray atv i 0.0
iterateM_ 0 n $ \j -> do
v' <- readArray v j
modArray atv i (+ ((a j i) * v'))
a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1)
-- where i' = fromIntegral i; j' = fromIntegral j
a (I# i) (I# j) = D# (
case i +# j of
n -> case n *# (n+#1#) of
t -> case t `uncheckedIShiftRA#` 1# of
u -> case u +# (i +# 1#) of
r -> 1.0## /## (int2Double# r))
-----------------------
-- General utility code
fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)
u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
where
go lb ub u v acc
| lb <= ub = do
u' <- readArray u lb
v' <- readArray v lb
go (lb+1) ub u v (acc + u'*v')
| otherwise = return acc
-- Explicitly inlining this seems to stand for about a 24x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i
-- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops
-- above makes the program go twice as slowly. (Really!)
{-# INLINE iterateM_ #-}
{-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-}
iterateM_ lb ub = forM_ [lb..ub]
data Allocator m i e = forall a. MArray a e m =>
Allocator ((i, i) -> m (a i e))
allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))
allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))