Shootout/Spectral

From HaskellWiki
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

A Shootout Entry for the spectral norm test

Timings

Debian Linux x86, n=2,500

||Entry || Time|| ||dons || 9.407|| ||old || 10.564||

Current entry

Damn fast.

{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns
--      -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
-- Note: The following flags appear to INCREASE running time:
--      -O2 -optc-funroll-loops

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Proposed entry

Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.

Submitted

{-# OPTIONS -fexcess-precision #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns 
--      -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Base

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
{-# INLINE timesAtAv #-}

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAv #-}

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAtv #-}

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case divInt# t 2# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Current entry

Better gcc flags. Bang patterns. Submitted

Didn't do so well in practice though :/

{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--

import Control.Monad
import System
import Text.Printf
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= printf "%.9f\n"

approximate n = do
    u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u

    let loop !vbv !vv !i
            | i >= n    = return (vbv,vv)
            | otherwise = do ui <- unsafeRead u i
                             vi <- unsafeRead v i
                             loop (vbv + ui * vi) (vv + vi * vi) (i+1)
    (vbv,vv) <- loop 0 0 0
    return $! sqrt (vbv/vv)

-- return element i,j of infinite matrix A
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv !n !v !av = loop 0
    where loop i  = when (i < n) $ do avi <- loop' i 0 0
                                      unsafeWrite av i avi >> loop (i+1)
          loop' !i !j !av | j >= n    = return av
                          | otherwise = do vj  <- unsafeRead v j
                                           loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv !n !v !atv = loop 0
    where loop i = when (i < n) $ do atvi <- loop' i 0 0
                                     unsafeWrite atv i atvi >> loop (i+1)
          loop' !i !j !atvi
                | j >= n    = return atvi
                | otherwise = do vj <- unsafeRead v j
                                 loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv !n !v !atav = do
    u  <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    multiplyAv n v u >> multiplyAtv n u atav

Old entry

GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision

{-# OPTIONS -optc-O #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
-- gcc miscompiles this program with -O3
-- 

import Control.Monad
import System
import Numeric
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s []

approximate n = do
    u <- newArray  (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray_ (0,n-1)   :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
    loop 0 0 0 n u v

loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined
loop vbv vv i n u v | i >= n    = return $! sqrt (vbv/vv)
                    | otherwise = do ui <- unsafeRead u i
                                     vi <- unsafeRead v i
                                     loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v

-- return element i,j of infinite matrix A 
a i j | i `seq` j `seq` False = undefined
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined
multiplyAv n v av = loop 0
    where loop i  = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1)
          loop' i j av | i `seq` j `seq` av `seq` False = undefined
          loop' i j av | j >= n    = return av 
                       | otherwise = do vj  <- v `unsafeRead` j
                                        loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined
multiplyAtv n v atv = loop 0
    where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1)
          loop' i j atvi | j `seq` atvi `seq` False = undefined
          loop' i j atvi | j >= n    = return atvi 
                         | otherwise = do vj <- v `unsafeRead` j 
                                          loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined
multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
                           multiplyAv n v u >> multiplyAtv n u atav

Current entry

-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Original C contributed by Sebastien Loisel
-- Conversion to C++ by Jon Harrop
-- Conversion to Haskell by Einar Karttunen

import Control.Monad
import Control.Monad.ST
import Data.Array.Base
import Data.Array.ST
import Numeric
import System

eval_A :: Int -> Int -> Double
eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1)

plusAt :: STUArray s Int Double -> Int -> Double -> ST s ()
plusAt a i v = do o <- unsafeRead a i
                  unsafeWrite a i (v+o)

eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_A_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1)

eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_At_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1)

eval_AtA_Times_u u v = do w <- newArray (bounds u) 0
                          eval_A_Times_u u w >> eval_At_Times_u w v

main = do
  n <- getArgs >>= return.read.head
  let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
                           v <- newArray (0,n-1) 0
                           replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
                           vLoop u v n (0, 0))
  putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""

vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double)
vLoop u v 0 a            = return a
vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op
    where op = do ui <- unsafeRead u i
                  vi <- unsafeRead v i
                  return (vBv+(ui*vi),vv+(vi*vi))

STUArray-based solution

One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).

Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.

{-# OPTIONS_GHC -fglasgow-exts #-}

--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:
--    -O2 -fexcess-precision

import Control.Monad.ST
import Control.Monad

import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits

import System.Environment

-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float

main = do
	n <- fmap (read . head) getArgs
	print (spectralNorm n)
	-- Use this variant for IOUArray rather than STUArray
	--print =<< spectralNormM allocDoubleIO n

spectralNorm :: Int -> Double
spectralNorm n = runST (spectralNormM allocDouble n)

-----------------
-- Benchmark code

spectralNormM allocator n = case allocator of
  (Allocator alloc) -> do
	let ubound = n-1
	u <- alloc (0,ubound)
	v <- alloc (0,ubound)
	tmp <- alloc (0,ubound)

	fill u 1.0

	replicateM_ 10 $ do
		multiplyAtAv ubound u tmp v
		multiplyAtAv ubound v tmp u
	
	vBv <- u .| v
	vv <- v .| v
	
	return $ sqrt (vBv / vv)

multiplyAtAv n v u atav = do
	multiplyAv n v u
	multiplyAtv n u atav

multiplyAv n v av = iterateM_ 0 n $ \i -> do
	writeArray av i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray av i (+ ((a i j) * v'))

multiplyAtv n v atv = iterateM_ 0 n $ \i -> do
	writeArray atv i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray atv i (+ ((a j i) * v'))

a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1)
--	where i' = fromIntegral i; j' = fromIntegral j
a (I# i) (I# j) = D# (
	case i +# j of
		n -> case n *# (n+#1#) of
			t -> case t `uncheckedIShiftRA#` 1# of
				u -> case u +# (i +# 1#) of
					r -> 1.0## /## (int2Double# r))

-----------------------
-- General utility code

fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)

u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
	where
		go lb ub u v acc
			| lb <= ub = do
				u' <- readArray u lb
				v' <- readArray v lb
				go (lb+1) ub u v (acc + u'*v')
			| otherwise = return acc

-- Explicitly inlining this seems to stand for about a 24x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i

-- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops
-- above makes the program go twice as slowly. (Really!)

{-# INLINE iterateM_ #-}
{-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-}
iterateM_ lb ub = forM_ [lb..ub]

data Allocator m i e = forall a. MArray a e m => 
                       Allocator ((i, i) -> m (a i e))

allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
	(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))

allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
	(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))