Shootout/Spectral

From HaskellWiki
Jump to navigation Jump to search

A Shootout Entry for the spectral norm test

Timings

Debian Linux x86, n=2,500

||Entry || Time|| ||dons || 9.407|| ||old || 10.564||

Current entry

Damn fast.

{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns
--      -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
-- Note: The following flags appear to INCREASE running time:
--      -O2 -optc-funroll-loops

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Proposed entry

Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.

Submitted

{-# OPTIONS -fexcess-precision #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns 
--      -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Base

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
{-# INLINE timesAtAv #-}

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAv #-}

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAtv #-}

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case divInt# t 2# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Current entry

Better gcc flags. Bang patterns. Submitted

Didn't do so well in practice though :/

{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--

import Control.Monad
import System
import Text.Printf
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= printf "%.9f\n"

approximate n = do
    u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u

    let loop !vbv !vv !i
            | i >= n    = return (vbv,vv)
            | otherwise = do ui <- unsafeRead u i
                             vi <- unsafeRead v i
                             loop (vbv + ui * vi) (vv + vi * vi) (i+1)
    (vbv,vv) <- loop 0 0 0
    return $! sqrt (vbv/vv)

-- return element i,j of infinite matrix A
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv !n !v !av = loop 0
    where loop i  = when (i < n) $ do avi <- loop' i 0 0
                                      unsafeWrite av i avi >> loop (i+1)
          loop' !i !j !av | j >= n    = return av
                          | otherwise = do vj  <- unsafeRead v j
                                           loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv !n !v !atv = loop 0
    where loop i = when (i < n) $ do atvi <- loop' i 0 0
                                     unsafeWrite atv i atvi >> loop (i+1)
          loop' !i !j !atvi
                | j >= n    = return atvi
                | otherwise = do vj <- unsafeRead v j
                                 loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv !n !v !atav = do
    u  <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    multiplyAv n v u >> multiplyAtv n u atav

Old entry

GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision

{-# OPTIONS -optc-O #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
-- gcc miscompiles this program with -O3
-- 

import Control.Monad
import System
import Numeric
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s []

approximate n = do
    u <- newArray  (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray_ (0,n-1)   :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
    loop 0 0 0 n u v

loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined
loop vbv vv i n u v | i >= n    = return $! sqrt (vbv/vv)
                    | otherwise = do ui <- unsafeRead u i
                                     vi <- unsafeRead v i
                                     loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v

-- return element i,j of infinite matrix A 
a i j | i `seq` j `seq` False = undefined
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined
multiplyAv n v av = loop 0
    where loop i  = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1)
          loop' i j av | i `seq` j `seq` av `seq` False = undefined
          loop' i j av | j >= n    = return av 
                       | otherwise = do vj  <- v `unsafeRead` j
                                        loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined
multiplyAtv n v atv = loop 0
    where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1)
          loop' i j atvi | j `seq` atvi `seq` False = undefined
          loop' i j atvi | j >= n    = return atvi 
                         | otherwise = do vj <- v `unsafeRead` j 
                                          loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined
multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
                           multiplyAv n v u >> multiplyAtv n u atav

Current entry

-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Original C contributed by Sebastien Loisel
-- Conversion to C++ by Jon Harrop
-- Conversion to Haskell by Einar Karttunen

import Control.Monad
import Control.Monad.ST
import Data.Array.Base
import Data.Array.ST
import Numeric
import System

eval_A :: Int -> Int -> Double
eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1)

plusAt :: STUArray s Int Double -> Int -> Double -> ST s ()
plusAt a i v = do o <- unsafeRead a i
                  unsafeWrite a i (v+o)

eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_A_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1)

eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_At_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1)

eval_AtA_Times_u u v = do w <- newArray (bounds u) 0
                          eval_A_Times_u u w >> eval_At_Times_u w v

main = do
  n <- getArgs >>= return.read.head
  let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
                           v <- newArray (0,n-1) 0
                           replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
                           vLoop u v n (0, 0))
  putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""

vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double)
vLoop u v 0 a            = return a
vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op
    where op = do ui <- unsafeRead u i
                  vi <- unsafeRead v i
                  return (vBv+(ui*vi),vv+(vi*vi))

STUArray-based solution

One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).

Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.

{-# OPTIONS_GHC -fglasgow-exts #-}

--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:
--    -O2 -fexcess-precision

import Control.Monad.ST
import Control.Monad

import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits

import System.Environment

-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float

main = do
	n <- fmap (read . head) getArgs
	print (spectralNorm n)
	-- Use this variant for IOUArray rather than STUArray
	--print =<< spectralNormM allocDoubleIO n

spectralNorm :: Int -> Double
spectralNorm n = runST (spectralNormM allocDouble n)

-----------------
-- Benchmark code

spectralNormM allocator n = case allocator of
  (Allocator alloc) -> do
	let ubound = n-1
	u <- alloc (0,ubound)
	v <- alloc (0,ubound)
	tmp <- alloc (0,ubound)

	fill u 1.0

	replicateM_ 10 $ do
		multiplyAtAv ubound u tmp v
		multiplyAtAv ubound v tmp u
	
	vBv <- u .| v
	vv <- v .| v
	
	return $ sqrt (vBv / vv)

multiplyAtAv n v u atav = do
	multiplyAv n v u
	multiplyAtv n u atav

multiplyAv n v av = iterateM_ 0 n $ \i -> do
	writeArray av i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray av i (+ ((a i j) * v'))

multiplyAtv n v atv = iterateM_ 0 n $ \i -> do
	writeArray atv i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray atv i (+ ((a j i) * v'))

a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1)
--	where i' = fromIntegral i; j' = fromIntegral j
a (I# i) (I# j) = D# (
	case i +# j of
		n -> case n *# (n+#1#) of
			t -> case t `uncheckedIShiftRA#` 1# of
				u -> case u +# (i +# 1#) of
					r -> 1.0## /## (int2Double# r))

-----------------------
-- General utility code

fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)

u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
	where
		go lb ub u v acc
			| lb <= ub = do
				u' <- readArray u lb
				v' <- readArray v lb
				go (lb+1) ub u v (acc + u'*v')
			| otherwise = return acc

-- Explicitly inlining this seems to stand for about a 24x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i

-- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops
-- above makes the program go twice as slowly. (Really!)

{-# INLINE iterateM_ #-}
{-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-}
iterateM_ lb ub = forM_ [lb..ub]

data Allocator m i e = forall a. MArray a e m => 
                       Allocator ((i, i) -> m (a i e))

allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
	(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))

allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
	(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))