Difference between revisions of "Shootout/Spectral"

From HaskellWiki
Jump to navigation Jump to search
m
 
(3 intermediate revisions by 2 users not shown)
Line 1: Line 1:
 
 
A Shootout Entry for the [http://shootout.alioth.debian.org/gp4/benchmark.php?test=spectralnorm&lang=all spectral norm test]
 
A Shootout Entry for the [http://shootout.alioth.debian.org/gp4/benchmark.php?test=spectralnorm&lang=all spectral norm test]
   
Line 9: Line 8:
 
||dons || 9.407||
 
||dons || 9.407||
 
||old || 10.564||
 
||old || 10.564||
  +
  +
== Current entry ==
  +
  +
Damn fast.
  +
  +
<haskell>
  +
{-# OPTIONS -fvia-C -fexcess-precision #-}
  +
--
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
  +
-- 2) changed -optc-O to -optc-O3
  +
-- 3) added -optc-ffast-math
  +
-- Translation from Clean by Don Stewart
  +
--
  +
-- Should be compiled with:
  +
-- -O -fglasgow-exts -fbang-patterns
  +
-- -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
  +
--
  +
-- Note: The following flags appear to INCREASE running time:
  +
-- -O2 -optc-funroll-loops
  +
  +
import System
  +
import Foreign.Marshal.Array
  +
import Foreign
  +
import Text.Printf
  +
import Control.Monad
  +
import Data.ByteString.Internal
  +
  +
import GHC.Base
  +
import GHC.Float
  +
import GHC.Int
  +
  +
type Reals = Ptr Double
  +
  +
main = do
  +
n <- getArgs >>= readIO . head
  +
  +
u <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff u i 1
  +
v <- mallocArray n :: IO Reals
  +
forM_ [0..n-1] $ \i -> pokeElemOff v i 0
  +
  +
powerMethod 10 n u v
  +
printf "%.9f\n" (eigenvalue n u v 0 0 0)
  +
  +
------------------------------------------------------------------------
  +
  +
eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
  +
eigenvalue !n !u !v !i !vBv !vv
  +
| i < n = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
  +
| otherwise = sqrt $! vBv / vv
  +
where
  +
ui = inlinePerformIO (peekElemOff u i)
  +
vi = inlinePerformIO (peekElemOff v i)
  +
  +
------------------------------------------------------------------------
  +
  +
powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
  +
powerMethod !i !n !u !v = allocaArray n $ \t ->
  +
replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u
  +
  +
-- multiply vector v by matrix A and then by matrix A transposed
  +
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
  +
timesAtAv !t !n !u !atau = do
  +
timesAv n u t
  +
timesAtv n t atau
  +
  +
timesAv :: Int -> Reals -> Reals -> IO ()
  +
timesAv !n !u !au = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff au i (avsum i 0 0)
  +
go (i+1)
  +
  +
avsum :: Int -> Int -> Double -> Double
  +
avsum !i !j !acc
  +
| j < n = avsum i (j+1) (acc + ((aij i j) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
timesAtv :: Int -> Reals -> Reals -> IO ()
  +
timesAtv !n !u !a = go 0
  +
where
  +
go :: Int -> IO ()
  +
go !i = when (i < n) $ do
  +
pokeElemOff a i (atvsum i 0 0)
  +
go (i+1)
  +
  +
atvsum :: Int -> Int -> Double -> Double
  +
atvsum !i !j !acc
  +
| j < n = atvsum i (j+1) (acc + ((aij j i) * uj))
  +
| otherwise = acc
  +
where uj = inlinePerformIO (peekElemOff u j)
  +
  +
--
  +
-- manually unbox the inner loop:
  +
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
  +
--
  +
aij (I# i) (I# j) = D# (
  +
case i +# j of
  +
n -> case n *# (n+#1#) of
  +
t -> case t `uncheckedIShiftRA#` 1# of
  +
u -> case u +# (i +# 1#) of
  +
r -> 1.0## /## (int2Double# r))
  +
</haskell>
   
 
== Proposed entry ==
 
== Proposed entry ==
Line 17: Line 124:
   
 
<haskell>
 
<haskell>
{-# OPTIONS -O2 -fglasgow-exts -fbang-patterns -fexcess-precision -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-}
+
{-# OPTIONS -fexcess-precision #-}
 
--
 
--
 
-- The Computer Language Shootout
 
-- The Computer Language Shootout
Line 23: Line 130:
 
--
 
--
 
-- Translation from Clean by Don Stewart
 
-- Translation from Clean by Don Stewart
  +
--
  +
-- Should be compiled with:
  +
-- -O -fglasgow-exts -fbang-patterns
  +
-- -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2
 
--
 
--
   
Line 130: Line 241:
 
--
 
--
   
import Monad
+
import Control.Monad
 
import System
 
import System
 
import Text.Printf
 
import Text.Printf
Line 141: Line 252:
 
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
 
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
 
v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
 
v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
sequence_ $ replicate 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
+
replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
   
 
let loop !vbv !vv !i
 
let loop !vbv !vv !i
Line 193: Line 304:
 
--
 
--
   
import Monad
+
import Control.Monad
 
import System
 
import System
 
import Numeric
 
import Numeric
Line 204: Line 315:
 
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
 
u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
 
v <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
 
v <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
sequence_ $ replicate 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
+
replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
 
loop 0 0 0 n u v
 
loop 0 0 0 n u v
   
Line 251: Line 362:
 
-- Conversion to Haskell by Einar Karttunen
 
-- Conversion to Haskell by Einar Karttunen
   
  +
import Control.Monad
 
import Control.Monad.ST
 
import Control.Monad.ST
 
import Data.Array.Base
 
import Data.Array.Base
Line 285: Line 397:
 
let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
 
let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
 
v <- newArray (0,n-1) 0
 
v <- newArray (0,n-1) 0
sequence_ $ replicate 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
+
replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
 
vLoop u v n (0, 0))
 
vLoop u v n (0, 0))
 
putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""
 
putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""
Line 295: Line 407:
 
vi <- unsafeRead v i
 
vi <- unsafeRead v i
 
return (vBv+(ui*vi),vv+(vi*vi))
 
return (vBv+(ui*vi),vv+(vi*vi))
  +
</haskell>
  +
  +
== STUArray-based solution ==
  +
  +
One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).
  +
  +
Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.
  +
  +
<haskell>
  +
{-# OPTIONS_GHC -fglasgow-exts #-}
  +
  +
--
  +
-- The Computer Language Benchmarks Game
  +
-- http://shootout.alioth.debian.org/
  +
--
  +
-- Contributed by Simon Brenner
  +
--
  +
-- Compile with:
  +
--
  +
-- C-options:
  +
-- -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
  +
-- -optc-ffast-math
  +
-- GHC options:
  +
-- -O2 -fexcess-precision
  +
  +
import Control.Monad.ST
  +
import Control.Monad
  +
  +
import Data.Array.IO
  +
import Data.Array.ST
  +
import Data.Ix
  +
import Data.Bits
  +
  +
import System.Environment
  +
  +
-- Various magic used for the unboxed version of the 'a' function
  +
import GHC.Base
  +
import GHC.Float
  +
  +
main = do
  +
n <- fmap (read . head) getArgs
  +
print (spectralNorm n)
  +
-- Use this variant for IOUArray rather than STUArray
  +
--print =<< spectralNormM allocDoubleIO n
  +
  +
spectralNorm :: Int -> Double
  +
spectralNorm n = runST (spectralNormM allocDouble n)
  +
  +
-----------------
  +
-- Benchmark code
  +
  +
spectralNormM allocator n = case allocator of
  +
(Allocator alloc) -> do
  +
let ubound = n-1
  +
u <- alloc (0,ubound)
  +
v <- alloc (0,ubound)
  +
tmp <- alloc (0,ubound)
  +
  +
fill u 1.0
  +
  +
replicateM_ 10 $ do
  +
multiplyAtAv ubound u tmp v
  +
multiplyAtAv ubound v tmp u
  +
  +
vBv <- u .| v
  +
vv <- v .| v
  +
  +
return $ sqrt (vBv / vv)
  +
  +
multiplyAtAv n v u atav = do
  +
multiplyAv n v u
  +
multiplyAtv n u atav
  +
  +
multiplyAv n v av = iterateM_ 0 n $ \i -> do
  +
writeArray av i 0.0
  +
iterateM_ 0 n $ \j -> do
  +
v' <- readArray v j
  +
modArray av i (+ ((a i j) * v'))
  +
  +
multiplyAtv n v atv = iterateM_ 0 n $ \i -> do
  +
writeArray atv i 0.0
  +
iterateM_ 0 n $ \j -> do
  +
v' <- readArray v j
  +
modArray atv i (+ ((a j i) * v'))
  +
  +
a :: Int -> Int -> Double
  +
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
  +
--a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1)
  +
-- where i' = fromIntegral i; j' = fromIntegral j
  +
a (I# i) (I# j) = D# (
  +
case i +# j of
  +
n -> case n *# (n+#1#) of
  +
t -> case t `uncheckedIShiftRA#` 1# of
  +
u -> case u +# (i +# 1#) of
  +
r -> 1.0## /## (int2Double# r))
  +
  +
-----------------------
  +
-- General utility code
  +
  +
fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)
  +
  +
u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
  +
where
  +
go lb ub u v acc
  +
| lb <= ub = do
  +
u' <- readArray u lb
  +
v' <- readArray v lb
  +
go (lb+1) ub u v (acc + u'*v')
  +
| otherwise = return acc
  +
  +
-- Explicitly inlining this seems to stand for about a 24x speedup :)
  +
{-# INLINE modArray #-}
  +
modArray a i f = writeArray a i . f =<< readArray a i
  +
  +
-- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops
  +
-- above makes the program go twice as slowly. (Really!)
  +
  +
{-# INLINE iterateM_ #-}
  +
{-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-}
  +
iterateM_ lb ub = forM_ [lb..ub]
  +
  +
data Allocator m i e = forall a. MArray a e m =>
  +
Allocator ((i, i) -> m (a i e))
  +
  +
allocDouble :: Ix i => Allocator (ST s) i Double
  +
allocDouble = Allocator $
  +
(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))
  +
  +
allocDoubleIO :: Ix i => Allocator IO i Double
  +
allocDoubleIO = Allocator $
  +
(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))
 
</haskell>
 
</haskell>
   

Latest revision as of 06:09, 21 February 2010

A Shootout Entry for the spectral norm test

Timings

Debian Linux x86, n=2,500

||Entry || Time|| ||dons || 9.407|| ||old || 10.564||

Current entry

Damn fast.

{-# OPTIONS -fvia-C -fexcess-precision #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Modified by Ryan Trinkle: 1) change from divInt# to uncheckedIShiftRA#
--                           2) changed -optc-O to -optc-O3
--                           3) added -optc-ffast-math
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns
--      -optc-O3 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 -optc-ffast-math
--
-- Note: The following flags appear to INCREASE running time:
--      -O2 -optc-funroll-loops

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Internal

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case t `uncheckedIShiftRA#` 1# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Proposed entry

Uses Ptr Double, and carefully unboxes the aij inner computation (where 70% of the time is spent). Runs 2.3x slower than C on my p4.

Submitted

{-# OPTIONS -fexcess-precision #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Translation from Clean by Don Stewart
--
-- Should be compiled with:
--      -O -fglasgow-exts -fbang-patterns 
--      -optc-O -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2
--

import System
import Foreign.Marshal.Array
import Foreign
import Text.Printf
import Control.Monad
import Data.ByteString.Base

import GHC.Base
import GHC.Float
import GHC.Int

type Reals = Ptr Double

main = do
    n <- getArgs >>= readIO . head

    u <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff u i 1
    v <- mallocArray n :: IO Reals
    forM_ [0..n-1] $ \i -> pokeElemOff v i 0

    powerMethod 10 n u v
    printf "%.9f\n" (eigenvalue n u v 0 0 0)

------------------------------------------------------------------------

eigenvalue :: Int -> Reals -> Reals -> Int -> Double -> Double -> Double
eigenvalue !n !u !v !i !vBv !vv
    | i < n     = eigenvalue n u v (i+1) (vBv + ui * vi) (vv + vi * vi)
    | otherwise = sqrt $! vBv / vv
    where
       ui = inlinePerformIO (peekElemOff u i)
       vi = inlinePerformIO (peekElemOff v i)

------------------------------------------------------------------------

powerMethod :: Int -> Int -> Reals -> Reals -> IO ()
powerMethod !i !n !u !v = allocaArray n $ \t ->
    replicateM_ i $ timesAtAv t n u v >> timesAtAv t n v u

-- multiply vector v by matrix A and then by matrix A transposed
timesAtAv :: Reals -> Int -> Reals -> Reals -> IO ()
timesAtAv !t !n !u !atau = do
    timesAv  n u t
    timesAtv n t atau
{-# INLINE timesAtAv #-}

timesAv :: Int -> Reals -> Reals -> IO ()
timesAv !n !u !au = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff au i (avsum i 0 0)
        go (i+1)

    avsum :: Int -> Int -> Double -> Double
    avsum !i !j !acc
        | j < n = avsum i (j+1) (acc + ((aij i j) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAv #-}

timesAtv :: Int -> Reals -> Reals -> IO ()
timesAtv !n !u !a = go 0
  where
    go :: Int -> IO ()
    go !i = when (i < n) $ do
        pokeElemOff a i (atvsum i 0 0)
        go (i+1)

    atvsum :: Int -> Int -> Double -> Double
    atvsum !i !j !acc
        | j < n     = atvsum i (j+1) (acc + ((aij j i) * uj))
        | otherwise = acc
        where uj = inlinePerformIO (peekElemOff u j)
{-# INLINE timesAtv #-}

--
-- manually unbox the inner loop:
-- aij i j = 1 / fromIntegral ((i+j) * (i+j+1) `div` 2 + i + 1)
--
aij (I# i) (I# j) = D# (
    case i +# j of
        n -> case n *# (n+#1#) of
                t -> case divInt# t 2# of
                        u -> case u +# (i +# 1#) of
                                r -> 1.0## /## (int2Double# r))

Current entry

Better gcc flags. Bang patterns. Submitted

Didn't do so well in practice though :/

{-# OPTIONS -O -fglasgow-exts -fbang-patterns -funbox-strict-fields -fexcess-precision -optc-O2 -optc-march=pentium4 -optc-mfpmath=sse -optc-msse2 #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--

import Control.Monad
import System
import Text.Printf
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= printf "%.9f\n"

approximate n = do
    u <- newArray (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u

    let loop !vbv !vv !i
            | i >= n    = return (vbv,vv)
            | otherwise = do ui <- unsafeRead u i
                             vi <- unsafeRead v i
                             loop (vbv + ui * vi) (vv + vi * vi) (i+1)
    (vbv,vv) <- loop 0 0 0
    return $! sqrt (vbv/vv)

-- return element i,j of infinite matrix A
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv !n !v !av = loop 0
    where loop i  = when (i < n) $ do avi <- loop' i 0 0
                                      unsafeWrite av i avi >> loop (i+1)
          loop' !i !j !av | j >= n    = return av
                          | otherwise = do vj  <- unsafeRead v j
                                           loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv !n !v !atv = loop 0
    where loop i = when (i < n) $ do atvi <- loop' i 0 0
                                     unsafeWrite atv i atvi >> loop (i+1)
          loop' !i !j !atvi
                | j >= n    = return atvi
                | otherwise = do vj <- unsafeRead v j
                                 loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv !n !v !atav = do
    u  <- newArray (0,n-1) 0 :: IO (IOUArray Int Double)
    multiplyAv n v u >> multiplyAtv n u atav

Old entry

GHC unboxed this better. Careful attention payed to the unboxing. -O2 -optc-O -optc-ffast-math -fexcess-precision

{-# OPTIONS -optc-O #-}
--
-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
-- gcc miscompiles this program with -O3
-- 

import Control.Monad
import System
import Numeric
import Data.Array.IO
import Data.Array.Base

main = getArgs >>= approximate . read . head >>= putStrLn . \s -> showFFloat (Just 9) s []

approximate n = do
    u <- newArray  (0,n-1) 1 :: IO (IOUArray Int Double)
    v <- newArray_ (0,n-1)   :: IO (IOUArray Int Double)
    replicateM_ 10 $ multiplyAtAv n u v >> multiplyAtAv n v u
    loop 0 0 0 n u v

loop vbv vv i n u v | vbv `seq` vv `seq` i `seq` n `seq` u `seq` v `seq` False = undefined
loop vbv vv i n u v | i >= n    = return $! sqrt (vbv/vv)
                    | otherwise = do ui <- unsafeRead u i
                                     vi <- unsafeRead v i
                                     loop (vbv + ui * vi) (vv + vi * vi) (i+1) n u v

-- return element i,j of infinite matrix A 
a i j | i `seq` j `seq` False = undefined
a i j = 1 / fromIntegral (x*(x+1) `div` 2 + i + 1) where x = i+j

-- multiply vector v by matrix A */
multiplyAv n v av | n `seq` v `seq` av `seq` False = undefined
multiplyAv n v av = loop 0
    where loop i  = when (i < n) $ loop' i 0 0 >>= unsafeWrite av i >> loop (i+1)
          loop' i j av | i `seq` j `seq` av `seq` False = undefined
          loop' i j av | j >= n    = return av 
                       | otherwise = do vj  <- v `unsafeRead` j
                                        loop' i (j+1) (av + a i j * vj)

-- multiply vector v by matrix A transposed
multiplyAtv n v atv | n `seq` v `seq` atv `seq` False = undefined
multiplyAtv n v atv = loop 0
    where loop i = when (i < n) $ loop' i 0 0 >>= unsafeWrite atv i >> loop (i+1)
          loop' i j atvi | j `seq` atvi `seq` False = undefined
          loop' i j atvi | j >= n    = return atvi 
                         | otherwise = do vj <- v `unsafeRead` j 
                                          loop' i (j+1) (atvi + a j i * vj)

-- multiply vector v by matrix A and then by matrix A transposed */
multiplyAtAv n v atav | n `seq` v `seq` atav `seq` False = undefined
multiplyAtAv n v atav = do u <- newArray_ (0,n-1) :: IO (IOUArray Int Double)
                           multiplyAv n v u >> multiplyAtv n u atav

Current entry

-- The Great Computer Language Shootout
-- http:--shootout.alioth.debian.org/
--
-- Original C contributed by Sebastien Loisel
-- Conversion to C++ by Jon Harrop
-- Conversion to Haskell by Einar Karttunen

import Control.Monad
import Control.Monad.ST
import Data.Array.Base
import Data.Array.ST
import Numeric
import System

eval_A :: Int -> Int -> Double
eval_A i j = 1 / fromIntegral ((i+j)*(i+j+1) `div` 2 + i + 1)

plusAt :: STUArray s Int Double -> Int -> Double -> ST s ()
plusAt a i v = do o <- unsafeRead a i
                  unsafeWrite a i (v+o)

eval_A_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_A_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A i 0 * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A i j * uj) >> inner i (j-1)

eval_At_Times_u :: STUArray s Int Double -> STUArray s Int Double -> ST s ()
eval_At_Times_u u au = outer (snd $ bounds u)
    where outer 0 = unsafeWrite au 0 0 >> inner 0 (snd $ bounds u)
          outer i = unsafeWrite au i 0 >> inner i (snd $ bounds u) >> outer (i-1)
          inner i 0 = unsafeRead u 0 >>= \uj -> plusAt au i (eval_A 0 i * uj)
          inner i j = unsafeRead u j >>= \uj -> plusAt au i (eval_A j i * uj) >> inner i (j-1)

eval_AtA_Times_u u v = do w <- newArray (bounds u) 0
                          eval_A_Times_u u w >> eval_At_Times_u w v

main = do
  n <- getArgs >>= return.read.head
  let (vBv,vv) = runST (do u <- newArray (0,n-1) 1
                           v <- newArray (0,n-1) 0
                           replicateM_ 10 (eval_AtA_Times_u u v >> eval_AtA_Times_u v u)
                           vLoop u v n (0, 0))
  putStrLn $ showFFloat (Just 9) (sqrt (vBv/vv)) ""

vLoop :: STUArray s Int Double -> STUArray s Int Double -> Int -> (Double,Double) -> ST s (Double,Double)
vLoop u v 0 a            = return a
vLoop u v (i+1) (vBv,vv) = vLoop u v i =<< op
    where op = do ui <- unsafeRead u i
                  vi <- unsafeRead v i
                  return (vBv+(ui*vi),vv+(vi*vi))

STUArray-based solution

One-line-change convertible to IOUArray, which seems to be exactly as fast as STUArray. The important part seems to be the unboxed version of the 'a' function and the iterateM_ hack (which I don't quite know how it works at all).

Otherwise, it seems to be just about exactly as fast as the unsafe Ptr Double versions.

{-# OPTIONS_GHC -fglasgow-exts #-}

--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Simon Brenner
--
-- Compile with:
--
-- C-options:
--    -fvia-c -optc-O3 -optc-march=native -optc-mfpmath=sse -optc-msse2
--    -optc-ffast-math
-- GHC options:
--    -O2 -fexcess-precision

import Control.Monad.ST
import Control.Monad

import Data.Array.IO
import Data.Array.ST
import Data.Ix
import Data.Bits

import System.Environment

-- Various magic used for the unboxed version of the 'a' function
import GHC.Base
import GHC.Float

main = do
	n <- fmap (read . head) getArgs
	print (spectralNorm n)
	-- Use this variant for IOUArray rather than STUArray
	--print =<< spectralNormM allocDoubleIO n

spectralNorm :: Int -> Double
spectralNorm n = runST (spectralNormM allocDouble n)

-----------------
-- Benchmark code

spectralNormM allocator n = case allocator of
  (Allocator alloc) -> do
	let ubound = n-1
	u <- alloc (0,ubound)
	v <- alloc (0,ubound)
	tmp <- alloc (0,ubound)

	fill u 1.0

	replicateM_ 10 $ do
		multiplyAtAv ubound u tmp v
		multiplyAtAv ubound v tmp u
	
	vBv <- u .| v
	vv <- v .| v
	
	return $ sqrt (vBv / vv)

multiplyAtAv n v u atav = do
	multiplyAv n v u
	multiplyAtv n u atav

multiplyAv n v av = iterateM_ 0 n $ \i -> do
	writeArray av i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray av i (+ ((a i j) * v'))

multiplyAtv n v atv = iterateM_ 0 n $ \i -> do
	writeArray atv i 0.0
	iterateM_ 0 n $ \j -> do
		v' <- readArray v j
		modArray atv i (+ ((a j i) * v'))

a :: Int -> Int -> Double
--a i j = 1.0 / fromIntegral (((i + j) * (i + j + 1)) `shiftR` 1 + i + 1)
--a i j = 1.0 / (((i' + j') * (i' + j' + 1) / 2) + i' + 1)
--	where i' = fromIntegral i; j' = fromIntegral j
a (I# i) (I# j) = D# (
	case i +# j of
		n -> case n *# (n+#1#) of
			t -> case t `uncheckedIShiftRA#` 1# of
				u -> case u +# (i +# 1#) of
					r -> 1.0## /## (int2Double# r))

-----------------------
-- General utility code

fill u x = getBounds u >>= \(lb,ub) -> forM_ [lb..ub] (\i -> writeArray u i x)

u .| v = getBounds u >>= \(lb,ub) -> go lb ub u v 0
	where
		go lb ub u v acc
			| lb <= ub = do
				u' <- readArray u lb
				v' <- readArray v lb
				go (lb+1) ub u v (acc + u'*v')
			| otherwise = return acc

-- Explicitly inlining this seems to stand for about a 24x speedup :)
{-# INLINE modArray #-}
modArray a i f = writeArray a i . f =<< readArray a i

-- This is really weird! Replacing iterateM_ x y with forM_ [x..y] in the loops
-- above makes the program go twice as slowly. (Really!)

{-# INLINE iterateM_ #-}
{-# RULES "inline-iterateM_" iterateM_ = \lb ub f -> forM_ [lb..ub] f #-}
iterateM_ lb ub = forM_ [lb..ub]

data Allocator m i e = forall a. MArray a e m => 
                       Allocator ((i, i) -> m (a i e))

allocDouble :: Ix i => Allocator (ST s) i Double
allocDouble = Allocator $
	(newArray_ :: Ix i => (i,i) -> ST s (STUArray s i Double))

allocDoubleIO :: Ix i => Allocator IO i Double
allocDoubleIO = Allocator $
	(newArray_ :: Ix i => (i,i) -> IO (IOUArray i Double))