Difference between revisions of "Benchmarks Game/Parallel/BinaryTrees"
DonStewart (talk | contribs) |
|||
(5 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
== Binary Trees == |
== Binary Trees == |
||
+ | Description of the benchmark: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees |
||
− | * http://shootout.alioth.debian.org/u64q/benchmark.php?test=binarytrees&lang=ghc&id=1 |
||
+ | Haskell entries: https://benchmarksgame-team.pages.debian.net/benchmarksgame/measurements/ghc.html |
||
− | === Parallel Strategies: parMap == |
||
+ | |||
+ | '''*The following text may be outdated*''' |
||
+ | |||
+ | |||
+ | === 2009-03-01: Current Entry === |
||
+ | |||
+ | Submitted: http://alioth.debian.org/tracker/index.php?func=detail&aid=311523&group_id=30402&atid=411646 |
||
+ | |||
+ | Also filed a bug ticket with GHC to find out if the GC growth strategy can be improved (so that -H240M isn't required): http://hackage.haskell.org/trac/ghc/ticket/3061 |
||
+ | |||
+ | <haskell> |
||
+ | {-# OPTIONS -funbox-strict-fields #-} |
||
+ | {-# LANGUAGE BangPatterns #-} |
||
+ | -- |
||
+ | -- The Computer Language Benchmarks Game |
||
+ | -- http://shootout.alioth.debian.org/ |
||
+ | -- |
||
+ | -- Contributed by Don Stewart |
||
+ | -- Modified by Stephen Blackheath to parallelize (a very tiny tweak) |
||
+ | -- |
||
+ | -- Compile with: |
||
+ | -- |
||
+ | -- > ghc -O2 -fasm -threaded --make |
||
+ | -- |
||
+ | -- Run with: |
||
+ | -- |
||
+ | -- > ./A +RTS -N4 -H300M -RTS 20 |
||
+ | -- |
||
+ | -- Where '4' is the number of cores. and "set your -H value high (3 or |
||
+ | -- more times the maximum residency)", as per GHC User's Guide: |
||
+ | -- |
||
+ | -- <http://haskell.org/ghc/docs/6.10.1/html/users_guide/runtime-control.html#rts-options-gc> |
||
+ | -- |
||
+ | -- -H "provides a “suggested heap size” for the garbage collector. The |
||
+ | -- garbage collector will use about this much memory until the program |
||
+ | -- residency grows and the heap size needs to be expanded to retain |
||
+ | -- reasonable performance." |
||
+ | -- |
||
+ | |||
+ | import System |
||
+ | import Data.Bits |
||
+ | import Text.Printf |
||
+ | import Control.Parallel.Strategies |
||
+ | |||
+ | -- |
||
+ | -- an artificially strict tree. |
||
+ | -- |
||
+ | -- normally you would ensure the branches are lazy, but this benchmark |
||
+ | -- requires strict allocation. |
||
+ | -- |
||
+ | data Tree = Nil | Node !Int !Tree !Tree |
||
+ | |||
+ | minN = 4 |
||
+ | |||
+ | io s n t = printf "%s of depth %d\t check: %d\n" s n t |
||
+ | |||
+ | main = do |
||
+ | n <- getArgs >>= readIO . head |
||
+ | let maxN = max (minN + 2) n |
||
+ | stretchN = maxN + 1 |
||
+ | |||
+ | -- stretch memory tree |
||
+ | let c = check (make 0 stretchN) |
||
+ | io "stretch tree" stretchN c |
||
+ | |||
+ | -- allocate a long lived tree |
||
+ | let !long = make 0 maxN |
||
+ | |||
+ | -- allocate, walk, and deallocate many bottom-up binary trees |
||
+ | let vs = parMap rnf id $ depth minN maxN |
||
+ | mapM_ (\((m,d,i)) -> io (show m ++ "\t trees") d i) vs |
||
+ | |||
+ | -- confirm the the long-lived binary tree still exists |
||
+ | io "long lived tree" maxN (check long) |
||
+ | |||
+ | -- generate many trees |
||
+ | depth :: Int -> Int -> [(Int,Int,Int)] |
||
+ | depth d m |
||
+ | | d <= m = (2*n,d,sumT d n 0) : depth (d+2) m |
||
+ | | otherwise = [] |
||
+ | where n = 1 `shiftL` (m - d + minN) |
||
+ | |||
+ | -- allocate and check lots of trees |
||
+ | sumT :: Int -> Int -> Int -> Int |
||
+ | sumT d 0 t = t |
||
+ | sumT d i t = sumT d (i-1) (t + a + b) |
||
+ | where a = check (make i d) |
||
+ | b = check (make (-i) d) |
||
+ | |||
+ | -- traverse the tree, counting up the nodes |
||
+ | check :: Tree -> Int |
||
+ | check Nil = 0 |
||
+ | check (Node i l r) = i + check l - check r |
||
+ | |||
+ | -- build a tree |
||
+ | make :: Int -> Int -> Tree |
||
+ | make i 0 = Node i Nil Nil |
||
+ | make i d = Node i (make (i2-1) d2) (make i2 d2) |
||
+ | where i2 = 2*i; d2 = d-1 |
||
+ | |||
+ | </haskell> |
||
+ | |||
+ | === Parallel Strategies: parMap === |
||
* Status: submitted. |
* Status: submitted. |
||
Line 9: | Line 112: | ||
Flags: |
Flags: |
||
+ | $ ghc -O2 --make -fasm -threaded Parallel.hs |
||
− | {{{ |
||
+ | $ ./Parallel 20 +RTS -N5 -A350M |
||
− | $ ghc -O2 --make -fasm -threaded Parallel.hs |
||
+ | |||
− | $ ./Parallel 20 +RTS -N5 -A350M |
||
+ | This is a version of the Haskell GHC binary-trees benchmark, annotated for parallelism, using parallel strategy combinators. |
||
− | }}} |
||
+ | When compiled with the -threaded flag, and run with +RTS -N5 -RTS, it will exploit all cores on the quad-core machine, |
||
+ | dramatically reducing running times. |
||
+ | |||
+ | On my quad core, running time goes from, |
||
+ | |||
+ | * single core, 26.997s |
||
+ | * quad core, 5.692s |
||
+ | |||
+ | The following flags should be used: |
||
+ | |||
+ | Compile time: |
||
+ | |||
+ | ghc -O2 -fasm --make Parallel2.hs -threaded |
||
+ | |||
+ | Runtime: |
||
+ | |||
+ | ./Parallel2 20 +RTS -N5 -A350M -RTS |
||
+ | |||
+ | The -N5 flag asks the Haskell runtime to use 5 capabilites, which map onto the underlying cores. |
||
+ | |||
+ | Here is the result on my quad core, |
||
+ | |||
+ | $ time ./Parallel2 20 +RTS -N5 -A350M -RTS |
||
+ | stretch tree of depth 21 check: -1 |
||
+ | 2097152 trees of depth 4 check: -2097152 |
||
+ | 524288 trees of depth 6 check: -524288 |
||
+ | 131072 trees of depth 8 check: -131072 |
||
+ | 32768 trees of depth 10 check: -32768 |
||
+ | 8192 trees of depth 12 check: -8192 |
||
+ | 2048 trees of depth 14 check: -2048 |
||
+ | 512 trees of depth 16 check: -512 |
||
+ | 128 trees of depth 18 check: -128 |
||
+ | 32 trees of depth 20 check: -32 |
||
+ | long lived tree of depth 20 check: -1 |
||
+ | ./Parallel2 20 +RTS -N5 -A350M -RTS 15.80s user 1.52s system 304% cpu 5.692 total |
||
+ | |||
+ | Which is a satisfying result, as the parallelisation strategy is super simple. |
||
+ | |||
+ | |||
+ | Code: |
||
<haskell> |
<haskell> |
Latest revision as of 23:27, 18 April 2020
Binary Trees
Description of the benchmark: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees
Haskell entries: https://benchmarksgame-team.pages.debian.net/benchmarksgame/measurements/ghc.html
*The following text may be outdated*
2009-03-01: Current Entry
Submitted: http://alioth.debian.org/tracker/index.php?func=detail&aid=311523&group_id=30402&atid=411646
Also filed a bug ticket with GHC to find out if the GC growth strategy can be improved (so that -H240M isn't required): http://hackage.haskell.org/trac/ghc/ticket/3061
{-# OPTIONS -funbox-strict-fields #-}
{-# LANGUAGE BangPatterns #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
-- Modified by Stephen Blackheath to parallelize (a very tiny tweak)
--
-- Compile with:
--
-- > ghc -O2 -fasm -threaded --make
--
-- Run with:
--
-- > ./A +RTS -N4 -H300M -RTS 20
--
-- Where '4' is the number of cores. and "set your -H value high (3 or
-- more times the maximum residency)", as per GHC User's Guide:
--
-- <http://haskell.org/ghc/docs/6.10.1/html/users_guide/runtime-control.html#rts-options-gc>
--
-- -H "provides a “suggested heap size” for the garbage collector. The
-- garbage collector will use about this much memory until the program
-- residency grows and the heap size needs to be expanded to retain
-- reasonable performance."
--
import System
import Data.Bits
import Text.Printf
import Control.Parallel.Strategies
--
-- an artificially strict tree.
--
-- normally you would ensure the branches are lazy, but this benchmark
-- requires strict allocation.
--
data Tree = Nil | Node !Int !Tree !Tree
minN = 4
io s n t = printf "%s of depth %d\t check: %d\n" s n t
main = do
n <- getArgs >>= readIO . head
let maxN = max (minN + 2) n
stretchN = maxN + 1
-- stretch memory tree
let c = check (make 0 stretchN)
io "stretch tree" stretchN c
-- allocate a long lived tree
let !long = make 0 maxN
-- allocate, walk, and deallocate many bottom-up binary trees
let vs = parMap rnf id $ depth minN maxN
mapM_ (\((m,d,i)) -> io (show m ++ "\t trees") d i) vs
-- confirm the the long-lived binary tree still exists
io "long lived tree" maxN (check long)
-- generate many trees
depth :: Int -> Int -> [(Int,Int,Int)]
depth d m
| d <= m = (2*n,d,sumT d n 0) : depth (d+2) m
| otherwise = []
where n = 1 `shiftL` (m - d + minN)
-- allocate and check lots of trees
sumT :: Int -> Int -> Int -> Int
sumT d 0 t = t
sumT d i t = sumT d (i-1) (t + a + b)
where a = check (make i d)
b = check (make (-i) d)
-- traverse the tree, counting up the nodes
check :: Tree -> Int
check Nil = 0
check (Node i l r) = i + check l - check r
-- build a tree
make :: Int -> Int -> Tree
make i 0 = Node i Nil Nil
make i d = Node i (make (i2-1) d2) (make i2 d2)
where i2 = 2*i; d2 = d-1
Parallel Strategies: parMap
- Status: submitted.
Flags:
$ ghc -O2 --make -fasm -threaded Parallel.hs $ ./Parallel 20 +RTS -N5 -A350M
This is a version of the Haskell GHC binary-trees benchmark, annotated for parallelism, using parallel strategy combinators. When compiled with the -threaded flag, and run with +RTS -N5 -RTS, it will exploit all cores on the quad-core machine, dramatically reducing running times.
On my quad core, running time goes from,
* single core, 26.997s * quad core, 5.692s
The following flags should be used:
Compile time:
ghc -O2 -fasm --make Parallel2.hs -threaded
Runtime:
./Parallel2 20 +RTS -N5 -A350M -RTS
The -N5 flag asks the Haskell runtime to use 5 capabilites, which map onto the underlying cores.
Here is the result on my quad core,
$ time ./Parallel2 20 +RTS -N5 -A350M -RTS stretch tree of depth 21 check: -1 2097152 trees of depth 4 check: -2097152 524288 trees of depth 6 check: -524288 131072 trees of depth 8 check: -131072 32768 trees of depth 10 check: -32768 8192 trees of depth 12 check: -8192 2048 trees of depth 14 check: -2048 512 trees of depth 16 check: -512 128 trees of depth 18 check: -128 32 trees of depth 20 check: -32 long lived tree of depth 20 check: -1 ./Parallel2 20 +RTS -N5 -A350M -RTS 15.80s user 1.52s system 304% cpu 5.692 total
Which is a satisfying result, as the parallelisation strategy is super simple.
Code:
{-# OPTIONS -fbang-patterns -funbox-strict-fields #-}
--
-- The Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart and Thomas Davie
--
-- This implementation uses a parallel strategy to exploit the quad core machine.
-- For more information about Haskell parallel strategies, see,
--
-- http://www.macs.hw.ac.uk/~dsg/gph/papers/html/Strategies/strategies.html
--
import System
import Data.Bits
import Text.Printf
import Control.Parallel.Strategies
import Control.Parallel
--
-- an artificially strict tree.
--
-- normally you would ensure the branches are lazy, but this benchmark
-- requires strict allocation.
--
data Tree = Nil | Node !Int !Tree !Tree
minN = 4
io s n t = printf "%s of depth %d\t check: %d\n" s n t
main = do
n <- getArgs >>= readIO . head
let maxN = max (minN + 2) n
stretchN = maxN + 1
-- stretch memory tree
let c = check (make 0 stretchN)
io "stretch tree" stretchN c
-- allocate a long lived tree
let !long = make 0 maxN
-- allocate, walk, and deallocate many bottom-up binary trees
let vs = (parMap rnf) (depth' maxN) [minN,minN+2..maxN]
mapM_ (\((m,d,i)) -> io (show m ++ "\t trees") d i) vs
-- confirm the the long-lived binary tree still exists
io "long lived tree" maxN (check long)
-- generate many trees
depth' :: Int -> Int -> (Int,Int,Int)
depth' m d =
(2*n,d,sumT d n 0)
where
n = 1 `shiftL` (m - d + minN)
-- allocate and check lots of trees
sumT :: Int -> Int -> Int -> Int
sumT d 0 t = t
sumT d i t = sumT d (i-1) (t + a + b)
where a = check (make i d)
b = check (make (-i) d)
-- traverse the tree, counting up the nodes
check :: Tree -> Int
check Nil = 0
check (Node i l r) = i + check l - check r
-- build a tree
make :: Int -> Int -> Tree
make i 0 = Node i Nil Nil
make i d = Node i (make (i2-1) d2) (make i2 d2)
where i2 = 2*i; d2 = d-1