Difference between revisions of "Shootout/Fasta"
DonStewart (talk  contribs) (port) 
m (→A new proposal) 

(17 intermediate revisions by 8 users not shown)  
Line 1:  Line 1:  
−  
⚫  
⚫  
The description of the program: 
The description of the program: 

Line 11:  Line 10:  
We'll use the generated FASTA file as input for other benchmarks (reversecomplement, knucleotide). 
We'll use the generated FASTA file as input for other benchmarks (reversecomplement, knucleotide). 

+  
+  = Current entry = 

+  
+  Same as previous, but with fewer strictness annotations thanks to strictify. 

+  
+  <haskell> 

+  
+  {# OPTIONS fviaC O2 optcO2 optcffastmath fbangpatterns fexcessprecision #} 

+   

+   The Computer Language Benchmarks Game 

+   http://shootout.alioth.debian.org/ 

+   

+   Contributed by Don Stewart 

+   A lazy bytestring solution. 

+   Unnecessary strictness annotations removed by Sterling Clover 2/08 

+   

+   Add: 

+   optcmfpmath=sse optcmsse2 

+   

+  
+  import System 

+  import Data.Word 

+  import Control.Arrow 

+  
+  import qualified Data.ByteString.Lazy as L 

+  import qualified Data.ByteString.Lazy.Char8 as C (pack,unfoldr) 

+  import qualified Data.ByteString as S 

+  import Data.ByteString.Internal 

+  import Data.ByteString.Unsafe 

+  
+  main = do 

+  n < getArgs >>= readIO . head 

+  writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu) 

+  g < unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42 

+  unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g 

+  
+   

+   

+   lazily unfold the randomised dna sequences 

+   

+  
+  unfold l t n f g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n 

+  
+  unroll :: (Int > (Word8, Int)) > Int > Int > IO Int 

+  unroll f = loop 

+  where 

+  loop r 0 = return r 

+  loop !r i = case S.unfoldrN m (Just . f) r of 

+  (!s, Just r') > do 

+  S.putStrLn s 

+  loop r' (im) 

+  where m = min i 60 

+  
+  look ds k = (choose ds d, j) 

+  where (d,j) = rand k 

+  
+  choose :: [(Word8,Float)] > Float > Word8 

+  choose [(b,_)] _ = b 

+  choose ((b,f):xs) p = if p < f then b else choose xs (pf) 

+  
+   

+   

+   only demand as much of the infinite sequence as we require 

+  
+  writeFasta label title n s = do 

+  putStrLn $ ">" ++ label ++ " " ++ title 

+  let (t:ts) = L.toChunks s 

+  go ts t n 

+  where 

+  go ss s n 

+   l60 && n60 = S.putStrLn l >> go ss r (n60) 

+   n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n60) 

+   n <= ln = S.putStrLn (S.take n s) 

+   otherwise = S.putStr s >> S.putStrLn (S.take (nln) (head ss)) 

+  where 

+  ln = S.length s 

+  l60 = ln >= 60 

+  n60 = n >= 60 

+  (l,r) = S.splitAt 60 s 

+  (a,b) = S.splitAt (60ln) (head ss) 

+  
+   

+  
+  im = 139968 

+  ia = 3877 

+  ic = 29573 

+  
+  rand :: Int > (Float, Int) 

+  rand seed = (newran,newseed) 

+  where 

+  newseed = (seed * ia + ic) `rem` im 

+  newran = 1.0 * fromIntegral newseed / imd 

+  imd = fromIntegral im 

+  
+   

+  
+  alu = C.pack 

+  "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 

+  \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 

+  \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 

+  \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 

+  \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 

+  \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 

+  \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" 

+  
+  iubs = map (c2w *** id) 

+  [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) 

+  ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) 

+  ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] 

+  
+  homs = map (c2w *** id) 

+  [('a',0.3029549426680),('c',0.1979883004921) 

+  ,('g',0.1975473066391),('t',0.3015094502008)] 

+  </haskell> 

+  
+  = Previous entry = 

+  
+  Pretty fast, uses some cute lazy bytestring tricks. 

+  Compile with optcmfpmath=sse optcmsse2 

+  
+  <haskell> 

+  {# OPTIONS O2 optcO2 optcffastmath fbangpatterns fexcessprecision #} 

+   

+   The Computer Language Benchmarks Game 

+   http://shootout.alioth.debian.org/ 

+   

+   Contributed by Don Stewart 

+   A lazy bytestring solution. 

+   

+   Add: 

+   optcmfpmath=sse optcmsse2 

+   

+  
+  import System 

+  import Data.Word 

+  import Control.Arrow 

+  
+  import qualified Data.ByteString.Lazy as L 

+  import qualified Data.ByteString.Lazy.Char8 as C (pack) 

+  import qualified Data.ByteString as S 

+  import Data.ByteString.Internal 

+  
+  main = do 

+  n < getArgs >>= readIO . head 

+  writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu) 

+  g < unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42 

+  unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g 

+  
+   

+   

+   lazily unfold the randomised dna sequences 

+   

+  
+  unfold l t n f !g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n 

+  
+  unroll :: (Int > (Word8, Int)) > Int > Int > IO Int 

+  unroll f = loop 

+  where 

+  loop r 0 = return r 

+  loop !r !i = case S.unfoldrN m (Just . f) r of 

+  (!s, Just r') > do 

+  S.putStrLn s 

+  loop r' (im) 

+  where m = min i 60 

+  
+  look ds !k = let (d,j) = rand k in (choose ds d, j) 

+  
+  choose :: [(Word8,Float)] > Float > Word8 

+  choose [(b,_)] _ = b 

+  choose ((!b,!f):xs) !p = if p < f then b else choose xs (pf) 

+  
+   

+   

+   only demand as much of the infinite sequence as we require 

+  
+  writeFasta label title n s = do 

+  putStrLn $ ">" ++ label ++ " " ++ title 

+  let (t:ts) = L.toChunks s 

+  go ts t n 

+  where 

+  go ss !t !n 

+   l60 && n60 = S.putStrLn l >> go ss r (n60) 

+   n60 = S.putStr t >> S.putStrLn a >> go (tail ss) b (n60) 

+   n <= ln = S.putStrLn (S.take n t) 

+   otherwise = S.putStr t >> S.putStrLn (S.take (nln) (head ss)) 

+  where 

+  !ln = S.length t 

+  !l60 = ln >= 60 

+  !n60 = n >= 60 

+  (l,r) = S.splitAt 60 t 

+  (a,b) = S.splitAt (60ln) (head ss) 

+  
+   

+  
+  im = 139968 

+  ia = 3877 

+  ic = 29573 

+  
+  rand :: Int > (Float, Int) 

+  rand !seed = (newran,newseed) 

+  where 

+  !newseed = (seed * ia + ic) `rem` im 

+  !newran = 1.0 * fromIntegral newseed / imd 

+  imd = fromIntegral im 

+  
+   

+  
+  alu = C.pack 

+  "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 

+  GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 

+  CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 

+  ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 

+  GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 

+  AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 

+  AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" 

+  
+  iubs = map (c2w *** id) 

+  [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) 

+  ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) 

+  ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] 

+  
+  homs = map (c2w *** id) 

+  [('a',0.3029549426680),('c',0.1979883004921) 

+  ,('g',0.1975473066391),('t',0.3015094502008)] 

+  </haskell> 

+  
+  = A new proposal = 

+  
+  Give it some testing if the clean cooseN or the hacked chooseN' works better. Starting with ghc7 fviaC is deprecated, so fllvm is included with (amd) optimized flags. You can also play around with inlining. The posted version worked best for me (amd64, ghc7.0.4, llvm2.7). 

+  
+  <haskell> 

+  {# OPTIONS 

+  fviaC 

+  O3 

+  optcO3 

+  fexcessprecision 

+  optcffastmath 

+  optcmarch=native 

+  #} 

+  {# OPTIONS fllvm O3 fexcessprecision optlcmcpu=amdfam10 optloO3 optlopruneeh optlomem2reg optloloopunswitch optloglobalopt optlcO3 optlcenableunsafefpmath optlomemdep optlomem2reg optloscalarrepl optloloopunroll funboxstrictfields #} 

+  {# LANGUAGE BangPatterns #} 

+  import System.IO 

+  import System.Environment 

+  import Data.Word 

+  import Control.Monad 

+  
+  import qualified Data.ByteString.Lazy as L 

+  import qualified Data.ByteString.Lazy.Char8 as C (pack) 

+  import qualified Data.ByteString as S 

+  import Data.ByteString.Internal 

+  
+  import Foreign.ForeignPtr 

+  import Foreign.Ptr 

+  import Foreign.Storable 

+  
+  updateUnfoldrN :: S.ByteString > Int > (a > Maybe (Word8, a)) > a > IO (Maybe a) 

+  updateUnfoldrN bs i f x0 

+   i' <= 0 = do 

+  withForeignPtr fp (\ p > memset p 0 (fromIntegral len)) 

+  return (Just x0) 

+   otherwise = withForeignPtr fp $ \p > do 

+  (n, res) < go p x0 0 

+  memset (p `plusPtr` n) 0 (fromIntegral (len  n)) 

+  return res 

+  where 

+  (fp, off, len) = toForeignPtr bs 

+  i' = min i len 

+  go !p !x !n 

+   n < i' = case f x of 

+  Nothing > return (n, Nothing) 

+  Just (w,x') > poke p w >> go (p `plusPtr` 1) x' (n+1) 

+   otherwise = return (n, Just x) 

+  {# INLINE updateUnfoldrN #} 

+  
+  main = do 

+  n < getArgs >>= readIO . head 

+  
+  putStrLn ">ONE Homo sapiens alu" 

+  printBlocks takeN (L.cycle alu) (2*n) 

+  
+  let bs = S.replicate 60 (c2w 'x') 

+  putStrLn ">TWO IUB ambiguity codes" 

+  seed < printBlocks (chooseN' bs iubs) 42 (3*n) 

+  
+  putStrLn ">THREE Homo sapiens frequency" 

+  printBlocks (chooseN' bs homs) seed (5*n) 

+  
+   

+  
+  blockSizes :: Int > [Int] 

+  blockSizes 0 = [] 

+  blockSizes n = let m = (min 60 n) in (m : (blockSizes (nm))) 

+  
+  printBlocks :: (a > Int > IO a) > a > Int > IO a 

+  printBlocks f s n = foldM f s $ (blockSizes n) 

+  
+  takeN :: L.ByteString > Int > IO L.ByteString 

+  takeN bs n = L.putStrLn l >> return r 

+  where 

+  (l, r) = L.splitAt (fromIntegral n) bs 

+  {# INLINE takeN #} 

+  
+  
+  chooseN :: [(Word8, Double)] > Int > Int > IO Int 

+  chooseN ds seed n = S.putStrLn bs >> return seed' 

+  where 

+  (bs, Just seed') = S.unfoldrN n (chooseNext ds) seed 

+  {# INLINE chooseN #} 

+  
+  chooseN' :: ByteString > [(Word8, Double)] > Int > Int > IO Int 

+  chooseN' bs ds seed n = do 

+  Just seed' < updateUnfoldrN bs n (chooseNext ds) seed 

+  S.putStrLn bs 

+  return seed' 

+  {# INLINE chooseN' #} 

+  
+  chooseNext :: [(Word8, Double)] > Int > Maybe (Word8, Int) 

+  chooseNext ds seed = Just (choose ds newran, newseed) 

+  where 

+  newseed = (seed * ia + ic) `rem` im 

+  newran = fromIntegral newseed / imd 

+  {# INLINE chooseNext #} 

+  
+  choose :: [(Word8, Double)] > Double > Word8 

+  choose [(b,_)] _ = b 

+  choose ((b,f):xs) p = if p < f then b else choose xs (p  f) 

+  
+   

+  im = 139968 

+  imd = 139968.0 

+  ia = 3877 

+  ic = 29573 

+  
+  (***) :: (a > c) > (b > d) > (a, b) > (c, d) 

+  (***) f g (x, y) = (f x, g y) 

+  {# INLINE (***) #} 

+   

+  
+  alu = C.pack 

+  "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 

+  \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 

+  \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 

+  \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 

+  \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 

+  \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 

+  \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" 

+  
+  iubs = map (c2w *** id) 

+  [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02) 

+  ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02) 

+  ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)] 

+  
+  homs = map (c2w *** id) 

+  [('a',0.3029549426680),('c',0.1979883004921) 

+  ,('g',0.1975473066391),('t',0.3015094502008)] 

+  </haskell> 

+  
+  = Old entry = 

+  
+  This entry is similar to the "Old entry" on this page, but has been updated to use Data.ByteString. It appears to be respectably fast, execution time is about 3.5 times the C++ version in initial tests. 

+  
+  It has been [https://alioth.debian.org/tracker/index.php?func=detail&aid=304147&group_id=30402&atid=411646 submitted]. 

+  
+  <haskell> 

+  { The Computer Language Shootout 

+  http://shootout.alioth.debian.org/ 

+  contributed by Jeff Newbern 

+  updated by Spencer Janssen and Don Stewart } 

+  
+  import System 

+  import qualified Data.ByteString.Char8 as B 

+  
+  randomSequence :: Int > [(Char,Double)] > Int > (B.ByteString, Int) 

+  randomSequence n bf seed = (sequence, seed') 

+  where (sequence, Just seed') = B.unfoldrN n f seed 

+  f s = Just (chooseBase bf (normalize s), nextSeed s) 

+  
+  chooseBase :: [(Char,Double)] > Double > Char 

+  chooseBase [(b,_)] _ = b 

+  chooseBase ((b,f):xs) p  p < f = b 

+   otherwise = chooseBase xs (pf) 

+  
+  writeFasta label title sequence = do 

+  putStrLn $ ">" ++ label ++ " " ++ title 

+  mapM_ B.putStrLn $ split 60 sequence 

+  
+  split n xs  B.null xs = [] 

+   otherwise = l : split n r 

+  where (l, r) = B.splitAt n xs 

+  
+  main = do 

+  n < getArgs >>= readIO . head 

+  let aluLen = 1 + 2 * n `div` B.length alu 

+  aluSeq = B.take (2 * n) . B.concat . replicate aluLen $ alu 

+  (iubSeq, seed) = randomSequence (3 * n) iub initSeed 

+  (homSeq, _) = randomSequence (5 * n) homosapiens seed 

+  
+  writeFasta "ONE" "Homo sapiens alu" aluSeq 

+  writeFasta "TWO" "IUB ambiguity codes" iubSeq 

+  writeFasta "THREE" "Homo sapiens frequency" homSeq 

+  
+  alu = B.pack 

+  "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 

+  \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 

+  \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 

+  \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 

+  \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 

+  \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 

+  \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" 

+  
+  iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02) 

+  , ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02) 

+  , ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ] 

+  
+  homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921), 

+  ('g', 0.1975473066391), ('t', 0.3015094502008) ] 

+  
+  im = 139968 

+  ia = 3877 

+  ic = 29573 

+  nextSeed s = (s * ia + ic) `rem` im 

+  normalize n = (fromIntegral n) * (1.0 / fromIntegral im) 

+  initSeed = nextSeed 42 

+  </haskell> 

+  
+  [[User:ArthurVanLeeuwenArthurVanLeeuwen]] 13:38, 26 January 2007 (UTC) Changing chooseBase to not do the (pf), adding 

+  <haskell> 

+  cumulative bfs = zip bases (scanl1 (+) frequencies) 

+  where (bases,frequencies) = unzip bfs 

+  </haskell> 

+  and passing (cumulative iub) rather than iub (etc.) may give another 10 percent speedup. 

= Current best entry (Haskell GHC #3) = 
= Current best entry (Haskell GHC #3) = 

−  This entry is the one that is used on the [http://shootout.alioth.debian.org/benchmark.php?test=fasta&lang=all debian] and [http://shootout.alioth.debian.org/gp4/benchmark.php?test=fasta&lang=all gentoo] shootouts. 

+  This entry has been disqualified. From the Shootout judges: "Preconverting to Int is a really interesting approach  we're looking for the vanilla approach." 

It now has respectable performance and memory usage. 
It now has respectable performance and memory usage. 

Line 94:  Line 524:  
let (full,end) = total `divMod` perLine 
let (full,end) = total `divMod` perLine 

fullLine n = let ptr = advancePtr aluBuf ((n * perLine) `mod` l) 
fullLine n = let ptr = advancePtr aluBuf ((n * perLine) `mod` l) 

−  in hPutBuf stdout ptr perLine >> 
+  in hPutBuf stdout ptr perLine >> putChar '\n' 
lastLine = let ptr = advancePtr aluBuf ((full*perLine) `mod` l) 
lastLine = let ptr = advancePtr aluBuf ((full*perLine) `mod` l) 

−  in hPutBuf stdout ptr end >> 
+  in hPutBuf stdout ptr end >> putChar '\n' 
mapM_ fullLine [0..pred full] 
mapM_ fullLine [0..pred full] 

when (end>0) lastLine 
when (end>0) lastLine 

Line 133:  Line 563:  
writeWrapped (5*n) (chooseBase homosapiens) seed' 
writeWrapped (5*n) (chooseBase homosapiens) seed' 

</haskell> 
</haskell> 

+  
+  An extremely minor gripe, how about using: 

+  <haskell> 

+  alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 

+  \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 

+  \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 

+  \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 

+  \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 

+  \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 

+  \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" 

+  </haskell> 

+  
+  Rather than all those ++... This won't help performance one bit I would wager, but uses the "proper" way of doing multiline strings.[[User:SebastianSylvanSebastianSylvan]] 11:40, 10 November 2006 (UTC) 

+  
+  [[User:ArthurVanLeeuwenArthurVanLeeuwen]] 14:48, 26 January 2007 (UTC) I noticed that changing chooseBase in this program so that it conforms to the arbitrary rules set by the shootout maintainers makes this program only perform about 10% faster than the ByteString version above. 

= Nonleaking Entry = 
= Nonleaking Entry = 

Line 154:  Line 599:  
} 
} 

+  import Control.Monad (replicateM) 

import Control.Monad.Trans 
import Control.Monad.Trans 

import Control.Monad.State 
import Control.Monad.State 

Line 199:  Line 645:  
return (normalize seed') 
return (normalize seed') 

−  prngN count = 
+  prngN count = replicateM count prng 
 write a sequence in Fasta format 
 write a sequence in Fasta format 

Line 206:  Line 652:  
do putStrLn $ ">" ++ (label ++ (" " ++ title)) 
do putStrLn $ ">" ++ (label ++ (" " ++ title)) 

writeWrapped 60 sequence 
writeWrapped 60 sequence 

−  where writeWrapped _ [] = 
+  where writeWrapped _ [] = return () 
writeWrapped len str = do let (s1,s2) = splitAt len str 
writeWrapped len str = do let (s1,s2) = splitAt len str 

putStrLn s1 
putStrLn s1 

Line 215:  Line 661:  
writeWrapped' :: Int > Int > (Double>Base) > Pseudo () 
writeWrapped' :: Int > Int > (Double>Base) > Pseudo () 

−  writeWrapped' wrap total trans = 
+  writeWrapped' wrap total trans = work total 
−  +  where work 0 = return () 

−  +  work n = do let c' = min wrap n 

−  n > do let c' = min wrap n 

nextC = c  c' 
nextC = c  c' 

s < liftM (map trans) (prngN c') 
s < liftM (map trans) (prngN c') 

liftIO $ putStrLn s 
liftIO $ putStrLn s 

work nextC 
work nextC 

−  in work total 

writeWrapped = writeWrapped' 60 
writeWrapped = writeWrapped' 60 
Latest revision as of 13:35, 21 July 2011
This is a Shootout Entry for [1].
The description of the program:
Each program should
 encode the expected cumulative probabilities for 2 alphabets
 generate DNA sequences, by weighted random selection from the alphabets (using the pseudorandom number generator from the random benchmark)
 generate DNA sequences, by copying from a given sequence
 write 3 sequences linebyline in FASTA format
We'll use the generated FASTA file as input for other benchmarks (reversecomplement, knucleotide).
Contents
Current entry
Same as previous, but with fewer strictness annotations thanks to strictify.
{# OPTIONS fviaC O2 optcO2 optcffastmath fbangpatterns fexcessprecision #}

 The Computer Language Benchmarks Game
 http://shootout.alioth.debian.org/

 Contributed by Don Stewart
 A lazy bytestring solution.
 Unnecessary strictness annotations removed by Sterling Clover 2/08

 Add:
 optcmfpmath=sse optcmsse2

import System
import Data.Word
import Control.Arrow
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack,unfoldr)
import qualified Data.ByteString as S
import Data.ByteString.Internal
import Data.ByteString.Unsafe
main = do
n < getArgs >>= readIO . head
writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu)
g < unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42
unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g


 lazily unfold the randomised dna sequences

unfold l t n f g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n
unroll :: (Int > (Word8, Int)) > Int > Int > IO Int
unroll f = loop
where
loop r 0 = return r
loop !r i = case S.unfoldrN m (Just . f) r of
(!s, Just r') > do
S.putStrLn s
loop r' (im)
where m = min i 60
look ds k = (choose ds d, j)
where (d,j) = rand k
choose :: [(Word8,Float)] > Float > Word8
choose [(b,_)] _ = b
choose ((b,f):xs) p = if p < f then b else choose xs (pf)


 only demand as much of the infinite sequence as we require
writeFasta label title n s = do
putStrLn $ ">" ++ label ++ " " ++ title
let (t:ts) = L.toChunks s
go ts t n
where
go ss s n
 l60 && n60 = S.putStrLn l >> go ss r (n60)
 n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n60)
 n <= ln = S.putStrLn (S.take n s)
 otherwise = S.putStr s >> S.putStrLn (S.take (nln) (head ss))
where
ln = S.length s
l60 = ln >= 60
n60 = n >= 60
(l,r) = S.splitAt 60 s
(a,b) = S.splitAt (60ln) (head ss)

im = 139968
ia = 3877
ic = 29573
rand :: Int > (Float, Int)
rand seed = (newran,newseed)
where
newseed = (seed * ia + ic) `rem` im
newran = 1.0 * fromIntegral newseed / imd
imd = fromIntegral im

alu = C.pack
"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
iubs = map (c2w *** id)
[('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]
homs = map (c2w *** id)
[('a',0.3029549426680),('c',0.1979883004921)
,('g',0.1975473066391),('t',0.3015094502008)]
Previous entry
Pretty fast, uses some cute lazy bytestring tricks. Compile with optcmfpmath=sse optcmsse2
{# OPTIONS O2 optcO2 optcffastmath fbangpatterns fexcessprecision #}

 The Computer Language Benchmarks Game
 http://shootout.alioth.debian.org/

 Contributed by Don Stewart
 A lazy bytestring solution.

 Add:
 optcmfpmath=sse optcmsse2

import System
import Data.Word
import Control.Arrow
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack)
import qualified Data.ByteString as S
import Data.ByteString.Internal
main = do
n < getArgs >>= readIO . head
writeFasta "ONE" "Homo sapiens alu" (n*2) (L.cycle alu)
g < unfold "TWO" "IUB ambiguity codes" (n*3) (look iubs) 42
unfold "THREE" "Homo sapiens frequency" (n*5) (look homs) g


 lazily unfold the randomised dna sequences

unfold l t n f !g = putStrLn (">" ++ l ++ " " ++ t) >> unroll f g n
unroll :: (Int > (Word8, Int)) > Int > Int > IO Int
unroll f = loop
where
loop r 0 = return r
loop !r !i = case S.unfoldrN m (Just . f) r of
(!s, Just r') > do
S.putStrLn s
loop r' (im)
where m = min i 60
look ds !k = let (d,j) = rand k in (choose ds d, j)
choose :: [(Word8,Float)] > Float > Word8
choose [(b,_)] _ = b
choose ((!b,!f):xs) !p = if p < f then b else choose xs (pf)


 only demand as much of the infinite sequence as we require
writeFasta label title n s = do
putStrLn $ ">" ++ label ++ " " ++ title
let (t:ts) = L.toChunks s
go ts t n
where
go ss !t !n
 l60 && n60 = S.putStrLn l >> go ss r (n60)
 n60 = S.putStr t >> S.putStrLn a >> go (tail ss) b (n60)
 n <= ln = S.putStrLn (S.take n t)
 otherwise = S.putStr t >> S.putStrLn (S.take (nln) (head ss))
where
!ln = S.length t
!l60 = ln >= 60
!n60 = n >= 60
(l,r) = S.splitAt 60 t
(a,b) = S.splitAt (60ln) (head ss)

im = 139968
ia = 3877
ic = 29573
rand :: Int > (Float, Int)
rand !seed = (newran,newseed)
where
!newseed = (seed * ia + ic) `rem` im
!newran = 1.0 * fromIntegral newseed / imd
imd = fromIntegral im

alu = C.pack
"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
iubs = map (c2w *** id)
[('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]
homs = map (c2w *** id)
[('a',0.3029549426680),('c',0.1979883004921)
,('g',0.1975473066391),('t',0.3015094502008)]
A new proposal
Give it some testing if the clean cooseN or the hacked chooseN' works better. Starting with ghc7 fviaC is deprecated, so fllvm is included with (amd) optimized flags. You can also play around with inlining. The posted version worked best for me (amd64, ghc7.0.4, llvm2.7).
{# OPTIONS
fviaC
O3
optcO3
fexcessprecision
optcffastmath
optcmarch=native
#}
{# OPTIONS fllvm O3 fexcessprecision optlcmcpu=amdfam10 optloO3 optlopruneeh optlomem2reg optloloopunswitch optloglobalopt optlcO3 optlcenableunsafefpmath optlomemdep optlomem2reg optloscalarrepl optloloopunroll funboxstrictfields #}
{# LANGUAGE BangPatterns #}
import System.IO
import System.Environment
import Data.Word
import Control.Monad
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack)
import qualified Data.ByteString as S
import Data.ByteString.Internal
import Foreign.ForeignPtr
import Foreign.Ptr
import Foreign.Storable
updateUnfoldrN :: S.ByteString > Int > (a > Maybe (Word8, a)) > a > IO (Maybe a)
updateUnfoldrN bs i f x0
 i' <= 0 = do
withForeignPtr fp (\ p > memset p 0 (fromIntegral len))
return (Just x0)
 otherwise = withForeignPtr fp $ \p > do
(n, res) < go p x0 0
memset (p `plusPtr` n) 0 (fromIntegral (len  n))
return res
where
(fp, off, len) = toForeignPtr bs
i' = min i len
go !p !x !n
 n < i' = case f x of
Nothing > return (n, Nothing)
Just (w,x') > poke p w >> go (p `plusPtr` 1) x' (n+1)
 otherwise = return (n, Just x)
{# INLINE updateUnfoldrN #}
main = do
n < getArgs >>= readIO . head
putStrLn ">ONE Homo sapiens alu"
printBlocks takeN (L.cycle alu) (2*n)
let bs = S.replicate 60 (c2w 'x')
putStrLn ">TWO IUB ambiguity codes"
seed < printBlocks (chooseN' bs iubs) 42 (3*n)
putStrLn ">THREE Homo sapiens frequency"
printBlocks (chooseN' bs homs) seed (5*n)

blockSizes :: Int > [Int]
blockSizes 0 = []
blockSizes n = let m = (min 60 n) in (m : (blockSizes (nm)))
printBlocks :: (a > Int > IO a) > a > Int > IO a
printBlocks f s n = foldM f s $ (blockSizes n)
takeN :: L.ByteString > Int > IO L.ByteString
takeN bs n = L.putStrLn l >> return r
where
(l, r) = L.splitAt (fromIntegral n) bs
{# INLINE takeN #}
chooseN :: [(Word8, Double)] > Int > Int > IO Int
chooseN ds seed n = S.putStrLn bs >> return seed'
where
(bs, Just seed') = S.unfoldrN n (chooseNext ds) seed
{# INLINE chooseN #}
chooseN' :: ByteString > [(Word8, Double)] > Int > Int > IO Int
chooseN' bs ds seed n = do
Just seed' < updateUnfoldrN bs n (chooseNext ds) seed
S.putStrLn bs
return seed'
{# INLINE chooseN' #}
chooseNext :: [(Word8, Double)] > Int > Maybe (Word8, Int)
chooseNext ds seed = Just (choose ds newran, newseed)
where
newseed = (seed * ia + ic) `rem` im
newran = fromIntegral newseed / imd
{# INLINE chooseNext #}
choose :: [(Word8, Double)] > Double > Word8
choose [(b,_)] _ = b
choose ((b,f):xs) p = if p < f then b else choose xs (p  f)

im = 139968
imd = 139968.0
ia = 3877
ic = 29573
(***) :: (a > c) > (b > d) > (a, b) > (c, d)
(***) f g (x, y) = (f x, g y)
{# INLINE (***) #}

alu = C.pack
"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
iubs = map (c2w *** id)
[('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]
homs = map (c2w *** id)
[('a',0.3029549426680),('c',0.1979883004921)
,('g',0.1975473066391),('t',0.3015094502008)]
Old entry
This entry is similar to the "Old entry" on this page, but has been updated to use Data.ByteString. It appears to be respectably fast, execution time is about 3.5 times the C++ version in initial tests.
It has been submitted.
{ The Computer Language Shootout
http://shootout.alioth.debian.org/
contributed by Jeff Newbern
updated by Spencer Janssen and Don Stewart }
import System
import qualified Data.ByteString.Char8 as B
randomSequence :: Int > [(Char,Double)] > Int > (B.ByteString, Int)
randomSequence n bf seed = (sequence, seed')
where (sequence, Just seed') = B.unfoldrN n f seed
f s = Just (chooseBase bf (normalize s), nextSeed s)
chooseBase :: [(Char,Double)] > Double > Char
chooseBase [(b,_)] _ = b
chooseBase ((b,f):xs) p  p < f = b
 otherwise = chooseBase xs (pf)
writeFasta label title sequence = do
putStrLn $ ">" ++ label ++ " " ++ title
mapM_ B.putStrLn $ split 60 sequence
split n xs  B.null xs = []
 otherwise = l : split n r
where (l, r) = B.splitAt n xs
main = do
n < getArgs >>= readIO . head
let aluLen = 1 + 2 * n `div` B.length alu
aluSeq = B.take (2 * n) . B.concat . replicate aluLen $ alu
(iubSeq, seed) = randomSequence (3 * n) iub initSeed
(homSeq, _) = randomSequence (5 * n) homosapiens seed
writeFasta "ONE" "Homo sapiens alu" aluSeq
writeFasta "TWO" "IUB ambiguity codes" iubSeq
writeFasta "THREE" "Homo sapiens frequency" homSeq
alu = B.pack
"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02)
, ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02)
, ('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ]
homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921),
('g', 0.1975473066391), ('t', 0.3015094502008) ]
im = 139968
ia = 3877
ic = 29573
nextSeed s = (s * ia + ic) `rem` im
normalize n = (fromIntegral n) * (1.0 / fromIntegral im)
initSeed = nextSeed 42
ArthurVanLeeuwen 13:38, 26 January 2007 (UTC) Changing chooseBase to not do the (pf), adding
cumulative bfs = zip bases (scanl1 (+) frequencies)
where (bases,frequencies) = unzip bfs
and passing (cumulative iub) rather than iub (etc.) may give another 10 percent speedup.
Current best entry (Haskell GHC #3)
This entry has been disqualified. From the Shootout judges: "Preconverting to Int is a really interesting approach  we're looking for the vanilla approach."
It now has respectable performance and memory usage.
{# OPTIONS_GHC O2 funboxstrictfields #}
 The Great Computer Language Shootout
 http://shootout.alioth.debian.org/

 contributed by Jeff Newbern
 Modified to fastest.hs by Chris Kuklewicz, 6 Jan 2006
 Modified to fixedfasta.hs by Chris Kuklewicz, 17 Jan 2006

 Uses random generation code derived from Simon Marlow and Einar
 Karttunen's "random" test entry. No longer uses Double during run,
 everything has been preconverted to Int. And preconverted to a
 binary tree for lookup. Ideally this tree could be constructed
 with the probabilities in mind, but it isn't in this version.

 Compile with ghc make resubfasta.hs o resubfasta.ghc_run
 Run with "./rsubfasta.ghc_run %A" where %A is the parameter
import Control.Monad
import Data.Char(chr,ord)
import Data.List(mapAccumL)
import Data.Word(Word8)
import Data.IORef
import Foreign
import System(getArgs)
import System.IO
type Base = Word8
data BaseFrequencyTree = Node !Base
 TreeNodes !Int !Base !Base
 Tree !Int !BaseFrequencyTree !BaseFrequencyTree
data Seed = Seed !Int
b2c :: Word8 > Char
b2c = chr . fromEnum
c2b = toEnum . ord
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++
"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++
"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++
"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++
"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++
"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++
"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
im = 139968 :: Double
iub = mkTree $ snd . mapAccumL (\rt (c,f) > (f+rt,(c2b c,ceiling $ im*(f+rt)))) 0.0 $
[ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27), ('B', 0.02),
('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02),
('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ]
homosapiens = mkTree $ snd . mapAccumL (\rt (c,f) > (f+rt,(c2b c,ceiling $ im*(f+rt)))) 0.0 $
[ ('a', 0.3029549426680), ('c', 0.1979883004921), ('g', 0.1975473066391), ('t', 0.3015094502008) ]
mkTree [(b,_)] = Node b
mkTree [(b,f),(b',_)] = TreeNodes f b b'
mkTree xs = let (h,t) = splitAt (length xs `div` 2) xs
(_,f) = last h
in Tree f (mkTree h) (mkTree t)
chooseBase (Node b) _ = b
chooseBase (TreeNodes f b b') p = if (p<f) then b else b'
chooseBase (Tree f l r) p  p < f = chooseBase l p
 otherwise = chooseBase r p
writeFastaHeader label title = (putStrLn (('>':label) ++ (' ':title)))
perLine = 60
writeAluBuffer total = do
let l = length alu
bufSize = l + perLine  1
aluBuf < mallocArray bufSize
foldM_ (\ptr c > poke ptr (c2b c) >> return (advancePtr ptr 1)) aluBuf (take bufSize (cycle alu))
let (full,end) = total `divMod` perLine
fullLine n = let ptr = advancePtr aluBuf ((n * perLine) `mod` l)
in hPutBuf stdout ptr perLine >> putChar '\n'
lastLine = let ptr = advancePtr aluBuf ((full*perLine) `mod` l)
in hPutBuf stdout ptr end >> putChar '\n'
mapM_ fullLine [0..pred full]
when (end>0) lastLine
writeWrapped total trans initSeed = do
seedRef < newIORef initSeed
let l = succ perLine
(im,ia,ic)=(139968,3877,29573)
nextSeed (Seed s) = Seed ( (s * ia + ic) `mod` im )
prng = do newSeed < return.nextSeed =<< readIORef seedRef
writeIORef seedRef newSeed
return newSeed
buf < mallocArray l
poke (advancePtr buf perLine) (c2b '\n')
let (full,end) = total `divMod` perLine
fill 0 _ = return ()
fill i ptr = do (Seed b) < prng
poke ptr (trans b)
fill (pred i) (advancePtr ptr 1)
fullLine = do fill perLine buf
hPutBuf stdout buf l
lastLine = do fill end buf
poke (advancePtr buf end) (c2b '\n')
hPutBuf stdout buf (succ end)
replicateM_ full fullLine
when (end>0) lastLine
readIORef seedRef
main = do args < getArgs
let n = if null args then 2500000 else read (head args)
writeFastaHeader "ONE" "Homo sapiens alu"
writeAluBuffer (2*n)
writeFastaHeader "TWO" "IUB ambiguity codes"
seed' < writeWrapped (3*n) (chooseBase iub) (Seed 42)
writeFastaHeader "THREE" "Homo sapiens frequency"
writeWrapped (5*n) (chooseBase homosapiens) seed'
An extremely minor gripe, how about using:
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
Rather than all those ++... This won't help performance one bit I would wager, but uses the "proper" way of doing multiline strings.SebastianSylvan 11:40, 10 November 2006 (UTC)
ArthurVanLeeuwen 14:48, 26 January 2007 (UTC) I noticed that changing chooseBase in this program so that it conforms to the arbitrary rules set by the shootout maintainers makes this program only perform about 10% faster than the ByteString version above.
Nonleaking Entry
This a new entry, which is mostly the old entry below modified to use StateT to allow the random number stream to be finite, so no splitAt or drop functions are needed. This fixes the space leak and improves the time by almost a factor of 2. Serious speed will require changing from String to a Word8 array of some sort.
{
The Great Computer Language Shootout
http://shootout.alioth.debian.org/
contributed by Jeff Newbern
Modified by Chris Kuklewicz, 3 Jan 2006
Uses random generation code derived from Simon Marlow and Einar
Karttunen's "random" test entry.
Modified version note: Use a StateT around IO to manage
the seed. This makes interleaving the generation and output of the
sequences easier.
}
import Control.Monad (replicateM)
import Control.Monad.Trans
import Control.Monad.State
import System(getArgs)
type Base = Char
type Sequence = [Base]
alu :: Sequence  predefined sequence
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++
"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++
"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++
"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++
"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++
"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++
"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
type BaseFrequency = (Base,Double)
iub :: [BaseFrequency]
iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27),
('B', 0.02), ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02),
('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ]
homosapiens :: [BaseFrequency]
homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921),
('g', 0.1975473066391), ('t', 0.3015094502008) ]
 select a base whose interval covers the given double
chooseBase :: [BaseFrequency] > Double > Base
chooseBase [(b,_)] _ = b
chooseBase ((b,f):xs) p  p < f = b
 otherwise = chooseBase xs (pf)
type Seed = Int
type Pseudo a = StateT Seed IO a
prng :: Pseudo Double
prng = let nextSeed s = (s * ia + ic) `mod` im
normalize n = (fromIntegral n) * (1.0 / fromIntegral im)
im = 139968; ia = 3877; ic = 29573
in do seed < get
let seed' = nextSeed seed
put seed'
return (normalize seed')
prngN count = replicateM count prng
 write a sequence in Fasta format
writeFasta :: String > String > Sequence > IO ()
writeFasta label title sequence =
do putStrLn $ ">" ++ (label ++ (" " ++ title))
writeWrapped 60 sequence
where writeWrapped _ [] = return ()
writeWrapped len str = do let (s1,s2) = splitAt len str
putStrLn s1
writeWrapped len s2
writeFastaHeader :: (MonadIO m) => String > String > m ()
writeFastaHeader label title = liftIO $ putStrLn $ ">" ++ (label ++ (" " ++ title))
writeWrapped' :: Int > Int > (Double>Base) > Pseudo ()
writeWrapped' wrap total trans = work total
where work 0 = return ()
work n = do let c' = min wrap n
nextC = c  c'
s < liftM (map trans) (prngN c')
liftIO $ putStrLn s
work nextC
writeWrapped = writeWrapped' 60
main = do args < getArgs
let n = if (null args) then 1000 else read (head args)
writeFasta "ONE" "Homo sapiens alu" (take (2*n) (cycle alu))
writeFastaHeader "TWO" "IUB ambiguity codes"
seed' < execStateT (writeWrapped (3*n) (chooseBase iub)) 42
writeFastaHeader "THREE" "Homo sapiens frequency"
execStateT (writeWrapped (5*n) (chooseBase homosapiens)) seed'
Old Entry
This is the old entry. It has a huge memory leak due to laziness.
 The Great Computer Language Shootout
 http://shootout.alioth.debian.org/
 contributed by Jeff Newbern
 Uses random generation code derived from Simon Marlow and Einar Karttunen's
 "random" test entry.
 Note: This code has not been optimized *at all*. It is written to be clear
 and to follow standard Haskell idioms as much as possible (but we have to match
 the stateful PRNG idiom in the test definition, which is oriented toward an
 imperative language). Performance is decent with ghc O2, but if it can be
 improved without sacrificing the clarity of the code, by all means go for it!
import System(getArgs)
type Base = Char
type Sequence = [Base]
alu :: Sequence  predefined sequence
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ++
"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ++
"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ++
"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ++
"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ++
"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ++
"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
type BaseFrequency = (Base,Double)
iub :: [BaseFrequency]
iub = [ ('a', 0.27), ('c', 0.12), ('g', 0.12), ('t', 0.27),
('B', 0.02), ('D', 0.02), ('H', 0.02), ('K', 0.02), ('M', 0.02), ('N', 0.02),
('R', 0.02), ('S', 0.02), ('V', 0.02), ('W', 0.02), ('Y', 0.02) ]
homosapiens :: [BaseFrequency]
homosapiens = [ ('a', 0.3029549426680), ('c', 0.1979883004921),
('g', 0.1975473066391), ('t', 0.3015094502008) ]
 select a base whose interval covers the given double
chooseBase :: [BaseFrequency] > Double > Base
chooseBase [(b,_)] _ = b
chooseBase ((b,f):xs) p  p < f = b
 otherwise = chooseBase xs (pf)
 write a sequence in Fasta format
writeFasta :: String > String > Sequence > IO ()
writeFasta label title sequence =
do putStrLn $ ">" ++ (label ++ (" " ++ title))
writeWrapped 60 sequence
where writeWrapped _ [] = do return ()
writeWrapped len str = do let (s1,s2) = splitAt len str
putStrLn s1
writeWrapped len s2
 generate an infinite sequence of random doubles using the
 prng from the "random" test
probs :: Int > [Double]
probs seed = tail $ map normalize (iterate nextSeed seed)
where nextSeed s = (s * ia + ic) `mod` im
im = 139968
ia = 3877
ic = 29573
normalize n = (fromIntegral n) * (1.0 / fromIntegral im)
main = do args < getArgs
let n = if (null args) then 1000 else read (head args)
writeFasta "ONE" "Homo sapiens alu" (take (2*n) (cycle alu))
let (seq1,seq2) = splitAt (3*n) (probs 42)  we have to match the imperative version
let seq2' = take (5*n) seq2  instead of using the Haskell idiom
writeFasta "TWO" "IUB ambiguity codes" (map (chooseBase iub) seq1)
writeFasta "THREE" "Homo sapiens frequency" (map (chooseBase homosapiens) seq2')