Shootout/Reverse complement
This is a ShootoutEntry for [1].
about the reverse-complement benchmark
Each program should
- read line-by-line a redirected FASTA format file from stdin
- for each sequence: write the id, description, and the reverse-complement sequence in FASTA format to stdout
We use the FASTA file generated by the fasta benchmark as input for this benchmark. Note: the file may include both lowercase and uppercase codes.
We use these code complements:
code meaning complement A A T C C G G G C T/U T A M A or C K R A or G Y W A or T W S C or G S Y C or T R K G or T M V A or C or G B H A or C or T D D A or G or T H B C or G or T V N G or A or T or C N
"by knowing the sequence of bases of one strand of DNA we immediately know the sequence of the DNA strand which will bind to it, this strand is called the reverse complement"
DNA: Structure and Function
New Code
Performance for various entries. Size of data = 5M.
Code | Time in seconds for a given file |
---|---|
Don Stewart v1 | 30.85 |
Original entry | 24.13 |
Mutable version 1 | 2.44 |
Mutable version 2 | 1.67 |
gcc -Onot | 0.33 |
GHC 6.8
{-# OPTIONS -fvia-C -O2 -optc-O3 -fglasgow-exts #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Sterling Clover
-- Heavily inspired by contribution from Don Stewart
-- Suggested flags: -funfolding-use-threshold=32 -O2 -optc-O3
--
import qualified Data.ByteString.Char8 as S
import Data.ByteString.Internal
import Data.ByteString.Unsafe
import Foreign
import Control.Arrow
import GHC.Base
import GHC.Ptr
import GHC.IOBase
import Control.Monad (zipWithM_)
main = uncurry proc =<< clines `fmap` S.getContents
proc = zipWithM_ (\h b -> S.putStrLn h >> revcomp b >> writeFasta b)
writeFasta t
| S.null t =return ()
| otherwise = S.putStrLn l >> writeFasta r
where (l,r) = S.splitAt 60 t
clines :: ByteString -> ([ByteString],[ByteString])
clines ps = clines' ps ([],[])
where
clines' ps accum@(f,s)
| otherwise = case S.elemIndex '\n' ps of
Just n -> clines'' (S.drop (n+1) ps) (f++[S.take n ps],s)
clines'' ps accum@(f,s)
| otherwise = case S.elemIndex '>' ps of
Nothing -> (f,s++[S.filter (/='\n') ps])
Just n -> clines' (S.drop n ps) (f,s++[S.filter (/='\n') . S.take n $ ps])
comps = map (ord *** c2w) [
('A' , 'T'), ( 'a' , 'T'), ( 'C' , 'G'), ( 'c' , 'G'), ( 'G' , 'C'),
('g' , 'C'), ( 'T' , 'A'), ( 't' , 'A'), ( 'U' , 'A'), ( 'u' , 'A'),
('M' , 'K'), ( 'm' , 'K'), ( 'R' , 'Y'), ( 'r' , 'Y'), ( 'Y' , 'R'),
('y' , 'R'), ( 'K' , 'M'), ( 'k' , 'M'), ( 'V' , 'B'), ( 'v' , 'B'),
('H' , 'D'), ( 'h' , 'D'), ( 'D' , 'H'), ( 'd' , 'H'), ( 'B' , 'V'), ( 'b' , 'V')]
{- NOINLINE -}
ca :: Ptr Word8
ca = inlinePerformIO $ do
a <- mallocArray 200
mapM_ (uncurry (pokeByteOff a)) $ zip [0..199::Int] [0..199::Word8]
mapM_ (uncurry (pokeByteOff a)) comps
return a
comp :: Word# -> Word#
comp c = rw8 ca (word2Int# c)
revcomp (PS fp s (I# l)) = withForeignPtr fp $ \p -> rc (p `plusPtr` s) 0# (l -# 1#)
where
rc :: Ptr Word8 -> Int# -> Int# -> IO ()
rc p i j = rc' i j
where
rc' i j
| i <# j = do
let x = rw8 p i
ww8 p i (comp (rw8 p j))
ww8 p j (comp x)
rc' (i +# 1#) (j -# 1#)
| i ==# j = ww8 p i (comp (rw8 p i))
| otherwise = return ()
rw8 :: Ptr Word8 -> Int# -> Word#
rw8 (Ptr a) i = case readWord8OffAddr# a i realWorld# of (# _, x #) -> x
ww8 :: Ptr Word8 -> Int# -> Word# -> IO ()
ww8 (Ptr a) i x = IO $ \s -> case writeWord8OffAddr# a i x s of s2 -> (# s2, () #)
Submitted
Faster still. ByteString version. Submitted
{-# OPTIONS -fbang-patterns #-}
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--
import qualified Data.ByteString.Char8 as S
import Data.ByteString.Base
import Foreign
main = do
(s:ss) <- S.lines `fmap` S.getContents
process s ss []
process !s xs@(~(x:xx)) acc
| S.null s = writeFasta acc
| null xs = revcomp s >> writeFasta (s:acc)
| h == '>' = writeFasta acc >> S.putStrLn s >> process x xx []
| otherwise = revcomp s >> process x xx (s:acc)
where
(h,t) = uncons s
uncons s = (w2c (unsafeHead s), unsafeTail s)
comp c = c2w $ case w2c c of
'A' -> 'T'; 'a' -> 'T'; 'C' -> 'G'; 'c' -> 'G'; 'G' -> 'C'
'g' -> 'C'; 'T' -> 'A'; 't' -> 'A'; 'U' -> 'A'; 'u' -> 'A'
'M' -> 'K'; 'm' -> 'K'; 'R' -> 'Y'; 'r' -> 'Y'; 'Y' -> 'R'
'y' -> 'R'; 'K' -> 'M'; 'k' -> 'M'; 'V' -> 'B'; 'v' -> 'B'
'H' -> 'D'; 'h' -> 'D'; 'D' -> 'H'; 'd' -> 'H'; 'B' -> 'V'; 'b' -> 'V'; x -> x
writeFasta [] = return ()
writeFasta (t:ts) = go ts t
where
go [] !s
| S.null s = return ()
| otherwise = S.putStrLn l >> go [] r
where (l,r) = S.splitAt 60 s
go ss !s
| ln >= 60 = S.putStrLn l >> go ss r
| otherwise = S.putStr s >> S.putStrLn a >> go (tail ss) b
where
ln = S.length s
(l,r) = S.splitAt 60 s
(a,b) = S.splitAt (60-ln) (head ss)
--
-- An inplace reverse. Since we have a uniqueness here, just use the FFI as an ST monad
--
revcomp (PS fp s l) = withForeignPtr fp $ \p -> rc (p `plusPtr` s) 0 (l-1)
where
rc :: Ptr Word8 -> Int -> Int -> IO ()
rc !p !i !j
| i < j = do
x <- peekByteOff p i
pokeByteOff p i . comp =<< peekByteOff p j
pokeByteOff p j (comp x)
rc p (i+1) (j-1)
| i == j = pokeByteOff p i . comp =<< peekByteOff p i
| otherwise = return ()
Data.ByteString
Translation of "Don Stewart v1" to use Data.ByteString. About 2x faster than Mutable version 2, on my OpenBSD x86 box.
Todo. Submit this.
{-# OPTIONS -cpp -fglasgow-exts -O2 -optc-O3 -funbox-strict-fields #-}
--
-- Reverse complement benchmark
--
-- Written by Don Stewart
--
import GHC.Base
import Data.Char
import qualified Data.ByteString as B
main = B.getContents >>= process B.empty []
process h b ps
| h `seq` b `seq` ps `seq` False = undefined
| B.null ps = write h b
| x == '>' = write h b >> process h' [] ps'
| x == '\n' = process h b xs
| otherwise = process h ((complement . toUpper $ x) : b) xs
where (x,xs) = (B.unsafeHead ps, B.unsafeTail ps)
(h',ps') = B.breakOn '\n' ps
write h s
| B.null h = return ()
| otherwise = B.putStrLn h >> write_ (B.pack s)
where
write_ t
| B.null t = return ()
| otherwise = let (a,b) = B.splitAt 60 t in B.putStrLn a >> write_ b
complement (C# i) = C# (indexCharOffAddr# arr (ord# i -# 65#))
where arr = "TVGH CD M KN YSAABW R"#
Mutable version 2
This is the fastest Haskell entry so far. It uses mutable arrays, as well as a handcrafted inner-loop on the reverse, and doesn't waste time coercing the input chars to words (and identity, in this case, anyway).
{-# OPTIONS -fglasgow-exts -O2 -optc-O3 #-}
--
--
-- Translation of C version to Haskell by Don Stewart
--
import Data.Char
import Control.Arrow
import Foreign.Marshal.Array
import Foreign.Storable
import Control.Monad
import qualified Control.Exception as C
import System.IO
import GHC.IOBase
import GHC.Base
import GHC.Ptr
import GHC.Word
pairs = map (c2w *** c2w) [('A','T'),('C','G'),('B','V'),('D','H'),('K','M'),('R','Y'),('\0','\0')]
main = do
inp <- mallocArray 129 :: IO (Ptr Word8)
buf <- mallocArray 1024 :: IO (Ptr Word8)
iubP <- sequence [ newArray [x,y] | (x,y) <- pairs ] >>= newArray
iubC <- newArray (map c2w ['\0'..'\255']++[0])
buildIubComplement iubC iubP (0 :: Int)
(slen,inp) <- go 0 128 inp buf iubC
when (slen > 0) $ process iubC inp slen
go slen mlen inp buffer iubC = do
eof <- C.catch (getLine >>= pokeArray0 0 buffer . c2ws . take 1023 >> return False)
(\_ -> return True)
if eof then return (slen,inp) else do
b0 <- buffer ! (0::Int)
if b0 == c2w '>'
then do when (slen > 0) $ process iubC inp slen
lengthArray0 0 buffer >>= hPutBuf stdout buffer >> putChar '\n'
go 0 mlen inp buffer iubC
else do l <- lengthArray0 0 buffer >>= shrink buffer . (+1)
(inp',mlen') <- tweak slen mlen l inp
copyArray (inp' `plusPtr` slen) buffer l
go (slen + l) mlen' inp' buffer iubC
process iubc strand len = do
inplacereverse iubc strand len
(s,l) <- print60 strand len
hPutBuf stdout s l >> putChar '\n'
print60 s n
| n <= 60 = return (s,n)
| otherwise = do
hPutBuf stdout s 60 >> putChar '\n'
print60 (s `advancePtr` 60) (n - 60)
tweak slen mlen l inp
| slen + l <= mlen = return (inp,mlen)
| otherwise = do
let mlen' = mlen + mlen
inp' <- reallocArray0 inp mlen'
tweak slen mlen' l inp'
shrink b l | l <= 0 = return l
| otherwise = do
bl1 <- b ! (l-1)
if not . isAlpha . w2c $ bl1 then shrink b (l-1) else return l
buildIubComplement iubC iubP i = do
i0 <- index2 iubP i (0::Int)
when (i0 /= 0) $ do
i1 <- index2 iubP i (1::Int)
set iubC i0 i1
set iubC i1 i0
set iubC (tolower i0) i1
set iubC (tolower i1) i0
buildIubComplement iubC iubP (i+1)
inplacereverse iubc@(Ptr r) strand@(Ptr s) len@(I# ln) = do
(i,l) <- IO $ reverseit r s 0# (ln -# 1#)
when (i == l) $ strand ! i >>= (iubc !) >>= set strand i
reverseit iubc strand i l s =
if i >=# l
then (# s, (I# i, I# l) #)
else case readWord8OffAddr# strand i s of { (# s, c #) ->
case readWord8OffAddr# strand l s of { (# s, x #) ->
case readWord8OffAddr# iubc (word2Int# x) s of { (# s, y #) ->
case readWord8OffAddr# iubc (word2Int# c) s of { (# s, z #) ->
case writeWord8OffAddr# strand i y s of { s ->
case writeWord8OffAddr# strand l z s of { s ->
reverseit iubc strand (i +# 1#) (l -# 1#) s
} } } } } }
arr ! i = peekElemOff arr (fromIntegral i)
set arr i n = pokeElemOff arr (fromIntegral i) n
index2 arr i j = arr ! i >>= (! j)
set2 arr i j n = arr ! i >>= \arr' -> set arr' j n
c2w = fromIntegral . ord
w2c = chr . fromIntegral
c2ws = unsafeCoerce#
tolower = fromIntegral . ord . toLower . chr . fromIntegral
Mutable version 1
This is a translation of the fast C entry into Haskell. It uses mutable arrays.
{-# OPTIONS -fglasgow-exts -O2 -optc-O3 #-}
--
-- Translation of C version by Don Stewart
--
import Data.Word
import Data.Char
import Control.Arrow
import Foreign.Marshal.Array
import Foreign.Storable
import Foreign.Ptr
import Control.Monad
import qualified Control.Exception as C
import System.IO
pairs :: [(Word8,Word8)]
pairs = map (c2w *** c2w) [('A','T'),('C','G'),('B','V'),('D','H'),('K','M'),('R','Y'),('\0','\0')]
c2w :: Char -> Word8
c2w = fromIntegral . ord
w2c :: Word8 -> Char
w2c = chr . fromIntegral
tolower :: Word8 -> Word8
tolower = fromIntegral . ord . toLower . chr . fromIntegral
main = do
inp <- mallocArray 129 :: IO (Ptr Word8)
buf <- mallocArray 1024 :: IO (Ptr Word8)
iubP <- sequence [ newArray [x,y] | (x,y) <- pairs ] >>= newArray
iubC <- newArray (map c2w ['\0'..'\255']++[0])
buildIubComplement iubC iubP (0 :: Int)
(slen,inp) <- go 0 128 inp buf iubC
when (slen > 0) $ process iubC inp slen
go slen mlen inp buffer iubC = do
eof <- C.catch (getLine >>= pokeArray0 0 buffer . map c2w . take 1023 >> return False)
(\_ -> return True)
if eof then return (slen,inp) else do
b0 <- buffer ! (0::Int)
if b0 == c2w '>'
then do when (slen > 0) $ process iubC inp slen
lengthArray0 0 buffer >>= hPutBuf stdout buffer >> putChar '\n'
go 0 mlen inp buffer iubC
else do l <- lengthArray0 0 buffer >>= shrink buffer . (+1)
(inp',mlen') <- tweak slen mlen l inp
copyArray (inp' `plusPtr` slen) buffer l
go (slen + l) mlen' inp' buffer iubC
inplacereverse :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
inplacereverse iubc strand len = do
(i,l) <- reverseit iubc strand 0 (len-1)
when (i == l) $ strand ! i >>= (iubc !) >>= set strand i
reverseit iubc strand i l
= if i >= l then return (i,l)
else do c <- strand ! i
strand ! l >>= (iubc !) >>= set strand i
iubc ! c >>= set strand l
reverseit iubc strand (i+1) (l-1)
process :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
process iubc strand len = do
inplacereverse iubc strand len
(s,l) <- print60 strand len
hPutBuf stdout s l >> putChar '\n'
print60 s n
| n <= 60 = return (s,n)
| otherwise = do
hPutBuf stdout s 60 >> putChar '\n'
print60 (s `advancePtr` 60) (n - 60)
tweak slen mlen l inp
| slen + l <= mlen = return (inp,mlen)
| otherwise = do
let mlen' = mlen + mlen
inp' <- reallocArray0 inp mlen'
tweak slen mlen' l inp'
shrink b l =
if l <= 0
then return l
else do bl1 <- b ! (l-1)
if not . isAlpha . w2c $ bl1
then shrink b (l-1)
else return l
buildIubComplement iubC iubP i = do
i0 <- index2 iubP i (0::Int)
when (i0 /= 0) $ do
i1 <- index2 iubP i (1::Int)
set iubC i0 i1
set iubC i1 i0
set iubC (tolower i0) i1
set iubC (tolower i1) i0
buildIubComplement iubC iubP (i+1)
arr ! i = peekElemOff arr (fromIntegral i)
set arr i n = pokeElemOff arr (fromIntegral i) n
index2 arr i j = arr ! i >>= (! j)
set2 arr i j n = arr ! i >>= \arr' -> set arr' j n
Don Stewart v1
A combination of the original entry, with an array index idea from the alex lexer. The original verbose version uses marginally less heap space, and they're both roughly the same speed. This version uses half the space of the array index version.
{-# OPTIONS -fglagsow-exts #-}
import GHC.Base
import Data.Char
main = getContents >>= process . (,,) [] []
complement (C# i) = C# (indexCharOffAddr# arr (ord# i -# 65#))
where arr = "TVGH..CD..M.KN...YSAABW.R"#
process (h,b,c@('>':_)) = write h b >> process (h',[],cs')
where (h',cs') = break (=='\n') c
process (h,b,('\n':cs)) = process (h,b,cs)
process (h,b,(c:cs)) = process (h,((complement $ toUpper c):b),cs)
process (h,b,[]) = write h b
write [] _ = return ()
write h s = putStrLn h >> write' s
where write' [] = return ()
write' t = let (a,b) = splitAt 60 t in putStrLn a >> write' b
Old Code
This is the original haskell entry
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
-- contributed by Jeff Newbern
-- Note: This code has not been optimized *at all*. It is written to be clear
-- and concise, using standard Haskell idioms. Performance is decent with
-- ghc -O2, but if it can be improved without sacrificing the clarity of the
-- code, by all means go for it!
import Data.Char(toLower)
type Base = Char
type Sequence = [Base]
complement :: Base -> Base
complement 'A' = 'T'
complement 'a' = 'T'
complement 'C' = 'G'
complement 'c' = 'G'
complement 'G' = 'C'
complement 'g' = 'C'
complement 'T' = 'A'
complement 't' = 'A'
complement 'U' = 'A'
complement 'u' = 'A'
complement 'M' = 'K'
complement 'm' = 'K'
complement 'R' = 'Y'
complement 'r' = 'Y'
complement 'Y' = 'R'
complement 'y' = 'R'
complement 'K' = 'M'
complement 'k' = 'M'
complement 'V' = 'B'
complement 'v' = 'B'
complement 'H' = 'D'
complement 'h' = 'D'
complement 'D' = 'H'
complement 'd' = 'H'
complement 'B' = 'V'
complement 'b' = 'V'
complement x = x
-- write a sequence in Fasta format
writeFasta :: String -> Sequence -> IO ()
writeFasta [] _ = return ()
writeFasta header sequence =
do putStrLn header
writeWrapped 60 sequence
where writeWrapped _ [] = return ()
writeWrapped len str = do let (s1,s2) = splitAt len str
putStrLn s1
writeWrapped len s2
-- recurse over input stream, accumulating and writing processed sequences
process :: (String,[Base],String) -> IO()
process (header,bases,[]) = writeFasta header bases
process (header,bases,c@('>':cs)) = do writeFasta header bases
let (header',cs') = break (\c->c == '\n') c
process (header',[],cs')
process (header,bases,('\n':cs)) = process (header,bases,cs)
process (header,bases,(c:cs)) = process (header,((complement c):bases),cs)
main = do cs <- getContents
process ([],[],cs)