Shootout/Reverse complement

From HaskellWiki
Jump to navigation Jump to search

This is a ShootoutEntry for [1].

about the reverse-complement benchmark

Each program should

  • read line-by-line a redirected FASTA format file from stdin
  • for each sequence: write the id, description, and the reverse-complement sequence in FASTA format to stdout

We use the FASTA file generated by the fasta benchmark as input for this benchmark. Note: the file may include both lowercase and uppercase codes.

We use these code complements:

   code  meaning   complement
   A    A                   T
   C    C                   G
   G    G                   C
   T/U  T                   A
   M    A or C              K
   R    A or G              Y
   W    A or T              W
   S    C or G              S
   Y    C or T              R
   K    G or T              M
   V    A or C or G         B
   H    A or C or T         D
   D    A or G or T         H
   B    C or G or T         V
   N    G or A or T or C    N


"by knowing the sequence of bases of one strand of DNA we immediately know the sequence of the DNA strand which will bind to it, this strand is called the reverse complement" DNA: Structure and Function

New Code

Performance for various entries. Size of data = 5M.

Code Time in seconds for a given file
Don Stewart v1 30.85
Original entry 24.13
Mutable version 1 2.44
Mutable version 2 1.67
gcc -Onot 0.33

GHC 6.8

{-# OPTIONS -fvia-C -O2 -optc-O3 -fglasgow-exts #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- Contributed by Sterling Clover
-- Heavily inspired by contribution from Don Stewart
-- Suggested flags: -funfolding-use-threshold=32 -O2 -optc-O3
--

import qualified Data.ByteString.Char8 as S
import Data.ByteString.Internal
import Data.ByteString.Unsafe
import Foreign
import Control.Arrow
import GHC.Base
import GHC.Ptr
import GHC.IOBase
import Control.Monad (zipWithM_)

main = uncurry proc =<< clines `fmap` S.getContents

proc = zipWithM_ (\h b -> S.putStrLn h >> revcomp b >> writeFasta b)

writeFasta t
    | S.null t =return ()
    | otherwise = S.putStrLn l >> writeFasta r
    where (l,r) = S.splitAt 60 t

clines :: ByteString -> ([ByteString],[ByteString])
clines ps = clines' ps ([],[])
    where
      clines' ps accum@(f,s)
          | otherwise = case S.elemIndex '\n' ps of
                          Just n  -> clines'' (S.drop (n+1) ps) (f++[S.take n ps],s)
      clines'' ps accum@(f,s)
          | otherwise = case S.elemIndex '>' ps of
                      Nothing -> (f,s++[S.filter (/='\n') ps])
                      Just n  -> clines' (S.drop n ps) (f,s++[S.filter (/='\n') . S.take n $ ps])

comps = map (ord *** c2w) [
    ('A' , 'T'), ( 'a' , 'T'), ( 'C' , 'G'), ( 'c' , 'G'), ( 'G' , 'C'),
    ('g' , 'C'), ( 'T' , 'A'), ( 't' , 'A'), ( 'U' , 'A'), ( 'u' , 'A'),
    ('M' , 'K'), ( 'm' , 'K'), ( 'R' , 'Y'), ( 'r' , 'Y'), ( 'Y' , 'R'),
    ('y' , 'R'), ( 'K' , 'M'), ( 'k' , 'M'), ( 'V' , 'B'), ( 'v' , 'B'),
    ('H' , 'D'), ( 'h' , 'D'), ( 'D' , 'H'), ( 'd' , 'H'), ( 'B' , 'V'), ( 'b' , 'V')]

{- NOINLINE -}
ca :: Ptr Word8
ca = inlinePerformIO $ do
       a <- mallocArray 200
       mapM_ (uncurry (pokeByteOff a)) $ zip [0..199::Int] [0..199::Word8]
       mapM_ (uncurry (pokeByteOff a)) comps
       return a

comp :: Word# -> Word#
comp c = rw8 ca (word2Int# c)

revcomp (PS fp s (I# l)) = withForeignPtr fp $ \p -> rc (p `plusPtr` s) 0# (l -# 1#)
  where
    rc :: Ptr Word8 -> Int# -> Int# -> IO ()
    rc p i j  = rc' i j
        where
          rc' i j
              | i <# j = do
                          let x = rw8 p i
                          ww8 p i (comp (rw8 p j))
                          ww8 p j (comp x)
                          rc' (i +# 1#) (j -# 1#)
              | i ==# j = ww8 p i (comp (rw8 p i))
              | otherwise =  return ()

rw8 :: Ptr Word8 -> Int# -> Word#
rw8 (Ptr a) i = case readWord8OffAddr# a i realWorld#  of (# _, x #) ->  x

ww8 :: Ptr Word8 -> Int# -> Word# -> IO ()
ww8 (Ptr a) i x  = IO $ \s -> case writeWord8OffAddr# a i x s of s2 -> (# s2, () #)


Submitted

Faster still. ByteString version. Submitted

{-# OPTIONS -fbang-patterns #-}
--
-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- Contributed by Don Stewart
--

import qualified Data.ByteString.Char8 as S
import Data.ByteString.Base
import Foreign

main = do
    (s:ss) <- S.lines `fmap` S.getContents
    process s ss []

process !s xs@(~(x:xx)) acc
    | S.null s      = writeFasta acc
    | null xs       = revcomp s >> writeFasta (s:acc)
    | h == '>'      = writeFasta acc >> S.putStrLn s >> process x xx []
    | otherwise     = revcomp s >> process x xx (s:acc)
  where
    (h,t)  = uncons s

uncons s = (w2c (unsafeHead s), unsafeTail s)

comp c = c2w $ case w2c c of
    'A' -> 'T'; 'a' -> 'T'; 'C' -> 'G'; 'c' -> 'G'; 'G' -> 'C'
    'g' -> 'C'; 'T' -> 'A'; 't' -> 'A'; 'U' -> 'A'; 'u' -> 'A'
    'M' -> 'K'; 'm' -> 'K'; 'R' -> 'Y'; 'r' -> 'Y'; 'Y' -> 'R'
    'y' -> 'R'; 'K' -> 'M'; 'k' -> 'M'; 'V' -> 'B'; 'v' -> 'B'
    'H' -> 'D'; 'h' -> 'D'; 'D' -> 'H'; 'd' -> 'H'; 'B' -> 'V'; 'b' -> 'V'; x  -> x

writeFasta []     = return ()
writeFasta (t:ts) = go ts t
  where
    go [] !s
        | S.null s  = return ()
        | otherwise = S.putStrLn l >> go [] r
        where (l,r) = S.splitAt 60 s
    go ss !s
        | ln >= 60  = S.putStrLn l               >> go ss r
        | otherwise = S.putStr s >> S.putStrLn a >> go (tail ss) b
        where
            ln    = S.length s
            (l,r) = S.splitAt 60 s
            (a,b) = S.splitAt (60-ln) (head ss)

--
-- An inplace reverse. Since we have a uniqueness here, just use the FFI as an ST monad
--
revcomp (PS fp s l) = withForeignPtr fp $ \p -> rc (p `plusPtr` s) 0 (l-1)
  where
    rc :: Ptr Word8 -> Int -> Int -> IO ()
    rc !p !i !j
        | i < j = do
            x <- peekByteOff p i
            pokeByteOff p i . comp =<< peekByteOff p j
            pokeByteOff p j (comp x)
            rc p (i+1) (j-1)
        | i == j = pokeByteOff p i . comp =<< peekByteOff p i
        | otherwise = return ()

Data.ByteString

Translation of "Don Stewart v1" to use Data.ByteString. About 2x faster than Mutable version 2, on my OpenBSD x86 box.

Todo. Submit this.

{-# OPTIONS -cpp -fglasgow-exts -O2 -optc-O3 -funbox-strict-fields #-}
--
-- Reverse complement benchmark
--
-- Written by Don Stewart
--
import GHC.Base
import Data.Char
import qualified Data.ByteString as B

main = B.getContents >>= process B.empty []

process h b ps
    | h `seq` b `seq` ps `seq` False = undefined
    | B.null ps = write h b
    | x == '>'  = write h b >> process h' [] ps'
    | x == '\n' = process h b xs
    | otherwise = process h ((complement . toUpper $ x) : b) xs
    where (x,xs)   = (B.unsafeHead ps, B.unsafeTail ps)
          (h',ps') = B.breakOn '\n' ps

write h s
    | B.null h  = return ()
    | otherwise = B.putStrLn h >> write_ (B.pack s)
    where
        write_ t
            | B.null t  = return ()
            | otherwise = let (a,b) = B.splitAt 60 t in B.putStrLn a >> write_ b

complement (C# i) = C# (indexCharOffAddr# arr (ord# i -# 65#))
    where arr = "TVGH  CD  M KN   YSAABW R"#

Mutable version 2

This is the fastest Haskell entry so far. It uses mutable arrays, as well as a handcrafted inner-loop on the reverse, and doesn't waste time coercing the input chars to words (and identity, in this case, anyway).

{-# OPTIONS -fglasgow-exts -O2 -optc-O3 #-}

--
-- 
-- Translation of C version to Haskell by Don Stewart
--

import Data.Char
import Control.Arrow
import Foreign.Marshal.Array
import Foreign.Storable
import Control.Monad
import qualified Control.Exception as C
import System.IO
import GHC.IOBase
import GHC.Base
import GHC.Ptr
import GHC.Word

pairs = map (c2w *** c2w) [('A','T'),('C','G'),('B','V'),('D','H'),('K','M'),('R','Y'),('\0','\0')]

main = do
    inp  <- mallocArray 129  :: IO (Ptr Word8)
    buf  <- mallocArray 1024 :: IO (Ptr Word8)
    iubP <- sequence [ newArray [x,y] | (x,y) <- pairs ] >>= newArray
    iubC <- newArray (map c2w ['\0'..'\255']++[0])
    buildIubComplement iubC iubP (0 :: Int)
    (slen,inp) <- go 0 128 inp buf iubC
    when (slen > 0) $ process iubC inp slen

go slen mlen inp buffer iubC = do
    eof <- C.catch (getLine >>= pokeArray0 0 buffer . c2ws . take 1023 >> return False)
                   (\_ -> return True)
    if eof then return (slen,inp) else do
        b0 <- buffer ! (0::Int)
        if b0 == c2w '>' 
            then do when (slen > 0) $ process iubC inp slen
                    lengthArray0 0 buffer >>= hPutBuf stdout buffer >> putChar '\n'
                    go 0 mlen inp buffer iubC

            else do l <- lengthArray0 0 buffer >>= shrink buffer . (+1)
                    (inp',mlen') <- tweak slen mlen l inp
                    copyArray (inp' `plusPtr` slen) buffer l
                    go (slen + l) mlen' inp' buffer iubC

process iubc strand len = do
    inplacereverse iubc strand len
    (s,l) <- print60 strand len
    hPutBuf stdout s l >> putChar '\n'

print60 s n 
    | n <= 60 = return (s,n)
    | otherwise = do
        hPutBuf stdout s 60 >> putChar '\n'
        print60 (s `advancePtr` 60) (n - 60)

tweak slen mlen l inp 
    | slen + l <= mlen = return (inp,mlen)
    | otherwise        = do 
        let mlen' = mlen + mlen
        inp' <- reallocArray0 inp mlen'
        tweak slen mlen' l inp'

shrink b l | l <= 0  = return l
           | otherwise = do
                bl1 <- b ! (l-1)
                if not . isAlpha . w2c $ bl1 then shrink b (l-1) else return l
    
buildIubComplement iubC iubP i = do
    i0 <- index2 iubP i (0::Int)
    when (i0 /= 0) $ do
        i1 <- index2 iubP i (1::Int) 
        set iubC i0 i1
        set iubC i1 i0
        set iubC (tolower i0) i1
        set iubC (tolower i1) i0
        buildIubComplement iubC iubP (i+1)

inplacereverse iubc@(Ptr r) strand@(Ptr s) len@(I# ln) = do 
    (i,l) <- IO $ reverseit r s 0# (ln -# 1#)
    when (i == l) $ strand ! i >>= (iubc !) >>= set strand i

reverseit iubc strand i l s =
    if i >=# l 
        then (# s, (I# i, I# l) #)
        else case readWord8OffAddr# strand i s  of { (# s, c #) -> 
             case readWord8OffAddr# strand l s  of { (# s, x #) -> 
             case readWord8OffAddr# iubc   (word2Int# x) s  of { (# s, y #) -> 
             case readWord8OffAddr# iubc   (word2Int# c) s  of { (# s, z #) ->
             case writeWord8OffAddr# strand i y s of { s ->
             case writeWord8OffAddr# strand l z s of { s ->
             reverseit iubc strand (i +# 1#) (l -# 1#) s
        } } } } } }
    
arr ! i     = peekElemOff arr (fromIntegral i)
set arr i n = pokeElemOff arr (fromIntegral i) n

index2 arr i j = arr ! i >>= (! j)
set2 arr i j n = arr ! i >>= \arr' -> set arr' j n

c2w     = fromIntegral . ord
w2c     = chr . fromIntegral
c2ws    = unsafeCoerce#
tolower = fromIntegral . ord . toLower . chr . fromIntegral


Mutable version 1

This is a translation of the fast C entry into Haskell. It uses mutable arrays.

{-# OPTIONS -fglasgow-exts -O2 -optc-O3 #-}

--
-- Translation of C version by Don Stewart
--

import Data.Word
import Data.Char
import Control.Arrow
import Foreign.Marshal.Array
import Foreign.Storable
import Foreign.Ptr
import Control.Monad
import qualified Control.Exception as C
import System.IO

pairs :: [(Word8,Word8)]
pairs = map (c2w *** c2w) [('A','T'),('C','G'),('B','V'),('D','H'),('K','M'),('R','Y'),('\0','\0')]

c2w :: Char -> Word8
c2w = fromIntegral . ord

w2c :: Word8 -> Char
w2c = chr . fromIntegral

tolower :: Word8 -> Word8
tolower = fromIntegral . ord . toLower . chr . fromIntegral

main = do
    inp  <- mallocArray 129  :: IO (Ptr Word8)
    buf  <- mallocArray 1024 :: IO (Ptr Word8)
    iubP <- sequence [ newArray [x,y] | (x,y) <- pairs ] >>= newArray
    iubC <- newArray (map c2w ['\0'..'\255']++[0])
    buildIubComplement iubC iubP (0 :: Int)
    (slen,inp) <- go 0 128 inp buf iubC
    when (slen > 0) $ process iubC inp slen

go slen mlen inp buffer iubC = do
    eof <- C.catch (getLine >>= pokeArray0 0 buffer . map c2w . take 1023 >> return False) 
                   (\_ -> return True)
    if eof then return (slen,inp) else do
        b0 <- buffer ! (0::Int)
        if b0 == c2w '>' 
            then do when (slen > 0) $ process iubC inp slen
                    lengthArray0 0 buffer >>= hPutBuf stdout buffer >> putChar '\n'
                    go 0 mlen inp buffer iubC

            else do l <- lengthArray0 0 buffer >>= shrink buffer . (+1)
                    (inp',mlen') <- tweak slen mlen l inp
                    copyArray (inp' `plusPtr` slen) buffer l
                    go (slen + l) mlen' inp' buffer iubC

inplacereverse :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
inplacereverse iubc strand len = do 
    (i,l) <- reverseit iubc strand 0 (len-1)
    when (i == l) $ strand ! i >>= (iubc !) >>= set strand i

reverseit iubc strand i l 
    = if i >= l then return (i,l)
                else do c <- strand ! i
                        strand ! l >>= (iubc !) >>= set strand i
                        iubc ! c >>= set strand l
                        reverseit iubc strand (i+1) (l-1)

process :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
process iubc strand len = do
    inplacereverse iubc strand len
    (s,l) <- print60 strand len
    hPutBuf stdout s l >> putChar '\n'

print60 s n 
    | n <= 60 = return (s,n)
    | otherwise = do
        hPutBuf stdout s 60 >> putChar '\n'
        print60 (s `advancePtr` 60) (n - 60)

tweak slen mlen l inp 
    | slen + l <= mlen = return (inp,mlen)
    | otherwise        = do 
        let mlen' = mlen + mlen
        inp' <- reallocArray0 inp mlen'
        tweak slen mlen' l inp'

shrink b l =
    if l <= 0 
        then return l
        else do bl1 <- b ! (l-1)
                if not . isAlpha . w2c $ bl1
                    then shrink b (l-1) 
                    else return l
    
buildIubComplement iubC iubP i = do
    i0 <- index2 iubP i (0::Int)
    when (i0 /= 0) $ do
        i1 <- index2 iubP i (1::Int) 
        set iubC i0 i1
        set iubC i1 i0
        set iubC (tolower i0) i1
        set iubC (tolower i1) i0
        buildIubComplement iubC iubP (i+1)
    
arr ! i     = peekElemOff arr (fromIntegral i)
set arr i n = pokeElemOff arr (fromIntegral i) n

index2 arr i j = arr ! i >>= (! j)

set2 arr i j n = arr ! i >>= \arr' -> set arr' j n

Don Stewart v1

A combination of the original entry, with an array index idea from the alex lexer. The original verbose version uses marginally less heap space, and they're both roughly the same speed. This version uses half the space of the array index version.

{-# OPTIONS -fglagsow-exts #-}
import GHC.Base
import Data.Char

main = getContents >>= process . (,,) [] []

complement (C# i) = C# (indexCharOffAddr# arr (ord# i -# 65#))
    where arr = "TVGH..CD..M.KN...YSAABW.R"#

process (h,b,c@('>':_)) = write h b >> process (h',[],cs')
    where (h',cs') = break (=='\n') c
process (h,b,('\n':cs)) = process (h,b,cs)
process (h,b,(c:cs))    = process (h,((complement $ toUpper c):b),cs)
process (h,b,[])        = write h b

write [] _ = return ()
write h  s = putStrLn h >> write' s
  where write' [] = return ()
        write' t  = let (a,b) = splitAt 60 t in putStrLn a >> write' b


Old Code

This is the original haskell entry

-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
-- contributed by Jeff Newbern

-- Note: This code has not been optimized *at all*.  It is written to be clear
-- and concise, using standard Haskell idioms.  Performance is decent with
-- ghc -O2, but if it can be improved without sacrificing the clarity of the
-- code, by all means go for it!

import Data.Char(toLower)

type Base = Char
type Sequence = [Base]

complement :: Base -> Base
complement 'A' = 'T'
complement 'a' = 'T'
complement 'C' = 'G'
complement 'c' = 'G'
complement 'G' = 'C'
complement 'g' = 'C'
complement 'T' = 'A'
complement 't' = 'A'
complement 'U' = 'A'
complement 'u' = 'A'
complement 'M' = 'K'
complement 'm' = 'K'
complement 'R' = 'Y'
complement 'r' = 'Y'
complement 'Y' = 'R'
complement 'y' = 'R'
complement 'K' = 'M'
complement 'k' = 'M'
complement 'V' = 'B'
complement 'v' = 'B'
complement 'H' = 'D'
complement 'h' = 'D'
complement 'D' = 'H'
complement 'd' = 'H'
complement 'B' = 'V'
complement 'b' = 'V'
complement  x  = x

-- write a sequence in Fasta format
writeFasta :: String -> Sequence -> IO ()
writeFasta []     _        = return ()
writeFasta header sequence =
  do putStrLn header
     writeWrapped 60 sequence
  where writeWrapped _   []  = return ()
        writeWrapped len str = do let (s1,s2) = splitAt len str
                                  putStrLn s1
                                  writeWrapped len s2

-- recurse over input stream, accumulating and writing processed sequences
process :: (String,[Base],String) -> IO()
process (header,bases,[])         = writeFasta header bases
process (header,bases,c@('>':cs)) = do writeFasta header bases
                                       let (header',cs') = break (\c->c == '\n') c
                                       process (header',[],cs')
process (header,bases,('\n':cs))  = process (header,bases,cs)
process (header,bases,(c:cs))     = process (header,((complement c):bases),cs)

main = do cs <- getContents
          process ([],[],cs)