Serialisation and compression with Data Binary

From HaskellWiki
Revision as of 14:16, 29 November 2007 by Manu3000 (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

An example showing how to *efficiently* serialise data, compress it, and pass it to C, using Data.Binary and the zlib binding.

{-# OPTIONS -fglasgow-exts #-}

{-
An example showing how to:

    * Use the FFI
    * Compress streaming data
    * Serialise to and from disk
    * Stream lazy bytestrings efficiently

We will use
    * Foreign.* to generate the data
    * Wrap it as a lazy bytestring
    * Data.Binary to serialise it
    * Code.Compression.Gzip to compress/uncompress
    * Pass it to C and make a simple FFI call on the result
    * Display the result

Running:
    $ ghc -O2 A.hs --make

    $ time ./A                
    Built table
    Compressed      25600000 bytes
    Compressed size  2231545 bytes (91.28%)
    Decompressed    25600000 bytes
    Calling into C ...
    -8.742278e-8
    -0.6865875
    -0.7207948
    -0.1401903
    0.63918984
    0.7437966
    0.27236375
    -0.5763547
    -0.75708854
    -0.39026973
    ./A  2.98s user 0.11s system 94% cpu 3.275 total
-}

-- 
-- Some imports
--
import Foreign
import Foreign.C.Types
import Data.Int

import qualified Data.ByteString.Lazy     as L
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString          as S

import Data.Binary
import Codec.Compression.GZip

import System.IO
import Text.Printf
import Control.Monad

------------------------------------------------------------------------
-- Foreign Ptrs
--
-- A simple wrapper type
--
data Table = Table { floats :: ForeignPtr CFloat
                   , ints   :: ForeignPtr Int    }

-- Statically fixed sizes
floatSize = 4800000
intSize   = 1600000

totalBytes = sizeOf (undefined :: CFloat) * floatSize
           + sizeOf (undefined :: Int)    * intSize

--
-- Build a table populated with some defaults
-- Float table filled with 'pi' , ints numbered consecutively
--
newTable :: IO Table
newTable = do
    fp <- S.mallocByteString (floatSize * sizeOf (undefined :: CFloat))
    ip <- S.mallocByteString (intSize   * sizeOf (undefined :: Int   ))
    withForeignPtr fp $ \p ->
        forM_ [0..floatSize-1] $ \n ->
            pokeElemOff p n pi
    withForeignPtr ip $ \p ->
        forM_ [0..intSize-1]   $ \n ->
            pokeElemOff p n n
    return (Table fp ip)

------------------------------------------------------------------------
-- Lazy ByteStrings
--
-- Convert ForeignPtr a to and from a lazy ByteString
--
toByteString   :: Storable a => ForeignPtr a -> Int -> L.ByteString
toByteString (fp :: ForeignPtr a) n =
    L.fromChunks . (:[]) $ S.fromForeignPtr (castForeignPtr fp) 0 
                                            (n * sizeOf (undefined :: a))

--
-- Flatten a lazy bytestring back to a ForeignPtr.
--
fromByteString :: Storable a => L.ByteString -> ForeignPtr a
fromByteString lbs = castForeignPtr fp
   where (fp,_,n) = S.toForeignPtr . S.concat $ L.toChunks lbs

------------------------------------------------------------------------
-- GZip and Data.Binary
--
-- Serialise a Table, compressing with gzip it as we go:
--
instance Binary Table where
    put (Table f i) = do
        put . compress . toByteString f $ floatSize
        put . compress . toByteString i $ intSize

    get = do
        fs <- liftM decompress get
        is <- liftM decompress get

        -- check we read the correct amount:
        if L.length fs + L.length is == fromIntegral totalBytes
            then return $ Table (fromByteString fs) (fromByteString is)
            else error "Partial read"

------------------------------------------------------------------------
-- FFI
--
-- Example call to process the data using C functions.
--
rounded :: Int -> ForeignPtr CFloat -> IO [CFloat]
rounded l fp = withForeignPtr fp $ \p -> go p
    where
        go p = forM [0..l-1] $ \n -> do
                    v <- peekElemOff p n
                    return $ c_tanhf (c_sinf (v + fromIntegral n))

-- A random C function to use:    
foreign import ccall unsafe "math.h sinf"  c_sinf  :: CFloat -> CFloat
foreign import ccall unsafe "math.h tanhf" c_tanhf :: CFloat -> CFloat


------------------------------------------------------------------------
--
-- Now glue it all together
-- 
main = do
    table <- newTable
    putStrLn "Built table"

    -- write the data to disk, compressed with gzip as we go.
    encodeFile "/tmp/table.gz" table
    printf "Compressed      %d bytes\n" totalBytes

    -- how good was the compression?
    h <- openFile "/tmp/table.gz" ReadMode
    n <- hFileSize h
    hClose h
    printf "Compressed size  %d bytes (%0.2f%%)\n" n
                (100 - (fromIntegral n/fromIntegral totalBytes*100) :: Double)

    -- load it back in, decompressing on the fly
    table' <- decodeFile "/tmp/table.gz"
    printf "Decompressed    %d bytes\n" totalBytes

    -- now process the floats with C
    printf "Calling into C ...\n"
    ps <- rounded 10 (floats table')
    forM_ ps print