A guided tour through the bytestring library

What this talk covers

talk repository: sources and pandoc (what a pleasure to create slides™) config

Purpose of the bytestring library

Provide datastructures for computing efficiently, both in time and space, with finite sequences and infinite streams of bytes.

Why not [Word8]?

See below for the actual memory representation of [1,2] :: [Word8]. Each box represents one machine word.

  +----------+-----+-----+      +----------+-----+-----+      +---------+
  | (:)-Info | Ptr | Ptr | ---> | (:)-Info | Ptr | Ptr | ---> | []-Info |
  +----------+-----+-----+      +----------+-----+-----+      +---------+
               |                            |
               |                            |
               v                            v
         +-------------+---+          +-------------+---+
         | Word8#-Info | 1 |          | Word8#-Info | 2 |
         +-------------+---+          +-------------+---+

For more information on the heap layout of the GHC RTS see the GHC commentary or the paper on pointer-tagging by Simmon Marlow, Alexey Rodriguez Yakushev, and Simon Peyton Jones. Johan Tibell also posted a good overview of the memory overhead of common Haskell types.

Datastructures supported by the bytestring library

Strict bytestrings

-- A typical bytestring is of the form
exampleBS = PS (ForeignPtr addr (PlainPtr array) len off

-- Copied from "bytestring:Data.ByteString.Internal":
data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8) -- payload
{-# UNPACK #-} !Int -- offset
{-# UNPACK #-} !Int -- length
-- Copied from "base:GHC.ForeignPtr":
data ForeignPtr a = ForeignPtr Addr# ForeignPtrContents

data Finalizers
= NoFinalizers
| CFinalizers
| HaskellFinalizers
deriving Eq

data ForeignPtrContents
= PlainForeignPtr !(IORef (Finalizers, [IO ()]))
| MallocPtr (MutableByteArray# RealWorld) !(IORef (Finalizers, [IO ()]))
| PlainPtr (MutableByteArray# RealWorld)

Slicing support

-- Assume the following qualified import
import qualified Data.ByteString as S

take :: Int -> S.ByteString -> S.ByteString
take n ps@(PS fp off len)
| n <= 0 = empty
| n >= len = ps
| otherwise = PS fp off n
{-# INLINE take #-}

Implementation tricks

dropWhile :: (Word8 -> Bool) -> ByteString -> ByteString
dropWhile f ps = unsafeDrop (findIndexOrEnd (not . f) ps) ps
{-# INLINE dropWhile #-}

-- | 'findIndexOrEnd' is a variant of findIndex, that returns the length
-- of the string if no element is found, rather than Nothing.
findIndexOrEnd :: (Word8 -> Bool) -> ByteString -> Int
findIndexOrEnd k (PS x s l) =
inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0
where
STRICT2(go)
go ptr n | n >= l = return l
| otherwise = do w <- peek ptr
if k w
then return n
else go (ptr `plusPtr` 1) (n+1)
{-# INLINE findIndexOrEnd #-}

unsafeDrop :: Int -> ByteString -> ByteString
unsafeDrop n (PS x s l) = assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
{-# INLINE unsafeDrop #-}

inlinePerformIO :: IO a -> a
inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
{-# INLINE inlinePerformIO #-}

Creating strict bytestrings

-- | /O(1)/ The empty 'ByteString'
empty :: ByteString
empty = PS nullForeignPtr 0 0

-- | /O(1)/ Convert a 'Word8' into a 'ByteString'
singleton :: Word8 -> ByteString
singleton c = unsafeCreate 1 $ \p -> poke p c

replicate :: Int -> Word8 -> ByteString
replicate w c
| w <= 0 = empty
| otherwise = unsafeCreate w $ \ptr ->
memset ptr c (fromIntegral w) >> return ()

append :: ByteString -> ByteString -> ByteString
append (PS _ _ 0) b = b
append a (PS _ _ 0) = a
append (PS fp1 off1 len1) (PS fp2 off2 len2) =
unsafeCreate (len1+len2) $ \destptr1 -> do
let destptr2 = destptr1 `plusPtr` len1
withForeignPtr fp1 $ \p1 -> memcpy destptr1 (p1 `plusPtr` off1) len1
withForeignPtr fp2 $ \p2 -> memcpy destptr2 (p2 `plusPtr` off2) len2

Performance stumbling blocks

pack :: [Word8] -> ByteString
pack ws = unsafePackLenBytes (List.length ws) ws

unsafePackLenBytes :: Int -> [Word8] -> ByteString
unsafePackLenBytes len xs0 =
unsafeCreate len $ \p -> go p xs0
where
go !_ [] = return ()
go !p (x:xs) = poke p x >> go (p `plusPtr` 1) xs

Strict bytestrings summary

Lazy bytestrings

Just a lazy list of strict bytestrings.

data ByteString = 
Empty
| Chunk {-# UNPACK #-} !S.ByteString ByteString

Interlude: difference lists

A canonical use case for difference lists

-- strictness annotations
{-# LANGUAGE BangPatterns #-}

-- and a bunch of imports for our later benchmarks
import Data.Monoid (Monoid(..))
import Criterion.Main (defaultMain, nf, bgroup, bench)

A canonical use case for difference lists: in-order traversal of trees

data Tree a = Leaf | Node a (Tree a) (Tree a)
deriving( Eq, Ord, Show )

fullTree :: Int -> Tree Int
fullTree n
| n <= 0 = Leaf
| otherwise = Node n t' t'
where
t' = fullTree (n - 1)

-- This version does not scale linearly with the number of nodes
-- in the tree, as (++) is required to traverse some nodes repeatedly.
inorder :: Tree a -> [a]
inorder Leaf = []
inorder (Node x l r) = inorder l ++ ([x] ++ inorder r)

-- This version scales linearly, but is hard to read.
inorder' :: Tree a -> [a]
inorder' t =
go t []
where
go Leaf rest = rest
go (Node x l r) rest = go l (x : go r rest)

Recall the definition of (++)

(++) :: [a] -> [a] -> [a]
(++) [] ys = ys
(++) (x:xs) ys = x : xs ++ ys

In-order traversal using difference lists

A DList is a function that, given the desired rest of the list, returns the complete list.

newtype DList a = DList { runDList :: [a] -> [a] }

singleton :: a -> DList a
singleton x = DList (x:)

toList :: DList a -> [a]
toList dl = runDList dl []

-- The standard API for the empty `DList` and concatenation
instance Monoid (DList a) where
mempty = DList id
dl1 `mappend` dl2 = DList (runDList dl1 . runDList dl2)

-- This operator should really make it into base.
(<>) :: Monoid m => m -> m -> m
(<>) = mappend

-- much easier to read than `inorder'`
inorderDL :: Tree a -> [a]
inorderDL =
toList . go
where
go Leaf = mempty
go (Node x l r) = go l <> singleton x <> go r

Note that Semigroups are even simpler than Monoids: they provide only the “append” operation. Interestingly, this suffices for many tasks.

Benchmarking using criterion

In GHCi, the inorderDL version is the slowest, but benchmarks are worth nothing without turning optimizations on ;-)

The talk is a literate Haskell file, cabalized, and ready to run the following criterion benchmarks.

main :: IO ()
main = defaultMain $
[ bgroup "inorder" $ concatMap (\mkBench -> map mkBench depths) $
[ \n -> bench ("inorder" ++ show n) $ nf inorder $ (fullTree n)
, \n -> bench ("inorder'" ++ show n) $ nf inorder' $ (fullTree n)
, \n -> bench ("inorderDL" ++ show n) $ nf inorderDL $ (fullTree n)
]
, bgroup "size" $ concatMap (\mkBench -> map mkBench depths) $
[ \n -> bench ("size" ++ show n) $ nf size $ (fullTree n)
, \n -> bench ("size'" ++ show n) $ nf size' $ (fullTree n)
]
]
where
depths = [10..13]

Side note: removing lazyness

Note that a similar transform as for inorder' also speeds up the size computation of Trees. However, here we gain “only” a constant factor 2 speedup. It is due to the removing the unnecessary laziness in the size computation.

size :: Tree a -> Int
size Leaf = 0
size (Node _ l r) = 1 + size l + size r

-- this version is a factor 2x faster than `size`
size' :: Tree a -> Int
size' t0 =
go t0 0
where
go Leaf !s = s
go (Node _ l r) !s = go l (go r (s + 1))

The lazy bytestring builder

The types underlying the lazy bytestring builder

data BufferRange = BufferRange 
{-# UNPACK #-} !(Ptr Word8) -- First byte of range
{-# UNPACK #-} !(Ptr Word8) -- First byte /after/ range

-- a lazy bytestring difference-list annotated with its size
data SizedChunks =
SizedChunks {-# UNPACK #-} !Int64 (L.ByteString -> L.ByteString)

-- a buffer-range-filling function
type BuildStep a = BufferRange -> IO (BuildSignal a)

-- the result of filling a buffer-range
data BuildSignal a =
Done
{-# UNPACK #-} !(Ptr Word8) -- pointer to first byte after data
a -- value computed (for Put monad)

| BufferFull -- signal a full buffer
{-# UNPACK #-} !Int -- minimal size for next buffer
{-# UNPACK #-} !(Ptr Word8) -- pointer to first byte after data
!(BuildStep a) -- next buildstep to call

| InsertChunks -- transfer control over chunk generation
{-# UNPACK #-} !(Ptr Word8) -- pointer to first byte after data
{-# UNPACK #-} !SizedChunks -- produced chunks
!(Maybe Buffer) -- partial buffer to be filled further
!(BuildStep a) -- next buildstep to call

-- the lazy bytestring builder: a difference-list of buffer filling functions
newtype Builder = Builder (forall r. BuildStep r -> BuildStep r)

-- the Put monad for computing a value while filling a buffer
-- (e.g., a checksum over the generated chunks or an error report)
newtype Put a = Put { unPut :: forall r. (a -> BuildStep r) -> BuildStep r }

How to construct primitive lazy bytestring builders

An example of a bounded-size encoding: int8Dec

-- | Decimal encoding of an 'Int8'.
{-# INLINE int8Dec #-}
int8Dec :: BoundedEncoding Int8
int8Dec = boundedEncoding 4 $ c_int_dec . fromIntegral

-- | An encoding that always results in sequence of bytes that is no longer
-- than a pre-determined bound.
data BoundedEncoding a = BE {-# UNPACK #-} !Int (a -> Ptr Word8 -> IO (Ptr Word8))

-- | Encode a value with a 'BoundedEncoding'.
{-# INLINE[1] encodeWithB #-}
encodeWithB :: BoundedEncoding a -> (a -> Builder)
encodeWithB (BE bound io) x =
-- It is important to avoid recursive 'BuildStep's where possible, as
-- their closure allocation is expensive. Using 'ensureFree' allows the
-- 'step' to assume that at least 'sizeBound w' free space is available.
ensureFree (bound) `mappend` builder step
where
step k (BufferRange op ope) = do
op' <- io x op
let !br' = BufferRange op' ope
k br'

-- | Ensure that there are at least 'n' free bytes for the following 'Builder'.
{-# INLINE ensureFree #-}
ensureFree :: Int -> Builder
ensureFree minFree =
builder step
where
step k br@(BufferRange op ope)
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
| otherwise = k br

-- fast decimal encoding of `CInt`s
foreign import ccall unsafe "static _hs_bytestring_int_dec" c_int_dec
:: CInt -> Ptr Word8 -> IO (Ptr Word8)

HTML escaping fused with UTF–8 encoding

import qualified Data.ByteString.Lazy.Builder.BasicEncoding  as E
import Data.ByteString.Lazy.Builder.BasicEncoding
( ifB, fromF, (>*<), (>$<) )

{-# INLINE charUtf8HtmlEscaped #-}
charUtf8HtmlEscaped :: E.BoundedEncoding Char
charUtf8HtmlEscaped =
ifB (> '>' ) E.charUtf8 $ -- '>' is the largest escaped 'Char'
ifB (== '<' ) (fixed4 ('&',('l',('t',';')))) $ -- &lt;
ifB (== '>' ) (fixed4 ('&',('g',('t',';')))) $ -- &gt;
ifB (== '&' ) (fixed5 ('&',('a',('m',('p',';'))))) $ -- &amp;
ifB (== '"' ) (fixed5 ('&',('#',('3',('4',';'))))) $ -- &#34;
(fromF E.char7) -- fallback for remaining 'Char's
where
{-# INLINE fixed4 #-}
fixed4 x = fromF $ const x >$<
E.char7 >*< E.char7 >*< E.char7 >*< E.char7

{-# INLINE fixed5 #-}
fixed5 x = fromF $ const x >$<
E.char7 >*< E.char7 >*< E.char7 >*< E.char7 >*< E.char7

Some missing pieces in our library infrastructure

Conclusions

Enjoy Haskell: enabling the construction of pure and efficient API’s is just beautiful ☺

Thanks…

…for listening.

Control.Monad.Has.Questions> ?