module Test.QuickCheck.Utf8(
    genValidUtf8
  , shrinkValidUtf8

  , utf8BS
  , shrinkUtf8BS

  , genValidUtf81
  , shrinkValidUtf81

  , utf8BS1
  , shrinkUtf8BS1

    -- * Generators for single characters
  , genChar
  , genUtf8Character
  , oneByte
  , twoByte
  , threeByte
) where

import           Control.Monad

import           Data.Binary.Builder

import           Data.ByteString     (ByteString)
import qualified Data.ByteString     as BS
import qualified Data.ByteString.Lazy     as BL
import           Data.Text           (Text)
import qualified Data.Text           as T
import           Data.Text.Encoding
import           Data.Text.Internal.Encoding.Utf8
import           Data.Word

import           Test.QuickCheck

-- |
-- Generate a possibly-empty valid UTF-8 'Text' value.
genValidUtf8 :: Gen Text
genValidUtf8 :: Gen Text
genValidUtf8 = (ByteString -> Text) -> Gen ByteString -> Gen Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ByteString -> Text
decodeUtf8 Gen ByteString
utf8BS

-- |
-- Shrink a possible-empty valid UTF-8 'Text' value.
shrinkValidUtf8 :: Text -> [Text]
shrinkValidUtf8 :: Text -> [Text]
shrinkValidUtf8 = (String -> Text) -> [String] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack ([String] -> [Text]) -> (Text -> [String]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [String]
forall a. Arbitrary a => a -> [a]
shrink (String -> [String]) -> (Text -> String) -> Text -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
T.unpack

-- |
-- Generate a possibly-empty sequence of bytes which represent a valid
-- UTF-8 code point.
utf8BS :: Gen ByteString
utf8BS :: Gen ByteString
utf8BS = ([ByteString] -> ByteString) -> Gen [ByteString] -> Gen ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [ByteString] -> ByteString
BS.concat (Gen [ByteString] -> Gen ByteString)
-> (Gen ByteString -> Gen [ByteString])
-> Gen ByteString
-> Gen ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Gen ByteString -> Gen [ByteString]
forall a. Gen a -> Gen [a]
listOf (Gen ByteString -> Gen ByteString)
-> Gen ByteString -> Gen ByteString
forall a b. (a -> b) -> a -> b
$ [Gen ByteString] -> Gen ByteString
forall a. [Gen a] -> Gen a
oneof [Gen ByteString]
symbolTypes

-- |
-- Shrink a possible-empty sequence of bytes which represent a valid
-- UTF-8 code point.
shrinkUtf8BS :: ByteString -> [ByteString]
shrinkUtf8BS :: ByteString -> [ByteString]
shrinkUtf8BS = (Text -> ByteString) -> [Text] -> [ByteString]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Text -> ByteString
encodeUtf8 ([Text] -> [ByteString])
-> (ByteString -> [Text]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
shrinkValidUtf8 (Text -> [Text]) -> (ByteString -> Text) -> ByteString -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
decodeUtf8

-- |
-- Like 'genValidUtf8', but does not allow empty 'Text' values.
genValidUtf81 :: Gen Text
genValidUtf81 :: Gen Text
genValidUtf81 = (ByteString -> Text) -> Gen ByteString -> Gen Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ByteString -> Text
decodeUtf8 Gen ByteString
utf8BS1

-- |
-- List 'genValidUtf8', bute does not allow empty 'Text' values.
shrinkValidUtf81 :: Text -> [Text]
shrinkValidUtf81 :: Text -> [Text]
shrinkValidUtf81 = (Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null) ([Text] -> [Text]) -> (Text -> [Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
shrinkValidUtf8

-- |
-- Like 'utf8BS', but does not allow empty 'ByteString's.
utf8BS1 :: Gen ByteString
utf8BS1 :: Gen ByteString
utf8BS1 = ([ByteString] -> ByteString) -> Gen [ByteString] -> Gen ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [ByteString] -> ByteString
BS.concat (Gen [ByteString] -> Gen ByteString)
-> (Gen ByteString -> Gen [ByteString])
-> Gen ByteString
-> Gen ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Gen ByteString -> Gen [ByteString]
forall a. Gen a -> Gen [a]
listOf1 (Gen ByteString -> Gen ByteString)
-> Gen ByteString -> Gen ByteString
forall a b. (a -> b) -> a -> b
$ [Gen ByteString] -> Gen ByteString
forall a. [Gen a] -> Gen a
oneof [Gen ByteString]
symbolTypes

-- |
-- Like 'shrinkUtf8BS', but does not allow empty 'ByteString's.
shrinkUtf8BS1 :: ByteString -> [ByteString]
shrinkUtf8BS1 :: ByteString -> [ByteString]
shrinkUtf8BS1 = (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (ByteString -> Bool) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Bool
BS.null) ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
shrinkUtf8BS

symbolTypes :: [Gen ByteString]
symbolTypes :: [Gen ByteString]
symbolTypes = [ Gen ByteString
oneByte
              , Gen ByteString
twoByte
              , Gen ByteString
threeByte
              ]

inRange :: Int -> Int -> Gen Word8
inRange :: Int -> Int -> Gen Word8
inRange Int
lo Int
hi = (Int -> Word8) -> Gen Int -> Gen Word8
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Gen Int -> Gen Word8) -> Gen Int -> Gen Word8
forall a b. (a -> b) -> a -> b
$ [Int] -> Gen Int
forall a. [a] -> Gen a
elements [Int
lo..Int
hi]

-- | Generate a valid 'Char'. Note that this is UTF-16, not UTF-8, but
-- the intent is the same: the Arbitrary instance for 'Char' in quickcheck
-- makes no attempt to generate valid non-ASCII characters at this time.
genChar :: Gen Char
genChar :: Gen Char
genChar = (ByteString -> Char) -> Gen ByteString -> Gen Char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Text -> Char
T.head (Text -> Char) -> (ByteString -> Text) -> ByteString -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
decodeUtf8) Gen ByteString
genUtf8Character

-- | A valid UTF-8 character, one to three bytes long.
genUtf8Character :: Gen ByteString
genUtf8Character :: Gen ByteString
genUtf8Character = [Gen ByteString] -> Gen ByteString
forall a. [Gen a] -> Gen a
oneof [
    Gen ByteString
oneByte
  , Gen ByteString
twoByte
  , Gen ByteString
threeByte
  ]

-- | Single-byte UTF-8 (i.e., a standard ASCII byte with a cleared MSB).
oneByte :: Gen ByteString
oneByte :: Gen ByteString
oneByte = (Word8 -> ByteString) -> Gen Word8 -> Gen ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ([Word8] -> ByteString
BS.pack ([Word8] -> ByteString)
-> (Word8 -> [Word8]) -> Word8 -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> [Word8]
forall (m :: * -> *) a. Monad m => a -> m a
return) (Gen Word8 -> Gen ByteString) -> Gen Word8 -> Gen ByteString
forall a b. (a -> b) -> a -> b
$
  Int -> Int -> Gen Word8
inRange Int
0 Int
127 -- 0bbbbbbb

twoByte :: Gen ByteString
twoByte :: Gen ByteString
twoByte = do
  Word8
b1 <- Int -> Int -> Gen Word8
inRange Int
0xC2 Int
0xDF -- 110bbbbb
  Word8
b2 <- Gen Word8
nonInitial
  ByteString -> Gen ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Gen ByteString)
-> (Builder -> ByteString) -> Builder -> Gen ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> ByteString
buildUtf (Builder -> Gen ByteString) -> Builder -> Gen ByteString
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> Builder
putBytes2 Word8
b1 Word8
b2

threeByte :: Gen ByteString
threeByte :: Gen ByteString
threeByte = do
  (Word8
b1, Word8
b2) <- [Gen (Word8, Word8)] -> Gen (Word8, Word8)
forall a. [Gen a] -> Gen a
oneof [Gen (Word8, Word8)
b3_1, Gen (Word8, Word8)
b3_2, Gen (Word8, Word8)
b3_3, Gen (Word8, Word8)
b3_4]
  Word8
b3 <- Gen Word8
nonInitial
  ByteString -> Gen ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Gen ByteString)
-> (Builder -> ByteString) -> Builder -> Gen ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> ByteString
buildUtf (Builder -> Gen ByteString) -> Builder -> Gen ByteString
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> Word8 -> Builder
putBytes3 Word8
b1 Word8
b2 Word8
b3
 where
  b3_1 :: Gen (Word8, Word8)
b3_1 = (,) (Word8 -> Word8 -> (Word8, Word8))
-> Gen Word8 -> Gen (Word8 -> (Word8, Word8))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Word8 -> Gen Word8
forall (m :: * -> *) a. Monad m => a -> m a
return Word8
0xE0 Gen (Word8 -> (Word8, Word8)) -> Gen Word8 -> Gen (Word8, Word8)
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` Int -> Int -> Gen Word8
inRange Int
0xA0 Int
0xBF

  b3_2 :: Gen (Word8, Word8)
b3_2 = (,) (Word8 -> Word8 -> (Word8, Word8))
-> Gen Word8 -> Gen (Word8 -> (Word8, Word8))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Int -> Int -> Gen Word8
inRange Int
0xE1 Int
0xEC Gen (Word8 -> (Word8, Word8)) -> Gen Word8 -> Gen (Word8, Word8)
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` Gen Word8
nonInitial

  b3_3 :: Gen (Word8, Word8)
b3_3 = (,) (Word8 -> Word8 -> (Word8, Word8))
-> Gen Word8 -> Gen (Word8 -> (Word8, Word8))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Word8 -> Gen Word8
forall (m :: * -> *) a. Monad m => a -> m a
return Word8
0xED Gen (Word8 -> (Word8, Word8)) -> Gen Word8 -> Gen (Word8, Word8)
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` Int -> Int -> Gen Word8
inRange Int
0x80 Int
0x9F

  b3_4 :: Gen (Word8, Word8)
b3_4 = (,) (Word8 -> Word8 -> (Word8, Word8))
-> Gen Word8 -> Gen (Word8 -> (Word8, Word8))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Int -> Int -> Gen Word8
inRange Int
0xEE Int
0xEF Gen (Word8 -> (Word8, Word8)) -> Gen Word8 -> Gen (Word8, Word8)
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` Gen Word8
nonInitial

buildUtf :: Builder -> ByteString 
buildUtf :: Builder -> ByteString
buildUtf = [ByteString] -> ByteString
BS.concat ([ByteString] -> ByteString)
-> (Builder -> [ByteString]) -> Builder -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
BL.toChunks (ByteString -> [ByteString])
-> (Builder -> ByteString) -> Builder -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> ByteString
toLazyByteString

putBytes2 :: Word8 -> Word8 -> Builder
putBytes2 :: Word8 -> Word8 -> Builder
putBytes2 Word8
b1 Word8
b2 =  Char -> Builder
putCharUtf8 (Char -> Builder) -> Char -> Builder
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> Char
chr2 Word8
b1 Word8
b2

putBytes3 :: Word8 -> Word8 -> Word8 -> Builder
putBytes3 :: Word8 -> Word8 -> Word8 -> Builder
putBytes3 Word8
b1 Word8
b2 Word8
b3 =  Char -> Builder
putCharUtf8 (Char -> Builder) -> Char -> Builder
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> Word8 -> Char
chr3 Word8
b1 Word8
b2 Word8
b3

nonInitial :: Gen Word8
nonInitial :: Gen Word8
nonInitial = Int -> Int -> Gen Word8
inRange Int
0x80 Int
0xBF