Skip to content

Commit

Permalink
Statistically verify result (#288)
Browse files Browse the repository at this point in the history
This PR adds ability to be able to statistically verify that a result is correct. It does not _yet_ add the ability to cancel running tests if they can never reach the specified coverage.
  • Loading branch information
felixmulder authored and jacobstanley committed Oct 10, 2019
1 parent 8b02b09 commit 05a99d2
Show file tree
Hide file tree
Showing 9 changed files with 301 additions and 34 deletions.
1 change: 1 addition & 0 deletions hedgehog-example/hedgehog-example.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ library

exposed-modules:
Test.Example.Basic
, Test.Example.Confidence
, Test.Example.Coverage
, Test.Example.Exception
, Test.Example.List
Expand Down
25 changes: 25 additions & 0 deletions hedgehog-example/src/Test/Example/Confidence.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Test.Example.Confidence where

import Hedgehog
import qualified Hedgehog.Range as Range
import qualified Hedgehog.Internal.Gen as Gen

------------------------------------------------------------------------
-- Example 0: This test will certify that it is impossible to get 60%
-- coverage for the property label "number == 1"
--
-- Note that it will abort running once it knows its task is
-- impossible - it will not run 1000000 tests
--
prop_without_confidence :: Property
prop_without_confidence =
verifiedTermination . withConfidence (10^9) . withTests 1000000 . property $ do
number <- forAll (Gen.int $ Range.constant 1 2)
cover 60 "number == 1" $ number == 1

------------------------------------------------------------------------
tests :: IO Bool
tests =
checkSequential $$(discover)
20 changes: 11 additions & 9 deletions hedgehog-example/test/test.hs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import System.IO (BufferMode(..), hSetBuffering, stdout, stderr)

import qualified Test.Example.Basic as Test.Example.Basic
import qualified Test.Example.Coverage as Test.Example.Coverage
import qualified Test.Example.Exception as Test.Example.Exception
import qualified Test.Example.QuickCheck as Test.Example.QuickCheck
import qualified Test.Example.References as Test.Example.References
import qualified Test.Example.Registry as Test.Example.Registry
import qualified Test.Example.Resource as Test.Example.Resource
import qualified Test.Example.Roundtrip as Test.Example.Roundtrip
import qualified Test.Example.STLC as Test.Example.STLC
import qualified Test.Example.Basic
import qualified Test.Example.Confidence
import qualified Test.Example.Coverage
import qualified Test.Example.Exception
import qualified Test.Example.QuickCheck
import qualified Test.Example.References
import qualified Test.Example.Registry
import qualified Test.Example.Resource
import qualified Test.Example.Roundtrip
import qualified Test.Example.STLC

main :: IO ()
main = do
Expand All @@ -17,6 +18,7 @@ main = do

_results <- sequence [
Test.Example.Basic.tests
, Test.Example.Confidence.tests
, Test.Example.Coverage.tests
, Test.Example.Exception.tests
, Test.Example.QuickCheck.tests
Expand Down
2 changes: 2 additions & 0 deletions hedgehog/hedgehog.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ library
, concurrent-output >= 1.7 && < 1.11
, containers >= 0.4 && < 0.7
, directory >= 1.2 && < 1.4
, erf >= 2.0 && < 2.1
, exceptions >= 0.7 && < 0.11
, fail >= 4.9 && < 5
, lifted-async >= 0.7 && < 0.11
Expand Down Expand Up @@ -126,6 +127,7 @@ test-suite test

other-modules:
Test.Hedgehog.Applicative
Test.Hedgehog.Confidence
Test.Hedgehog.Filter
Test.Hedgehog.Seed
Test.Hedgehog.Text
Expand Down
5 changes: 5 additions & 0 deletions hedgehog/src/Hedgehog.hs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ module Hedgehog (
, checkParallel
, checkSequential

, Confidence
, verifiedTermination
, withConfidence

, withTests
, TestLimit

Expand Down Expand Up @@ -167,6 +171,7 @@ import Hedgehog.Internal.Property (forAll, forAllWith)
import Hedgehog.Internal.Property (LabelName, MonadTest(..))
import Hedgehog.Internal.Property (Property, PropertyT, PropertyName)
import Hedgehog.Internal.Property (Group(..), GroupName)
import Hedgehog.Internal.Property (Confidence, verifiedTermination, withConfidence)
import Hedgehog.Internal.Property (ShrinkLimit, withShrinks)
import Hedgehog.Internal.Property (ShrinkRetries, withRetries)
import Hedgehog.Internal.Property (Test, TestT, property, test)
Expand Down
153 changes: 146 additions & 7 deletions hedgehog/src/Hedgehog/Internal/Property.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
Expand Down Expand Up @@ -41,6 +42,7 @@ module Hedgehog.Internal.Property (
, forAllT
, forAllWith
, forAllWithT
, defaultMinTests
, discard

-- * Group
Expand Down Expand Up @@ -92,6 +94,15 @@ module Hedgehog.Internal.Property (
, CoverPercentage(..)
, toCoverCount

-- * Confidence
, Confidence(..)
, TerminationCriteria(..)
, confidenceSuccess
, confidenceFailure
, withConfidence
, verifiedTermination
, defaultConfidence

-- * Internal
-- $internal
, defaultConfig
Expand All @@ -105,6 +116,8 @@ module Hedgehog.Internal.Property (
, mkTestT
, runTest
, runTestT

, wilsonBounds
) where

import Control.Applicative (Alternative(..))
Expand Down Expand Up @@ -139,11 +152,14 @@ import qualified Control.Monad.Trans.Writer.Strict as Strict

import qualified Data.Char as Char
import Data.Functor.Identity (Identity(..))
import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map.Strict as Map
import Data.Number.Erf (invnormcdf)
import qualified Data.List as List
import Data.Semigroup (Semigroup(..))
import Data.String (IsString)
import Data.Ratio ((%))
import Data.Typeable (typeOf)

import Hedgehog.Internal.Distributive
Expand Down Expand Up @@ -221,14 +237,23 @@ newtype PropertyName =
unPropertyName :: String
} deriving (Eq, Ord, Show, IsString, Semigroup, Lift)

-- | The acceptable occurrence of false positives
--
-- Example, @Confidence 10^9@ would mean that you'd accept a false positive
-- for 1 in 10^9 tests.
newtype Confidence =
Confidence {
unConfidence :: Int64
} deriving (Eq, Ord, Show, Num, Lift)

-- | Configuration for a property test.
--
data PropertyConfig =
PropertyConfig {
propertyTestLimit :: !TestLimit
, propertyDiscardLimit :: !DiscardLimit
propertyDiscardLimit :: !DiscardLimit
, propertyShrinkLimit :: !ShrinkLimit
, propertyShrinkRetries :: !ShrinkRetries
, propertyTerminationCriteria :: !TerminationCriteria
} deriving (Eq, Ord, Show, Lift)

-- | The number of successful tests that need to be run before a property test
Expand Down Expand Up @@ -333,6 +358,12 @@ newtype PropertyCount =
PropertyCount Int
deriving (Eq, Ord, Show, Num, Enum, Real, Integral)

data TerminationCriteria =
EarlyTermination Confidence TestLimit
| NoEarlyTermination Confidence TestLimit
| NoConfidenceTermination TestLimit
deriving (Eq, Ord, Show, Lift)

--
-- FIXME This whole Log/Failure thing could be a lot more structured to allow
-- FIXME for richer user controlled error messages, think Doc. Ideally we'd
Expand Down Expand Up @@ -403,7 +434,7 @@ newtype CoverCount =
newtype CoverPercentage =
CoverPercentage {
unCoverPercentage :: Double
} deriving (Eq, Ord, Show, Num)
} deriving (Eq, Ord, Show, Num, Fractional)

-- | The name of a classifier.
--
Expand Down Expand Up @@ -890,22 +921,60 @@ test =
defaultConfig :: PropertyConfig
defaultConfig =
PropertyConfig {
propertyTestLimit =
100
, propertyDiscardLimit =
propertyDiscardLimit =
100
, propertyShrinkLimit =
1000
, propertyShrinkRetries =
0
, propertyTerminationCriteria =
NoConfidenceTermination defaultMinTests
}

-- | The minimum amount of tests to run for a 'Property'
--
defaultMinTests :: TestLimit
defaultMinTests = 100

-- | The default confidence allows one false positive in 10^9 tests
--
defaultConfidence :: Confidence
defaultConfidence = 10 ^ (9 :: Int)

-- | Map a config modification function over a property.
--
mapConfig :: (PropertyConfig -> PropertyConfig) -> Property -> Property
mapConfig f (Property cfg t) =
Property (f cfg) t

-- | Make sure that the result is statistically significant in accordance to
-- the passed 'Confidence'
--
withConfidence :: Confidence -> Property -> Property
withConfidence c =
let
setConfidence = \case
NoEarlyTermination _ tests -> NoEarlyTermination c tests
NoConfidenceTermination tests -> NoEarlyTermination c tests
EarlyTermination _ tests -> EarlyTermination c tests
in
mapConfig $ \config@PropertyConfig{..} ->
config
{ propertyTerminationCriteria =
setConfidence propertyTerminationCriteria
}

verifiedTermination :: Property -> Property
verifiedTermination =
mapConfig $ \config@PropertyConfig{..} ->
let
newTerminationCriteria = case propertyTerminationCriteria of
NoEarlyTermination c tests -> EarlyTermination c tests
NoConfidenceTermination tests -> EarlyTermination defaultConfidence tests
EarlyTermination c tests -> EarlyTermination c tests
in
config { propertyTerminationCriteria = newTerminationCriteria }

-- | Set the number of times a property should be executed before it is considered
-- successful.
--
Expand All @@ -915,7 +984,14 @@ mapConfig f (Property cfg t) =
--
withTests :: TestLimit -> Property -> Property
withTests n =
mapConfig $ \config -> config { propertyTestLimit = n }
let
setTestLimit tests = \case
NoEarlyTermination c _ -> NoEarlyTermination c tests
NoConfidenceTermination _ -> NoConfidenceTermination tests
EarlyTermination c _ -> EarlyTermination c tests
in
mapConfig $ \config@PropertyConfig{..} ->
config { propertyTerminationCriteria = setTestLimit n propertyTerminationCriteria }

-- | Set the number of times a property is allowed to discard before the test
-- runner gives up.
Expand Down Expand Up @@ -1013,6 +1089,7 @@ labelCovered :: TestCount -> Label CoverCount -> Bool
labelCovered tests (MkLabel _ _ minimum_ population) =
coverPercentage tests population >= minimum_

-- | All labels are covered
coverageSuccess :: TestCount -> Coverage CoverCount -> Bool
coverageSuccess tests =
null . coverageFailures tests
Expand All @@ -1021,6 +1098,68 @@ coverageFailures :: TestCount -> Coverage CoverCount -> [Label CoverCount]
coverageFailures tests (Coverage kvs) =
filter (not . labelCovered tests) (Map.elems kvs)

-- | Is true when the test coverage satisfies the specified 'Confidence'
-- contstraint for all 'Coverage CoverCount's
confidenceSuccess :: TestCount -> Confidence -> Coverage CoverCount -> Bool
confidenceSuccess tests confidence =
let
assertLow :: Label CoverCount -> Bool
assertLow coverCount@MkLabel{..} =
fst (boundsForLabel tests confidence coverCount)
>= unCoverPercentage labelMinimum / 100.0
in
and . fmap assertLow . Map.elems . coverageLabels

-- | Is true when there exists a label that is sure to have failed according to
-- the 'Confidence' constraint
confidenceFailure :: TestCount -> Confidence -> Coverage CoverCount -> Bool
confidenceFailure tests confidence =
let
assertHigh :: Label CoverCount -> Bool
assertHigh coverCount@MkLabel{..} =
snd (boundsForLabel tests confidence coverCount)
< (unCoverPercentage labelMinimum / 100.0)
in
or . fmap assertHigh . Map.elems . coverageLabels

boundsForLabel :: TestCount -> Confidence -> Label CoverCount -> (Double, Double)
boundsForLabel tests confidence MkLabel{..} =
wilsonBounds
(fromIntegral $ unCoverCount labelAnnotation)
(fromIntegral tests)
(1 / fromIntegral (unConfidence confidence))

-- In order to get an accurate measurement with small sample sizes, we're
-- using the Wilson score interval
-- (<https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
-- wikipedia>) instead of a normal approximation interval.
wilsonBounds :: Integer -> Integer -> Double -> (Double, Double)
wilsonBounds positives count acceptance =
let
p =
fromRational $ positives % count
n =
fromIntegral count
z =
invnormcdf $ 1 - acceptance / 2

midpoint =
p + z * z / (2 * n)

offset =
z / (1 + z ** 2 / n) * sqrt (p * (1 - p) / n + z ** 2 / (4 * n ** 2))

denominator =
1 + z * z / n

low =
(midpoint - offset) / denominator

high =
(midpoint + offset) / denominator
in
(low, high)

fromLabel :: Label a -> Coverage a
fromLabel x =
Coverage $
Expand Down
Loading

0 comments on commit 05a99d2

Please sign in to comment.