From 05a99d2bc64ac1ed82d0cc055703e5ceff5cfb07 Mon Sep 17 00:00:00 2001 From: Felix Mulder Date: Thu, 10 Oct 2019 19:38:40 +0200 Subject: [PATCH] Statistically verify result (#288) This PR adds ability to be able to statistically verify that a result is correct. It does not _yet_ add the ability to cancel running tests if they can never reach the specified coverage. --- hedgehog-example/hedgehog-example.cabal | 1 + .../src/Test/Example/Confidence.hs | 25 +++ hedgehog-example/test/test.hs | 20 +-- hedgehog/hedgehog.cabal | 2 + hedgehog/src/Hedgehog.hs | 5 + hedgehog/src/Hedgehog/Internal/Property.hs | 153 +++++++++++++++++- hedgehog/src/Hedgehog/Internal/Runner.hs | 100 +++++++++--- hedgehog/test/Test/Hedgehog/Confidence.hs | 27 ++++ hedgehog/test/test.hs | 2 + 9 files changed, 301 insertions(+), 34 deletions(-) create mode 100644 hedgehog-example/src/Test/Example/Confidence.hs create mode 100644 hedgehog/test/Test/Hedgehog/Confidence.hs diff --git a/hedgehog-example/hedgehog-example.cabal b/hedgehog-example/hedgehog-example.cabal index 68ce3f6c..b8e4580c 100644 --- a/hedgehog-example/hedgehog-example.cabal +++ b/hedgehog-example/hedgehog-example.cabal @@ -43,6 +43,7 @@ library exposed-modules: Test.Example.Basic + , Test.Example.Confidence , Test.Example.Coverage , Test.Example.Exception , Test.Example.List diff --git a/hedgehog-example/src/Test/Example/Confidence.hs b/hedgehog-example/src/Test/Example/Confidence.hs new file mode 100644 index 00000000..cff6e5ba --- /dev/null +++ b/hedgehog-example/src/Test/Example/Confidence.hs @@ -0,0 +1,25 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TemplateHaskell #-} +module Test.Example.Confidence where + +import Hedgehog +import qualified Hedgehog.Range as Range +import qualified Hedgehog.Internal.Gen as Gen + +------------------------------------------------------------------------ +-- Example 0: This test will certify that it is impossible to get 60% +-- coverage for the property label "number == 1" +-- +-- Note that it will abort running once it knows its task is +-- impossible - it will not run 1000000 tests +-- +prop_without_confidence :: Property +prop_without_confidence = + verifiedTermination . withConfidence (10^9) . withTests 1000000 . property $ do + number <- forAll (Gen.int $ Range.constant 1 2) + cover 60 "number == 1" $ number == 1 + +------------------------------------------------------------------------ +tests :: IO Bool +tests = + checkSequential $$(discover) diff --git a/hedgehog-example/test/test.hs b/hedgehog-example/test/test.hs index 31cf7aca..342e8549 100644 --- a/hedgehog-example/test/test.hs +++ b/hedgehog-example/test/test.hs @@ -1,14 +1,15 @@ import System.IO (BufferMode(..), hSetBuffering, stdout, stderr) -import qualified Test.Example.Basic as Test.Example.Basic -import qualified Test.Example.Coverage as Test.Example.Coverage -import qualified Test.Example.Exception as Test.Example.Exception -import qualified Test.Example.QuickCheck as Test.Example.QuickCheck -import qualified Test.Example.References as Test.Example.References -import qualified Test.Example.Registry as Test.Example.Registry -import qualified Test.Example.Resource as Test.Example.Resource -import qualified Test.Example.Roundtrip as Test.Example.Roundtrip -import qualified Test.Example.STLC as Test.Example.STLC +import qualified Test.Example.Basic +import qualified Test.Example.Confidence +import qualified Test.Example.Coverage +import qualified Test.Example.Exception +import qualified Test.Example.QuickCheck +import qualified Test.Example.References +import qualified Test.Example.Registry +import qualified Test.Example.Resource +import qualified Test.Example.Roundtrip +import qualified Test.Example.STLC main :: IO () main = do @@ -17,6 +18,7 @@ main = do _results <- sequence [ Test.Example.Basic.tests + , Test.Example.Confidence.tests , Test.Example.Coverage.tests , Test.Example.Exception.tests , Test.Example.QuickCheck.tests diff --git a/hedgehog/hedgehog.cabal b/hedgehog/hedgehog.cabal index 2db86d60..4d12665c 100644 --- a/hedgehog/hedgehog.cabal +++ b/hedgehog/hedgehog.cabal @@ -55,6 +55,7 @@ library , concurrent-output >= 1.7 && < 1.11 , containers >= 0.4 && < 0.7 , directory >= 1.2 && < 1.4 + , erf >= 2.0 && < 2.1 , exceptions >= 0.7 && < 0.11 , fail >= 4.9 && < 5 , lifted-async >= 0.7 && < 0.11 @@ -126,6 +127,7 @@ test-suite test other-modules: Test.Hedgehog.Applicative + Test.Hedgehog.Confidence Test.Hedgehog.Filter Test.Hedgehog.Seed Test.Hedgehog.Text diff --git a/hedgehog/src/Hedgehog.hs b/hedgehog/src/Hedgehog.hs index 28b459b9..5dd39dfe 100644 --- a/hedgehog/src/Hedgehog.hs +++ b/hedgehog/src/Hedgehog.hs @@ -66,6 +66,10 @@ module Hedgehog ( , checkParallel , checkSequential + , Confidence + , verifiedTermination + , withConfidence + , withTests , TestLimit @@ -167,6 +171,7 @@ import Hedgehog.Internal.Property (forAll, forAllWith) import Hedgehog.Internal.Property (LabelName, MonadTest(..)) import Hedgehog.Internal.Property (Property, PropertyT, PropertyName) import Hedgehog.Internal.Property (Group(..), GroupName) +import Hedgehog.Internal.Property (Confidence, verifiedTermination, withConfidence) import Hedgehog.Internal.Property (ShrinkLimit, withShrinks) import Hedgehog.Internal.Property (ShrinkRetries, withRetries) import Hedgehog.Internal.Property (Test, TestT, property, test) diff --git a/hedgehog/src/Hedgehog/Internal/Property.hs b/hedgehog/src/Hedgehog/Internal/Property.hs index 3fbf44d7..d6aee8c6 100644 --- a/hedgehog/src/Hedgehog/Internal/Property.hs +++ b/hedgehog/src/Hedgehog/Internal/Property.hs @@ -12,6 +12,7 @@ {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE RankNTypes #-} +{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TemplateHaskell #-} @@ -41,6 +42,7 @@ module Hedgehog.Internal.Property ( , forAllT , forAllWith , forAllWithT + , defaultMinTests , discard -- * Group @@ -92,6 +94,15 @@ module Hedgehog.Internal.Property ( , CoverPercentage(..) , toCoverCount + -- * Confidence + , Confidence(..) + , TerminationCriteria(..) + , confidenceSuccess + , confidenceFailure + , withConfidence + , verifiedTermination + , defaultConfidence + -- * Internal -- $internal , defaultConfig @@ -105,6 +116,8 @@ module Hedgehog.Internal.Property ( , mkTestT , runTest , runTestT + + , wilsonBounds ) where import Control.Applicative (Alternative(..)) @@ -139,11 +152,14 @@ import qualified Control.Monad.Trans.Writer.Strict as Strict import qualified Data.Char as Char import Data.Functor.Identity (Identity(..)) +import Data.Int (Int64) import Data.Map (Map) import qualified Data.Map.Strict as Map +import Data.Number.Erf (invnormcdf) import qualified Data.List as List import Data.Semigroup (Semigroup(..)) import Data.String (IsString) +import Data.Ratio ((%)) import Data.Typeable (typeOf) import Hedgehog.Internal.Distributive @@ -221,14 +237,23 @@ newtype PropertyName = unPropertyName :: String } deriving (Eq, Ord, Show, IsString, Semigroup, Lift) +-- | The acceptable occurrence of false positives +-- +-- Example, @Confidence 10^9@ would mean that you'd accept a false positive +-- for 1 in 10^9 tests. +newtype Confidence = + Confidence { + unConfidence :: Int64 + } deriving (Eq, Ord, Show, Num, Lift) + -- | Configuration for a property test. -- data PropertyConfig = PropertyConfig { - propertyTestLimit :: !TestLimit - , propertyDiscardLimit :: !DiscardLimit + propertyDiscardLimit :: !DiscardLimit , propertyShrinkLimit :: !ShrinkLimit , propertyShrinkRetries :: !ShrinkRetries + , propertyTerminationCriteria :: !TerminationCriteria } deriving (Eq, Ord, Show, Lift) -- | The number of successful tests that need to be run before a property test @@ -333,6 +358,12 @@ newtype PropertyCount = PropertyCount Int deriving (Eq, Ord, Show, Num, Enum, Real, Integral) +data TerminationCriteria = + EarlyTermination Confidence TestLimit + | NoEarlyTermination Confidence TestLimit + | NoConfidenceTermination TestLimit + deriving (Eq, Ord, Show, Lift) + -- -- FIXME This whole Log/Failure thing could be a lot more structured to allow -- FIXME for richer user controlled error messages, think Doc. Ideally we'd @@ -403,7 +434,7 @@ newtype CoverCount = newtype CoverPercentage = CoverPercentage { unCoverPercentage :: Double - } deriving (Eq, Ord, Show, Num) + } deriving (Eq, Ord, Show, Num, Fractional) -- | The name of a classifier. -- @@ -890,22 +921,60 @@ test = defaultConfig :: PropertyConfig defaultConfig = PropertyConfig { - propertyTestLimit = - 100 - , propertyDiscardLimit = + propertyDiscardLimit = 100 , propertyShrinkLimit = 1000 , propertyShrinkRetries = 0 + , propertyTerminationCriteria = + NoConfidenceTermination defaultMinTests } +-- | The minimum amount of tests to run for a 'Property' +-- +defaultMinTests :: TestLimit +defaultMinTests = 100 + +-- | The default confidence allows one false positive in 10^9 tests +-- +defaultConfidence :: Confidence +defaultConfidence = 10 ^ (9 :: Int) + -- | Map a config modification function over a property. -- mapConfig :: (PropertyConfig -> PropertyConfig) -> Property -> Property mapConfig f (Property cfg t) = Property (f cfg) t +-- | Make sure that the result is statistically significant in accordance to +-- the passed 'Confidence' +-- +withConfidence :: Confidence -> Property -> Property +withConfidence c = + let + setConfidence = \case + NoEarlyTermination _ tests -> NoEarlyTermination c tests + NoConfidenceTermination tests -> NoEarlyTermination c tests + EarlyTermination _ tests -> EarlyTermination c tests + in + mapConfig $ \config@PropertyConfig{..} -> + config + { propertyTerminationCriteria = + setConfidence propertyTerminationCriteria + } + +verifiedTermination :: Property -> Property +verifiedTermination = + mapConfig $ \config@PropertyConfig{..} -> + let + newTerminationCriteria = case propertyTerminationCriteria of + NoEarlyTermination c tests -> EarlyTermination c tests + NoConfidenceTermination tests -> EarlyTermination defaultConfidence tests + EarlyTermination c tests -> EarlyTermination c tests + in + config { propertyTerminationCriteria = newTerminationCriteria } + -- | Set the number of times a property should be executed before it is considered -- successful. -- @@ -915,7 +984,14 @@ mapConfig f (Property cfg t) = -- withTests :: TestLimit -> Property -> Property withTests n = - mapConfig $ \config -> config { propertyTestLimit = n } + let + setTestLimit tests = \case + NoEarlyTermination c _ -> NoEarlyTermination c tests + NoConfidenceTermination _ -> NoConfidenceTermination tests + EarlyTermination c _ -> EarlyTermination c tests + in + mapConfig $ \config@PropertyConfig{..} -> + config { propertyTerminationCriteria = setTestLimit n propertyTerminationCriteria } -- | Set the number of times a property is allowed to discard before the test -- runner gives up. @@ -1013,6 +1089,7 @@ labelCovered :: TestCount -> Label CoverCount -> Bool labelCovered tests (MkLabel _ _ minimum_ population) = coverPercentage tests population >= minimum_ +-- | All labels are covered coverageSuccess :: TestCount -> Coverage CoverCount -> Bool coverageSuccess tests = null . coverageFailures tests @@ -1021,6 +1098,68 @@ coverageFailures :: TestCount -> Coverage CoverCount -> [Label CoverCount] coverageFailures tests (Coverage kvs) = filter (not . labelCovered tests) (Map.elems kvs) +-- | Is true when the test coverage satisfies the specified 'Confidence' +-- contstraint for all 'Coverage CoverCount's +confidenceSuccess :: TestCount -> Confidence -> Coverage CoverCount -> Bool +confidenceSuccess tests confidence = + let + assertLow :: Label CoverCount -> Bool + assertLow coverCount@MkLabel{..} = + fst (boundsForLabel tests confidence coverCount) + >= unCoverPercentage labelMinimum / 100.0 + in + and . fmap assertLow . Map.elems . coverageLabels + +-- | Is true when there exists a label that is sure to have failed according to +-- the 'Confidence' constraint +confidenceFailure :: TestCount -> Confidence -> Coverage CoverCount -> Bool +confidenceFailure tests confidence = + let + assertHigh :: Label CoverCount -> Bool + assertHigh coverCount@MkLabel{..} = + snd (boundsForLabel tests confidence coverCount) + < (unCoverPercentage labelMinimum / 100.0) + in + or . fmap assertHigh . Map.elems . coverageLabels + +boundsForLabel :: TestCount -> Confidence -> Label CoverCount -> (Double, Double) +boundsForLabel tests confidence MkLabel{..} = + wilsonBounds + (fromIntegral $ unCoverCount labelAnnotation) + (fromIntegral tests) + (1 / fromIntegral (unConfidence confidence)) + +-- In order to get an accurate measurement with small sample sizes, we're +-- using the Wilson score interval +-- () instead of a normal approximation interval. +wilsonBounds :: Integer -> Integer -> Double -> (Double, Double) +wilsonBounds positives count acceptance = + let + p = + fromRational $ positives % count + n = + fromIntegral count + z = + invnormcdf $ 1 - acceptance / 2 + + midpoint = + p + z * z / (2 * n) + + offset = + z / (1 + z ** 2 / n) * sqrt (p * (1 - p) / n + z ** 2 / (4 * n ** 2)) + + denominator = + 1 + z * z / n + + low = + (midpoint - offset) / denominator + + high = + (midpoint + offset) / denominator + in + (low, high) + fromLabel :: Label a -> Coverage a fromLabel x = Coverage $ diff --git a/hedgehog/src/Hedgehog/Internal/Runner.hs b/hedgehog/src/Hedgehog/Internal/Runner.hs index 63c8f85b..b0701301 100644 --- a/hedgehog/src/Hedgehog/Internal/Runner.hs +++ b/hedgehog/src/Hedgehog/Internal/Runner.hs @@ -40,7 +40,10 @@ import Hedgehog.Internal.Property (Property(..), PropertyConfig(..), P import Hedgehog.Internal.Property (PropertyT(..), Failure(..), runTestT) import Hedgehog.Internal.Property (ShrinkLimit, ShrinkRetries, withTests) import Hedgehog.Internal.Property (TestCount(..), PropertyCount(..)) +import Hedgehog.Internal.Property (TerminationCriteria(..)) import Hedgehog.Internal.Property (coverageSuccess, journalCoverage) +import Hedgehog.Internal.Property (confidenceSuccess, confidenceFailure) +import Hedgehog.Internal.Property (defaultMinTests) import Hedgehog.Internal.Queue import Hedgehog.Internal.Region import Hedgehog.Internal.Report @@ -160,6 +163,30 @@ checkReport cfg size0 seed0 test0 updateUI = test = catchAll test0 (fail . show) + terminationCriteria = + propertyTerminationCriteria cfg + + (confidence, minTests) = + case terminationCriteria of + EarlyTermination c t -> (Just c, t) + NoEarlyTermination c t -> (Just c, t) + NoConfidenceTermination t -> (Nothing, t) + + successVerified count coverage = + count `mod` 100 == 0 && + -- If the user wants a statistically significant result, this function + -- will run a confidence check. Otherwise, it will default to checking + -- the percentage of encountered labels + maybe False (\c -> confidenceSuccess count c coverage) confidence + + failureVerified count coverage = + -- Will be true if we can statistically verify that our coverage was + -- inadequate. + -- Testing only on 100s to minimise repeated measurement statistical + -- errors. + count `mod` 100 == 0 && + maybe False (\c -> confidenceFailure count c coverage) confidence + loop :: TestCount -> DiscardCount @@ -170,28 +197,65 @@ checkReport cfg size0 seed0 test0 updateUI = loop !tests !discards !size !seed !coverage0 = do updateUI $ Report tests discards coverage0 Running + let + coverageReached = + successVerified tests coverage0 + + coverageUnreachable = + failureVerified tests coverage0 + + enoughTestsRun = + case terminationCriteria of + EarlyTermination _ _ -> + tests >= fromIntegral defaultMinTests && + (coverageReached || coverageUnreachable) + NoEarlyTermination _ _ -> + tests >= fromIntegral minTests + NoConfidenceTermination _ -> + tests >= fromIntegral minTests + + labelsCovered = + coverageSuccess tests coverage0 + + successReport = + Report tests discards coverage0 OK + + failureReport message = + Report tests discards coverage0 . Failed $ mkFailure + size + seed + 0 + (Just coverage0) + Nothing + message + Nothing + [] + + confidenceReport = + if coverageReached && labelsCovered then + successReport + else + failureReport $ + "Test coverage cannot be reached after " <> show tests <> " tests" + if size > 99 then -- size has reached limit, reset to 0 loop tests discards 0 seed coverage0 - else if tests >= fromIntegral (propertyTestLimit cfg) then - -- we've hit the test limit - if coverageSuccess tests coverage0 then - -- all classifiers satisfied, test was successful - pure $ Report tests discards coverage0 OK - - else - -- some classifiers unsatisfied, test was successful - pure . Report tests discards coverage0 . Failed $ - mkFailure - size - seed - 0 - (Just coverage0) - Nothing - "Insufficient coverage." - Nothing - [] + else if enoughTestsRun then + -- at this point, we know that enough tests have been run in order to + -- make a decision on if this was a successful run or not + -- + -- If we have early termination, then we need to check coverageReached / coverageUnreachable + pure $ case terminationCriteria of + EarlyTermination _ _ -> confidenceReport + NoEarlyTermination _ _ -> confidenceReport + NoConfidenceTermination _ -> + if labelsCovered then + successReport + else + failureReport $ + "Labels not sufficently covered after " <> show tests <> " tests" else if discards >= fromIntegral (propertyDiscardLimit cfg) then -- we've hit the discard limit, give up diff --git a/hedgehog/test/Test/Hedgehog/Confidence.hs b/hedgehog/test/Test/Hedgehog/Confidence.hs new file mode 100644 index 00000000..936ba7f8 --- /dev/null +++ b/hedgehog/test/Test/Hedgehog/Confidence.hs @@ -0,0 +1,27 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TemplateHaskell #-} +module Test.Hedgehog.Confidence where + +import Hedgehog +import qualified Hedgehog.Range as Range +import qualified Hedgehog.Internal.Gen as Gen + +confidence :: Confidence +confidence = 10 ^ (9 :: Int) + +prop_with_confidence :: Property +prop_with_confidence = + verifiedTermination . withConfidence confidence . property $ do + number <- forAll (Gen.int $ Range.linear 1 10) + cover 20 "number == 1" $ number == 1 + +-- This tests that at least 1000 tests are run for the property +prop_with_confidence_and_min_tests :: Property +prop_with_confidence_and_min_tests = + withConfidence confidence . withTests 1000 . property $ do + number <- forAll (Gen.int $ Range.linear 1 10) + cover 10 "number == 2" $ number == 2 + +tests :: IO Bool +tests = + checkParallel $$(discover) diff --git a/hedgehog/test/test.hs b/hedgehog/test/test.hs index b292a361..d663714f 100644 --- a/hedgehog/test/test.hs +++ b/hedgehog/test/test.hs @@ -1,6 +1,7 @@ import Hedgehog.Main (defaultMain) import qualified Test.Hedgehog.Applicative +import qualified Test.Hedgehog.Confidence import qualified Test.Hedgehog.Filter import qualified Test.Hedgehog.Seed import qualified Test.Hedgehog.Text @@ -11,6 +12,7 @@ main :: IO () main = defaultMain [ Test.Hedgehog.Applicative.tests + , Test.Hedgehog.Confidence.tests , Test.Hedgehog.Filter.tests , Test.Hedgehog.Seed.tests , Test.Hedgehog.Text.tests