Statistically verify result (#288)

This PR adds ability to be able to statistically verify that a result is correct. It does not _yet_ add the ability to cancel running tests if they can never reach the specified coverage.
hedgehogqa · Oct 10, 2019 · 05a99d2 · 05a99d2
1 parent 8b02b09
commit 05a99d2
Show file tree

Hide file tree

Showing 9 changed files with 301 additions and 34 deletions.
diff --git a/hedgehog-example/hedgehog-example.cabal b/hedgehog-example/hedgehog-example.cabal
@@ -43,6 +43,7 @@ library
 
   exposed-modules:
       Test.Example.Basic
+    , Test.Example.Confidence
     , Test.Example.Coverage
     , Test.Example.Exception
     , Test.Example.List

diff --git a/hedgehog-example/src/Test/Example/Confidence.hs b/hedgehog-example/src/Test/Example/Confidence.hs
@@ -0,0 +1,25 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TemplateHaskell #-}
+module Test.Example.Confidence where
+
+import           Hedgehog
+import qualified Hedgehog.Range as Range
+import qualified Hedgehog.Internal.Gen as Gen
+
+------------------------------------------------------------------------
+-- Example 0: This test will certify that it is impossible to get 60%
+--            coverage for the property label "number == 1"
+--
+--            Note that it will abort running once it knows its task is
+--            impossible - it will not run 1000000 tests
+--
+prop_without_confidence :: Property
+prop_without_confidence =
+  verifiedTermination . withConfidence (10^9) . withTests 1000000 . property $ do
+    number <- forAll (Gen.int $ Range.constant 1 2)
+    cover 60 "number == 1" $ number == 1
+
+------------------------------------------------------------------------
+tests :: IO Bool
+tests =
+  checkSequential $$(discover)
diff --git a/hedgehog-example/test/test.hs b/hedgehog-example/test/test.hs
@@ -1,14 +1,15 @@
 import           System.IO (BufferMode(..), hSetBuffering, stdout, stderr)
 
-import qualified Test.Example.Basic as Test.Example.Basic
-import qualified Test.Example.Coverage as Test.Example.Coverage
-import qualified Test.Example.Exception as Test.Example.Exception
-import qualified Test.Example.QuickCheck as Test.Example.QuickCheck
-import qualified Test.Example.References as Test.Example.References
-import qualified Test.Example.Registry as Test.Example.Registry
-import qualified Test.Example.Resource as Test.Example.Resource
-import qualified Test.Example.Roundtrip as Test.Example.Roundtrip
-import qualified Test.Example.STLC as Test.Example.STLC
+import qualified Test.Example.Basic
+import qualified Test.Example.Confidence
+import qualified Test.Example.Coverage
+import qualified Test.Example.Exception
+import qualified Test.Example.QuickCheck
+import qualified Test.Example.References
+import qualified Test.Example.Registry
+import qualified Test.Example.Resource
+import qualified Test.Example.Roundtrip
+import qualified Test.Example.STLC
 
 main :: IO ()
 main = do
@@ -17,6 +18,7 @@ main = do
 
   _results <- sequence [
       Test.Example.Basic.tests
+    , Test.Example.Confidence.tests
     , Test.Example.Coverage.tests
     , Test.Example.Exception.tests
     , Test.Example.QuickCheck.tests

diff --git a/hedgehog/hedgehog.cabal b/hedgehog/hedgehog.cabal
@@ -55,6 +55,7 @@ library
     , concurrent-output               >= 1.7        && < 1.11
     , containers                      >= 0.4        && < 0.7
     , directory                       >= 1.2        && < 1.4
+    , erf                             >= 2.0        && < 2.1
     , exceptions                      >= 0.7        && < 0.11
     , fail                            >= 4.9        && < 5
     , lifted-async                    >= 0.7        && < 0.11
@@ -126,6 +127,7 @@ test-suite test
 
   other-modules:
     Test.Hedgehog.Applicative
+    Test.Hedgehog.Confidence
     Test.Hedgehog.Filter
     Test.Hedgehog.Seed
     Test.Hedgehog.Text

diff --git a/hedgehog/src/Hedgehog.hs b/hedgehog/src/Hedgehog.hs
@@ -66,6 +66,10 @@ module Hedgehog (
   , checkParallel
   , checkSequential
 
+  , Confidence
+  , verifiedTermination
+  , withConfidence
+
   , withTests
   , TestLimit
 
@@ -167,6 +171,7 @@ import           Hedgehog.Internal.Property (forAll, forAllWith)
 import           Hedgehog.Internal.Property (LabelName, MonadTest(..))
 import           Hedgehog.Internal.Property (Property, PropertyT, PropertyName)
 import           Hedgehog.Internal.Property (Group(..), GroupName)
+import           Hedgehog.Internal.Property (Confidence, verifiedTermination, withConfidence)
 import           Hedgehog.Internal.Property (ShrinkLimit, withShrinks)
 import           Hedgehog.Internal.Property (ShrinkRetries, withRetries)
 import           Hedgehog.Internal.Property (Test, TestT, property, test)

diff --git a/hedgehog/src/Hedgehog/Internal/Property.hs b/hedgehog/src/Hedgehog/Internal/Property.hs
@@ -12,6 +12,7 @@
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE RecordWildCards #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE StandaloneDeriving #-}
 {-# LANGUAGE TemplateHaskell #-}
@@ -41,6 +42,7 @@ module Hedgehog.Internal.Property (
   , forAllT
   , forAllWith
   , forAllWithT
+  , defaultMinTests
   , discard
 
   -- * Group
@@ -92,6 +94,15 @@ module Hedgehog.Internal.Property (
   , CoverPercentage(..)
   , toCoverCount
 
+  -- * Confidence
+  , Confidence(..)
+  , TerminationCriteria(..)
+  , confidenceSuccess
+  , confidenceFailure
+  , withConfidence
+  , verifiedTermination
+  , defaultConfidence
+
   -- * Internal
   -- $internal
   , defaultConfig
@@ -105,6 +116,8 @@ module Hedgehog.Internal.Property (
   , mkTestT
   , runTest
   , runTestT
+
+  , wilsonBounds
   ) where
 
 import           Control.Applicative (Alternative(..))
@@ -139,11 +152,14 @@ import qualified Control.Monad.Trans.Writer.Strict as Strict
 
 import qualified Data.Char as Char
 import           Data.Functor.Identity (Identity(..))
+import           Data.Int (Int64)
 import           Data.Map (Map)
 import qualified Data.Map.Strict as Map
+import           Data.Number.Erf (invnormcdf)
 import qualified Data.List as List
 import           Data.Semigroup (Semigroup(..))
 import           Data.String (IsString)
+import           Data.Ratio ((%))
 import           Data.Typeable (typeOf)
 
 import           Hedgehog.Internal.Distributive
@@ -221,14 +237,23 @@ newtype PropertyName =
       unPropertyName :: String
     } deriving (Eq, Ord, Show, IsString, Semigroup, Lift)
 
+-- | The acceptable occurrence of false positives
+--
+--   Example, @Confidence 10^9@ would mean that you'd accept a false positive
+--   for 1 in 10^9 tests.
+newtype Confidence =
+  Confidence {
+    unConfidence :: Int64
+  } deriving (Eq, Ord, Show, Num, Lift)
+
 -- | Configuration for a property test.
 --
 data PropertyConfig =
   PropertyConfig {
-      propertyTestLimit :: !TestLimit
-    , propertyDiscardLimit :: !DiscardLimit
+      propertyDiscardLimit :: !DiscardLimit
     , propertyShrinkLimit :: !ShrinkLimit
     , propertyShrinkRetries :: !ShrinkRetries
+    , propertyTerminationCriteria :: !TerminationCriteria
     } deriving (Eq, Ord, Show, Lift)
 
 -- | The number of successful tests that need to be run before a property test
@@ -333,6 +358,12 @@ newtype PropertyCount =
   PropertyCount Int
   deriving (Eq, Ord, Show, Num, Enum, Real, Integral)
 
+data TerminationCriteria =
+    EarlyTermination Confidence TestLimit
+  | NoEarlyTermination Confidence TestLimit
+  | NoConfidenceTermination TestLimit
+  deriving (Eq, Ord, Show, Lift)
+
 --
 -- FIXME This whole Log/Failure thing could be a lot more structured to allow
 -- FIXME for richer user controlled error messages, think Doc. Ideally we'd
@@ -403,7 +434,7 @@ newtype CoverCount =
 newtype CoverPercentage =
   CoverPercentage {
       unCoverPercentage :: Double
-    } deriving (Eq, Ord, Show, Num)
+    } deriving (Eq, Ord, Show, Num, Fractional)
 
 -- | The name of a classifier.
 --
@@ -890,22 +921,60 @@ test =
 defaultConfig :: PropertyConfig
 defaultConfig =
   PropertyConfig {
-      propertyTestLimit =
-        100
-    , propertyDiscardLimit =
+      propertyDiscardLimit =
         100
     , propertyShrinkLimit =
         1000
     , propertyShrinkRetries =
         0
+    , propertyTerminationCriteria =
+        NoConfidenceTermination defaultMinTests
     }
 
+-- | The minimum amount of tests to run for a 'Property'
+--
+defaultMinTests :: TestLimit
+defaultMinTests = 100
+
+-- | The default confidence allows one false positive in 10^9 tests
+--
+defaultConfidence :: Confidence
+defaultConfidence = 10 ^ (9 :: Int)
+
 -- | Map a config modification function over a property.
 --
 mapConfig :: (PropertyConfig -> PropertyConfig) -> Property -> Property
 mapConfig f (Property cfg t) =
   Property (f cfg) t
 
+-- | Make sure that the result is statistically significant in accordance to
+--   the passed 'Confidence'
+--
+withConfidence :: Confidence -> Property -> Property
+withConfidence c =
+  let
+    setConfidence = \case
+      NoEarlyTermination _ tests -> NoEarlyTermination c tests
+      NoConfidenceTermination tests -> NoEarlyTermination c tests
+      EarlyTermination _ tests -> EarlyTermination c tests
+  in
+    mapConfig $ \config@PropertyConfig{..} ->
+      config
+        { propertyTerminationCriteria =
+            setConfidence propertyTerminationCriteria
+        }
+
+verifiedTermination :: Property -> Property
+verifiedTermination =
+  mapConfig $ \config@PropertyConfig{..} ->
+    let
+      newTerminationCriteria = case propertyTerminationCriteria of
+        NoEarlyTermination c tests -> EarlyTermination c tests
+        NoConfidenceTermination tests -> EarlyTermination defaultConfidence tests
+        EarlyTermination c tests -> EarlyTermination c tests
+    in
+      config { propertyTerminationCriteria = newTerminationCriteria }
+
 -- | Set the number of times a property should be executed before it is considered
 --   successful.
 --
@@ -915,7 +984,14 @@ mapConfig f (Property cfg t) =
 --
 withTests :: TestLimit -> Property -> Property
 withTests n =
-  mapConfig $ \config -> config { propertyTestLimit = n }
+  let
+    setTestLimit tests = \case
+      NoEarlyTermination c _ -> NoEarlyTermination c tests
+      NoConfidenceTermination _ -> NoConfidenceTermination tests
+      EarlyTermination c _ -> EarlyTermination c tests
+  in
+    mapConfig $ \config@PropertyConfig{..} ->
+      config { propertyTerminationCriteria = setTestLimit n propertyTerminationCriteria }
 
 -- | Set the number of times a property is allowed to discard before the test
 --   runner gives up.
@@ -1013,6 +1089,7 @@ labelCovered :: TestCount -> Label CoverCount -> Bool
 labelCovered tests (MkLabel _ _ minimum_ population) =
   coverPercentage tests population >= minimum_
 
+-- | All labels are covered
 coverageSuccess :: TestCount -> Coverage CoverCount -> Bool
 coverageSuccess tests =
   null . coverageFailures tests
@@ -1021,6 +1098,68 @@ coverageFailures :: TestCount -> Coverage CoverCount -> [Label CoverCount]
 coverageFailures tests (Coverage kvs) =
   filter (not . labelCovered tests) (Map.elems kvs)
 
+-- | Is true when the test coverage satisfies the specified 'Confidence'
+--   contstraint for all 'Coverage CoverCount's
+confidenceSuccess :: TestCount -> Confidence -> Coverage CoverCount -> Bool
+confidenceSuccess tests confidence =
+  let
+    assertLow :: Label CoverCount -> Bool
+    assertLow coverCount@MkLabel{..} =
+      fst (boundsForLabel tests confidence coverCount)
+        >= unCoverPercentage labelMinimum / 100.0
+  in
+    and . fmap assertLow . Map.elems . coverageLabels
+
+-- | Is true when there exists a label that is sure to have failed according to
+--   the 'Confidence' constraint
+confidenceFailure :: TestCount -> Confidence -> Coverage CoverCount -> Bool
+confidenceFailure tests confidence =
+  let
+    assertHigh :: Label CoverCount -> Bool
+    assertHigh coverCount@MkLabel{..} =
+      snd (boundsForLabel tests confidence coverCount)
+        < (unCoverPercentage labelMinimum / 100.0)
+  in
+    or . fmap assertHigh . Map.elems . coverageLabels
+
+boundsForLabel :: TestCount -> Confidence -> Label CoverCount -> (Double, Double)
+boundsForLabel tests confidence MkLabel{..} =
+  wilsonBounds
+    (fromIntegral $ unCoverCount labelAnnotation)
+    (fromIntegral tests)
+    (1 / fromIntegral (unConfidence confidence))
+
+-- In order to get an accurate measurement with small sample sizes, we're
+-- using the Wilson score interval
+-- (<https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
+-- wikipedia>) instead of a normal approximation interval.
+wilsonBounds :: Integer -> Integer -> Double -> (Double, Double)
+wilsonBounds positives count acceptance =
+  let
+    p =
+      fromRational $ positives % count
+    n =
+      fromIntegral count
+    z =
+      invnormcdf $ 1 - acceptance / 2
+
+    midpoint =
+      p + z * z / (2 * n)
+
+    offset =
+      z / (1 + z ** 2 / n) * sqrt (p * (1 - p) / n + z ** 2 / (4 * n ** 2))
+
+    denominator =
+      1 + z * z / n
+
+    low =
+      (midpoint - offset) / denominator
+
+    high =
+      (midpoint + offset) / denominator
+  in
+    (low, high)
+
 fromLabel :: Label a -> Coverage a
 fromLabel x =
   Coverage $