From 0817722a761e97e91825246a78e14b3f18b6b006 Mon Sep 17 00:00:00 2001 From: Pengfei Liu Date: Tue, 27 Dec 2022 16:29:07 -0500 Subject: [PATCH 1/2] add tests for meval to replicate paper results --- .../bart_score_cnn_ref_hypo_predictions.json | 662 ++++++++++++++++++ .../newsroom/rouge1_f_predictions.json | 662 ++++++++++++++++++ integration_tests/meta_eval_nlg_test.py | 90 ++- 3 files changed, 1401 insertions(+), 13 deletions(-) create mode 100644 integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json create mode 100644 integration_tests/artifacts/newsroom/rouge1_f_predictions.json diff --git a/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json b/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json new file mode 100644 index 00000000..0b2d9480 --- /dev/null +++ b/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json @@ -0,0 +1,662 @@ +[ + { + "auto_scores": [ + -1.2346594333648682, + -3.337571382522583, + -6.383989334106445, + -4.984112739562988, + -3.6778671741485596, + -3.930911064147949, + -3.212421178817749 + ] + }, + { + "auto_scores": [ + -3.6213812828063965, + -0.634912371635437, + -3.670194387435913, + -5.574587821960449, + -6.174976825714111, + -4.187601089477539, + -4.893369197845459 + ] + }, + { + "auto_scores": [ + -4.3441267013549805, + -2.0739285945892334, + -4.994687080383301, + -5.674907207489014, + -4.832435607910156, + -4.986924171447754, + -5.201959133148193 + ] + }, + { + "auto_scores": [ + -5.000977039337158, + -5.222707748413086, + -4.795951843261719, + -3.734975814819336, + -3.2286689281463623, + -3.826882839202881, + -4.8123393058776855 + ] + }, + { + "auto_scores": [ + -4.223236560821533, + -4.719715595245361, + -4.59467077255249, + -5.68583869934082, + -6.5522780418396, + -5.388362407684326, + -5.18690299987793 + ] + }, + { + "auto_scores": [ + -3.7510688304901123, + -2.914438247680664, + -4.286332130432129, + -7.133133411407471, + -5.809284210205078, + -3.3124961853027344, + -5.612478733062744 + ] + }, + { + "auto_scores": [ + -0.6680727005004883, + -0.6739019155502319, + -3.1031758785247803, + -5.8908233642578125, + -4.66937780380249, + -2.0242176055908203, + -4.522027492523193 + ] + }, + { + "auto_scores": [ + -4.329258441925049, + -5.275319576263428, + -5.579972743988037, + -7.09144401550293, + -5.283503532409668, + -5.104097843170166, + -4.941505432128906 + ] + }, + { + "auto_scores": [ + -2.8284058570861816, + -1.1876786947250366, + -3.3185195922851562, + -4.423750400543213, + -4.909323692321777, + -3.3485019207000732, + -3.490218162536621 + ] + }, + { + "auto_scores": [ + -6.907121181488037, + -4.707708358764648, + -4.0600481033325195, + -6.627317905426025, + -4.6124958992004395, + -5.713040828704834, + -4.879390239715576 + ] + }, + { + "auto_scores": [ + -3.482964515686035, + -1.8555914163589478, + -3.4856302738189697, + -4.5765790939331055, + -5.007498264312744, + -5.3058695793151855, + -4.370933532714844 + ] + }, + { + "auto_scores": [ + -3.649778127670288, + -7.154506683349609, + -3.289411783218384, + -5.8764119148254395, + -5.020175933837891, + -4.205811023712158, + -4.3719482421875 + ] + }, + { + "auto_scores": [ + -3.525455951690674, + -0.6714977025985718, + -3.5236685276031494, + -5.575921058654785, + -4.328283309936523, + -4.421144008636475, + -5.753363609313965 + ] + }, + { + "auto_scores": [ + -4.823455810546875, + -3.305582284927368, + -3.7563633918762207, + -7.122200012207031, + -6.407985687255859, + -6.161899566650391, + -6.045509338378906 + ] + }, + { + "auto_scores": [ + -4.271821022033691, + -0.22897133231163025, + -1.7710247039794922, + -5.976954460144043, + -4.617368221282959, + -4.148235321044922, + -5.564839839935303 + ] + }, + { + "auto_scores": [ + -4.172784328460693, + -0.9746262431144714, + -4.971782207489014, + -7.950442790985107, + -4.324660778045654, + -5.159337043762207, + -5.440691947937012 + ] + }, + { + "auto_scores": [ + -3.0683135986328125, + -2.433345079421997, + -2.8803658485412598, + -5.805661678314209, + -4.270867824554443, + -3.3736610412597656, + -5.132525444030762 + ] + }, + { + "auto_scores": [ + -1.7446292638778687, + -0.6325616240501404, + -2.5213329792022705, + -4.796778202056885, + -4.467785835266113, + -3.71679949760437, + -4.044307708740234 + ] + }, + { + "auto_scores": [ + -4.092418670654297, + -3.5181899070739746, + -4.59417200088501, + -4.340042591094971, + -5.503137111663818, + -5.130063533782959, + -4.715507507324219 + ] + }, + { + "auto_scores": [ + -1.9360358715057373, + -1.2985289096832275, + -1.7955436706542969, + -5.797574996948242, + -5.288994789123535, + -2.147458076477051, + -2.362152576446533 + ] + }, + { + "auto_scores": [ + -3.1864800453186035, + -7.8803300857543945, + -3.383310079574585, + -5.561729431152344, + -5.314629554748535, + -3.5953166484832764, + -5.051906108856201 + ] + }, + { + "auto_scores": [ + -3.8382568359375, + -1.2670823335647583, + -4.553617000579834, + -5.867466449737549, + -4.707453727722168, + -5.538605213165283, + -4.538035869598389 + ] + }, + { + "auto_scores": [ + -3.8993802070617676, + -3.60168719291687, + -3.857508659362793, + -4.424567699432373, + -5.072203636169434, + -4.495748519897461, + -4.477273464202881 + ] + }, + { + "auto_scores": [ + -1.950005054473877, + -0.7109898924827576, + -3.2378036975860596, + -5.4950456619262695, + -4.684595108032227, + -1.7371671199798584, + -4.209641933441162 + ] + }, + { + "auto_scores": [ + -4.496479034423828, + -2.556669235229492, + -4.498341083526611, + -5.729393005371094, + -5.359912872314453, + -5.263284206390381, + -6.106546401977539 + ] + }, + { + "auto_scores": [ + -4.054862976074219, + -0.45988163352012634, + -4.6832661628723145, + -4.341195106506348, + -5.400857448577881, + -4.356181621551514, + -5.217522144317627 + ] + }, + { + "auto_scores": [ + -5.333022117614746, + -1.9357249736785889, + -4.021650314331055, + -5.890506267547607, + -4.706431865692139, + -5.232513904571533, + -5.546692371368408 + ] + }, + { + "auto_scores": [ + -3.3256938457489014, + -0.9897561073303223, + -4.210707664489746, + -5.635035037994385, + -5.4662861824035645, + -3.871394157409668, + -5.324164390563965 + ] + }, + { + "auto_scores": [ + -2.717494487762451, + -3.7315330505371094, + -3.955324172973633, + -4.491107940673828, + -4.338680744171143, + -3.963266372680664, + -4.681497573852539 + ] + }, + { + "auto_scores": [ + -0.8609310984611511, + -1.1239956617355347, + -1.3126013278961182, + -4.678332805633545, + -4.1132965087890625, + -2.218703269958496, + -3.150106906890869 + ] + }, + { + "auto_scores": [ + -3.643179416656494, + -0.9851183891296387, + -3.833064317703247, + -4.648499011993408, + -4.927526950836182, + -4.354979038238525, + -5.20051383972168 + ] + }, + { + "auto_scores": [ + -3.9338979721069336, + -3.090111494064331, + -4.448740482330322, + -3.902815818786621, + -4.836552619934082, + -3.8867716789245605, + -5.490387439727783 + ] + }, + { + "auto_scores": [ + -1.1862435340881348, + -1.603387713432312, + -4.425495147705078, + -4.629676342010498, + -6.500186920166016, + -1.749828577041626, + -3.9402852058410645 + ] + }, + { + "auto_scores": [ + -3.261005401611328, + -0.7430241703987122, + -4.377554416656494, + -5.855930805206299, + -5.323549747467041, + -4.947967529296875, + -4.609178066253662 + ] + }, + { + "auto_scores": [ + -2.8447532653808594, + -0.9991836547851562, + -3.883254051208496, + -7.081796646118164, + -5.052463531494141, + -4.451764106750488, + -4.260918140411377 + ] + }, + { + "auto_scores": [ + -2.7264280319213867, + -1.1419219970703125, + -1.1307411193847656, + -5.69485330581665, + -5.184634208679199, + -2.312387228012085, + -2.2400753498077393 + ] + }, + { + "auto_scores": [ + -3.8610730171203613, + -1.3325031995773315, + -3.223046064376831, + -5.786430835723877, + -5.001805305480957, + -4.727075576782227, + -4.931162357330322 + ] + }, + { + "auto_scores": [ + -3.1031854152679443, + -2.283616065979004, + -3.1693084239959717, + -5.209831714630127, + -5.4373955726623535, + -3.6447370052337646, + -5.161511421203613 + ] + }, + { + "auto_scores": [ + -2.831552505493164, + -1.4173871278762817, + -2.1985905170440674, + -4.8720808029174805, + -3.425079345703125, + -2.933311700820923, + -4.249796390533447 + ] + }, + { + "auto_scores": [ + -2.482973575592041, + -0.8178629279136658, + -3.3884518146514893, + -4.180578231811523, + -4.87364387512207, + -1.7275251150131226, + -3.002310037612915 + ] + }, + { + "auto_scores": [ + -3.5216124057769775, + -1.315640926361084, + -4.976970195770264, + -4.588405132293701, + -4.988601207733154, + -4.338250160217285, + -4.258938789367676 + ] + }, + { + "auto_scores": [ + -2.2196450233459473, + -0.6448060274124146, + -4.11163330078125, + -8.273467063903809, + -3.422013521194458, + -2.428804397583008, + -4.367648601531982 + ] + }, + { + "auto_scores": [ + -2.8822271823883057, + -1.338028073310852, + -2.5680463314056396, + -5.671765327453613, + -3.6491050720214844, + -3.923588752746582, + -3.6876883506774902 + ] + }, + { + "auto_scores": [ + -3.0493404865264893, + -1.0289462804794312, + -2.6373345851898193, + -5.77088737487793, + -4.87874174118042, + -5.060049057006836, + -5.282394886016846 + ] + }, + { + "auto_scores": [ + -4.5092267990112305, + -2.1672258377075195, + -5.196324348449707, + -4.173991680145264, + -5.770926475524902, + -4.504154205322266, + -5.454038619995117 + ] + }, + { + "auto_scores": [ + -3.9801697731018066, + -1.4928231239318848, + -4.76878547668457, + -4.193707466125488, + -4.717602252960205, + -4.891543388366699, + -4.786087989807129 + ] + }, + { + "auto_scores": [ + -4.327728271484375, + -5.265812873840332, + -4.09039831161499, + -5.932196140289307, + -5.382634162902832, + -4.686864376068115, + -5.1508002281188965 + ] + }, + { + "auto_scores": [ + -3.269033432006836, + -4.5389723777771, + -4.005921840667725, + -6.995456695556641, + -4.278707981109619, + -3.5476715564727783, + -4.695621490478516 + ] + }, + { + "auto_scores": [ + -3.557901620864868, + -2.4186339378356934, + -4.851772785186768, + -4.371306896209717, + -5.103922367095947, + -4.333691596984863, + -4.298647880554199 + ] + }, + { + "auto_scores": [ + -4.055961608886719, + -4.662075519561768, + -3.473954439163208, + -7.5713982582092285, + -6.1275200843811035, + -5.409901142120361, + -5.5505499839782715 + ] + }, + { + "auto_scores": [ + -3.688018798828125, + -3.572950839996338, + -4.312163352966309, + -4.746547698974609, + -4.910508155822754, + -4.222039222717285, + -4.782878398895264 + ] + }, + { + "auto_scores": [ + -3.908001661300659, + -1.983359932899475, + -4.276918411254883, + -5.791119575500488, + -5.717682361602783, + -4.877152442932129, + -4.80209493637085 + ] + }, + { + "auto_scores": [ + -3.632587194442749, + -3.0536139011383057, + -4.381106376647949, + -5.96414852142334, + -4.910465240478516, + -3.78712797164917, + -5.2252631187438965 + ] + }, + { + "auto_scores": [ + -3.524134635925293, + -1.4702116250991821, + -3.7997453212738037, + -6.277534484863281, + -5.421000003814697, + -4.504569053649902, + -4.773496150970459 + ] + }, + { + "auto_scores": [ + -3.3854877948760986, + -4.794503211975098, + -4.626842498779297, + -4.996484756469727, + -5.472954273223877, + -4.742428302764893, + -6.0292253494262695 + ] + }, + { + "auto_scores": [ + -3.2534337043762207, + -1.3036439418792725, + -3.159641981124878, + -6.059638023376465, + -4.791782379150391, + -4.999390125274658, + -5.129875659942627 + ] + }, + { + "auto_scores": [ + -3.7950146198272705, + -1.0038492679595947, + -3.8807947635650635, + -6.902472496032715, + -5.115923881530762, + -4.9170732498168945, + -4.917073726654053 + ] + }, + { + "auto_scores": [ + -4.213415622711182, + -4.250669002532959, + -3.8901631832122803, + -5.633732318878174, + -5.176709175109863, + -5.110617637634277, + -5.267539024353027 + ] + }, + { + "auto_scores": [ + -4.235969543457031, + -2.604471445083618, + -4.817269802093506, + -6.817267894744873, + -4.734187126159668, + -4.550678253173828, + -4.019895076751709 + ] + }, + { + "auto_scores": [ + -4.271620273590088, + -3.7753279209136963, + -4.285919189453125, + -6.411378860473633, + -4.8667378425598145, + -4.203290939331055, + -4.631669521331787 + ] + } +] \ No newline at end of file diff --git a/integration_tests/artifacts/newsroom/rouge1_f_predictions.json b/integration_tests/artifacts/newsroom/rouge1_f_predictions.json new file mode 100644 index 00000000..482f4989 --- /dev/null +++ b/integration_tests/artifacts/newsroom/rouge1_f_predictions.json @@ -0,0 +1,662 @@ +[ + { + "auto_scores": [ + 0.9434, + 0.21918, + 0.05263, + 0.19781, + 0.22857, + 0.21539, + 0.47059 + ] + }, + { + "auto_scores": [ + 0.31667, + 0.97561, + 0.54545, + 0.07408, + 0.17858, + 0.18519, + 0.38554 + ] + }, + { + "auto_scores": [ + 0.21052, + 0.9, + 0.16439, + 0.05555, + 0.28572, + 0.28571, + 0.12598 + ] + }, + { + "auto_scores": [ + 0.09411, + 0.15686, + 0.12698, + 0.12372, + 0.75, + 0.04167, + 0.0 + ] + }, + { + "auto_scores": [ + 0.12281, + 0.72, + 0.10811, + 0.0, + 0.11628, + 0.12766, + 0.10526 + ] + }, + { + "auto_scores": [ + 0.2029, + 0.85714, + 0.23188, + 0.15, + 0.25715, + 0.17241, + 0.17647 + ] + }, + { + "auto_scores": [ + 0.64285, + 1.0, + 0.37255, + 0.0, + 0.32117, + 0.77894, + 0.6 + ] + }, + { + "auto_scores": [ + 0.12903, + 0.66667, + 0.15385, + 0.0, + 0.10526, + 0.11111, + 0.11111 + ] + }, + { + "auto_scores": [ + 0.52941, + 0.95652, + 0.19672, + 0.0, + 0.16667, + 0.53731, + 0.45569 + ] + }, + { + "auto_scores": [ + 0.09756, + 0.67692, + 0.10126, + 0.04445, + 0.13158, + 0.07895, + 0.12599 + ] + }, + { + "auto_scores": [ + 0.2078, + 0.90909, + 0.21739, + 0.07407, + 0.15001, + 0.20833, + 0.11321 + ] + }, + { + "auto_scores": [ + 0.08, + 0.43479, + 0.15686, + 0.08334, + 0.09412, + 0.11765, + 0.1282 + ] + }, + { + "auto_scores": [ + 0.34951, + 1.0, + 0.64285, + 0.21053, + 0.4, + 0.11594, + 0.1039 + ] + }, + { + "auto_scores": [ + 0.10937, + 0.66667, + 0.07142, + 0.08333, + 0.10126, + 0.0, + 0.03252 + ] + }, + { + "auto_scores": [ + 0.12903, + 1.0, + 0.82051, + 0.0, + 0.16667, + 0.43243, + 0.33614 + ] + }, + { + "auto_scores": [ + 0.09449, + 0.96552, + 0.14492, + 0.0, + 0.07577, + 0.3077, + 0.06061 + ] + }, + { + "auto_scores": [ + 0.26373, + 0.9, + 0.33334, + 0.0, + 0.12612, + 0.17544, + 0.12903 + ] + }, + { + "auto_scores": [ + 0.33708, + 0.98529, + 0.26263, + 0.02381, + 0.34783, + 0.5, + 0.24242 + ] + }, + { + "auto_scores": [ + 0.18182, + 0.66667, + 0.15385, + 0.0, + 0.0, + 0.16666, + 0.11111 + ] + }, + { + "auto_scores": [ + 0.6055, + 0.93507, + 0.73333, + 0.04545, + 0.46729, + 0.85715, + 0.73333 + ] + }, + { + "auto_scores": [ + 0.0, + 0.33333, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "auto_scores": [ + 0.2, + 0.97297, + 0.21875, + 0.0, + 0.47369, + 0.11111, + 0.21818 + ] + }, + { + "auto_scores": [ + 0.25862, + 0.70968, + 0.2, + 0.03636, + 0.21539, + 0.28948, + 0.30769 + ] + }, + { + "auto_scores": [ + 0.48855, + 0.98462, + 0.23684, + 0.09524, + 0.17778, + 0.76923, + 0.40876 + ] + }, + { + "auto_scores": [ + 0.08265, + 0.78572, + 0.04445, + 0.0, + 0.08602, + 0.03637, + 0.075 + ] + }, + { + "auto_scores": [ + 0.22609, + 1.0, + 0.1608, + 0.03509, + 0.125, + 0.17894, + 0.15384 + ] + }, + { + "auto_scores": [ + 0.27118, + 0.93549, + 0.20513, + 0.0, + 0.30107, + 0.27692, + 0.21782 + ] + }, + { + "auto_scores": [ + 0.42222, + 0.97436, + 0.24, + 0.0, + 0.26415, + 0.73077, + 0.29033 + ] + }, + { + "auto_scores": [ + 0.22535, + 0.875, + 0.3077, + 0.0, + 0.18182, + 0.14634, + 0.13333 + ] + }, + { + "auto_scores": [ + 0.49006, + 1.0, + 0.59259, + 0.09091, + 0.51807, + 0.425, + 0.59649 + ] + }, + { + "auto_scores": [ + 0.11764, + 0.91892, + 0.28169, + 0.11428, + 0.26966, + 0.19231, + 0.11429 + ] + }, + { + "auto_scores": [ + 0.07595, + 0.82927, + 0.14706, + 0.0, + 0.11111, + 0.025, + 0.09302 + ] + }, + { + "auto_scores": [ + 0.69822, + 0.98148, + 0.24096, + 0.04838, + 0.22377, + 0.5, + 0.35616 + ] + }, + { + "auto_scores": [ + 0.42857, + 0.98182, + 0.26471, + 0.16666, + 0.20472, + 0.16129, + 0.18182 + ] + }, + { + "auto_scores": [ + 0.58252, + 0.9697, + 0.12821, + 0.05405, + 0.26471, + 0.21621, + 0.25 + ] + }, + { + "auto_scores": [ + 0.61017, + 0.98591, + 1.0, + 0.16394, + 0.34568, + 0.95652, + 1.0 + ] + }, + { + "auto_scores": [ + 0.43678, + 0.95652, + 0.51613, + 0.10526, + 0.14545, + 0.14545, + 0.19048 + ] + }, + { + "auto_scores": [ + 0.28572, + 0.91892, + 0.20896, + 0.11428, + 0.19672, + 0.28571, + 0.19672 + ] + }, + { + "auto_scores": [ + 0.38938, + 0.97675, + 0.83019, + 0.0, + 0.37624, + 0.56604, + 0.35088 + ] + }, + { + "auto_scores": [ + 0.63889, + 0.97872, + 0.15873, + 0.05263, + 0.07843, + 0.74193, + 0.74193 + ] + }, + { + "auto_scores": [ + 0.12195, + 0.96296, + 0.21818, + 0.2069, + 0.17242, + 0.17242, + 0.24 + ] + }, + { + "auto_scores": [ + 0.33463, + 0.99479, + 0.09216, + 0.0, + 0.31372, + 0.28444, + 0.41216 + ] + }, + { + "auto_scores": [ + 0.42718, + 0.95833, + 0.1, + 0.0, + 0.15152, + 0.10345, + 0.4466 + ] + }, + { + "auto_scores": [ + 0.31343, + 0.97675, + 0.47619, + 0.0, + 0.14458, + 0.16667, + 0.10526 + ] + }, + { + "auto_scores": [ + 0.11494, + 0.93333, + 0.2, + 0.0, + 0.11112, + 0.05, + 0.0708 + ] + }, + { + "auto_scores": [ + 0.17204, + 0.95, + 0.18749, + 0.05556, + 0.16667, + 0.06667, + 0.1791 + ] + }, + { + "auto_scores": [ + 0.16901, + 0.73077, + 0.28986, + 0.0, + 0.19355, + 0.09375, + 0.225 + ] + }, + { + "auto_scores": [ + 0.18868, + 0.78788, + 0.19179, + 0.15, + 0.24657, + 0.1695, + 0.17143 + ] + }, + { + "auto_scores": [ + 0.1519, + 0.84615, + 0.09677, + 0.06667, + 0.03884, + 0.07844, + 0.07273 + ] + }, + { + "auto_scores": [ + 0.12307, + 0.7317, + 0.19047, + 0.0, + 0.1282, + 0.09836, + 0.11594 + ] + }, + { + "auto_scores": [ + 0.08772, + 0.78261, + 0.08696, + 0.07143, + 0.04706, + 0.14545, + 0.10666 + ] + }, + { + "auto_scores": [ + 0.20833, + 0.91666, + 0.24, + 0.16666, + 0.13636, + 0.17857, + 0.22222 + ] + }, + { + "auto_scores": [ + 0.08, + 0.78431, + 0.19178, + 0.0, + 0.26087, + 0.28986, + 0.20619 + ] + }, + { + "auto_scores": [ + 0.16868, + 0.875, + 0.17778, + 0.0, + 0.28, + 0.28, + 0.20589 + ] + }, + { + "auto_scores": [ + 0.12371, + 0.69565, + 0.06558, + 0.06667, + 0.05797, + 0.07692, + 0.05633 + ] + }, + { + "auto_scores": [ + 0.20155, + 0.93333, + 0.2623, + 0.0, + 0.14457, + 0.13793, + 0.18666 + ] + }, + { + "auto_scores": [ + 0.15385, + 0.97143, + 0.16326, + 0.25806, + 0.19178, + 0.09091, + 0.09091 + ] + }, + { + "auto_scores": [ + 0.19672, + 0.79166, + 0.21538, + 0.0, + 0.10204, + 0.21538, + 0.18182 + ] + }, + { + "auto_scores": [ + 0.28948, + 0.85714, + 0.11941, + 0.36364, + 0.31579, + 0.3, + 0.2353 + ] + }, + { + "auto_scores": [ + 0.20833, + 0.90323, + 0.21538, + 0.16, + 0.10417, + 0.10526, + 0.13333 + ] + } +] \ No newline at end of file diff --git a/integration_tests/meta_eval_nlg_test.py b/integration_tests/meta_eval_nlg_test.py index e125aa77..6ae36821 100644 --- a/integration_tests/meta_eval_nlg_test.py +++ b/integration_tests/meta_eval_nlg_test.py @@ -1,20 +1,24 @@ from __future__ import annotations +import os import unittest +from integration_tests.utils import test_artifacts_path import numpy as np +from explainaboard import FileType, get_processor_class, Source, TaskType +from explainaboard.loaders.file_loader import DatalabLoaderOption +from explainaboard.loaders.loader_factory import get_loader_class from explainaboard.metrics.meta_evaluation import CorrelationNLG, CorrelationNLGConfig +from explainaboard.metrics.metric import Score from explainaboard.utils.typing_utils import narrow, unwrap class MetaEvalNLGInvalidValueTest(unittest.TestCase): - true_data = [[1, 2, 3, 4, 5], [2, 1, 4, 5, 2], [5, 4, 3, 2, 1]] pred_data = [[2, 1, 3, 4, 5], [2, 4, 5, 5, 2], [5, 3, 4, 2, 1]] def test_illegal_correlation_type_calc_stats_from_data(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="sample", correlation_type="illegal" ) @@ -33,7 +37,6 @@ def test_illegal_correlation_type_aggregate_stats(self) -> None: corr_metric._calc_metric_from_aggregate_single(stats_arr) def test_illegal_group_type_calc_stats_from_data(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="illegal", correlation_type="spearmanr" ) @@ -61,7 +64,6 @@ class MetaEvalNLGTest(unittest.TestCase): pred_data = [[2, 1, 3, 4, 5], [2, 4, 5, 5, 2], [5, 3, 4, 2, 1]] def test_sample_level_spearmanr(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="sample", correlation_type="spearmanr" ) @@ -73,7 +75,6 @@ def test_sample_level_spearmanr(self) -> None: self.assertAlmostEqual(val, 0.8162952, 3) def test_sample_level_kendalltau(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="sample", correlation_type="kendalltau" ) @@ -84,7 +85,6 @@ def test_sample_level_kendalltau(self) -> None: self.assertAlmostEqual(val, 0.69046817, 3) def test_sample_level_pearsonr(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="sample", correlation_type="pearsonr" ) @@ -95,7 +95,6 @@ def test_sample_level_pearsonr(self) -> None: self.assertAlmostEqual(val, 0.820707397, 3) def test_system_level_spearmanr(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="system", correlation_type="spearmanr" ) @@ -106,7 +105,6 @@ def test_system_level_spearmanr(self) -> None: self.assertAlmostEqual(val, 0.815789, 3) def test_system_level_kendalltau(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="system", correlation_type="kendalltau" ) @@ -117,7 +115,6 @@ def test_system_level_kendalltau(self) -> None: self.assertAlmostEqual(val, 0.66666, 3) def test_dataset_level_spearmanr(self) -> None: - true_data = [[1], [2], [3], [4], [5]] pred_data = [[1], [2], [3], [4], [5]] @@ -132,12 +129,10 @@ def test_dataset_level_spearmanr(self) -> None: class MetaEvalNLGCITest(unittest.TestCase): - true_data = [[1, 2, 3, 4, 5], [2, 1, 4, 5, 2], [5, 4, 3, 2, 1]] pred_data = [[2, 1, 3, 4, 5], [2, 4, 5, 5, 2], [5, 3, 4, 2, 1]] def test_sample_level_spearmanr_bootstrap(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="sample", correlation_type="spearmanr" ) @@ -154,7 +149,6 @@ def test_sample_level_spearmanr_bootstrap(self) -> None: self.assertAlmostEqual(ci[1], 0.8999, 2) def test_system_level_spearmanr_bootstrap(self) -> None: - nlg_corr_config = CorrelationNLGConfig( group_by="system", correlation_type="spearmanr" ) @@ -171,7 +165,6 @@ def test_system_level_spearmanr_bootstrap(self) -> None: self.assertAlmostEqual(ci[1], 0.9746, 2) def test_dataset_level_spearmanr_bootstrap(self) -> None: - true_data = [[1], [2], [3], [4], [5]] pred_data = [[1], [2], [3], [4], [5]] @@ -189,3 +182,74 @@ def test_dataset_level_spearmanr_bootstrap(self) -> None: ci = unwrap(corr_metric.calc_confidence_interval(stats, 0.05)) self.assertAlmostEqual(ci[0], 1, 2) self.assertAlmostEqual(ci[1], 1, 2) + + +class MetaEvalNLGNewsroomTest(unittest.TestCase): + """ + Test the NLG metric on newsroom dataset and replicate the reported results from + the paper: https://arxiv.org/pdf/2106.11520.pdf + """ + + artifact_path = os.path.join(test_artifacts_path, "newsroom") + predictions_rouge1 = os.path.join(artifact_path, "rouge1_f_predictions.json") + predictions_bartscore = os.path.join( + artifact_path, "bart_score_cnn_ref_hypo_predictions.json" + ) + + def test_coherence_rouge1_f(self): + loader = get_loader_class(TaskType.meta_evaluation_nlg).from_datalab( + dataset=DatalabLoaderOption("meval_newsroom", "coherence"), + output_data=self.predictions_rouge1, + output_source=Source.local_filesystem, + output_file_type=FileType.json, + ) + data = loader.load().samples + + metadata = { + "task_name": TaskType.meta_evaluation_nlg.value, + "dataset_name": "meval_newsroom", + "sub_dataset_name": "coherence", + "metric_names": ["SpearmanSampleLevelCorr"], + } + processor = get_processor_class(TaskType.meta_evaluation_nlg)() + sys_info = processor.process(metadata, data) + overall_score = ( + sys_info.results.overall["example"]["SpearmanSampleLevelCorr"] + .get_value(Score, "score") + .value + ) + self.assertGreater(len(sys_info.results.analyses), 0) + self.assertAlmostEqual( + overall_score, + 0.0946, + places=3, + ) + + def test_coherence_bartscore(self): + loader = get_loader_class(TaskType.meta_evaluation_nlg).from_datalab( + dataset=DatalabLoaderOption("meval_newsroom", "coherence"), + output_data=self.predictions_bartscore, + output_source=Source.local_filesystem, + output_file_type=FileType.json, + ) + data = loader.load().samples + + metadata = { + "task_name": TaskType.meta_evaluation_nlg.value, + "dataset_name": "meval_newsroom", + "sub_dataset_name": "coherence", + "metric_names": ["SpearmanSampleLevelCorr"], + } + processor = get_processor_class(TaskType.meta_evaluation_nlg)() + sys_info = processor.process(metadata, data) + overall_score = ( + sys_info.results.overall["example"]["SpearmanSampleLevelCorr"] + .get_value(Score, "score") + .value + ) + self.assertGreater(len(sys_info.results.analyses), 0) + self.assertAlmostEqual( + overall_score, + 0.3157, + places=3, + ) From 6fa4dd4d8f498e6d47287795c78c978577b9c485 Mon Sep 17 00:00:00 2001 From: Pengfei Liu Date: Fri, 30 Dec 2022 23:38:16 -0500 Subject: [PATCH 2/2] add comments & update files --- .../bart_score_cnn_ref_hypo_predictions.json | 663 +----------------- .../newsroom/rouge1_f_predictions.json | 663 +----------------- integration_tests/meta_eval_nlg_test.py | 3 + 3 files changed, 5 insertions(+), 1324 deletions(-) diff --git a/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json b/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json index 0b2d9480..b4b3b036 100644 --- a/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json +++ b/integration_tests/artifacts/newsroom/bart_score_cnn_ref_hypo_predictions.json @@ -1,662 +1 @@ -[ - { - "auto_scores": [ - -1.2346594333648682, - -3.337571382522583, - -6.383989334106445, - -4.984112739562988, - -3.6778671741485596, - -3.930911064147949, - -3.212421178817749 - ] - }, - { - "auto_scores": [ - -3.6213812828063965, - -0.634912371635437, - -3.670194387435913, - -5.574587821960449, - -6.174976825714111, - -4.187601089477539, - -4.893369197845459 - ] - }, - { - "auto_scores": [ - -4.3441267013549805, - -2.0739285945892334, - -4.994687080383301, - -5.674907207489014, - -4.832435607910156, - -4.986924171447754, - -5.201959133148193 - ] - }, - { - "auto_scores": [ - -5.000977039337158, - -5.222707748413086, - -4.795951843261719, - -3.734975814819336, - -3.2286689281463623, - -3.826882839202881, - -4.8123393058776855 - ] - }, - { - "auto_scores": [ - -4.223236560821533, - -4.719715595245361, - -4.59467077255249, - -5.68583869934082, - -6.5522780418396, - -5.388362407684326, - -5.18690299987793 - ] - }, - { - "auto_scores": [ - -3.7510688304901123, - -2.914438247680664, - -4.286332130432129, - -7.133133411407471, - -5.809284210205078, - -3.3124961853027344, - -5.612478733062744 - ] - }, - { - "auto_scores": [ - -0.6680727005004883, - -0.6739019155502319, - -3.1031758785247803, - -5.8908233642578125, - -4.66937780380249, - -2.0242176055908203, - -4.522027492523193 - ] - }, - { - "auto_scores": [ - -4.329258441925049, - -5.275319576263428, - -5.579972743988037, - -7.09144401550293, - -5.283503532409668, - -5.104097843170166, - -4.941505432128906 - ] - }, - { - "auto_scores": [ - -2.8284058570861816, - -1.1876786947250366, - -3.3185195922851562, - -4.423750400543213, - -4.909323692321777, - -3.3485019207000732, - -3.490218162536621 - ] - }, - { - "auto_scores": [ - -6.907121181488037, - -4.707708358764648, - -4.0600481033325195, - -6.627317905426025, - -4.6124958992004395, - -5.713040828704834, - -4.879390239715576 - ] - }, - { - "auto_scores": [ - -3.482964515686035, - -1.8555914163589478, - -3.4856302738189697, - -4.5765790939331055, - -5.007498264312744, - -5.3058695793151855, - -4.370933532714844 - ] - }, - { - "auto_scores": [ - -3.649778127670288, - -7.154506683349609, - -3.289411783218384, - -5.8764119148254395, - -5.020175933837891, - -4.205811023712158, - -4.3719482421875 - ] - }, - { - "auto_scores": [ - -3.525455951690674, - -0.6714977025985718, - -3.5236685276031494, - -5.575921058654785, - -4.328283309936523, - -4.421144008636475, - -5.753363609313965 - ] - }, - { - "auto_scores": [ - -4.823455810546875, - -3.305582284927368, - -3.7563633918762207, - -7.122200012207031, - -6.407985687255859, - -6.161899566650391, - -6.045509338378906 - ] - }, - { - "auto_scores": [ - -4.271821022033691, - -0.22897133231163025, - -1.7710247039794922, - -5.976954460144043, - -4.617368221282959, - -4.148235321044922, - -5.564839839935303 - ] - }, - { - "auto_scores": [ - -4.172784328460693, - -0.9746262431144714, - -4.971782207489014, - -7.950442790985107, - -4.324660778045654, - -5.159337043762207, - -5.440691947937012 - ] - }, - { - "auto_scores": [ - -3.0683135986328125, - -2.433345079421997, - -2.8803658485412598, - -5.805661678314209, - -4.270867824554443, - -3.3736610412597656, - -5.132525444030762 - ] - }, - { - "auto_scores": [ - -1.7446292638778687, - -0.6325616240501404, - -2.5213329792022705, - -4.796778202056885, - -4.467785835266113, - -3.71679949760437, - -4.044307708740234 - ] - }, - { - "auto_scores": [ - -4.092418670654297, - -3.5181899070739746, - -4.59417200088501, - -4.340042591094971, - -5.503137111663818, - -5.130063533782959, - -4.715507507324219 - ] - }, - { - "auto_scores": [ - -1.9360358715057373, - -1.2985289096832275, - -1.7955436706542969, - -5.797574996948242, - -5.288994789123535, - -2.147458076477051, - -2.362152576446533 - ] - }, - { - "auto_scores": [ - -3.1864800453186035, - -7.8803300857543945, - -3.383310079574585, - -5.561729431152344, - -5.314629554748535, - -3.5953166484832764, - -5.051906108856201 - ] - }, - { - "auto_scores": [ - -3.8382568359375, - -1.2670823335647583, - -4.553617000579834, - -5.867466449737549, - -4.707453727722168, - -5.538605213165283, - -4.538035869598389 - ] - }, - { - "auto_scores": [ - -3.8993802070617676, - -3.60168719291687, - -3.857508659362793, - -4.424567699432373, - -5.072203636169434, - -4.495748519897461, - -4.477273464202881 - ] - }, - { - "auto_scores": [ - -1.950005054473877, - -0.7109898924827576, - -3.2378036975860596, - -5.4950456619262695, - -4.684595108032227, - -1.7371671199798584, - -4.209641933441162 - ] - }, - { - "auto_scores": [ - -4.496479034423828, - -2.556669235229492, - -4.498341083526611, - -5.729393005371094, - -5.359912872314453, - -5.263284206390381, - -6.106546401977539 - ] - }, - { - "auto_scores": [ - -4.054862976074219, - -0.45988163352012634, - -4.6832661628723145, - -4.341195106506348, - -5.400857448577881, - -4.356181621551514, - -5.217522144317627 - ] - }, - { - "auto_scores": [ - -5.333022117614746, - -1.9357249736785889, - -4.021650314331055, - -5.890506267547607, - -4.706431865692139, - -5.232513904571533, - -5.546692371368408 - ] - }, - { - "auto_scores": [ - -3.3256938457489014, - -0.9897561073303223, - -4.210707664489746, - -5.635035037994385, - -5.4662861824035645, - -3.871394157409668, - -5.324164390563965 - ] - }, - { - "auto_scores": [ - -2.717494487762451, - -3.7315330505371094, - -3.955324172973633, - -4.491107940673828, - -4.338680744171143, - -3.963266372680664, - -4.681497573852539 - ] - }, - { - "auto_scores": [ - -0.8609310984611511, - -1.1239956617355347, - -1.3126013278961182, - -4.678332805633545, - -4.1132965087890625, - -2.218703269958496, - -3.150106906890869 - ] - }, - { - "auto_scores": [ - -3.643179416656494, - -0.9851183891296387, - -3.833064317703247, - -4.648499011993408, - -4.927526950836182, - -4.354979038238525, - -5.20051383972168 - ] - }, - { - "auto_scores": [ - -3.9338979721069336, - -3.090111494064331, - -4.448740482330322, - -3.902815818786621, - -4.836552619934082, - -3.8867716789245605, - -5.490387439727783 - ] - }, - { - "auto_scores": [ - -1.1862435340881348, - -1.603387713432312, - -4.425495147705078, - -4.629676342010498, - -6.500186920166016, - -1.749828577041626, - -3.9402852058410645 - ] - }, - { - "auto_scores": [ - -3.261005401611328, - -0.7430241703987122, - -4.377554416656494, - -5.855930805206299, - -5.323549747467041, - -4.947967529296875, - -4.609178066253662 - ] - }, - { - "auto_scores": [ - -2.8447532653808594, - -0.9991836547851562, - -3.883254051208496, - -7.081796646118164, - -5.052463531494141, - -4.451764106750488, - -4.260918140411377 - ] - }, - { - "auto_scores": [ - -2.7264280319213867, - -1.1419219970703125, - -1.1307411193847656, - -5.69485330581665, - -5.184634208679199, - -2.312387228012085, - -2.2400753498077393 - ] - }, - { - "auto_scores": [ - -3.8610730171203613, - -1.3325031995773315, - -3.223046064376831, - -5.786430835723877, - -5.001805305480957, - -4.727075576782227, - -4.931162357330322 - ] - }, - { - "auto_scores": [ - -3.1031854152679443, - -2.283616065979004, - -3.1693084239959717, - -5.209831714630127, - -5.4373955726623535, - -3.6447370052337646, - -5.161511421203613 - ] - }, - { - "auto_scores": [ - -2.831552505493164, - -1.4173871278762817, - -2.1985905170440674, - -4.8720808029174805, - -3.425079345703125, - -2.933311700820923, - -4.249796390533447 - ] - }, - { - "auto_scores": [ - -2.482973575592041, - -0.8178629279136658, - -3.3884518146514893, - -4.180578231811523, - -4.87364387512207, - -1.7275251150131226, - -3.002310037612915 - ] - }, - { - "auto_scores": [ - -3.5216124057769775, - -1.315640926361084, - -4.976970195770264, - -4.588405132293701, - -4.988601207733154, - -4.338250160217285, - -4.258938789367676 - ] - }, - { - "auto_scores": [ - -2.2196450233459473, - -0.6448060274124146, - -4.11163330078125, - -8.273467063903809, - -3.422013521194458, - -2.428804397583008, - -4.367648601531982 - ] - }, - { - "auto_scores": [ - -2.8822271823883057, - -1.338028073310852, - -2.5680463314056396, - -5.671765327453613, - -3.6491050720214844, - -3.923588752746582, - -3.6876883506774902 - ] - }, - { - "auto_scores": [ - -3.0493404865264893, - -1.0289462804794312, - -2.6373345851898193, - -5.77088737487793, - -4.87874174118042, - -5.060049057006836, - -5.282394886016846 - ] - }, - { - "auto_scores": [ - -4.5092267990112305, - -2.1672258377075195, - -5.196324348449707, - -4.173991680145264, - -5.770926475524902, - -4.504154205322266, - -5.454038619995117 - ] - }, - { - "auto_scores": [ - -3.9801697731018066, - -1.4928231239318848, - -4.76878547668457, - -4.193707466125488, - -4.717602252960205, - -4.891543388366699, - -4.786087989807129 - ] - }, - { - "auto_scores": [ - -4.327728271484375, - -5.265812873840332, - -4.09039831161499, - -5.932196140289307, - -5.382634162902832, - -4.686864376068115, - -5.1508002281188965 - ] - }, - { - "auto_scores": [ - -3.269033432006836, - -4.5389723777771, - -4.005921840667725, - -6.995456695556641, - -4.278707981109619, - -3.5476715564727783, - -4.695621490478516 - ] - }, - { - "auto_scores": [ - -3.557901620864868, - -2.4186339378356934, - -4.851772785186768, - -4.371306896209717, - -5.103922367095947, - -4.333691596984863, - -4.298647880554199 - ] - }, - { - "auto_scores": [ - -4.055961608886719, - -4.662075519561768, - -3.473954439163208, - -7.5713982582092285, - -6.1275200843811035, - -5.409901142120361, - -5.5505499839782715 - ] - }, - { - "auto_scores": [ - -3.688018798828125, - -3.572950839996338, - -4.312163352966309, - -4.746547698974609, - -4.910508155822754, - -4.222039222717285, - -4.782878398895264 - ] - }, - { - "auto_scores": [ - -3.908001661300659, - -1.983359932899475, - -4.276918411254883, - -5.791119575500488, - -5.717682361602783, - -4.877152442932129, - -4.80209493637085 - ] - }, - { - "auto_scores": [ - -3.632587194442749, - -3.0536139011383057, - -4.381106376647949, - -5.96414852142334, - -4.910465240478516, - -3.78712797164917, - -5.2252631187438965 - ] - }, - { - "auto_scores": [ - -3.524134635925293, - -1.4702116250991821, - -3.7997453212738037, - -6.277534484863281, - -5.421000003814697, - -4.504569053649902, - -4.773496150970459 - ] - }, - { - "auto_scores": [ - -3.3854877948760986, - -4.794503211975098, - -4.626842498779297, - -4.996484756469727, - -5.472954273223877, - -4.742428302764893, - -6.0292253494262695 - ] - }, - { - "auto_scores": [ - -3.2534337043762207, - -1.3036439418792725, - -3.159641981124878, - -6.059638023376465, - -4.791782379150391, - -4.999390125274658, - -5.129875659942627 - ] - }, - { - "auto_scores": [ - -3.7950146198272705, - -1.0038492679595947, - -3.8807947635650635, - -6.902472496032715, - -5.115923881530762, - -4.9170732498168945, - -4.917073726654053 - ] - }, - { - "auto_scores": [ - -4.213415622711182, - -4.250669002532959, - -3.8901631832122803, - -5.633732318878174, - -5.176709175109863, - -5.110617637634277, - -5.267539024353027 - ] - }, - { - "auto_scores": [ - -4.235969543457031, - -2.604471445083618, - -4.817269802093506, - -6.817267894744873, - -4.734187126159668, - -4.550678253173828, - -4.019895076751709 - ] - }, - { - "auto_scores": [ - -4.271620273590088, - -3.7753279209136963, - -4.285919189453125, - -6.411378860473633, - -4.8667378425598145, - -4.203290939331055, - -4.631669521331787 - ] - } -] \ No newline at end of file +[{"auto_scores": [-3.212421178817749, -1.2346594333648682, -3.337571382522583, -6.383989334106445, -4.984112739562988, -3.6778671741485596, -3.930911064147949]}, {"auto_scores": [-3.6213812828063965, -0.634912371635437, -3.670194387435913, -5.574587821960449, -6.174976825714111, -4.187601089477539, -4.893369197845459]}, {"auto_scores": [-4.3441267013549805, -2.0739285945892334, -4.994687080383301, -5.674907207489014, -4.832435607910156, -4.986924171447754, -5.201959133148193]}, {"auto_scores": [-3.734975814819336, -3.2286689281463623, -3.826882839202881, -4.8123393058776855, -5.000977039337158, -5.222707748413086, -4.795951843261719]}, {"auto_scores": [-4.223236560821533, -4.719715595245361, -4.59467077255249, -5.68583869934082, -6.5522780418396, -5.388362407684326, -5.18690299987793]}, {"auto_scores": [-3.7510688304901123, -2.914438247680664, -4.286332130432129, -7.133133411407471, -5.809284210205078, -3.3124961853027344, -5.612478733062744]}, {"auto_scores": [-0.6680727005004883, -0.6739019155502319, -3.1031758785247803, -5.8908233642578125, -4.66937780380249, -2.0242176055908203, -4.522027492523193]}, {"auto_scores": [-4.329258441925049, -5.275319576263428, -5.579972743988037, -7.09144401550293, -5.283503532409668, -5.104097843170166, -4.941505432128906]}, {"auto_scores": [-2.8284058570861816, -1.1876786947250366, -3.3185195922851562, -4.423750400543213, -4.909323692321777, -3.3485019207000732, -3.490218162536621]}, {"auto_scores": [-6.907121181488037, -4.707708358764648, -4.0600481033325195, -6.627317905426025, -4.6124958992004395, -5.713040828704834, -4.879390239715576]}, {"auto_scores": [-3.482964515686035, -1.8555914163589478, -3.4856302738189697, -4.5765790939331055, -5.007498264312744, -5.3058695793151855, -4.370933532714844]}, {"auto_scores": [-3.649778127670288, -7.154506683349609, -3.289411783218384, -5.8764119148254395, -5.020175933837891, -4.205811023712158, -4.3719482421875]}, {"auto_scores": [-3.525455951690674, -0.6714977025985718, -3.5236685276031494, -5.575921058654785, -4.328283309936523, -4.421144008636475, -5.753363609313965]}, {"auto_scores": [-4.823455810546875, -3.305582284927368, -3.7563633918762207, -7.122200012207031, -6.407985687255859, -6.161899566650391, -6.045509338378906]}, {"auto_scores": [-4.271821022033691, -0.22897133231163025, -1.7710247039794922, -5.976954460144043, -4.617368221282959, -4.148235321044922, -5.564839839935303]}, {"auto_scores": [-4.172784328460693, -0.9746262431144714, -4.971782207489014, -7.950442790985107, -4.324660778045654, -5.159337043762207, -5.440691947937012]}, {"auto_scores": [-3.0683135986328125, -2.433345079421997, -2.8803658485412598, -5.805661678314209, -4.270867824554443, -3.3736610412597656, -5.132525444030762]}, {"auto_scores": [-1.7446292638778687, -0.6325616240501404, -2.5213329792022705, -4.796778202056885, -4.467785835266113, -3.71679949760437, -4.044307708740234]}, {"auto_scores": [-4.092418670654297, -3.5181899070739746, -4.59417200088501, -4.340042591094971, -5.503137111663818, -5.130063533782959, -4.715507507324219]}, {"auto_scores": [-1.9360358715057373, -1.2985289096832275, -1.7955436706542969, -5.797574996948242, -5.288994789123535, -2.147458076477051, -2.362152576446533]}, {"auto_scores": [-3.1864800453186035, -7.8803300857543945, -3.383310079574585, -5.561729431152344, -5.314629554748535, -3.5953166484832764, -5.051906108856201]}, {"auto_scores": [-3.8382568359375, -1.2670823335647583, -4.553617000579834, -5.867466449737549, -4.707453727722168, -5.538605213165283, -4.538035869598389]}, {"auto_scores": [-3.8993802070617676, -3.60168719291687, -3.857508659362793, -4.424567699432373, -5.072203636169434, -4.495748519897461, -4.477273464202881]}, {"auto_scores": [-1.950005054473877, -0.7109898924827576, -3.2378036975860596, -5.4950456619262695, -4.684595108032227, -1.7371671199798584, -4.209641933441162]}, {"auto_scores": [-4.496479034423828, -2.556669235229492, -4.498341083526611, -5.729393005371094, -5.359912872314453, -5.263284206390381, -6.106546401977539]}, {"auto_scores": [-4.054862976074219, -0.45988163352012634, -4.6832661628723145, -4.341195106506348, -5.400857448577881, -4.356181621551514, -5.217522144317627]}, {"auto_scores": [-5.333022117614746, -1.9357249736785889, -4.021650314331055, -5.890506267547607, -4.706431865692139, -5.232513904571533, -5.546692371368408]}, {"auto_scores": [-3.3256938457489014, -0.9897561073303223, -4.210707664489746, -5.635035037994385, -5.4662861824035645, -3.871394157409668, -5.324164390563965]}, {"auto_scores": [-2.717494487762451, -3.7315330505371094, -3.955324172973633, -4.491107940673828, -4.338680744171143, -3.963266372680664, -4.681497573852539]}, {"auto_scores": [-0.8609310984611511, -1.1239956617355347, -1.3126013278961182, -4.678332805633545, -4.1132965087890625, -2.218703269958496, -3.150106906890869]}, {"auto_scores": [-3.643179416656494, -0.9851183891296387, -3.833064317703247, -4.648499011993408, -4.927526950836182, -4.354979038238525, -5.20051383972168]}, {"auto_scores": [-3.9338979721069336, -3.090111494064331, -4.448740482330322, -3.902815818786621, -4.836552619934082, -3.8867716789245605, -5.490387439727783]}, {"auto_scores": [-1.1862435340881348, -1.603387713432312, -4.425495147705078, -4.629676342010498, -6.500186920166016, -1.749828577041626, -3.9402852058410645]}, {"auto_scores": [-3.261005401611328, -0.7430241703987122, -4.377554416656494, -5.855930805206299, -5.323549747467041, -4.947967529296875, -4.609178066253662]}, {"auto_scores": [-2.8447532653808594, -0.9991836547851562, -3.883254051208496, -7.081796646118164, -5.052463531494141, -4.451764106750488, -4.260918140411377]}, {"auto_scores": [-2.7264280319213867, -1.1419219970703125, -1.1307411193847656, -5.69485330581665, -5.184634208679199, -2.312387228012085, -2.2400753498077393]}, {"auto_scores": [-3.8610730171203613, -1.3325031995773315, -3.223046064376831, -5.786430835723877, -5.001805305480957, -4.727075576782227, -4.931162357330322]}, {"auto_scores": [-3.1031854152679443, -2.283616065979004, -3.1693084239959717, -5.209831714630127, -5.4373955726623535, -3.6447370052337646, -5.161511421203613]}, {"auto_scores": [-2.831552505493164, -1.4173871278762817, -2.1985905170440674, -4.8720808029174805, -3.425079345703125, -2.933311700820923, -4.249796390533447]}, {"auto_scores": [-2.482973575592041, -0.8178629279136658, -3.3884518146514893, -4.180578231811523, -4.87364387512207, -1.7275251150131226, -3.002310037612915]}, {"auto_scores": [-3.5216124057769775, -1.315640926361084, -4.976970195770264, -4.588405132293701, -4.988601207733154, -4.338250160217285, -4.258938789367676]}, {"auto_scores": [-2.2196450233459473, -0.6448060274124146, -4.11163330078125, -8.273467063903809, -3.422013521194458, -2.428804397583008, -4.367648601531982]}, {"auto_scores": [-2.8822271823883057, -1.338028073310852, -2.5680463314056396, -5.671765327453613, -3.6491050720214844, -3.923588752746582, -3.6876883506774902]}, {"auto_scores": [-3.0493404865264893, -1.0289462804794312, -2.6373345851898193, -5.77088737487793, -4.87874174118042, -5.060049057006836, -5.282394886016846]}, {"auto_scores": [-4.5092267990112305, -2.1672258377075195, -5.196324348449707, -4.173991680145264, -5.770926475524902, -4.504154205322266, -5.454038619995117]}, {"auto_scores": [-3.9801697731018066, -1.4928231239318848, -4.76878547668457, -4.193707466125488, -4.717602252960205, -4.891543388366699, -4.786087989807129]}, {"auto_scores": [-4.327728271484375, -5.265812873840332, -4.09039831161499, -5.932196140289307, -5.382634162902832, -4.686864376068115, -5.1508002281188965]}, {"auto_scores": [-3.269033432006836, -4.5389723777771, -4.005921840667725, -6.995456695556641, -4.278707981109619, -3.5476715564727783, -4.695621490478516]}, {"auto_scores": [-3.557901620864868, -2.4186339378356934, -4.851772785186768, -4.371306896209717, -5.103922367095947, -4.333691596984863, -4.298647880554199]}, {"auto_scores": [-4.055961608886719, -4.662075519561768, -3.473954439163208, -7.5713982582092285, -6.1275200843811035, -5.409901142120361, -5.5505499839782715]}, {"auto_scores": [-3.688018798828125, -3.572950839996338, -4.312163352966309, -4.746547698974609, -4.910508155822754, -4.222039222717285, -4.782878398895264]}, {"auto_scores": [-3.908001661300659, -1.983359932899475, -4.276918411254883, -5.791119575500488, -5.717682361602783, -4.877152442932129, -4.80209493637085]}, {"auto_scores": [-3.632587194442749, -3.0536139011383057, -4.381106376647949, -5.96414852142334, -4.910465240478516, -3.78712797164917, -5.2252631187438965]}, {"auto_scores": [-3.524134635925293, -1.4702116250991821, -3.7997453212738037, -6.277534484863281, -5.421000003814697, -4.504569053649902, -4.773496150970459]}, {"auto_scores": [-3.3854877948760986, -4.794503211975098, -4.626842498779297, -4.996484756469727, -5.472954273223877, -4.742428302764893, -6.0292253494262695]}, {"auto_scores": [-3.2534337043762207, -1.3036439418792725, -3.159641981124878, -6.059638023376465, -4.791782379150391, -4.999390125274658, -5.129875659942627]}, {"auto_scores": [-3.7950146198272705, -1.0038492679595947, -3.8807947635650635, -6.902472496032715, -5.115923881530762, -4.9170732498168945, -4.917073726654053]}, {"auto_scores": [-4.213415622711182, -4.250669002532959, -3.8901631832122803, -5.633732318878174, -5.176709175109863, -5.110617637634277, -5.267539024353027]}, {"auto_scores": [-4.235969543457031, -2.604471445083618, -4.817269802093506, -6.817267894744873, -4.734187126159668, -4.550678253173828, -4.019895076751709]}, {"auto_scores": [-4.271620273590088, -3.7753279209136963, -4.285919189453125, -6.411378860473633, -4.8667378425598145, -4.203290939331055, -4.631669521331787]}] \ No newline at end of file diff --git a/integration_tests/artifacts/newsroom/rouge1_f_predictions.json b/integration_tests/artifacts/newsroom/rouge1_f_predictions.json index 482f4989..82778e74 100644 --- a/integration_tests/artifacts/newsroom/rouge1_f_predictions.json +++ b/integration_tests/artifacts/newsroom/rouge1_f_predictions.json @@ -1,662 +1 @@ -[ - { - "auto_scores": [ - 0.9434, - 0.21918, - 0.05263, - 0.19781, - 0.22857, - 0.21539, - 0.47059 - ] - }, - { - "auto_scores": [ - 0.31667, - 0.97561, - 0.54545, - 0.07408, - 0.17858, - 0.18519, - 0.38554 - ] - }, - { - "auto_scores": [ - 0.21052, - 0.9, - 0.16439, - 0.05555, - 0.28572, - 0.28571, - 0.12598 - ] - }, - { - "auto_scores": [ - 0.09411, - 0.15686, - 0.12698, - 0.12372, - 0.75, - 0.04167, - 0.0 - ] - }, - { - "auto_scores": [ - 0.12281, - 0.72, - 0.10811, - 0.0, - 0.11628, - 0.12766, - 0.10526 - ] - }, - { - "auto_scores": [ - 0.2029, - 0.85714, - 0.23188, - 0.15, - 0.25715, - 0.17241, - 0.17647 - ] - }, - { - "auto_scores": [ - 0.64285, - 1.0, - 0.37255, - 0.0, - 0.32117, - 0.77894, - 0.6 - ] - }, - { - "auto_scores": [ - 0.12903, - 0.66667, - 0.15385, - 0.0, - 0.10526, - 0.11111, - 0.11111 - ] - }, - { - "auto_scores": [ - 0.52941, - 0.95652, - 0.19672, - 0.0, - 0.16667, - 0.53731, - 0.45569 - ] - }, - { - "auto_scores": [ - 0.09756, - 0.67692, - 0.10126, - 0.04445, - 0.13158, - 0.07895, - 0.12599 - ] - }, - { - "auto_scores": [ - 0.2078, - 0.90909, - 0.21739, - 0.07407, - 0.15001, - 0.20833, - 0.11321 - ] - }, - { - "auto_scores": [ - 0.08, - 0.43479, - 0.15686, - 0.08334, - 0.09412, - 0.11765, - 0.1282 - ] - }, - { - "auto_scores": [ - 0.34951, - 1.0, - 0.64285, - 0.21053, - 0.4, - 0.11594, - 0.1039 - ] - }, - { - "auto_scores": [ - 0.10937, - 0.66667, - 0.07142, - 0.08333, - 0.10126, - 0.0, - 0.03252 - ] - }, - { - "auto_scores": [ - 0.12903, - 1.0, - 0.82051, - 0.0, - 0.16667, - 0.43243, - 0.33614 - ] - }, - { - "auto_scores": [ - 0.09449, - 0.96552, - 0.14492, - 0.0, - 0.07577, - 0.3077, - 0.06061 - ] - }, - { - "auto_scores": [ - 0.26373, - 0.9, - 0.33334, - 0.0, - 0.12612, - 0.17544, - 0.12903 - ] - }, - { - "auto_scores": [ - 0.33708, - 0.98529, - 0.26263, - 0.02381, - 0.34783, - 0.5, - 0.24242 - ] - }, - { - "auto_scores": [ - 0.18182, - 0.66667, - 0.15385, - 0.0, - 0.0, - 0.16666, - 0.11111 - ] - }, - { - "auto_scores": [ - 0.6055, - 0.93507, - 0.73333, - 0.04545, - 0.46729, - 0.85715, - 0.73333 - ] - }, - { - "auto_scores": [ - 0.0, - 0.33333, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "auto_scores": [ - 0.2, - 0.97297, - 0.21875, - 0.0, - 0.47369, - 0.11111, - 0.21818 - ] - }, - { - "auto_scores": [ - 0.25862, - 0.70968, - 0.2, - 0.03636, - 0.21539, - 0.28948, - 0.30769 - ] - }, - { - "auto_scores": [ - 0.48855, - 0.98462, - 0.23684, - 0.09524, - 0.17778, - 0.76923, - 0.40876 - ] - }, - { - "auto_scores": [ - 0.08265, - 0.78572, - 0.04445, - 0.0, - 0.08602, - 0.03637, - 0.075 - ] - }, - { - "auto_scores": [ - 0.22609, - 1.0, - 0.1608, - 0.03509, - 0.125, - 0.17894, - 0.15384 - ] - }, - { - "auto_scores": [ - 0.27118, - 0.93549, - 0.20513, - 0.0, - 0.30107, - 0.27692, - 0.21782 - ] - }, - { - "auto_scores": [ - 0.42222, - 0.97436, - 0.24, - 0.0, - 0.26415, - 0.73077, - 0.29033 - ] - }, - { - "auto_scores": [ - 0.22535, - 0.875, - 0.3077, - 0.0, - 0.18182, - 0.14634, - 0.13333 - ] - }, - { - "auto_scores": [ - 0.49006, - 1.0, - 0.59259, - 0.09091, - 0.51807, - 0.425, - 0.59649 - ] - }, - { - "auto_scores": [ - 0.11764, - 0.91892, - 0.28169, - 0.11428, - 0.26966, - 0.19231, - 0.11429 - ] - }, - { - "auto_scores": [ - 0.07595, - 0.82927, - 0.14706, - 0.0, - 0.11111, - 0.025, - 0.09302 - ] - }, - { - "auto_scores": [ - 0.69822, - 0.98148, - 0.24096, - 0.04838, - 0.22377, - 0.5, - 0.35616 - ] - }, - { - "auto_scores": [ - 0.42857, - 0.98182, - 0.26471, - 0.16666, - 0.20472, - 0.16129, - 0.18182 - ] - }, - { - "auto_scores": [ - 0.58252, - 0.9697, - 0.12821, - 0.05405, - 0.26471, - 0.21621, - 0.25 - ] - }, - { - "auto_scores": [ - 0.61017, - 0.98591, - 1.0, - 0.16394, - 0.34568, - 0.95652, - 1.0 - ] - }, - { - "auto_scores": [ - 0.43678, - 0.95652, - 0.51613, - 0.10526, - 0.14545, - 0.14545, - 0.19048 - ] - }, - { - "auto_scores": [ - 0.28572, - 0.91892, - 0.20896, - 0.11428, - 0.19672, - 0.28571, - 0.19672 - ] - }, - { - "auto_scores": [ - 0.38938, - 0.97675, - 0.83019, - 0.0, - 0.37624, - 0.56604, - 0.35088 - ] - }, - { - "auto_scores": [ - 0.63889, - 0.97872, - 0.15873, - 0.05263, - 0.07843, - 0.74193, - 0.74193 - ] - }, - { - "auto_scores": [ - 0.12195, - 0.96296, - 0.21818, - 0.2069, - 0.17242, - 0.17242, - 0.24 - ] - }, - { - "auto_scores": [ - 0.33463, - 0.99479, - 0.09216, - 0.0, - 0.31372, - 0.28444, - 0.41216 - ] - }, - { - "auto_scores": [ - 0.42718, - 0.95833, - 0.1, - 0.0, - 0.15152, - 0.10345, - 0.4466 - ] - }, - { - "auto_scores": [ - 0.31343, - 0.97675, - 0.47619, - 0.0, - 0.14458, - 0.16667, - 0.10526 - ] - }, - { - "auto_scores": [ - 0.11494, - 0.93333, - 0.2, - 0.0, - 0.11112, - 0.05, - 0.0708 - ] - }, - { - "auto_scores": [ - 0.17204, - 0.95, - 0.18749, - 0.05556, - 0.16667, - 0.06667, - 0.1791 - ] - }, - { - "auto_scores": [ - 0.16901, - 0.73077, - 0.28986, - 0.0, - 0.19355, - 0.09375, - 0.225 - ] - }, - { - "auto_scores": [ - 0.18868, - 0.78788, - 0.19179, - 0.15, - 0.24657, - 0.1695, - 0.17143 - ] - }, - { - "auto_scores": [ - 0.1519, - 0.84615, - 0.09677, - 0.06667, - 0.03884, - 0.07844, - 0.07273 - ] - }, - { - "auto_scores": [ - 0.12307, - 0.7317, - 0.19047, - 0.0, - 0.1282, - 0.09836, - 0.11594 - ] - }, - { - "auto_scores": [ - 0.08772, - 0.78261, - 0.08696, - 0.07143, - 0.04706, - 0.14545, - 0.10666 - ] - }, - { - "auto_scores": [ - 0.20833, - 0.91666, - 0.24, - 0.16666, - 0.13636, - 0.17857, - 0.22222 - ] - }, - { - "auto_scores": [ - 0.08, - 0.78431, - 0.19178, - 0.0, - 0.26087, - 0.28986, - 0.20619 - ] - }, - { - "auto_scores": [ - 0.16868, - 0.875, - 0.17778, - 0.0, - 0.28, - 0.28, - 0.20589 - ] - }, - { - "auto_scores": [ - 0.12371, - 0.69565, - 0.06558, - 0.06667, - 0.05797, - 0.07692, - 0.05633 - ] - }, - { - "auto_scores": [ - 0.20155, - 0.93333, - 0.2623, - 0.0, - 0.14457, - 0.13793, - 0.18666 - ] - }, - { - "auto_scores": [ - 0.15385, - 0.97143, - 0.16326, - 0.25806, - 0.19178, - 0.09091, - 0.09091 - ] - }, - { - "auto_scores": [ - 0.19672, - 0.79166, - 0.21538, - 0.0, - 0.10204, - 0.21538, - 0.18182 - ] - }, - { - "auto_scores": [ - 0.28948, - 0.85714, - 0.11941, - 0.36364, - 0.31579, - 0.3, - 0.2353 - ] - }, - { - "auto_scores": [ - 0.20833, - 0.90323, - 0.21538, - 0.16, - 0.10417, - 0.10526, - 0.13333 - ] - } -] \ No newline at end of file +[{"auto_scores": [0.47059, 0.9434, 0.21918, 0.05263, 0.19781, 0.22857, 0.21539]}, {"auto_scores": [0.31667, 0.97561, 0.54545, 0.07408, 0.17858, 0.18519, 0.38554]}, {"auto_scores": [0.21052, 0.9, 0.16439, 0.05555, 0.28572, 0.28571, 0.12598]}, {"auto_scores": [0.12372, 0.75, 0.04167, 0.0, 0.09411, 0.15686, 0.12698]}, {"auto_scores": [0.12281, 0.72, 0.10811, 0.0, 0.11628, 0.12766, 0.10526]}, {"auto_scores": [0.2029, 0.85714, 0.23188, 0.15, 0.25715, 0.17241, 0.17647]}, {"auto_scores": [0.64285, 1.0, 0.37255, 0.0, 0.32117, 0.77894, 0.6]}, {"auto_scores": [0.12903, 0.66667, 0.15385, 0.0, 0.10526, 0.11111, 0.11111]}, {"auto_scores": [0.52941, 0.95652, 0.19672, 0.0, 0.16667, 0.53731, 0.45569]}, {"auto_scores": [0.09756, 0.67692, 0.10126, 0.04445, 0.13158, 0.07895, 0.12599]}, {"auto_scores": [0.2078, 0.90909, 0.21739, 0.07407, 0.15001, 0.20833, 0.11321]}, {"auto_scores": [0.08, 0.43479, 0.15686, 0.08334, 0.09412, 0.11765, 0.1282]}, {"auto_scores": [0.34951, 1.0, 0.64285, 0.21053, 0.4, 0.11594, 0.1039]}, {"auto_scores": [0.10937, 0.66667, 0.07142, 0.08333, 0.10126, 0.0, 0.03252]}, {"auto_scores": [0.12903, 1.0, 0.82051, 0.0, 0.16667, 0.43243, 0.33614]}, {"auto_scores": [0.09449, 0.96552, 0.14492, 0.0, 0.07577, 0.3077, 0.06061]}, {"auto_scores": [0.26373, 0.9, 0.33334, 0.0, 0.12612, 0.17544, 0.12903]}, {"auto_scores": [0.33708, 0.98529, 0.26263, 0.02381, 0.34783, 0.5, 0.24242]}, {"auto_scores": [0.18182, 0.66667, 0.15385, 0.0, 0.0, 0.16666, 0.11111]}, {"auto_scores": [0.6055, 0.93507, 0.73333, 0.04545, 0.46729, 0.85715, 0.73333]}, {"auto_scores": [0.0, 0.33333, 0.0, 0.0, 0.0, 0.0, 0.0]}, {"auto_scores": [0.2, 0.97297, 0.21875, 0.0, 0.47369, 0.11111, 0.21818]}, {"auto_scores": [0.25862, 0.70968, 0.2, 0.03636, 0.21539, 0.28948, 0.30769]}, {"auto_scores": [0.48855, 0.98462, 0.23684, 0.09524, 0.17778, 0.76923, 0.40876]}, {"auto_scores": [0.08265, 0.78572, 0.04445, 0.0, 0.08602, 0.03637, 0.075]}, {"auto_scores": [0.22609, 1.0, 0.1608, 0.03509, 0.125, 0.17894, 0.15384]}, {"auto_scores": [0.27118, 0.93549, 0.20513, 0.0, 0.30107, 0.27692, 0.21782]}, {"auto_scores": [0.42222, 0.97436, 0.24, 0.0, 0.26415, 0.73077, 0.29033]}, {"auto_scores": [0.22535, 0.875, 0.3077, 0.0, 0.18182, 0.14634, 0.13333]}, {"auto_scores": [0.49006, 1.0, 0.59259, 0.09091, 0.51807, 0.425, 0.59649]}, {"auto_scores": [0.11764, 0.91892, 0.28169, 0.11428, 0.26966, 0.19231, 0.11429]}, {"auto_scores": [0.07595, 0.82927, 0.14706, 0.0, 0.11111, 0.025, 0.09302]}, {"auto_scores": [0.69822, 0.98148, 0.24096, 0.04838, 0.22377, 0.5, 0.35616]}, {"auto_scores": [0.42857, 0.98182, 0.26471, 0.16666, 0.20472, 0.16129, 0.18182]}, {"auto_scores": [0.58252, 0.9697, 0.12821, 0.05405, 0.26471, 0.21621, 0.25]}, {"auto_scores": [0.61017, 0.98591, 1.0, 0.16394, 0.34568, 0.95652, 1.0]}, {"auto_scores": [0.43678, 0.95652, 0.51613, 0.10526, 0.14545, 0.14545, 0.19048]}, {"auto_scores": [0.28572, 0.91892, 0.20896, 0.11428, 0.19672, 0.28571, 0.19672]}, {"auto_scores": [0.38938, 0.97675, 0.83019, 0.0, 0.37624, 0.56604, 0.35088]}, {"auto_scores": [0.63889, 0.97872, 0.15873, 0.05263, 0.07843, 0.74193, 0.74193]}, {"auto_scores": [0.12195, 0.96296, 0.21818, 0.2069, 0.17242, 0.17242, 0.24]}, {"auto_scores": [0.33463, 0.99479, 0.09216, 0.0, 0.31372, 0.28444, 0.41216]}, {"auto_scores": [0.42718, 0.95833, 0.1, 0.0, 0.15152, 0.10345, 0.4466]}, {"auto_scores": [0.31343, 0.97675, 0.47619, 0.0, 0.14458, 0.16667, 0.10526]}, {"auto_scores": [0.11494, 0.93333, 0.2, 0.0, 0.11112, 0.05, 0.0708]}, {"auto_scores": [0.17204, 0.95, 0.18749, 0.05556, 0.16667, 0.06667, 0.1791]}, {"auto_scores": [0.16901, 0.73077, 0.28986, 0.0, 0.19355, 0.09375, 0.225]}, {"auto_scores": [0.18868, 0.78788, 0.19179, 0.15, 0.24657, 0.1695, 0.17143]}, {"auto_scores": [0.1519, 0.84615, 0.09677, 0.06667, 0.03884, 0.07844, 0.07273]}, {"auto_scores": [0.12307, 0.7317, 0.19047, 0.0, 0.1282, 0.09836, 0.11594]}, {"auto_scores": [0.08772, 0.78261, 0.08696, 0.07143, 0.04706, 0.14545, 0.10666]}, {"auto_scores": [0.20833, 0.91666, 0.24, 0.16666, 0.13636, 0.17857, 0.22222]}, {"auto_scores": [0.08, 0.78431, 0.19178, 0.0, 0.26087, 0.28986, 0.20619]}, {"auto_scores": [0.16868, 0.875, 0.17778, 0.0, 0.28, 0.28, 0.20589]}, {"auto_scores": [0.12371, 0.69565, 0.06558, 0.06667, 0.05797, 0.07692, 0.05633]}, {"auto_scores": [0.20155, 0.93333, 0.2623, 0.0, 0.14457, 0.13793, 0.18666]}, {"auto_scores": [0.15385, 0.97143, 0.16326, 0.25806, 0.19178, 0.09091, 0.09091]}, {"auto_scores": [0.19672, 0.79166, 0.21538, 0.0, 0.10204, 0.21538, 0.18182]}, {"auto_scores": [0.28948, 0.85714, 0.11941, 0.36364, 0.31579, 0.3, 0.2353]}, {"auto_scores": [0.20833, 0.90323, 0.21538, 0.16, 0.10417, 0.10526, 0.13333]}] \ No newline at end of file diff --git a/integration_tests/meta_eval_nlg_test.py b/integration_tests/meta_eval_nlg_test.py index 6ae36821..1f5465be 100644 --- a/integration_tests/meta_eval_nlg_test.py +++ b/integration_tests/meta_eval_nlg_test.py @@ -219,6 +219,7 @@ def test_coherence_rouge1_f(self): .value ) self.assertGreater(len(sys_info.results.analyses), 0) + # Replicate the Table 4 result in paper: https://arxiv.org/pdf/2106.11520.pdf self.assertAlmostEqual( overall_score, 0.0946, @@ -248,6 +249,8 @@ def test_coherence_bartscore(self): .value ) self.assertGreater(len(sys_info.results.analyses), 0) + # Replicate the Table 4 result in paper: + # https://github.com/neulab/BARTScore#reproduce self.assertAlmostEqual( overall_score, 0.3157,