Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 674062722
  • Loading branch information
roark-google authored and copybara-github committed Sep 13, 2024
1 parent 9155350 commit 5ea9261
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 14 deletions.
3 changes: 3 additions & 0 deletions nisaba/translit/tools/calculate_error_rate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,9 @@ void MultiRefErrorRate::CalculateErrorRate(bool pairwise_edits) {
void MultiRefErrorRate::CalculateErrorRate(absl::string_view reffile,
absl::string_view testfile,
bool pairwise_edits) {
if (output_syms_.Find("<epsilon>") < 0) {
output_syms_.AddSymbol("<epsilon>");
}
ReadInputs(reffile, /*is_reference=*/true);
ReadInputs(testfile, /*is_reference=*/false);
CalculateErrorRate(pairwise_edits);
Expand Down
23 changes: 10 additions & 13 deletions nisaba/translit/tools/calculate_error_rate.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct EditDistanceDouble {
double insertions = 0.;
double deletions = 0.;

EditDistanceDouble() {}
EditDistanceDouble() = default;

explicit EditDistanceDouble(const EditDistanceInt &x)
: reference_length(x.reference_length),
Expand Down Expand Up @@ -101,25 +101,27 @@ struct EditDistanceDouble {
// Class for calculating error rates possibly from multiple references.
class MultiRefErrorRate {
public:
MultiRefErrorRate() = default;

explicit MultiRefErrorRate(bool is_split_chars)
: is_split_chars_(is_split_chars) {
output_syms_.AddSymbol("<epsilon>");
}
explicit MultiRefErrorRate(bool is_split_chars = true)
: is_split_chars_(is_split_chars) {}

// Calculates multi reference error rate for given test file. If pairwise_edit
// is set to true, provides pairwise edits between all reference and test
// outputs for the same input.
void CalculateErrorRate(absl::string_view reffile, absl::string_view testfile,
bool pairwise_edits);
bool pairwise_edits = false);

// Writes results to output file.
void Write(absl::string_view ofile, bool pairwise_edits);

// Calculates error rate and optionally writes to provide file pointer.
double CalcErrorRate();

protected:
// Returns tokenized string associated with idx and k-th item from test_input_
// if is_test_item is true, otherwise references_.
std::vector<std::string> GetTokenizedString(int idx, int k,
bool is_test_item = false) const;

private:
// Reads in TSV input file, either reference or test file.
// Column 1 should be the example index, starting from 0.
Expand All @@ -128,11 +130,6 @@ class MultiRefErrorRate {
// For the reference file this is a count; for test a -log prob.
void ReadInputs(absl::string_view input_file, bool is_reference);

// Returns tokenized string associated with idx and k-th item from test_input_
// if is_test_item is true, otherwise references_.
std::vector<std::string> GetTokenizedString(int idx, int k,
bool is_test_item) const;

// Scans through data set and calculates error rate.
void CalculateErrorRate(bool pairwise_edits);

Expand Down
38 changes: 37 additions & 1 deletion nisaba/translit/tools/calculate_error_rate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ namespace {

constexpr float kFloatDelta = 0.00001; // Delta for float comparisons.

class MultiRefErrorRateHelper : public MultiRefErrorRate {
public:
using MultiRefErrorRate::GetTokenizedString;
MultiRefErrorRateHelper() = default;
};

class MultiRefErrorRateTest : public ::testing::Test {
protected:
// Creates first file for testing error rate.
Expand Down Expand Up @@ -71,7 +77,37 @@ class MultiRefErrorRateTest : public ::testing::Test {
std::string file_two_; // File name for second k-best output.
};

// TODO: create test of GetTokenizedString to test bad indices.
// To test behavior with out-of-range indices.
TEST_F(MultiRefErrorRateTest, GetTokenizedStringTests) {
// Initializing calculator with file_one_ as reference.
MultiRefErrorRateHelper multi_ref_calc;
multi_ref_calc.CalculateErrorRate(/*reffile=*/file_one_,
/*testfile=*/file_two_);
// Example 0, item 0 exists in reference, so resulting vector non-empty.
EXPECT_FALSE(multi_ref_calc.GetTokenizedString(0, 0).empty());
// Example index out of range, so vector empty.
EXPECT_TRUE(multi_ref_calc.GetTokenizedString(-1, 0).empty());
EXPECT_TRUE(
multi_ref_calc.GetTokenizedString(-1, 0, /*is_test_item=*/true).empty());
// Example index out of range, so vector empty.
EXPECT_TRUE(multi_ref_calc.GetTokenizedString(3, 0).empty());
EXPECT_TRUE(
multi_ref_calc.GetTokenizedString(3, 0, /*is_test_item=*/true).empty());
// Example 1 of reference and test have more than 1 item, vector non-empty.
EXPECT_FALSE(multi_ref_calc.GetTokenizedString(1, 1).empty());
EXPECT_FALSE(
multi_ref_calc.GetTokenizedString(1, 1, /*is_test_item=*/true).empty());
// Example 2 of reference and test have just 1 item, hence vector empty.
EXPECT_TRUE(multi_ref_calc.GetTokenizedString(2, 1).empty());
EXPECT_TRUE(
multi_ref_calc.GetTokenizedString(2, 1, /*is_test_item=*/true).empty());
// Example 1 of reference has 2 items but test has 3 items, so vector should
// be empty for reference but not for test.
EXPECT_TRUE(multi_ref_calc.GetTokenizedString(1, 2).empty());
EXPECT_FALSE(
multi_ref_calc.GetTokenizedString(1, 2, /*is_test_item=*/true).empty());
}

// Verifies minimum error rate calculation in both directions.
TEST_F(MultiRefErrorRateTest, CorrectMinErrorRates) {
// Initializing calculator for use with file_one_ as reference.
Expand Down

0 comments on commit 5ea9261

Please sign in to comment.