Skip to content

Commit

Permalink
fix: resolving pylint errors
Browse files Browse the repository at this point in the history
  • Loading branch information
init-22 committed Dec 22, 2024
1 parent d4aa90a commit 5e348e4
Showing 1 changed file with 71 additions and 61 deletions.
132 changes: 71 additions & 61 deletions algorithmic_efficiency/workloads/wmt/bleu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
Removing the dependency on sacrebleu, we reimplement the BLEU score computation in this file.
Removing the dependency on sacrebleu, we reimplement the BLEU score computation
in this file.
Reference:
https://github.com/mjpost/sacrebleu/blob/v1.3.1/sacrebleu.py.
"""
Expand Down Expand Up @@ -42,7 +43,8 @@ def my_log(num):

def tokenize_13a(line):
"""
Tokenizes an input line using a relatively minimal tokenization that is however equivalent to mteval-v13a, used by WMT.
Tokenizes an input line using a relatively minimal tokenization that is
however equivalent to mteval-v13a, used by WMT.
:param line: a segment to tokenize
:return: the tokenized line
Expand Down Expand Up @@ -80,6 +82,7 @@ class UnicodeRegex:
without depending on https://pypi.python.org/pypi/regex/."""

@staticmethod
def _property_chars(prefix):
return ''.join(
chr(x)
Expand All @@ -95,20 +98,23 @@ def _property_chars(prefix):
def tokenize_v14_international(string):
r"""Tokenize a string following the official BLEU implementation.
See https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983
See
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983
In our case, the input string is expected to be just one line
and no HTML entities de-escaping is needed.
So we just tokenize on punctuation and symbols,
except when a punctuation is preceded and followed by a digit
(e.g. a comma/dot as a thousand/decimal separator).
Note that a number (e.g., a year) followed by a dot at the end of sentence is NOT tokenized,
Note that a number (e.g., a year) followed by a dot at the end of sentence
is NOT tokenized,
i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
does not match this case (unless we add a space after each sentence).
However, this error is already in the original mteval-v14.pl
and we want to be consistent with it.
The error is not present in the non-international version,
which uses `$norm_text = " $norm_text "` (or `norm = " {} ".format(norm)` in Python).
which uses,
`$norm_text = " $norm_text "` (or `norm = " {} ".format(norm)` in Python).
:param string: the input string
:return: a list of tokens
Expand All @@ -123,26 +129,28 @@ def tokenize_zh(sentence):
"""MIT License
Copyright (c) 2017 - Shujian Huang <[email protected]>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
The tokenization of Chinese text in this script contains two steps: separate each Chinese
characters (by utf-8 encoding); tokenize the non Chinese part (following the mteval script).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to the
following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.
The tokenization of Chinese text in this script contains two steps:
separate each Chinese characters (by utf-8 encoding);
tokenize the non Chinese part (following the mteval script).
Author: Shujian Huang [email protected]
:param sentence: input sentence
Expand All @@ -151,54 +159,53 @@ def tokenize_zh(sentence):

def is_chinese_char(uchar):
"""
:param uchar: input char in unicode
:return: whether the input char is a Chinese character.
"""
if uchar >= u'\u3400' and uchar <= u'\u4db5': # CJK Unified Ideographs Extension A, release 3.0
:param uchar: input char in unicode
:return: whether the input char is a Chinese character.
"""
if "\u3400" <= uchar <= "\u4db5":
return True
elif uchar >= u'\u4e00' and uchar <= u'\u9fa5': # CJK Unified Ideographs, release 1.1
elif "\u4e00" <= uchar <= "\u9fa5":
return True
elif uchar >= u'\u9fa6' and uchar <= u'\u9fbb': # CJK Unified Ideographs, release 4.1
elif "\u9fa6" <= uchar <= "\u9fbb":
return True
elif uchar >= u'\uf900' and uchar <= u'\ufa2d': # CJK Compatibility Ideographs, release 1.1
elif "\uf900" <= uchar <= "\ufa2d":
return True
elif uchar >= u'\ufa30' and uchar <= u'\ufa6a': # CJK Compatibility Ideographs, release 3.2
elif "\ufa30" <= uchar <= "\ufa6a":
return True
elif uchar >= u'\ufa70' and uchar <= u'\ufad9': # CJK Compatibility Ideographs, release 4.1
elif "\ufa70" <= uchar <= "\ufad9":
return True
elif uchar >= u'\u20000' and uchar <= u'\u2a6d6': # CJK Unified Ideographs Extension B, release 3.1
elif "\u20000" <= uchar <= "\u2a6d6":
return True
elif uchar >= u'\u2f800' and uchar <= u'\u2fa1d': # CJK Compatibility Supplement, release 3.1
elif "\u2f800" <= uchar <= "\u2fa1d":
return True
elif uchar >= u'\uff00' and uchar <= u'\uffef': # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet
elif "\uff00" <= uchar <= "\uffef":
return True
elif uchar >= u'\u2e80' and uchar <= u'\u2eff': # CJK Radicals Supplement
elif "\u2e80" <= uchar <= "\u2eff":
return True
elif uchar >= u'\u3000' and uchar <= u'\u303f': # CJK punctuation mark
elif "\u3000" <= uchar <= "\u303f":
return True
elif uchar >= u'\u31c0' and uchar <= u'\u31ef': # CJK stroke
elif "\u31c0" <= uchar <= "\u31ef":
return True
elif uchar >= u'\u2f00' and uchar <= u'\u2fdf': # Kangxi Radicals
elif "\u2f00" <= uchar <= "\u2fdf":
return True
elif uchar >= u'\u2ff0' and uchar <= u'\u2fff': # Chinese character structure
elif "\u2ff0" <= uchar <= "\u2fff":
return True
elif uchar >= u'\u3100' and uchar <= u'\u312f': # Phonetic symbols
elif "\u3100" <= uchar <= "\u312f":
return True
elif uchar >= u'\u31a0' and uchar <= u'\u31bf': # Phonetic symbols (Taiwanese and Hakka expansion)
elif "\u31a0" <= uchar <= "\u31bf":
return True
elif uchar >= u'\ufe10' and uchar <= u'\ufe1f':
elif "\ufe10" <= uchar <= "\ufe1f":
return True
elif uchar >= u'\ufe30' and uchar <= u'\ufe4f':
elif "\ufe30" <= uchar <= "\ufe4f":
return True
elif uchar >= u'\u2600' and uchar <= u'\u26ff':
elif "\u2600" <= uchar <= "\u26ff":
return True
elif uchar >= u'\u2700' and uchar <= u'\u27bf':
elif "\u2700" <= uchar <= "\u27bf":
return True
elif uchar >= u'\u3200' and uchar <= u'\u32ff':
elif "\u3200" <= uchar <= "\u32ff":
return True
elif uchar >= u'\u3300' and uchar <= u'\u33ff':
elif "\u3300" <= uchar <= "\u33ff":
return True

return False

sentence = sentence.strip()
Expand Down Expand Up @@ -280,13 +287,13 @@ def ref_stats(output, refs):
closest_len = reflen

ngrams_ref = extract_ngrams(ref)
for ngram in ngrams_ref.keys():
for ngram in ngrams_ref:
ngrams[ngram] = max(ngrams[ngram], ngrams_ref[ngram])

return ngrams, closest_diff, closest_len


BLEU = namedtuple('BLEU',
BLEU = namedtuple('BLE',
'score, counts, totals, precisions, bp, sys_len, ref_len')


Expand All @@ -299,8 +306,9 @@ def compute_bleu(correct: List[int],
use_effective_order=False) -> BLEU:
"""Computes BLEU score from its sufficient statistics. Adds smoothing.
Smoothing methods (citing "A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU",
Boxing Chen and Colin Cherry, WMT 2014: http://aclweb.org/anthology/W14-3346)
Smoothing methods (citing "A Systematic Comparison of Smoothing Techniques
for Sentence-Level BLEU", Boxing Chen and Colin Cherry,
WMT 2014: http://aclweb.org/anthology/W14-3346)
- exp: NIST smoothing method (Method 3)
- floor: Method 1
Expand All @@ -312,7 +320,7 @@ def compute_bleu(correct: List[int],
:param sys_len: The cumulative system length
:param ref_len: The cumulative reference length
:param smooth: The smoothing method to use
:param smooth_value: The smoothing value added, if smooth method 'floor' is used
:param smooth_value: The smoothing value added, if smooth is 'floor'
:param use_effective_order: Use effective order.
:return: A BLEU object with the score (100-based) and other statistics.
"""
Expand Down Expand Up @@ -340,10 +348,12 @@ def compute_bleu(correct: List[int],
else:
precisions[n] = 100. * correct[n] / total[n]

# If the system guesses no i-grams, 1 <= i <= NGRAM_ORDER, the BLEU score is 0 (technically undefined).
# This is a problem for sentence-level BLEU or a corpus of short sentences, where systems will get no credit
# if sentence lengths fall under the NGRAM_ORDER threshold. This fix scales NGRAM_ORDER to the observed
# maximum order. It is only available through the API and off by default
# If the system guesses no i-grams, 1 <= i <= NGRAM_ORDER, the BLEU
# score is 0 (technically undefined). This is a problem for sentence-level
# BLEU or a corpus of short sentences, where systems will get no credit
# if sentence lengths fall under the NGRAM_ORDER threshold. This fix scales
# NGRAM_ORDER to the observed maximum order.
# It is only available through the API and off by default

brevity_penalty = 1.0
if sys_len < ref_len:
Expand Down Expand Up @@ -374,7 +384,7 @@ def corpus_bleu(sys_stream: Sequence[str],
:param force: Ignore data that looks already tokenized.
:param lowercase: Lowercase the data.
:param tokenize: The tokenizer to use.
:return: A BLEU object containing everything you'd want.
:return: A BLEU object containing everything yo'd want.
"""

# Add some robustness to the input arguments.
Expand Down

0 comments on commit 5e348e4

Please sign in to comment.