diff --git a/lib/core/facets/string/words.rb b/lib/core/facets/string/words.rb index bba9daca..b0ad9660 100644 --- a/lib/core/facets/string/words.rb +++ b/lib/core/facets/string/words.rb @@ -7,6 +7,18 @@ class String def words self.split(/\s+/) end - + + # Returns an array of words in the commonly-understood sense (not including punctuation). + # This takes into account international punctuation characters as well as English ones. + # + # 'Slowly, grudgingly he said: "This has to stop."'.words + # => ["Slowly", "grudgingly", "he", "said", "This", "has", "to", "stop"] + def words_without_punctuation + s = self.dup + s.gsub!(/[.?¿¡…!,::;—"。?!、‘“”„«»〈〉《》,\/\[\]]/, ' ') + s.gsub!('- ', ' ') + s.squeeze!(" ") + s.strip.split(" ") + end end diff --git a/test/core/string/test_words.rb b/test/core/string/test_words.rb index 2b9febfc..a8d3d4ee 100644 --- a/test/core/string/test_words.rb +++ b/test/core/string/test_words.rb @@ -20,5 +20,46 @@ end end + + method :words_without_punctuation do + + test do + # English + x = "How?? I don't believe you!!!" + x.words_without_punctuation.assert == ['How', 'I', "don't", 'believe', 'you'] + x = 'Slowly, grudgingly he said: "This has to stop."' + x.words_without_punctuation.assert == ['Slowly', 'grudgingly', 'he', 'said', 'This', 'has', 'to', 'stop'] + end + + test do + # French + x = "« Bonjour ! J'ai rendezvous avec mademoiselle Dupont-Fleury ! »" + x.words_without_punctuation.assert == ['Bonjour', "J'ai", "rendezvous", "avec", "mademoiselle", "Dupont-Fleury"] + end + + test do + # Spanish + x = "«¡María, te amo!», exclamó Juan. … «¿Por qué me sigues mintiendo?" + x.words_without_punctuation.assert == ['María', 'te', 'amo', 'exclamó', 'Juan', 'Por', 'qué', 'me', 'sigues', 'mintiendo'] + end + + test do + # Italian + x = 'Alcune persone scrivono al computer; altre con la penna: io con le due.' + x.words_without_punctuation.assert == ['Alcune', 'persone', 'scrivono', 'al', 'computer', 'altre', 'con', 'la', 'penna', 'io', 'con', 'le', 'due'] + end + + test do + # German + x = '“chevron,” „französische“ Anführungszeichen' + x.words_without_punctuation.assert == ['chevron', 'französische', 'Anführungszeichen'] + end + + test do + # Russian + x = '"А ты прав." — сказал он, — "Я великолепен!".' + x.words_without_punctuation.assert == ['А', 'ты', 'прав', 'сказал', 'он', 'Я', 'великолепен'] + end + end end