Skip to content

Commit

Permalink
Merge pull request #271 from Sprachprofi/patch-2
Browse files Browse the repository at this point in the history
Extracting 'words' in the commonly-understood sense of the word
  • Loading branch information
ioquatix authored Aug 28, 2018
2 parents 62f3667 + ce2df14 commit 17138ae
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 1 deletion.
14 changes: 13 additions & 1 deletion lib/core/facets/string/words.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ class String
def words
self.split(/\s+/)
end


# Returns an array of words in the commonly-understood sense (not including punctuation).
# This takes into account international punctuation characters as well as English ones.
#
# 'Slowly, grudgingly he said: "This has to stop."'.words
# => ["Slowly", "grudgingly", "he", "said", "This", "has", "to", "stop"]
def words_without_punctuation
s = self.dup
s.gsub!(/[.?¿¡…!,::;—"。?!、‘“”„«»〈〉《》,\/\[\]]/, ' ')
s.gsub!('- ', ' ')
s.squeeze!(" ")
s.strip.split(" ")
end
end

41 changes: 41 additions & 0 deletions test/core/string/test_words.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,46 @@
end

end

method :words_without_punctuation do

test do
# English
x = "How?? I don't believe you!!!"
x.words_without_punctuation.assert == ['How', 'I', "don't", 'believe', 'you']
x = 'Slowly, grudgingly he said: "This has to stop."'
x.words_without_punctuation.assert == ['Slowly', 'grudgingly', 'he', 'said', 'This', 'has', 'to', 'stop']
end

test do
# French
x = "« Bonjour ! J'ai rendezvous avec mademoiselle Dupont-Fleury ! »"
x.words_without_punctuation.assert == ['Bonjour', "J'ai", "rendezvous", "avec", "mademoiselle", "Dupont-Fleury"]
end

test do
# Spanish
x = "«¡María, te amo!», exclamó Juan. … «¿Por qué me sigues mintiendo?"
x.words_without_punctuation.assert == ['María', 'te', 'amo', 'exclamó', 'Juan', 'Por', 'qué', 'me', 'sigues', 'mintiendo']
end

test do
# Italian
x = 'Alcune persone scrivono al computer; altre con la penna: io con le due.'
x.words_without_punctuation.assert == ['Alcune', 'persone', 'scrivono', 'al', 'computer', 'altre', 'con', 'la', 'penna', 'io', 'con', 'le', 'due']
end

test do
# German
x = '“chevron,” „französische“ Anführungszeichen'
x.words_without_punctuation.assert == ['chevron', 'französische', 'Anführungszeichen']
end

test do
# Russian
x = '"А ты прав." — сказал он, — "Я великолепен!".'
x.words_without_punctuation.assert == ['А', 'ты', 'прав', 'сказал', 'он', 'Я', 'великолепен']
end
end

end

0 comments on commit 17138ae

Please sign in to comment.