Skip to content

Commit

Permalink
Stopwords with 't are removed, and words with 's are properly split, …
Browse files Browse the repository at this point in the history
…remove extra spacing from results
  • Loading branch information
bendangelo committed Dec 6, 2023
1 parent 42fc3a8 commit 23d6279
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 9 deletions.
14 changes: 8 additions & 6 deletions lib/keyphrase.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ class Keyphrase

autoload :Stoplist, "keyphrase/stoplist"

CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
CLEAN_SPACES_REGEX = /\s+/
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u

def self.analyse text, options={}
@@keyphrase ||= Keyphrase.new
Expand All @@ -23,10 +24,11 @@ def analyse text, options={}
sort = options[:sort] || true
blacklist = options[:blacklist] || BLACKLIST_REGEX
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX

pattern = buildStopwordRegExPattern stoplist, lang
sentences = text.split sentences_regex
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
wordscores = calculateWordScores phrases
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus

Expand Down Expand Up @@ -61,14 +63,14 @@ def buildStopwordRegExPattern stopwords, lang

# generate candidate keywords
# 2
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
phrases = Array.new

filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }

filtered_sentences.each do |parts|
parts.split("|").each do |part|
part = part.gsub(blacklist, " ").strip
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip

if !part.empty?
phrases.push part
Expand Down
6 changes: 3 additions & 3 deletions spec/keyphrase_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
it do
result = Keyphrase.analyse "Secrets Of The Smithsonian: Humanity's Hidden History | Jay Myers"

expect(result.keys).to eq ["Humanity's Hidden History", "The Smithsonian", "Jay Myers", "Secrets"]
expect(result.keys).to eq ["The Smithsonian", "Hidden History", "Jay Myers", "Secrets", "Humanity"]
end

it "should remove non-words" do
result = Keyphrase.analyse "re-move these's: 's & + ! @ # $ % ^ & * ( ) \\ | [ ] { } / ? ~ ` = - _ , . 🎄 £"
result = Keyphrase.analyse "re-move these's: 's & + ! @ # $ % ^ & * ( ) \\ | [ ] { } / ? ~ ` = - _ , . 🎄 £ tommy'humanity fff' 'fff don't"

expect(result.keys).to eq ["re-move these's"]
expect(result.keys).to eq ["tommy humanity fff fff", "re-move"]
end

it do
Expand Down

0 comments on commit 23d6279

Please sign in to comment.