Skip to content

Commit

Permalink
[#160463344] fix repeating segment regex (#147)
Browse files Browse the repository at this point in the history
  • Loading branch information
MothOnMars authored Sep 25, 2018
1 parent 3b26377 commit cc927cc
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
2 changes: 1 addition & 1 deletion app/models/searchgov_crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def skip_extensions_regex

# avoid infinite loops caused by malformed urls
def repeating_segments_regex
/(\/\w+)(?:.+?\1){2,}/
/(\/[[:alpha:]]+)(?=\/)(.*\1(?=\/)){2,}/
end

def supported_content_type(type)
Expand Down
13 changes: 13 additions & 0 deletions spec/models/searchgov_crawler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -316,4 +316,17 @@
end
end
end

# Testing this private method directly for test speed
describe '.repeating_segments_regex' do
subject(:regex) { crawler.send(:repeating_segments_regex) }

it { is_expected.to match 'http://www.agency.gov/foo/foo/foo/' }
it { is_expected.to match 'http://www.agency.gov/foo/baz/foo/biz/foo/qux/' }
it { is_expected.not_to match 'http://www.agency.gov/fee/fie/foe/' }
it { is_expected.not_to match 'http://www.agency.gov/foo/foo/' }
it { is_expected.not_to match 'http://www.agency.gov/f/foo/foo' }
it { is_expected.not_to match 'http://www.agency.gov/foofoofoo/' }
it { is_expected.not_to match 'http://www.agency.gov/09/09/09/' }
end
end

0 comments on commit cc927cc

Please sign in to comment.