diff --git a/app/models/searchgov_crawler.rb b/app/models/searchgov_crawler.rb index edcaa35097..a3ec540c85 100644 --- a/app/models/searchgov_crawler.rb +++ b/app/models/searchgov_crawler.rb @@ -77,7 +77,7 @@ def skip_extensions_regex # avoid infinite loops caused by malformed urls def repeating_segments_regex - /(\/\w+)(?:.+?\1){2,}/ + /(\/[[:alpha:]]+)(?=\/)(.*\1(?=\/)){2,}/ end def supported_content_type(type) diff --git a/spec/models/searchgov_crawler_spec.rb b/spec/models/searchgov_crawler_spec.rb index 4fb550d19f..4500aa6cf8 100644 --- a/spec/models/searchgov_crawler_spec.rb +++ b/spec/models/searchgov_crawler_spec.rb @@ -316,4 +316,17 @@ end end end + + # Testing this private method directly for test speed + describe '.repeating_segments_regex' do + subject(:regex) { crawler.send(:repeating_segments_regex) } + + it { is_expected.to match 'http://www.agency.gov/foo/foo/foo/' } + it { is_expected.to match 'http://www.agency.gov/foo/baz/foo/biz/foo/qux/' } + it { is_expected.not_to match 'http://www.agency.gov/fee/fie/foe/' } + it { is_expected.not_to match 'http://www.agency.gov/foo/foo/' } + it { is_expected.not_to match 'http://www.agency.gov/f/foo/foo' } + it { is_expected.not_to match 'http://www.agency.gov/foofoofoo/' } + it { is_expected.not_to match 'http://www.agency.gov/09/09/09/' } + end end