From a062e313d1e16e2108d5fafe621da35d6ffd1dae Mon Sep 17 00:00:00 2001 From: michael-lewis Date: Sun, 25 Jun 2023 21:04:57 +0100 Subject: [PATCH] Implemented #94: Detect if page content has changed --- .../indexer/spiders/search_my_site_parser.py | 47 +++++++++++++++++++ src/indexing/search_my_site_scheduler.py | 29 ++++++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/indexing/indexer/spiders/search_my_site_parser.py b/src/indexing/indexer/spiders/search_my_site_parser.py index a4637cb..28db8a5 100644 --- a/src/indexing/indexer/spiders/search_my_site_parser.py +++ b/src/indexing/indexer/spiders/search_my_site_parser.py @@ -19,6 +19,7 @@ # # # +# # # # @@ -199,6 +200,8 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf item['tags'] = tag_list # content + # Note that if the logic to generate content_text changes in any way, even just in the way white space is treated, + # then that will trigger new values for content_last_modified, even if the actual content hasn't actually changed only_body = SoupStrainer('body') body_html = BeautifulSoup(response.text, 'lxml', parse_only=only_body) for non_content in body_html(["nav", "header", "footer"]): # Remove nav, header, and footer tags and their contents @@ -213,6 +216,50 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf content_text = get_text(body_html) item['content'] = content_text + # content_last_modified + # Get values already parsed from the current page + new_content = content_text # from whole page, with nav, header etc. removed, and the remainder converted to plain text + page_last_modified = last_modified_date # from Last-Modified HTTP header + # Get values from the previously indexed version of this page + previous_content = None + previous_content_last_modified = None + previous_contents = site_config['contents'] + if response.url in previous_contents: # If there was something at this URL last time (not necessarily with content and/or content_last_modified set) + previous_page = previous_contents[response.url] + if 'content' in previous_page: + previous_content = previous_page['content'] + if 'content_last_modified' in previous_page: + previous_content_last_modified = previous_page['content_last_modified'] + # Scenarios: + # 1. Page content changed: use indexed_date + # 2. Page content unchanged: use previous_content_last_modified or page_last_modified or indexed_date + # 3. New page: use page_last_modified or indexed_date + # 4. No page content (or something else): no value + # Note that page_last_modified is not necessarily when the content was last changed, but is more likely to nearer than indexed_date, + # plus it saves a lot of content_last_modified values being set to the time this functionality is first run + if previous_content and new_content and previous_content != new_content: + content_last_modified = indexed_date + message = 'Updated page content: changing content_last_modified to {}'.format(content_last_modified) + elif previous_content and new_content and previous_content == new_content: + if previous_content_last_modified: # This will normally be set, but won't the first time this code is run against existing content + content_last_modified = previous_content_last_modified + elif page_last_modified: + content_last_modified = page_last_modified + else: + content_last_modified = indexed_date + message = 'Unchanged page content: using content_last_modified {}'.format(content_last_modified) + elif new_content and not previous_content and not previous_content_last_modified: + if page_last_modified: + content_last_modified = page_last_modified + else: + content_last_modified = indexed_date + message = 'New page: setting content_last_modified to {}'.format(content_last_modified) + else: + content_last_modified = None + message = 'No page content: content_last_modified not being set' + logger.debug(message) + item['content_last_modified'] = content_last_modified + # published_date published_date = response.xpath('//meta[@property="article:published_time"]/@content').get() if not published_date: published_date = response.xpath('//meta[@name="dc.date.issued"]/@content').get() diff --git a/src/indexing/search_my_site_scheduler.py b/src/indexing/search_my_site_scheduler.py index 95140f2..d41afff 100644 --- a/src/indexing/search_my_site_scheduler.py +++ b/src/indexing/search_my_site_scheduler.py @@ -7,7 +7,7 @@ import psycopg2 import psycopg2.extras from indexer.spiders.search_my_site_spider import SearchMySiteSpider -from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, check_for_stuck_jobs, expire_listings +from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, get_contents, check_for_stuck_jobs, expire_listings # As per https://docs.scrapy.org/en/latest/topics/practices.html @@ -23,12 +23,17 @@ # Initialise variables # - sites_to_crawl and common_config are the two values passed into SearchMySiteSpider -# - sites_to_crawl is a list of dicts, where each dict corresponds to a site which needs to be crawled, -# and the dict is all the information about the site which could be needed at index time, -# e.g. site['site_category'], site['web_feed'] , site['exclusions'] (a list of dicts), -# site['indexed_inlinks'], and for incremental indexes site['already_indexed_links'] -# - common_config is a dict with settings which apply to all sites, e.g. -# common_config['domains_for_indexed_links'] and common_config['domains_allowing_subdomains']. +# - sites_to_crawl is a list of dicts, where each dict in the list corresponds to a site which needs to be crawled, +# and the dict contains all the information about the site which could be needed at index time, e.g. +# - site['site_category'] +# - site['web_feed'] +# - site['exclusions'] (a list of dicts) +# - site['indexed_inlinks'] (from Solr) +# - site['content'] (from Solr) +# - site['already_indexed_links'] (from Solr, only set for incremental indexes) +# - common_config is a dict with settings which apply to all sites, i.e. +# - common_config['domains_for_indexed_links'] +# - common_config['domains_allowing_subdomains'] sites_to_crawl = [] # Just lookup domains_for_indexed_links and domains_allowing_subdomains once @@ -137,13 +142,19 @@ finally: conn.close() -# Read data from Solr (indexed_inlinks and if necessary already_indexed_links) +# Read data from Solr (indexed_inlinks, content and if necessary already_indexed_links) for site_to_crawl in sites_to_crawl: + # indexed_inlinks, i.e. pages (from other domains within this search index) which link to this domain. indexed_inlinks = get_all_indexed_inlinks_for_domain(site_to_crawl['domain']) logger.debug('indexed_inlinks: {}'.format(indexed_inlinks)) site_to_crawl['indexed_inlinks'] = indexed_inlinks - # Only get the list of already_indexed_links if it is needed, i.e. for an incremental index + # content, i.e. get_contents(domain) + contents = get_contents(site_to_crawl['domain']) + logger.debug('contents: {}'.format(contents.keys)) + site_to_crawl['contents'] = contents + # already_indexed_links, i.e. pages on this domain which have already been indexed. + # This is only set if it is needed, i.e. for an incremental index. if site['full_index'] == False: already_indexed_links = get_already_indexed_links(site_to_crawl['domain']) no_of_already_indexed_links = len(already_indexed_links)