Implemented #94: Detect if page content has changed

searchmysite · Jun 25, 2023 · a062e31 · a062e31
1 parent 12d86f2
commit a062e31
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 9 deletions.
diff --git a/src/indexing/indexer/spiders/search_my_site_parser.py b/src/indexing/indexer/spiders/search_my_site_parser.py
@@ -19,6 +19,7 @@
 #    <field name="description" type="text_general" indexed="true" stored="true" multiValued="false" />
 #    <field name="tags" type="string" indexed="true" stored="true" multiValued="true" />
 #    <field name="content" type="text_general" indexed="true" stored="true" multiValued="false" />
+#    <field name="content_last_modified" type="pdate" indexed="true" stored="true" multiValued="false" />
 #    <field name="content_type" type="string" indexed="true" stored="true" />
 #    <field name="page_type" type="string" indexed="true" stored="true" />
 #    <field name="page_last_modified" type="pdate" indexed="true" stored="true" />
@@ -199,6 +200,8 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf
         item['tags'] = tag_list
 
         # content
+        # Note that if the logic to generate content_text changes in any way, even just in the way white space is treated,
+        # then that will trigger new values for content_last_modified, even if the actual content hasn't actually changed
         only_body = SoupStrainer('body')
         body_html = BeautifulSoup(response.text, 'lxml', parse_only=only_body)
         for non_content in body_html(["nav", "header", "footer"]): # Remove nav, header, and footer tags and their contents
@@ -213,6 +216,50 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf
             content_text = get_text(body_html)
         item['content'] = content_text
 
+        # content_last_modified
+        # Get values already parsed from the current page
+        new_content = content_text # from whole page, with nav, header etc. removed, and the remainder converted to plain text
+        page_last_modified = last_modified_date # from Last-Modified HTTP header
+        # Get values from the previously indexed version of this page
+        previous_content = None
+        previous_content_last_modified = None
+        previous_contents = site_config['contents']
+        if response.url in previous_contents: # If there was something at this URL last time (not necessarily with content and/or content_last_modified set)
+            previous_page = previous_contents[response.url]
+            if 'content' in previous_page:
+                previous_content = previous_page['content']
+            if 'content_last_modified' in previous_page:
+                previous_content_last_modified = previous_page['content_last_modified']
+        # Scenarios:
+        # 1. Page content changed: use indexed_date
+        # 2. Page content unchanged: use previous_content_last_modified or page_last_modified or indexed_date
+        # 3. New page: use page_last_modified or indexed_date
+        # 4. No page content (or something else): no value
+        # Note that page_last_modified is not necessarily when the content was last changed, but is more likely to nearer than indexed_date, 
+        # plus it saves a lot of content_last_modified values being set to the time this functionality is first run
+        if previous_content and new_content and previous_content != new_content:
+            content_last_modified = indexed_date
+            message = 'Updated page content: changing content_last_modified to {}'.format(content_last_modified)
+        elif previous_content and new_content and previous_content == new_content:
+            if previous_content_last_modified: # This will normally be set, but won't the first time this code is run against existing content
+                content_last_modified = previous_content_last_modified
+            elif page_last_modified:
+                content_last_modified = page_last_modified
+            else:
+                content_last_modified = indexed_date
+            message = 'Unchanged page content: using content_last_modified {}'.format(content_last_modified)
+        elif new_content and not previous_content and not previous_content_last_modified:
+            if page_last_modified:
+                content_last_modified = page_last_modified
+            else:
+                content_last_modified = indexed_date
+            message = 'New page: setting content_last_modified to {}'.format(content_last_modified)
+        else:
+            content_last_modified = None
+            message = 'No page content: content_last_modified not being set'
+        logger.debug(message)
+        item['content_last_modified'] = content_last_modified
+
         # published_date
         published_date = response.xpath('//meta[@property="article:published_time"]/@content').get()
         if not published_date: published_date = response.xpath('//meta[@name="dc.date.issued"]/@content').get()

diff --git a/src/indexing/search_my_site_scheduler.py b/src/indexing/search_my_site_scheduler.py
@@ -7,7 +7,7 @@
 import psycopg2
 import psycopg2.extras
 from indexer.spiders.search_my_site_spider import SearchMySiteSpider
-from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, check_for_stuck_jobs, expire_listings
+from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, get_contents, check_for_stuck_jobs, expire_listings
 
 
 # As per https://docs.scrapy.org/en/latest/topics/practices.html
@@ -23,12 +23,17 @@
 
 # Initialise variables
 # - sites_to_crawl and common_config are the two values passed into SearchMySiteSpider
-# - sites_to_crawl is a list of dicts, where each dict corresponds to a site which needs to be crawled,
-#   and the dict is all the information about the site which could be needed at index time, 
-#   e.g. site['site_category'], site['web_feed'] , site['exclusions'] (a list of dicts),
-#   site['indexed_inlinks'], and for incremental indexes site['already_indexed_links']
-# - common_config is a dict with settings which apply to all sites, e.g. 
-#   common_config['domains_for_indexed_links'] and common_config['domains_allowing_subdomains'].
+# - sites_to_crawl is a list of dicts, where each dict in the list corresponds to a site which needs to be crawled,
+#   and the dict contains all the information about the site which could be needed at index time, e.g.
+#   - site['site_category']
+#   - site['web_feed']
+#   - site['exclusions'] (a list of dicts)
+#   - site['indexed_inlinks'] (from Solr)
+#   - site['content'] (from Solr)
+#   - site['already_indexed_links'] (from Solr, only set for incremental indexes)
+# - common_config is a dict with settings which apply to all sites, i.e.
+#   - common_config['domains_for_indexed_links']
+#   - common_config['domains_allowing_subdomains']
 
 sites_to_crawl = []
 # Just lookup domains_for_indexed_links and domains_allowing_subdomains once
@@ -137,13 +142,19 @@
 finally:
     conn.close()
 
-# Read data from Solr (indexed_inlinks and if necessary already_indexed_links)
+# Read data from Solr (indexed_inlinks, content and if necessary already_indexed_links)
 
 for site_to_crawl in sites_to_crawl:
+    # indexed_inlinks, i.e. pages (from other domains within this search index) which link to this domain.
     indexed_inlinks = get_all_indexed_inlinks_for_domain(site_to_crawl['domain'])
     logger.debug('indexed_inlinks: {}'.format(indexed_inlinks))
     site_to_crawl['indexed_inlinks'] = indexed_inlinks
-    # Only get the list of already_indexed_links if it is needed, i.e. for an incremental index
+    # content, i.e. get_contents(domain)
+    contents = get_contents(site_to_crawl['domain'])
+    logger.debug('contents: {}'.format(contents.keys))
+    site_to_crawl['contents'] = contents
+    # already_indexed_links, i.e. pages on this domain which have already been indexed.
+    # This is only set if it is needed, i.e. for an incremental index.
     if site['full_index'] == False: 
         already_indexed_links = get_already_indexed_links(site_to_crawl['domain'])
         no_of_already_indexed_links = len(already_indexed_links)