Skip to content

Commit

Permalink
Implemented #94: Detect if page content has changed
Browse files Browse the repository at this point in the history
  • Loading branch information
m-i-l committed Jun 25, 2023
1 parent 12d86f2 commit a062e31
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 9 deletions.
47 changes: 47 additions & 0 deletions src/indexing/indexer/spiders/search_my_site_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# <field name="description" type="text_general" indexed="true" stored="true" multiValued="false" />
# <field name="tags" type="string" indexed="true" stored="true" multiValued="true" />
# <field name="content" type="text_general" indexed="true" stored="true" multiValued="false" />
# <field name="content_last_modified" type="pdate" indexed="true" stored="true" multiValued="false" />
# <field name="content_type" type="string" indexed="true" stored="true" />
# <field name="page_type" type="string" indexed="true" stored="true" />
# <field name="page_last_modified" type="pdate" indexed="true" stored="true" />
Expand Down Expand Up @@ -199,6 +200,8 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf
item['tags'] = tag_list

# content
# Note that if the logic to generate content_text changes in any way, even just in the way white space is treated,
# then that will trigger new values for content_last_modified, even if the actual content hasn't actually changed
only_body = SoupStrainer('body')
body_html = BeautifulSoup(response.text, 'lxml', parse_only=only_body)
for non_content in body_html(["nav", "header", "footer"]): # Remove nav, header, and footer tags and their contents
Expand All @@ -213,6 +216,50 @@ def customparser(response, domain, is_home, domains_for_indexed_links, site_conf
content_text = get_text(body_html)
item['content'] = content_text

# content_last_modified
# Get values already parsed from the current page
new_content = content_text # from whole page, with nav, header etc. removed, and the remainder converted to plain text
page_last_modified = last_modified_date # from Last-Modified HTTP header
# Get values from the previously indexed version of this page
previous_content = None
previous_content_last_modified = None
previous_contents = site_config['contents']
if response.url in previous_contents: # If there was something at this URL last time (not necessarily with content and/or content_last_modified set)
previous_page = previous_contents[response.url]
if 'content' in previous_page:
previous_content = previous_page['content']
if 'content_last_modified' in previous_page:
previous_content_last_modified = previous_page['content_last_modified']
# Scenarios:
# 1. Page content changed: use indexed_date
# 2. Page content unchanged: use previous_content_last_modified or page_last_modified or indexed_date
# 3. New page: use page_last_modified or indexed_date
# 4. No page content (or something else): no value
# Note that page_last_modified is not necessarily when the content was last changed, but is more likely to nearer than indexed_date,
# plus it saves a lot of content_last_modified values being set to the time this functionality is first run
if previous_content and new_content and previous_content != new_content:
content_last_modified = indexed_date
message = 'Updated page content: changing content_last_modified to {}'.format(content_last_modified)
elif previous_content and new_content and previous_content == new_content:
if previous_content_last_modified: # This will normally be set, but won't the first time this code is run against existing content
content_last_modified = previous_content_last_modified
elif page_last_modified:
content_last_modified = page_last_modified
else:
content_last_modified = indexed_date
message = 'Unchanged page content: using content_last_modified {}'.format(content_last_modified)
elif new_content and not previous_content and not previous_content_last_modified:
if page_last_modified:
content_last_modified = page_last_modified
else:
content_last_modified = indexed_date
message = 'New page: setting content_last_modified to {}'.format(content_last_modified)
else:
content_last_modified = None
message = 'No page content: content_last_modified not being set'
logger.debug(message)
item['content_last_modified'] = content_last_modified

# published_date
published_date = response.xpath('//meta[@property="article:published_time"]/@content').get()
if not published_date: published_date = response.xpath('//meta[@name="dc.date.issued"]/@content').get()
Expand Down
29 changes: 20 additions & 9 deletions src/indexing/search_my_site_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import psycopg2
import psycopg2.extras
from indexer.spiders.search_my_site_spider import SearchMySiteSpider
from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, check_for_stuck_jobs, expire_listings
from common.utils import update_indexing_status, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, get_already_indexed_links, get_contents, check_for_stuck_jobs, expire_listings


# As per https://docs.scrapy.org/en/latest/topics/practices.html
Expand All @@ -23,12 +23,17 @@

# Initialise variables
# - sites_to_crawl and common_config are the two values passed into SearchMySiteSpider
# - sites_to_crawl is a list of dicts, where each dict corresponds to a site which needs to be crawled,
# and the dict is all the information about the site which could be needed at index time,
# e.g. site['site_category'], site['web_feed'] , site['exclusions'] (a list of dicts),
# site['indexed_inlinks'], and for incremental indexes site['already_indexed_links']
# - common_config is a dict with settings which apply to all sites, e.g.
# common_config['domains_for_indexed_links'] and common_config['domains_allowing_subdomains'].
# - sites_to_crawl is a list of dicts, where each dict in the list corresponds to a site which needs to be crawled,
# and the dict contains all the information about the site which could be needed at index time, e.g.
# - site['site_category']
# - site['web_feed']
# - site['exclusions'] (a list of dicts)
# - site['indexed_inlinks'] (from Solr)
# - site['content'] (from Solr)
# - site['already_indexed_links'] (from Solr, only set for incremental indexes)
# - common_config is a dict with settings which apply to all sites, i.e.
# - common_config['domains_for_indexed_links']
# - common_config['domains_allowing_subdomains']

sites_to_crawl = []
# Just lookup domains_for_indexed_links and domains_allowing_subdomains once
Expand Down Expand Up @@ -137,13 +142,19 @@
finally:
conn.close()

# Read data from Solr (indexed_inlinks and if necessary already_indexed_links)
# Read data from Solr (indexed_inlinks, content and if necessary already_indexed_links)

for site_to_crawl in sites_to_crawl:
# indexed_inlinks, i.e. pages (from other domains within this search index) which link to this domain.
indexed_inlinks = get_all_indexed_inlinks_for_domain(site_to_crawl['domain'])
logger.debug('indexed_inlinks: {}'.format(indexed_inlinks))
site_to_crawl['indexed_inlinks'] = indexed_inlinks
# Only get the list of already_indexed_links if it is needed, i.e. for an incremental index
# content, i.e. get_contents(domain)
contents = get_contents(site_to_crawl['domain'])
logger.debug('contents: {}'.format(contents.keys))
site_to_crawl['contents'] = contents
# already_indexed_links, i.e. pages on this domain which have already been indexed.
# This is only set if it is needed, i.e. for an incremental index.
if site['full_index'] == False:
already_indexed_links = get_already_indexed_links(site_to_crawl['domain'])
no_of_already_indexed_links = len(already_indexed_links)
Expand Down

0 comments on commit a062e31

Please sign in to comment.