Skip to content

Commit

Permalink
Added get_contents to get previously indexed page contents, for #94 D…
Browse files Browse the repository at this point in the history
…etect if page content has changed
  • Loading branch information
m-i-l committed Jun 25, 2023
1 parent 3e206d0 commit 1f88c7e
Showing 1 changed file with 23 additions and 0 deletions.
23 changes: 23 additions & 0 deletions src/indexing/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
solr_url = config.SOLR_URL
solr_query_to_get_indexed_outlinks = "select?q=*%3A*&fq=indexed_outlinks%3A*{}*&fl=url,indexed_outlinks&rows=10000"
solr_query_to_get_already_indexed_links = "select?q=domain%3A{}&fl=url&rows=1000"
solr_query_to_get_content = "select?q=domain%3A{}&fl=url,content,content_last_modified&rows=1000"
solr_delete_query = "update?commit=true"
solr_delete_headers = {'Content-Type': 'text/xml'}
solr_delete_data = "<delete><query>domain:{}</query></delete>"
Expand Down Expand Up @@ -321,6 +322,28 @@ def get_already_indexed_links(domain):
already_indexed_links.append(url)
return already_indexed_links

# Get all the content for a domain (used for identifying whether content has changed)
# Format is a dict of dicts.
# The first dict has 'url' as the key, and the second has (optional) 'content' and 'content_last_modified'
# Use with e.g.:
# content = contents['https://michael-lewis.com/']
# if content and 'content' in content and content['content']: ...

def get_contents(domain):
contents = {}
solrquery = solr_query_to_get_content.format(domain)
connection = urlopen(solr_url + solrquery)
results = json.load(connection)
if results['response']['docs']:
for doc in results['response']['docs']:
content = {}
if 'content' in doc and doc['content']:
content['content'] = doc['content']
if 'content_last_modified' in doc and doc['content_last_modified']:
content['content_last_modified'] = doc['content_last_modified']
contents[doc['url']] = content
return contents


# Database and Solr utils
# -----------------------
Expand Down

0 comments on commit 1f88c7e

Please sign in to comment.