Skip to content

Commit

Permalink
Added expire_unverified_sites for #3 Automate site expiry, and moved …
Browse files Browse the repository at this point in the history
…check_for_stuck_jobs to utils
  • Loading branch information
m-i-l committed Dec 11, 2021
1 parent d1c8a63 commit f7c899d
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 27 deletions.
68 changes: 67 additions & 1 deletion src/indexing/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from urllib.request import urlopen
from urllib.request import urlopen, Request
import psycopg2
import psycopg2.extras
import tldextract
Expand All @@ -25,9 +25,24 @@
indexing_log_sql = "SELECT * FROM tblIndexingLog WHERE domain = (%s) AND status = 'COMPLETE' ORDER BY timestamp DESC LIMIT 1;"
get_last_complete_indexing_log_message_sql = "SELECT message FROM tblIndexingLog WHERE domain = (%s) AND status = 'COMPLETE' ORDER BY timestamp DESC LIMIT 1;"
deactivate_indexing_sql = "UPDATE tblDomains SET indexing_enabled = FALSE, indexing_disabled_date = now(), indexing_disabled_reason = (%s) WHERE domain = (%s);"
# In sql_to_get_expired_unverified_sites, the moderator_approved = TRUE might appear redundant, but it is to stop the same site being returned after it is expired
get_expired_unverified_sites_sql = "SELECT domain from tblDomains "\
"WHERE expire_date < now() "\
"AND validation_method IN ('QuickAdd', 'SQL') "\
"AND moderator_approved = TRUE "\
"AND indexing_type = 'spider/default' "\
"ORDER BY date_domain_added ASC;"
expire_unverified_site_sql = "UPDATE tblDomains SET moderator_approved = NULL where domain = (%s);"
check_for_stuck_jobs_sql = "SELECT * FROM tblDomains "\
"WHERE indexing_type = 'spider/default' "\
"AND indexing_current_status = 'RUNNING' "\
"AND indexing_status_last_updated + '6 hours' < NOW();"

solr_url = config.SOLR_URL
solr_query_to_get_indexed_outlinks = "select?q=*%3A*&fq=indexed_outlinks%3A*{}*&fl=url,indexed_outlinks&rows=10000"
solr_delete_query = "update?commit=true"
solr_delete_headers = {'Content-Type': 'text/xml'}
solr_delete_data = "<delete><query>domain:{}</query></delete>"


# Database utils
Expand Down Expand Up @@ -113,6 +128,48 @@ def deactivate_indexing(domain, reason):
conn.close()
return

# Check for stuck jobs
def check_for_stuck_jobs():
logger = logging.getLogger()
try:
conn = psycopg2.connect(host=db_host, dbname=db_name, user=db_user, password=db_password)
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute(check_for_stuck_jobs_sql)
results = cursor.fetchall()
stuck_domains = []
for result in results:
stuck_domains.append(result['domain'])
if stuck_domains:
logger.warning('The following domains have had indexing RUNNING for over 6 hours, so something is likely to be wrong: {}'.format(stuck_domains))
except psycopg2.Error as e:
logger.error(' %s' % e.pgerror)
finally:
conn.close()

# Expire unverified ('QuickAdd', 'SQL') sites
def expire_unverified_sites():
expired_unverified_sites = []
logger = logging.getLogger()
try:
conn = psycopg2.connect(host=db_host, dbname=db_name, user=db_user, password=db_password)
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute(get_expired_unverified_sites_sql)
results = cursor.fetchall()
for result in results:
expired_unverified_sites.append(result['domain'])
if expired_unverified_sites:
for expired_unverified_site in expired_unverified_sites:
logger.info('Expiring the following unverified domain: {}'.format(expired_unverified_site))
cursor.execute(expire_unverified_site_sql, (expired_unverified_site,))
conn.commit()
solr_delete_domain(expired_unverified_site)
except psycopg2.Error as e:
logger.error('expire_unverified_sites: {}'.format(e.pgerror))
finally:
conn.close()
return expired_unverified_sites



# Solr utils
# ----------
Expand Down Expand Up @@ -146,6 +203,15 @@ def get_all_indexed_inlinks_for_domain(domain):
indexed_inlinks[indexed_outlink].append(url)
return indexed_inlinks

# Remove all pages from a domain from the Solr index
def solr_delete_domain(domain):
solrurl = config.SOLR_URL
solrquery = solrurl + solr_delete_query
data = solr_delete_data.format(domain)
req = Request(solrquery, data.encode("utf8"), solr_delete_headers)
response = urlopen(req)
results = response.read()


# Domain utils
# ------------
Expand Down
32 changes: 6 additions & 26 deletions src/indexing/search_my_site_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
import logging
import psycopg2
import psycopg2.extras
from urllib.request import urlopen
import json
from indexer.spiders.search_my_site_script import SearchMySiteScript
from common.utils import update_indexing_log, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain
from common.utils import update_indexing_log, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, check_for_stuck_jobs, expire_unverified_sites


# As per https://docs.scrapy.org/en/latest/topics/practices.html
Expand Down Expand Up @@ -53,32 +51,14 @@
"ORDER BY indexing_current_status DESC, owner_verified DESC "\
"LIMIT 16;"

sql_to_check_for_stuck_jobs = "SELECT * FROM tblDomains "\
"WHERE indexing_type = 'spider/default' "\
"AND indexing_current_status = 'RUNNING' "\
"AND indexing_status_last_updated + '6 hours' < NOW();"

solrurl = settings.get('SOLR_URL')
solr_query_to_get_indexed_outlinks = "select?q=*%3A*&fq=indexed_outlinks%3A*{}*&fl=url,indexed_outlinks&rows=10000"

# Maintenance jobs
# MAINTENANCE JOBS
# This could be in a separately scheduled job, which could be run less frequently, but is just here for now to save having to setup another job
# The code to check for and action expired domains could go here too
try:
conn = psycopg2.connect(dbname=db_name, user=db_user, host=db_host, password=db_password)
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute(sql_to_check_for_stuck_jobs)
results = cursor.fetchall()
stuck_domains = []
for result in results:
stuck_domains.append(result['domain'])
if stuck_domains:
logger.warning('The following domains have had indexing RUNNING for over 6 hours, so something is likely to be wrong: {}'.format(stuck_domains))
except psycopg2.Error as e:
logger.error(' %s' % e.pgerror)
finally:
conn.close()
check_for_stuck_jobs()
expire_unverified_sites()


# MAIN INDEXING JOB
# Read data from database (urls_to_crawl, domains_for_indexed_links, exclusion for each urls_to_crawl)

logger.info('Checking for sites to index')
Expand Down

0 comments on commit f7c899d

Please sign in to comment.