Added expire_unverified_sites for #3 Automate site expiry, and moved …

…check_for_stuck_jobs to utils
searchmysite · Dec 11, 2021 · f7c899d · f7c899d
1 parent d1c8a63
commit f7c899d
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 27 deletions.
diff --git a/src/indexing/common/utils.py b/src/indexing/common/utils.py
@@ -1,5 +1,5 @@
 import json
-from urllib.request import urlopen
+from urllib.request import urlopen, Request
 import psycopg2
 import psycopg2.extras
 import tldextract
@@ -25,9 +25,24 @@
 indexing_log_sql = "SELECT * FROM tblIndexingLog WHERE domain = (%s) AND status = 'COMPLETE' ORDER BY timestamp DESC LIMIT 1;"
 get_last_complete_indexing_log_message_sql = "SELECT message FROM tblIndexingLog WHERE domain = (%s) AND status = 'COMPLETE' ORDER BY timestamp DESC LIMIT 1;"
 deactivate_indexing_sql = "UPDATE tblDomains SET indexing_enabled = FALSE, indexing_disabled_date = now(), indexing_disabled_reason = (%s) WHERE domain = (%s);"
+# In sql_to_get_expired_unverified_sites, the moderator_approved = TRUE might appear redundant, but it is to stop the same site being returned after it is expired
+get_expired_unverified_sites_sql = "SELECT domain from tblDomains "\
+    "WHERE expire_date < now() "\
+    "AND validation_method IN ('QuickAdd', 'SQL') "\
+    "AND moderator_approved = TRUE "\
+    "AND indexing_type = 'spider/default' "\
+    "ORDER BY date_domain_added ASC;"
+expire_unverified_site_sql = "UPDATE tblDomains SET moderator_approved = NULL where domain = (%s);"
+check_for_stuck_jobs_sql = "SELECT * FROM tblDomains "\
+    "WHERE indexing_type = 'spider/default' "\
+    "AND indexing_current_status = 'RUNNING' "\
+    "AND indexing_status_last_updated + '6 hours' < NOW();"
 
 solr_url = config.SOLR_URL
 solr_query_to_get_indexed_outlinks = "select?q=*%3A*&fq=indexed_outlinks%3A*{}*&fl=url,indexed_outlinks&rows=10000"
+solr_delete_query = "update?commit=true"
+solr_delete_headers = {'Content-Type': 'text/xml'}
+solr_delete_data = "<delete><query>domain:{}</query></delete>"
 
 
 # Database utils
@@ -113,6 +128,48 @@ def deactivate_indexing(domain, reason):
         conn.close()
     return
 
+# Check for stuck jobs
+def check_for_stuck_jobs():
+    logger = logging.getLogger()
+    try:
+        conn = psycopg2.connect(host=db_host, dbname=db_name, user=db_user, password=db_password)
+        cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
+        cursor.execute(check_for_stuck_jobs_sql)
+        results = cursor.fetchall()
+        stuck_domains = []
+        for result in results:
+            stuck_domains.append(result['domain'])
+        if stuck_domains:
+            logger.warning('The following domains have had indexing RUNNING for over 6 hours, so something is likely to be wrong: {}'.format(stuck_domains))
+    except psycopg2.Error as e:
+        logger.error(' %s' % e.pgerror)
+    finally:
+        conn.close()
+
+# Expire unverified ('QuickAdd', 'SQL') sites
+def expire_unverified_sites():
+    expired_unverified_sites = []
+    logger = logging.getLogger()
+    try:
+        conn = psycopg2.connect(host=db_host, dbname=db_name, user=db_user, password=db_password)
+        cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
+        cursor.execute(get_expired_unverified_sites_sql)
+        results = cursor.fetchall()
+        for result in results:
+            expired_unverified_sites.append(result['domain'])
+        if expired_unverified_sites:
+            for expired_unverified_site in expired_unverified_sites:
+                logger.info('Expiring the following unverified domain: {}'.format(expired_unverified_site))
+                cursor.execute(expire_unverified_site_sql, (expired_unverified_site,))
+                conn.commit()
+                solr_delete_domain(expired_unverified_site)
+    except psycopg2.Error as e:
+        logger.error('expire_unverified_sites: {}'.format(e.pgerror))
+    finally:
+        conn.close()
+    return expired_unverified_sites
+
+
 
 # Solr utils
 # ----------
@@ -146,6 +203,15 @@ def get_all_indexed_inlinks_for_domain(domain):
                         indexed_inlinks[indexed_outlink].append(url)
     return indexed_inlinks
 
+# Remove all pages from a domain from the Solr index
+def solr_delete_domain(domain):
+    solrurl = config.SOLR_URL
+    solrquery = solrurl + solr_delete_query
+    data = solr_delete_data.format(domain)
+    req = Request(solrquery, data.encode("utf8"), solr_delete_headers)
+    response = urlopen(req)
+    results = response.read()
+
 
 # Domain utils
 # ------------

diff --git a/src/indexing/search_my_site_scheduler.py b/src/indexing/search_my_site_scheduler.py
@@ -6,10 +6,8 @@
 import logging
 import psycopg2
 import psycopg2.extras
-from urllib.request import urlopen
-import json
 from indexer.spiders.search_my_site_script import SearchMySiteScript
-from common.utils import update_indexing_log, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain
+from common.utils import update_indexing_log, get_all_domains, get_domains_allowing_subdomains, get_all_indexed_inlinks_for_domain, check_for_stuck_jobs, expire_unverified_sites
 
 
 # As per https://docs.scrapy.org/en/latest/topics/practices.html
@@ -53,32 +51,14 @@
     "ORDER BY indexing_current_status DESC, owner_verified DESC "\
     "LIMIT 16;"
 
-sql_to_check_for_stuck_jobs = "SELECT * FROM tblDomains "\
-    "WHERE indexing_type = 'spider/default' "\
-    "AND indexing_current_status = 'RUNNING' "\
-    "AND indexing_status_last_updated + '6 hours' < NOW();"
-
-solrurl = settings.get('SOLR_URL')
-solr_query_to_get_indexed_outlinks = "select?q=*%3A*&fq=indexed_outlinks%3A*{}*&fl=url,indexed_outlinks&rows=10000"
 
-# Maintenance jobs
+# MAINTENANCE JOBS
 # This could be in a separately scheduled job, which could be run less frequently, but is just here for now to save having to setup another job
-# The code to check for and action expired domains could go here too
-try:
-    conn = psycopg2.connect(dbname=db_name, user=db_user, host=db_host, password=db_password)
-    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
-    cursor.execute(sql_to_check_for_stuck_jobs)
-    results = cursor.fetchall()
-    stuck_domains = []
-    for result in results:
-        stuck_domains.append(result['domain'])
-    if stuck_domains:
-        logger.warning('The following domains have had indexing RUNNING for over 6 hours, so something is likely to be wrong: {}'.format(stuck_domains))
-except psycopg2.Error as e:
-    logger.error(' %s' % e.pgerror)
-finally:
-    conn.close()
+check_for_stuck_jobs()
+expire_unverified_sites()
+
 
+# MAIN INDEXING JOB
 # Read data from database (urls_to_crawl, domains_for_indexed_links, exclusion for each urls_to_crawl)
 
 logger.info('Checking for sites to index')