-
Notifications
You must be signed in to change notification settings - Fork 2
/
check-geoservices-stats.py
101 lines (89 loc) · 3.93 KB
/
check-geoservices-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Title: Check geoservices statistics
Author: Ralph Straumann
Date: 2023-05-00
Purpose: Compare current geoservices statistics with recent ones.
Notes:
- Uses Python 3.9
- If significant changes occur
"""
import os
import configuration as config
from datetime import timedelta
import polars as pl
import duckdb
import smtplib
import ssl
if __name__ == "__main__":
# Get current statistics
duckdb.sql('CREATE TABLE current_stats AS ('
'SELECT DATE, OWNER, DATASET_COUNT FROM "%s");' %
config.GEOSERVICES_STATS_CH_CSV)
current_date = duckdb.sql(
'SELECT MAX(DATE) FROM current_stats').fetchone()[0]
from_date = current_date - timedelta(days=30)
# Get historic statistics (everything that is at least 1 and at most
# 30 days old)
duckdb.sql("""CREATE TABLE hist_stats AS (
SELECT OWNER,
AVG(DATASET_COUNT) AS MEAN_DATASET_COUNT
FROM "%s"
WHERE DATE >= '%s' AND
DATE < '%s'
GROUP BY OWNER);""" % (
config.GEOSERVICES_STATS_CH_PATTERN,
from_date.strftime("%Y-%m-%d"),
current_date.strftime("%Y-%m-%d")))
duckdb.sql(
"""CREATE TABLE stats AS (
SELECT current_stats.OWNER AS Owner,
DATE as Date,
DATASET_COUNT as "Current dataset count",
MEAN_DATASET_COUNT as "Mean dataset count over last 30 days",
ROUND(DATASET_COUNT / MEAN_DATASET_COUNT * 100, 1) AS "Dataset ratio (percent)",
ROUND("Dataset ratio (percent)" - 100, 1) as "Dataset change (percent)",
CASE
WHEN "Dataset ratio (percent)" == 100 THEN 'Stable'
WHEN "Dataset ratio (percent)" > 100 THEN 'Improved'
WHEN "Dataset ratio (percent)" < 100 AND
"Dataset ratio (percent)" > 90 THEN 'Decreased'
WHEN "Dataset ratio (percent)" <= 90 AND
"Dataset ratio (percent)" > 80 THEN 'Strongly decreased'
WHEN "Dataset ratio (percent)" <= 80 THEN 'Suspicious. Maybe services endpoints have changed?'
END AS Classification
FROM current_stats
JOIN hist_stats
ON current_stats.OWNER = hist_stats.OWNER);""")
df = duckdb.sql("""SELECT * FROM stats""").pl()
pl.Config.set_tbl_rows(len(df))
print(df)
df.write_csv(config.GEOSERVICES_CHANGESTATS_CH_CSV, separator=',',
quote='"', date_format="%d.%m.%Y")
df = duckdb.sql(
"""SELECT * FROM stats WHERE Classification LIKE '%Suspicious%'""").pl()
if len(df) > 0:
print("\nFound significant changes")
# We have at least one "suspicious" entry
try:
print("\nSending e-mail")
smtp_server = os.environ.get('MAIL_SMTP_SERVER')
user_name = os.environ.get('MAIL_USER_NAME')
password = os.environ.get('MAIL_PASSWORD')
port = 465
# Create a secure SSL context
context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as \
server:
server.login(user_name, password)
message = """Subject: Significant changes in geoservices availability
Hi
Geoservice Harvester has found significant changes in geoservices availability in its last run.
This might mean that some data owners have changed endpoints of their geoservices.
Check the geoservice availability change statistics at https://github.com/rastrau/geoservice_harvester_poc/blob/main/data/geoservices_changestats_CH.csv.
"""
server.sendmail(user_name,
config.GEOSERVICES_CHANGESTATS_ALERT_RECIPIENTS,
message)
except Exception as e:
print(str(e))