forked from canonical-web-and-design/blog.ubuntu.com
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeeds.py
128 lines (108 loc) · 3.45 KB
/
feeds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Core
import os
import time
import datetime
from urllib.parse import urlparse
# Third-party
import feedparser
import logging
import requests_cache
import prometheus_client
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
# Prometheus metric exporters
requested_from_cache_counter = prometheus_client.Counter(
'feed_requested_from_cache',
'A counter of requests retrieved from the cache',
['domain'],
)
failed_requests = prometheus_client.Counter(
'feed_failed_requests',
'A counter of requests retrieved from the cache',
['error_name', 'domain'],
)
request_latency_seconds = prometheus_client.Histogram(
'feed_request_latency_seconds',
'Feed requests retrieved',
['domain', 'code'],
buckets=[0.25, 0.5, 0.75, 1, 2],
)
# Cache session settings
cached_session = requests_cache.CachedSession(
name="hour-cache",
expire_after=datetime.timedelta(hours=1),
backend=os.environ.get('REQUEST_CACHE_BACKEND') or 'memory',
old_data_on_error=True
)
cached_session.mount(
'https://',
HTTPAdapter(
max_retries=Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504]
)
)
)
def get_rss_feed_content(url, offset=0, limit=6, exclude_items_in=None):
"""
Get the entries from an RSS feed
Inspired by https://github.com/canonical-webteam/get-feeds/,
minus Django-specific stuff.
"""
logger = logging.getLogger(__name__)
end = limit + offset if limit is not None else None
try:
response = cached_request(url)
except Exception as request_error:
logger.warning(
'Attempt to get feed failed: {}'.format(str(request_error))
)
return False
try:
feed_data = feedparser.parse(response.text)
if not feed_data.feed:
logger.warning('No valid feed data found at {}'.format(url))
return False
content = feed_data.entries
except Exception as parse_error:
logger.warning(
'Failed to parse feed from {}: {}'.format(url, str(parse_error))
)
return False
if exclude_items_in:
exclude_ids = [item['guid'] for item in exclude_items_in]
content = [item for item in content if item['guid'] not in exclude_ids]
content = content[offset:end]
for item in content:
updated_time = time.mktime(item['updated_parsed'])
item['updated_datetime'] = datetime.datetime.fromtimestamp(
updated_time
)
return content
def cached_request(url):
"""
Retrieve the response from the requests cache.
If the cache has expired then it will attempt to update the cache.
If it gets an error, it will use the cached response, if it exists.
"""
response = cached_session.get(url, timeout=2)
try:
response.raise_for_status()
except RequestException as request_error:
failed_requests.labels(
error_name=type(request_error).__name__,
domain=urlparse(url).netloc,
).inc()
raise request_error
if response.from_cache:
requested_from_cache_counter.labels(
domain=urlparse(url).netloc,
).inc()
else:
request_latency_seconds.labels(
domain=urlparse(url).netloc,
code=response.status_code,
).observe(response.elapsed.total_seconds())
return response