forked from Linuxfabrik/lib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feedparser3.py
106 lines (86 loc) · 3.19 KB
/
feedparser3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author: Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
# https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.
# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.rst
"""Parse Atom and RSS feeds in Python.
Time zone handling is not implemented.
"""
__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2022021601'
import sys
from .globals3 import STATE_UNKNOWN
try:
from bs4 import BeautifulSoup
except ImportError as e:
print('Python module "BeautifulSoup4" is not installed.')
sys.exit(STATE_UNKNOWN)
from . import time3
from . import url3
def parse_atom(soup):
result = {}
result['title'] = soup.title.string
result['updated'] = soup.updated.string
# cut the timezone part
result['updated_parsed'] = time3.timestr2datetime(result['updated'][0:19], pattern='%Y-%m-%dT%H:%M:%S')
result['entries'] = []
for entry in soup.find_all('entry'):
tmp = {}
tmp['title'] = entry.title.string
tmp['id'] = entry.id.string
tmp['updated'] = entry.updated.string
# cut the timezone part
tmp['updated_parsed'] = time3.timestr2datetime(tmp['updated'][0:19], pattern='%Y-%m-%dT%H:%M:%S')
try:
soup = BeautifulSoup(entry.summary.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
try:
soup = BeautifulSoup(entry.content.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
pass
result['entries'].append(tmp)
return result
def parse_rss(soup):
result = {}
result['title'] = soup.rss.channel.title.string
result['updated'] = soup.rss.channel.pubDate.string
# cut the timezone part
result['updated_parsed'] = time3.timestr2datetime(result['updated'][0:25], pattern='%a, %d %b %Y %H:%M:%S')
result['entries'] = []
for entry in soup.find_all('item'):
tmp = {}
tmp['title'] = entry.title.string
tmp['id'] = entry.guid.string
tmp['updated'] = entry.pubDate.string
# cut the timezone part
tmp['updated_parsed'] = time3.timestr2datetime(tmp['updated'][0:25], pattern='%a, %d %b %Y %H:%M:%S')
try:
soup = BeautifulSoup(entry.description.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
pass
result['entries'].append(tmp)
return result
def parse(feed_url, insecure=False, no_proxy=False, timeout=5, encoding='urlencode'):
"""Parse a feed from a URL, file, stream, or string.
"""
success, xml = url3.fetch(feed_url, insecure=insecure, no_proxy=no_proxy, timeout=timeout,
encoding=encoding)
if not success:
return (False, xml)
try:
soup = BeautifulSoup(xml, 'xml')
except Exception as e:
return (False, e)
is_atom = soup.feed
if is_atom is not None:
return (True, parse_atom(soup))
is_rss = soup.rss
if is_rss is not None:
return (True, parse_rss(soup))
return (False, '{} does not seem to be an Atom or RSS feed I understand.'.format(feed_url))