-
Notifications
You must be signed in to change notification settings - Fork 2
/
mozilla_calendar.py
119 lines (98 loc) · 3.56 KB
/
mozilla_calendar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics
import re
from datetime import datetime
import dateparser
import requests
import ics
from urlextract import URLExtract
import sessionize
FLAG_A = ord('🇦')
FLAG_Z = FLAG_A + 26
FLAG_OFFSET = FLAG_A - ord('A')
URL_EXTRACTOR = URLExtract()
def fetch_cal():
url = 'https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics'
return ics.Calendar(requests.get(url).text)
def convert_flags(s):
ords = [ord(c) for c in s]
return ''.join(chr(c - FLAG_OFFSET) if FLAG_A <= c <= FLAG_Z else chr(c) for c in ords)
def parse_event_url(evt):
links = URL_EXTRACTOR.find_urls(evt.description)
if links:
return links[0]
def parse_date(raw_date, relative_to):
s = {'PREFER_DATES_FROM': 'future', 'RELATIVE_BASE': relative_to.replace(tzinfo=None)}
md = re.search(r'^(\w+) (\d+)\s*-\s*(\w+) (\d+)(.*)$', raw_date)
if md:
return (
dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
dateparser.parse(f'{md.group(3)} {md.group(4)}', settings=s).date(),
md.group(5),
)
md = re.search(r'^(\w+) (\d+)\s*-\s*(\d+)(.*)$', raw_date)
if md:
return (
dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
dateparser.parse(f'{md.group(1)} {md.group(3)}', settings=s).date(),
md.group(4),
)
md = re.search(r'^(\w+) (\d+)(.*)$', raw_date)
if md:
d = dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date()
return (d, d, md.group(3))
return (None, None, raw_date)
def parse_event_name(label, relative_to):
label = convert_flags(label)
md = re.search(r'^(.*) \((.*?)\)$', label)
if not md:
return {
'Conference Name': label.strip(),
}
name, dates_and_location = md.group(1, 2)
# Try to filter out the word CFP.
name = name.replace('CFP', '')
name = re.sub(r'(^| ):( |$)', ' ', name)
name = re.sub(r'\s+', ' ', name).strip()
# Parse dates.
start_date, end_date, location = parse_date(dates_and_location, relative_to)
# Clean up the location.
location = location.lstrip(',').strip()
evt = {
'Conference Name': name,
}
if start_date:
evt['Conference Start Date'] = start_date
if end_date:
evt['Conference End Date'] = end_date
if location:
evt['Location'] = location
return evt
def parse_events(cal):
# Skip anything that closed more than a year ago.
now = datetime.utcnow()
cutoff = now.replace(year=now.year-1, tzinfo=None)
for evt in cal.events:
if evt.begin.datetime.replace(tzinfo=None) < cutoff:
continue
data = parse_event_name(evt.name, evt.begin.datetime)
if evt.location:
data['Location'] = evt.location
data['CFP End Date'] = evt.begin.datetime.replace(tzinfo=None)
url = parse_event_url(evt)
if url:
data['Conference URL'] = data['CFP URL'] = url
yield data
def scrape():
for evt in parse_events(fetch_cal()):
if evt is None or 'CFP URL' not in evt:
continue
if 'papercall.io' in evt['CFP URL']:
continue
if 'sessionize.com' in evt['CFP URL']:
s = sessionize.parse_event(evt['CFP URL'])
if s:
evt.update(s)
yield evt
if __name__ == '__main__':
for e in scrape():
print(e)