forked from iliana/python-simplemediawiki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simplemediawiki.py
336 lines (292 loc) · 12.8 KB
/
simplemediawiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# python-simplemediawiki - Extremely low-level wrapper to the MediaWiki API
# Copyright (C) 2011 Red Hat, Inc.
# Primary maintainer: Ian Weller <[email protected]>
#
# This library is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 2.1 of the License, or (at your option)
# any later version.
#
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
:py:mod:`simplemediawiki` is an extremely low-level wrapper to the `MediaWiki
API`_. It automatically handles cookies and gzip compression so that you can
make basic calls to the API in the easiest and fastest way possible. It also
provides a few functions to make day-to-day API access easier.
To use this module, initialize a :py:class:`MediaWiki` object, passing it the
URL of api.php for the wiki you want to work with. Calls go through
:py:func:`MediaWiki.call`. A generic login wrapper as well as functions to
determine limits and get a list of namespaces are provided for your
convenience.
>>> from simplemediawiki import MediaWiki
>>> wiki = MediaWiki('http://en.wikipedia.org/w/api.php')
>>> wiki.call({'action': 'query', 'prop': 'revisions', 'titles': 'Main Page'})
{u'query': {u'pages': {...}}}
.. _`MediaWiki API`: http://www.mediawiki.org/wiki/API:Main_page
"""
try:
import cookielib as cookiejar
except ImportError:
from http import cookiejar
import gzip
try:
import simplejson as json
except ImportError:
import json
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
try:
from urllib import urlencode
import urllib2 as urequest
import urllib2 as uparse
import urllib2 as uerror
except ImportError:
from urllib.parse import urlencode
import urllib.request as urequest
import urllib.parse as uparse
import urllib.error as uerror
__author__ = 'Ian Weller <[email protected]>'
__version__ = '1.1'
DEFAULT_UA = ('python-simplemediawiki/%s '
'+https://github.com/ianweller/python-simplemediawiki') \
% __version__
class MediaWiki(object):
"""
Create a new object to access a wiki via *api_url*.
If you're interested in saving session data across multiple
:py:class:`MediaWiki` objects, provide a filename *cookie_file* to where
you want to save the cookies.
Applications that use simplemediawiki should change the *user_agent*
argument to something that can help identify the application if it is
misbehaving. It's recommended to use :py:func:`build_user_agent` to create
a `User-Agent`_ string that will be most helpful to server administrators.
Wikimedia sites enforce using a correct User-Agent; you should read
`Wikimedia's User-Agent policy`_ if you plan to be accessing those wikis.
.. tip::
If a user of your application may not know how to get the correct API
URL for their MediaWiki, you can try getting the right one with
:py:func:`MediaWiki.normalize_api_url`.
:param api_url: URL for the path to the API endpoint
:param cookie_file: path to a :py:class:`cookielib.MozillaCookieJar` file
:param user_agent: string sent as ``User-Agent`` header to web server
.. _`User-Agent`: http://en.wikipedia.org/wiki/User_agent
.. _`Wikimedia's User-Agent policy`:
http://meta.wikimedia.org/wiki/User-Agent_policy
"""
_high_limits = None
_namespaces = None
_psuedo_namespaces = None
def __init__(self, api_url, cookie_file=None, user_agent=DEFAULT_UA):
self._api_url = api_url
if cookie_file:
self._cj = cookiejar.MozillaCookieJar(cookie_file)
try:
self._cj.load()
except IOError:
self._cj.save()
self._cj.load()
else:
self._cj = cookiejar.CookieJar()
self._opener = urequest.build_opener(
urequest.HTTPCookieProcessor(self._cj)
)
self._opener.addheaders = [('User-Agent', user_agent)]
def _fetch_http(self, url, params):
"""
Standard HTTP request handler for this class with gzip and cookie
support. This was separated out of :py:func:`MediaWiki.call` to make
:py:func:`MediaWiki.normalize_api_url` useful.
.. note::
This function should not be used. Use :py:func:`MediaWiki.call`
instead.
:param url: URL to send POST request to
:param params: dictionary of query string parameters
"""
params['format'] = 'json'
# urllib.urlencode expects byte objects, not unicode strings
request = urequest.Request(url, urlencode(params).encode())
request.add_header('Accept-encoding', 'gzip')
response = self._opener.open(request)
if isinstance(self._cj, cookiejar.MozillaCookieJar):
self._cj.save()
if response.headers.get('Content-Encoding') == 'gzip':
compressed = StringIO(response.read())
gzipper = gzip.GzipFile(fileobj=compressed)
data = gzipper.read()
else:
data = response.read()
return data.decode()
def call(self, params):
"""
Make an API call to the wiki. *params* is a dictionary of query string
arguments. For example, to get basic information about the wiki, run:
>>> wiki.call({'action': 'query', 'meta': 'siteinfo'})
which would make a call to
``http://domain/w/api.php?action=query&meta=siteinfo&format=json``
(except the query string would be sent in POST).
:param params: dictionary of query string parameters
:returns: dictionary containing API response
"""
return json.loads(self._fetch_http(self._api_url, params))
def normalize_api_url(self):
"""
Checks that the API URL used to initialize this object actually returns
JSON. If it doesn't, make some educated guesses and try to find the
correct URL.
:returns: a valid API URL or ``None``
"""
def tester(self, api_url):
"""
Attempts to fetch general information about the MediaWiki instance
in order to test whether *api_url* will return JSON.
"""
data = self._fetch_http(api_url, {'action': 'query',
'meta': 'siteinfo'})
try:
data_json = json.loads(data)
return (data, data_json)
except ValueError:
return (data, None)
data, data_json = tester(self, self._api_url)
if data_json:
return self._api_url
else:
# if there's an index.php in the URL, we might find the API
if 'index.php' in self._api_url:
test_api_url = self._api_url.split('index.php')[0] + 'api.php'
print(test_api_url)
test_data, test_data_json = tester(self, test_api_url)
print((test_data, test_data_json))
if test_data_json:
self._api_url = test_api_url
return self._api_url
return None
def login(self, user, passwd):
"""
Logs into the wiki with username *user* and password *passwd*. Returns
``True`` on successful login.
:param user: username
:param passwd: password
:returns: ``True`` on successful login, otherwise ``False``
"""
def do_login(self, user, passwd, token=None):
"""
Login function that handles CSRF protection (see `MediaWiki bug
23076`_). Returns ``True`` on successful login.
.. _`MediaWiki bug 23076`:
https://bugzilla.wikimedia.org/show_bug.cgi?id=23076
"""
data = {'action': 'login',
'lgname': user,
'lgpassword': passwd}
if token:
data['lgtoken'] = token
result = self.call(data)
if result['login']['result'] == 'Success':
self._high_limits = None
return True
elif result['login']['result'] == 'NeedToken' and not token:
return do_login(self, user, passwd, result['login']['token'])
else:
return False
return do_login(self, user, passwd)
def logout(self):
"""
Logs out of the wiki.
:returns: ``True``
"""
data = {'action': 'logout'}
self.call(data)
self._high_limits = None
return True
def limits(self, low, high):
"""
Convenience function for determining appropriate limits in the API. If
the (usually logged-in) client has the ``apihighlimits`` right, it will
return *high*; otherwise it will return *low*.
It's generally a good idea to use the highest limit possible; this
reduces the amount of HTTP requests and therefore overhead. Read the
API documentation for details on the limits for the function you are
using.
:param low: value to return if client does not have ``apihighlimits``
:param high: value to return if client has ``apihighlimits``
:returns: *low* or *high*
"""
if self._high_limits == None:
result = self.call({'action': 'query',
'meta': 'userinfo',
'uiprop': 'rights'})
self._high_limits = 'apihighlimits' in \
result['query']['userinfo']['rights']
if self._high_limits:
return high
else:
return low
def namespaces(self, psuedo=True):
"""
Fetches a list of namespaces for this wiki and returns them as a
dictionary of namespace IDs corresponding to namespace names. If
*psuedo* is ``True``, the dictionary will also list psuedo-namespaces,
which are the "Special:" and "Media:" namespaces (special because they
have no content associated with them and their IDs are negative).
:param psuedo: boolean to determine inclusion of psuedo-namespaces
:returns: dictionary of namespace IDs and names
"""
if self._namespaces == None:
result = self.call({'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces'})
self._namespaces = {}
self._psuedo_namespaces = {}
for nsid in result['query']['namespaces']:
if int(nsid) >= 0:
self._namespaces[int(nsid)] = \
result['query']['namespaces'][nsid]['*']
else:
self._psuedo_namespaces[int(nsid)] = \
result['query']['namespaces'][nsid]['*']
if psuedo:
retval = {}
retval.update(self._namespaces)
retval.update(self._psuedo_namespaces)
return retval
else:
return self._namespaces
@staticmethod
def parse_date(date):
"""
Converts `ISO 8601`_ dates generated by the MediaWiki API into
:py:class:`datetime.datetime` objects.
This will return a time in what your wiki thinks is UTC. Plan
accordingly for bad server configurations.
.. _`ISO 8601`: http://en.wikipedia.org/wiki/ISO_8601
:param date: string ISO 8601 date representation
:returns: :py:class:`datetime.datetime` object
"""
# MediaWiki API dates are always of the format
# YYYY-MM-DDTHH:MM:SSZ
# (see $formats in wfTimestamp() in includes/GlobalFunctions.php)
return datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
def build_user_agent(application_name, version, url):
"""
Build a good User-Agent header string that can help server administrators
contact you if your application is misbehaving. This string will also
contain a reference to python-simplemediawiki.
See the documentation for :py:class:`simplemediawiki.MediaWiki` for good
reasons why you should use a custom User-Agent string for your application.
:param application_name: your application's name
:param version: your application's version
:param url: a URL where smoeone can find information about your \
application or your email address
:returns: User-Agent string
"""
return '%s/%s %s/%s (+%s)' % (application_name, version,
'python-simplemediawiki', __version__, url)