-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtasks.py
116 lines (108 loc) · 4.26 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import collections
import datetime
import logging
import html5lib
from google.appengine.api import memcache
from google.appengine.api import urlfetch
from google.appengine.ext import db
from google.appengine.ext import deferred
import models
def ComputeTagCounts(account_key):
account = models.Account.get(account_key)
tags = collections.defaultdict(int)
account_key_name = account.key().name()
logging.info("Started processing tags of %s" % account.nickname)
# Invalidate tags cache
tags_cache_key = "%s:tags" % account.key()
memcache.delete(tags_cache_key)
# TODO Process all bookmarks at once!?
for bookmark in account.bookmarks:
for tag_name in bookmark.tags:
tag_key_name = '%s:%s' % (account_key_name, tag_name)
tags[(tag_name, tag_key_name)] += 1
tags = [models.Tag(key_name=key_name,
name=name,
count=count,
account=account)
for (name, key_name), count in tags.items()
if name]
db.put(tags)
if not memcache.add(tags_cache_key, [tag.name for tag in tags]):
logging.error("Cannot set account tags in memcache")
logging.info("Processed tags")
def CheckBookmarks(account_key):
account = models.Account.get(account_key)
if account is None:
logging.info("Account not found %s" % account_key)
return
last_checked_date = datetime.datetime.utcnow() - datetime.timedelta(days=2)
query = models.Bookmark.all().filter('account =', account) \
.filter('last_checked <', last_checked_date) \
.order('last_checked')
cursor_key = 'check_bookmarks_cursor:%s' % account_key
cursor = memcache.get(cursor_key)
if cursor:
query.with_cursor(cursor)
memcache.delete(cursor_key)
bookmarks = query.fetch(50)
if not bookmarks:
return
processed = []
for bookmark in bookmarks:
logging.info("Checking: %s" % bookmark.uri)
try:
result = urlfetch.fetch(bookmark.uri, follow_redirects=False, method='HEAD')
bookmark.last_status_code = result.status_code
logging.info("Got %s from %s" % (result.status_code, bookmark.uri))
if result.status_code == 200:
# TODO Get the contents and put it to the blobstore
pass
if result.status_code in (301, 302):
bookmark.redirected = result.headers.get('location', bookmark.uri)
except urlfetch.DownloadError:
bookmark.last_status_code = 500
except Exception, e:
logging.error("Exception: %s" % e)
bookmark.last_checked = datetime.datetime.utcnow()
processed.append(bookmark)
db.put(processed)
memcache.set(cursor_key, query.cursor())
deferred.defer(CheckBookmarks, account_key, _countdown=120)
def ImportBookmarks(import_key):
bookmark_import = models.Import.get(import_key)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder('dom'))
dom_tree = parser.parse(bookmark_import.blob.open())
bookmarks = []
account = bookmark_import.account
for link in dom_tree.getElementsByTagName('a'):
uri = link.getAttribute('href')
if not uri.startswith('http://'):
continue
title = ''.join(node.data
for node in link.childNodes
if node.nodeType == node.TEXT_NODE)
uri_digest = models.Bookmark.get_digest_for_uri(uri)
key = '%s:%s' % (account.key().name(), uri_digest)
is_private = link.getAttribute('private') == '1'
created = link.getAttribute('add_date')
try:
created = datetime.datetime.utcfromtimestamp(float(created))
except:
created = datetime.datetime.utcnow()
tags = [tag.strip().lower()
for tag in link.getAttribute('tags').strip().split(',')
if link.getAttribute('tags')]
bookmark = models.Bookmark(
key_name=key, account=account, uri_digest=uri_digest,
title=title, uri=uri, private=is_private, created=created,
modified=created, tags=tags)
bookmarks.append(bookmark)
db.put(bookmarks)
# Mark this task as completed
bookmark_import.status = bookmark_import.DONE
bookmark_import.processed = datetime.datetime.utcnow()
bookmark_import.put()
# Remove blob
# TODO The following line does not seem to be working!?
# blobstore.delete(bookmark_import.blob)
deferred.defer(ComputeTagCounts, account.key())