-
Notifications
You must be signed in to change notification settings - Fork 0
/
PickSlide.py
113 lines (96 loc) · 4.03 KB
/
PickSlide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import ConfigParser
from os.path import expanduser, join
from bulbs.neo4jserver import Graph, Config, NEO4J_URI
import sys
from UserDownloader import UserDownloader
from TabDownloader import TabDownloader
from TabSpider import TabSpider
from TabData import TabData
from User import User
from Comment import Comment
def main(argv=None):
# Config processing
conf_file = join(expanduser("~"),"scrape.conf")
config = ConfigParser.ConfigParser()
config.read(conf_file)
salt = config.get('info', 'salt')
tab_page = config.get('info','tab_page')
domain = config.get('info','domain')
delay = int(config.get('info', 'delay'))
# Start graph
config = Config(NEO4J_URI)
g = Graph(config)
g.clear() # Change if youre working with persistent data store!!!!
# Set up local indices
users = {}
instruments = {}
# Make page crawlers
tab_loader = TabDownloader(domain, tab_page, salt, delay)
user_loader = UserDownloader(domain, delay)
# Unofficial iterator
resource = TabSpider(domain)
# Start crawling!
while resource.has_more():
# Get tab info
tab_info = tab_loader.tab_download(resource.next_url())
# has_more doesnt actually work because of the retarted logic needed to keep track of tabs
if not tab_info:
break
# If tab is valid
if tab_info:
# Store base tab
tab_node = g.vertices.create(name=tab_info.tab_file)
tab_node.tab_file = tab_info.tab_file
tab_node.title = tab_info.title
tab_node.version = tab_info.version
tab_node.rating = tab_info.rating
tab_node.num_ratings = tab_info.num_ratings
tab_node.num_comments = tab_info.num_comments
tab_node.label = "tab"
tab_node.save()
# Add instruments
for instrument in tab_info.instruments:
if instrument not in instruments.keys():
i_node = g.vertices.create(name=instrument)
i_node.label = "instrument"
i_node.save()
instruments[instrument] = i_node
i_node = instruments[instrument]
g.edges.create(tab_node,"has_instrument",i_node)
# Add comments (recursive)
if tab_info.comments:
for comment in tab_info.comments:
g.edges.create(tab_node,"has_comment",save_comment(g, comment))
# Get info on the tabber if we don't have it
if tab_info.tabber:
if tab_info.tabber not in users.keys():
tabber = user_loader.load_user(tab_info.tabber)
if not tabber:
continue
# create user node for tabber
tempname = tabber.name
if not tempname:
tempname = ""
u_node = g.vertices.create(name=tabber.tempname)
u_node.registration_date = tabber.registration_date
u_node.num_contributions = tabber.num_contributions
u_node.rank = tabber.rank
u_node.save()
users[tab_info.tabber] = u_node
# Add tab to tabber's transcriptions
tabber = users[tab_info.tabber]
g.edges.create(tabber,"tabbed",tab_node)
print "Finished crawl! Woah!"
def save_comment(g, comment):
# Create comment node
c_node = g.vertices.create(name=comment.author+","+comment.date)
c_node.author = comment.author #could create user node for this if wanted
c_node.content = comment.content
c_node.rating = comment.rating
c_node.date = comment.date
c_node.save()
# Recursively save child comments
for child in comment.child_comments:
g.edges.create(c_node,"has_comment",save_comment(g, child))
if __name__ == "__main__":
sys.exit(main())