-
Notifications
You must be signed in to change notification settings - Fork 0
/
chanspider.py
118 lines (113 loc) · 6.48 KB
/
chanspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Spiders a Kusaba board and returns a string with each post separated by a newline.
# Has options to return posts only with certain substrings, and to give a detailed and human-readable post listing. Do not use these for corpus-gathering.
# Also has an option to save all images seen to a folder. Can be useful in conjunction with the post corpus.
import urllib2, re, sys, string, random
import lxml.html
def spider(chan, boards, pages=3, detailed=False, search=None, namesearch=None, image_folder=""):
"""Default simpy saves all post content. Detailed mode adds board, name, trip, post number, and link.
'search' expects a list of search words. 'namesearch' expects a name to search for. Set 'image_folder'
to save all images seen on the chan to that folder.
All name searches and word searches are case insensitive."""
aggregation=""
boardposts=""
opener=urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 5.1; rv:5.0.1) Gecko/20100101 Firefox/5.0.1'),
('Referer', chan)]
urllib2.install_opener(opener)
for board in boards.split():
pages_spidered=0
boardpath=chan+board+"/"
while pages_spidered < pages:
if pages_spidered > 0:
url=boardpath+"%s.html" % pages_spidered
else:
url=boardpath #page 0 has no page number
try:
index=urllib2.urlopen(url, timeout=7).read()
except:
print "\nError connecting, probably due to malfunctioning proxy (or the board being down). Try switching to Tor or using no proxy, or try again."
sys.exit("Exited")
index_html=lxml.html.fromstring(index.decode("ascii", "ignore"))
divs=index_html.findall(".//div")
for div in divs:
try:
thread_div=div.attrib["id"]
except:
continue
if "thread" in thread_div:
threadID=re.search(r"\d+", thread_div).group()
thread=urllib2.urlopen(boardpath+"/res/%s.html" % threadID, timeout=7).read()
thread=re.sub(r"<br.*?>", "\n", thread) #breaks screw things up
thread_html=lxml.html.fromstring(thread.decode("ascii", "ignore"))
posts=thread_html.findall(".//blockquote")
if image_folder:
file_spans=thread_html.findall(".//span[@class='filesize']")
for span in file_spans:
image_link=span.getchildren()[0].attrib['href']
image_data=urllib2.urlopen(image_link, timeout=7).read()
image_name=re.search(r", (.*)\n\n\)", span.text_content()).group(1)
try:
newimage=open(image_folder+image_name, "wb")
except IOError:
#when there's a character in the filename that's invalid on your filesystem, uses random string+file extension instead
newimage=open(image_folder+''.join(random.sample(string.ascii_lowercase, 7))+re.search(r"\..*$", image_name).group(0), "wb")
newimage.write(image_data)
newimage.close()
i=0
for post in posts:
text=post.text_content().strip()
content=""
if detailed==True:
names=thread_html.findall(".//span[@class='postername']")
name=names[i].text
if name is None:
for sub in names[i].iterchildren():
name=sub.text
trip=""
tripnode=names[i].getnext()
if tripnode is not None:
trip=tripnode.text
if name is None:
name=""
postnumbers=thread_html.findall(".//input[@name='post[]']")
postnumber=postnumbers[i].attrib['value']
content+="\n==================================================\n"
if search or namesearch:
if (any(word.lower() in text.lower() for word in search) if search else None) or ((namesearch.lower() in name.lower()) if namesearch else None):
links=thread_html.findall(".//span[@class='reflink']")
for link in links[i].iterchildren():
content+=chan[:-1]+link.attrib["href"]+"\n\n"
break
else:
i+=1
continue #move to next post if searchwords aren't found
content+=name+trip+" ### No. %s" % postnumber+"\n-----------\n\n"
if len(text) > 2:
if detailed == False:
text=re.sub(r"(\r\n)+|\n+", " ", text)
content+=text
boardposts+=content+"\n"
i+=1
pages_spidered+=1
if detailed == True:
if len(boardposts) < 2:
error="No posts found.\n"
aggregation+="\n\n################\n/"+board+"/\n################\n"+error
else:
aggregation+="\n\n################\n/"+board+"/\n################\n"+boardposts
else:
aggregation+=boardposts
boardposts=""
aggregation=aggregation.rstrip()
if detailed == False:
aggregation=re.sub(r">>\d+(\s+)?", "", aggregation) #removes lines that are just quoting other post
if detailed == True:
aggregation+="\n\n**********************************\n\n"
return aggregation
if __name__=="__main__":
chan="http://www.somechan.org"
boards="b a" #boards separated by spaces, example: "b c a ck"
pages=2
searchwords=["dude", "dumb"]
name="Anonymous"
print spider(chan, boards, pages, detailed=True) #also try adding search=searchwords and namesearch=name