-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlfns.py
125 lines (121 loc) · 4.57 KB
/
htmlfns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
# coding=UTF-8
import logsetup
log=logsetup.getlog(__name__)
import html.parser
import file
import urls
"""general download functions"""
def getdecoded(url, **kwargs):
r=urls.http.request("GET", url, **kwargs)
if r.status == 200:
return r.data.decode()
def getbinary(url, **kwargs):
r=urls.http.request("GET", url, **kwargs)
if r.status == 200:
return r.data
def getraw(url):
return urls.http.request("GET", url)
class TranslationScraper(html.parser.HTMLParser):
"""Pulling this: <span jsname="FteS1d" class="jzUr5c" lang="fr">texte</span>
"""
def handle_data(self,data):
if self.capture:
self.text[self.capturelang].append(data)
# t=self.get_starttag_text()
# if 'texte' in data:
# log.info(data)
# if 'lang=' in t:
# log.info("{}: {}".format(t,data))
# if 'lang="{}"'.format(self.lang) in t:
def handle_starttag(self, tag, attrs):
# if tag == 'html':
# log.info("starting html ({})".format(kwargs))
if tag == 'span' and [t[1] for t in attrs if t[0] =='lang']:
self.capture=True
self.capturelang=[t[1] for t in attrs if t[0] =='lang'][0]
# self.text[lang].append(self.get_starttag_text())
# elif tag == 'span':
# print(attrs)
# elif [t for t in attrs if t[0] =='lang' and t[1] == self.lang]:
# print(self.get_starttag_text())
# lang in attrs and attrs['lang'] == self.lang:
def handle_endtag(self,tag):
self.capture=False
def __init__(self,lang):
log.info('scraping')
self.lang=lang
self.capture=False
super().__init__()
self.text={lang:[],
'en':[]
}
class ImageScraper(html.parser.HTMLParser):
"""Subclassing to do what I want"""
"""Pulling out this: <img src="/image/800px/301653" alt="Athlete's Foot">"""
def handle_starttag(self, tag, attrs):
# if tag == 'html':
# log.info("starting html ({})".format(kwargs))
if tag == 'img':
self.images.append({k:v for k,v in attrs})
# def handle_data(self, data):
# if self.get_starttag_text() == '<t>':
# log.info("do this")
def __init__(self):
super().__init__()
self.images=[]
def imgurl(x):
if '800px' in x:
x=x.replace('800px','400px')
if "./Openclipart - Clipping Culture_files" in x:
x=x.replace("./Openclipart - Clipping Culture_files",'/image/400px')
site='https://openclipart.org'
return site+x
if __name__ == '__main__':
import htmlfns #not normally used here
# html=getraw('www.google.com')
text='translation terms that I want to use'
kwargs={
'sl':'en', #search language
'tl':'fr', #translate to language
'text':text,
'op':'translate', #operation?
}
terms=urls.urlencode(kwargs)
url='https://translate.google.com/?'+terms
log.info("Looking in {}".format(url))
html=htmlfns.getdecoded(url)
with open('googletrans.html','w') as f:
f.write(html)
# 'https://translate.google.com/?sl=en&tl=fr&text=translation&op=translate')
scraper=TranslationScraper(lang='fr')
# html='<html><t>erg</t></html>' #<!doctype html>
# log.info(html.data)
# log.info(html.data.decode())
scraper.feed(html)
log.info("Found {} text elements: {}".format(len(scraper.text),scraper.text))
# log.info("Found {} images: {}".format(len(scraper.images),scraper.images))
# dir='images/openclipart.com/'+'_'.join(glosses)
# file.makedir(dir)
# for i in [i for i in scraper.images
# if 'openclipart-logo-2019.svg' not in i]:#[1:5]:
# url=imgurl(i['src'])
# num=i['src'].split('/')[-1]
# filename='_'.join([num,i['alt']])
# log.info("{} ({})".format(url,filename))
# response=getbinary(url)
# log.info("response data type: {}".format(type(response)))
# fqdn=file.getdiredurl(dir,filename)
# with open(fqdn,'wb') as d:
# d.write(response)
"""Maybe for later"""
# try:
# from BeautifulSoup import BeautifulSoup
# except ImportError:
# from bs4 import BeautifulSoup
# # html = #the HTML code you've written above
# parsed_html = BeautifulSoup(html)
# # print(parsed_html.body.find('div', attrs={'class':'container'}).text)
# # html=lift.readxmltext(html)
# log.info("response prettyprinted:")
# lift.prettyprint(parsed_html.body)