-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsave_html_to_mysql.py
99 lines (81 loc) · 3.38 KB
/
save_html_to_mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/5 21:32
# @Email : [email protected]
from linkmysql import link_mysql_write, link_mysql_read, show_tables
from mulit_downloads import muliti_down, clear_html, get_html
import pymysql.cursors
import base64
import csv
# 读表插入数据库表内HTML字段当中
table_name = show_tables()
# 编码
def base64_encode(c):
return base64.b64encode(c.encode("utf-8")).decode("utf-8")
# 解码
def base64_decode(c):
return base64.b64decode(c).decode("utf-8")
# 把当前行数记录在url_num_count.txt文件中,如果内存溢出,或者网络中断,可以继续下载
def save_index(index):
with open(r'C:\Users\888\PycharmProjects\recall0Information\urlLists\2018count.txt','w', encoding='utf-8') as f:
f.write(str(index))
# 读取index值
def read_index():
with open(r'C:\Users\888\PycharmProjects\recall0Information\urlLists\2018count.txt', 'r', encoding='utf-8') as f:
number = f.read()
return int(number)
# 读取csv中所有的url
def get_all_url(file):
url_list = []
with open(file, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for index, row in enumerate(reader):
url_list.append((row["url"]))
return url_list
# save html file to sql
def save_html(urls):
index_begin = read_index()
url_list = urls[index_begin:]
for index1, url_temp in enumerate(url_list):
print(url_temp)
response = get_html(url_temp)
sql = """update criterion2018 set ThirdReport_Raw = "{0}" where ThirdReport_Url="{1}";"""
# sql_content = """update {0} set htmlContent = "{1}" where url="{2}";"""
ch_base64 = base64.b64encode(response[0].encode('utf-8')).decode('utf-8')
ch_base64_sql = sql.format(ch_base64, url_temp)
link_mysql_write(ch_base64_sql)
'''
ch = clear_html(response[0])
try:
sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0])
link_mysql_write(sql_raw)
except:
response = get_html(url_temp[0], encode="gbk")
ch = clear_html(response[0])
sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0])
link_mysql_write(sql_raw)'''
save_index(index_begin + index1 + 1)
print(index_begin + index1 + 1)
if __name__ == '__main__':
urls = get_all_url("C:\\Users\\888\\PycharmProjects\\recall0Information\\urlLists\\criterion2018.csv")
save_html(urls=urls)
''' # 获取url列表
sql = "select * from {};"
c_r = link_mysql_read(sql=sql.format(table_name[6]))
url_list = [data["url"] for data in c_r]
# 多进程下载
len_url = len(url_list)
for i in range(425, len_url):
url_piece = url_list[i*10:i*10+10]
print(url_piece, i)
result = muliti_down(url_piece)
for html in result:
sql = """update {0} set html = "{1}" where url="{2}";"""
sql_content = """update {0} set htmlContent = "{1}" where url="{2}";"""
ch_base64 = base64.b64encode(html[0].encode('utf-8')).decode('utf-8')
ch_base64_sql = sql_content.format(table_name[6], ch_base64, html[1])
link_mysql_write(ch_base64_sql)
ch = clear_html(html[0])
sql_raw = sql.format(table_name[6], pymysql.escape_string(ch), html[1])
link_mysql_write(sql_raw)
'''