-
Notifications
You must be signed in to change notification settings - Fork 0
/
public_pool.py
128 lines (105 loc) · 3.67 KB
/
public_pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-01-09 02:54:54
# @Author : Tom Hu ([email protected])
# @Link : http://h1994st.com
# @Version : 1.0
import md5
import json
import pytz
import datetime
import peewee
import feedparser
db = peewee.MySQLDatabase('AUTO_PUBLIC_POOL', user='root', passwd='hsthst')
class Profiler(object):
'''
Collect news on structural websites
Website - Column - Page - URL
'''
def __init__(self):
super(Profiler, self).__init__()
class CNNProfiler(Profiler):
'''
CNN Profiler
'''
def __init__(self):
super(CNNProfiler, self).__init__()
class CHINA_DAILY(peewee.Model):
ID = peewee.IntegerField()
SECTION = peewee.CharField()
SUB_SECTION = peewee.CharField()
AUTHOR = peewee.CharField()
TITLE = peewee.CharField()
DATE = peewee.DateTimeField()
URL = peewee.CharField()
PROFILE_DATE = peewee.DateTimeField()
URL_MD5 = peewee.CharField()
SUMMARY = peewee.CharField()
class Meta:
database = db
class ChinaDailyProfiler(Profiler):
'''
China Daily Profiler
Feeds: data/china_daily_feeds.json
'''
def __init__(self):
super(ChinaDailyProfiler, self).__init__()
def profile(self):
with open("data/china_daily_feeds.json") as fp:
china_daily_feeds = json.load(fp)
try:
i = 0
for section in china_daily_feeds:
feed = feedparser.parse(china_daily_feeds[section])
for news in feed.entries:
# # section
# print section
# # sub section
# print news.tags[0].term
# # author
# print news.authorname
# # date
# print news.published, news.published_parsed
# # url
# print news.link
# print md5.new(news.link).hexdigest()
# # summary
# print news.summary
# print ''
if not hasattr(news, 'authorname'):
news.authorname = ''
try:
CHINA_DAILY.create(
SECTION=section, SUB_SECTION=news.tags[0].term,
AUTHOR=news.authorname, TITLE=news.title,
DATE=pytz.timezone('Asia/Shanghai').localize(
datetime.datetime.strptime(
news.published,
'%Y-%m-%d %H:%M:%S')).astimezone(
pytz.UTC),
URL=news.link,
URL_MD5=md5.new(news.link).hexdigest(),
PROFILE_DATE=datetime.datetime.now(pytz.utc),
SUMMARY=news.summary)
except peewee.IntegrityError as e:
print 'Error: %r' % e
print section, news.tags[0].term, news.link
continue
except AttributeError as e:
print 'Error: %r' % e
print section, news.tags[0].term, news.link
continue
else:
i += 1
print i
finally:
pass
print ''
except Exception as e:
print 'Error: %r' % e
finally:
db.close()
def main():
pass
if __name__ == '__main__':
main()