From 48ea821e7090508850cea31911e32c660e58274f Mon Sep 17 00:00:00 2001 From: Lucy Park Date: Wed, 23 Jul 2014 12:59:50 +0900 Subject: [PATCH 1/5] Add batch crawler --- meetings/batch.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 meetings/batch.py diff --git a/meetings/batch.py b/meetings/batch.py new file mode 100644 index 0000000..cb64f48 --- /dev/null +++ b/meetings/batch.py @@ -0,0 +1,87 @@ +#! /usr/bin/python2.7 +# -*- coding: utf-8 -*- + +import json +import re +import requests +import urllib + +import get + + +baseurl = 'http://likms.assembly.go.kr/record' + + +def get_docs(): + docurl_base = '%s/new/getFileDown.jsp?CONFER_NUM=' % baseurl + + +def get_listurl(assembly_id): + if assembly_id==5.5: + return '%s/content/con_search167.jsp?div=1&DAE_NUM=66&COMM_CODE=ZA'\ + '&i=1' % baseurl + elif assembly_id==10.5: + return '%s/content/con_search7.jsp?div=1&DAE_NUM=77&COMM_CODE=ZA'\ + '&i=1' % baseurl + else: + return 'http://likms.assembly.go.kr/record/content/con_search2.jsp'\ + '?div=1&DAE_NUM=%s' % assembly_id + +def get_sessionurl_base(assembly_id): + if assembly_id==5.5: + n = 767 + elif assembly_id==10.5: + n = 8 + else: + n = 3 + return '%s/content/con_search%d.jsp' % (baseurl, n) + + +def get_docids(listurl, sessionurl_base): + def encode_session_params(k, v): + if assembly_id==5.5: + params['COMM_NAME'] = v + params['COMM_CODE'] = 'ZA' + params['CONF_DATE'] = v.strip(u'년') + elif assembly_id==10.5: + params['CONF_DATE'] = v + else: + params['SES_NUM'] = v + params['SES_BIGO'] = 'null' + return urllib.urlencode({p:v.encode('euc-kr') for p, v in params.items()}) + + def get_sitting_info(anchor): + print anchor + items = anchor[0].split("'") + return {'sitting_name': items[1].strip(), 'docid': items[3]} + + def get_anchors(sessionurl): + root = get.webpage(get.htmltree(sessionurl)) + return filter(None, [i.xpath('./@href') for i in root.xpath('.//a')]) + + root = get.webpage(get.htmltree(listurl)) + params = {i.xpath('./@name')[0]: i.xpath('./@value')[0]\ + for i in root.xpath('.//input')} + + docids = [] + for k, v in params.items(): + if re.match(ur'(SES_NUM|CONF_DATE|COMM_NAME)[0-9]+', k): + print k, v + sessionurl = '%s?%s' % (sessionurl_base, encode_session_params(k, v)) + + anchors = get_anchors(sessionurl) + sittings = [get_sitting_info(a) for a in anchors\ + if re.match(r'javascript:mainsearch[0-9]+', a[0])] + docids.append({ 'session_name': v, 'sittings': sittings }) + return docids + + +#FIXME: broken for 1-19th assembly +for assembly_id in [5.5, 10.5]: + print assembly_id + listurl = get_listurl(assembly_id) + + sessionurl_base = get_sessionurl_base(assembly_id) + docids = get_docids(listurl, sessionurl_base) + with open('meetingdoc_ids_%.1f.json' % assembly_id, 'w') as f: + json.dump(docids, f) From 80b4d9f66aae221f121a8e148a8246fea0adad45 Mon Sep 17 00:00:00 2001 From: Lucy Park Date: Wed, 23 Jul 2014 14:40:16 +0900 Subject: [PATCH 2/5] Add downloader to batch crawler --- meetings/batch.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/meetings/batch.py b/meetings/batch.py index cb64f48..e13e28b 100644 --- a/meetings/batch.py +++ b/meetings/batch.py @@ -2,19 +2,31 @@ # -*- coding: utf-8 -*- import json +import os import re import requests import urllib import get - baseurl = 'http://likms.assembly.go.kr/record' +datadir = '/home/e9t/data/popong/meeting-docs/national' +def chkdir(directory): + if not os.path.exists(directory): + os.makedirs(directory) -def get_docs(): +def get_docs(attrs): docurl_base = '%s/new/getFileDown.jsp?CONFER_NUM=' % baseurl - + filedir = u'%s/%s/%s'\ + % (datadir, attrs.get('assembly_id'), attrs.get('date')) + chkdir(filedir) + filename = u'%s-%s-%s-본회의.pdf'\ + % (attrs.get('assembly_id'), attrs.get('session_id'),\ + attrs.get('sitting_id')) + print filename + urllib.urlretrieve('%s%s' % (docurl_base, attrs.get('docid')),\ + '%s/%s' % (filedir, filename)) def get_listurl(assembly_id): if assembly_id==5.5: @@ -77,6 +89,8 @@ def get_anchors(sessionurl): #FIXME: broken for 1-19th assembly +#FIXME: duplicate entries for 10.5th assembly +''' for assembly_id in [5.5, 10.5]: print assembly_id listurl = get_listurl(assembly_id) @@ -85,3 +99,23 @@ def get_anchors(sessionurl): docids = get_docids(listurl, sessionurl_base) with open('meetingdoc_ids_%.1f.json' % assembly_id, 'w') as f: json.dump(docids, f) +''' + +assembly_id = 5 +idfile = '%s/meetingdoc_ids_%s.json' % (datadir, assembly_id) + +with open(idfile, 'r') as f: + sessions = json.load(f) + +for session in sessions: + session_id = session.get('session_name').split(u'회')[0] + for sitting in session.get('sittings'): + docid = sitting.get('docid') + tmp = re.match(r'(.*?)\((.*?)\)', sitting.get('sitting_name')) + sitting_id = tmp.group(1).replace(u'제', '').replace(u'차', '').strip() + date = re.sub(ur'(년|월|일)', '-', tmp.group(2)).strip('\s-') + + attrs = { 'assembly_id': assembly_id, 'session_id': session_id,\ + 'date': date, 'sitting_id': sitting_id, 'docid': docid } + get_docs(attrs) + print attrs From bdf845e7dcf965fabdab25b9befa48cf8bf714d6 Mon Sep 17 00:00:00 2001 From: popong-web Date: Sat, 13 Sep 2014 21:20:14 +0900 Subject: [PATCH 3/5] Meeting crawler: replace urllib2 to requests - To handle hangul urls --- meetings/crawl.py | 20 ++++++++++++-------- meetings/get.py | 6 ------ meetings/online.sh | 6 +++--- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/meetings/crawl.py b/meetings/crawl.py index 740f53d..b85b458 100755 --- a/meetings/crawl.py +++ b/meetings/crawl.py @@ -6,9 +6,11 @@ import re import urllib +import requests + import get -basedir = '/var/popong/data' # change me +basedir = '.' # change me jsondir = '%s/meetings/national/meta' % basedir pdfdir = '%s/meeting-docs/national' % basedir @@ -20,20 +22,22 @@ def checkdir(directory): def get_html(page_num): url = '%s/new/new_list.jsp?CLASS_CODE=0¤tPage=%d' % (baseurl, page_num) - r = get.htmltree(url) - return r.read().decode('euc-kr').encode('utf-8') + r = requests.get(url) + return r.text.encode('utf-8') def get_hidden_url(url): - f = get.htmltree(url.encode('utf-8')) - root = get.webpage(f) + r = requests.get(url) + html = r.text + root = get.webpage(html) return '%s/%s' % (baseurl, root.xpath('//frame/@src')[1]) def get_issues(url): - f = get.htmltree(url) + r = requests.get(url) + html = r.text if 'new_list2.jsp' in url: - elems = get.webpage(f).xpath('//a/text()') + elems = get.webpage(html).xpath('//a/text()') elif 'new_list3.jsp' in url: - elems = get.webpage(f).xpath('//td/@title') + elems = get.webpage(html).xpath('//td/@title') else: raise Exception('New DOM type.') return elems diff --git a/meetings/get.py b/meetings/get.py index a0cbd6a..a8b868e 100644 --- a/meetings/get.py +++ b/meetings/get.py @@ -15,12 +15,6 @@ def localtree(url): def headtree(root): print etree.tostring(root)[0:200] -def htmltree(url): - r = urllib2.Request(url) - r.add_header("User-Agent", "Mozilla/5.0") - f = urllib2.urlopen(r) - return f - def webpage(f): parser = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ diff --git a/meetings/online.sh b/meetings/online.sh index 47facf0..4b112af 100755 --- a/meetings/online.sh +++ b/meetings/online.sh @@ -1,16 +1,16 @@ #! /bin/bash -e -JSON_DIR="/var/popong/data/meetings" +JSON_DIR="." # change me echo date echo "update assembly minutes" # crawl assembly minutes -cd "/var/popong/crawlers/meetings/" +cd "~/crawlers/meetings/" # change me to meetings crawler directory ./crawl.py -# commit to [data-meetings.git](https://bitbucket.org/teampopong/data-meetings) +# commit to [data-meetings.git](https://github.com/teampopong/data-meetings) cd $JSON_DIR git add . git commit -m "Auto update: `date`" From f480699b14f6109fea5b81d7d57e05e113c29dca Mon Sep 17 00:00:00 2001 From: popong-web Date: Sat, 13 Sep 2014 21:23:37 +0900 Subject: [PATCH 4/5] Fix national assembly crawler --- national_assembly/urls | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/national_assembly/urls b/national_assembly/urls index 4840e3d..56a6d7b 100644 --- a/national_assembly/urls +++ b/national_assembly/urls @@ -2,4 +2,4 @@ # first tokens are keys and second tokens are urls base http://www.assembly.go.kr people_list http://www.assembly.go.kr/assm/memact/congressman/memCond/memCondListAjax.do?currentPage=1&rowPerPage=500 -person http://www.assembly.go.kr/assm/memPop/memPopup.do?num= +person http://www.assembly.go.kr/assm/memPop/memPopup.do?dept_cd= From 5fe8d69d7b10babb98d93fd23d43b4a30fdd8a49 Mon Sep 17 00:00:00 2001 From: popong-web Date: Sat, 13 Sep 2014 21:28:07 +0900 Subject: [PATCH 5/5] Update gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 0cf7c97..bbba118 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ bills/html/ bills/data/ bills/na-bills-*.csv bills/new_bills +national_assembly/assembly.csv +national_assembly/assembly.json +national_assembly/html/* log