Skip to content

Commit

Permalink
Merge pull request #1 from teampopong/master
Browse files Browse the repository at this point in the history
update from origin
  • Loading branch information
majorika committed Sep 15, 2014
2 parents 41e0fdd + 36dacd4 commit b7d7808
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 18 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@ bills/html/
bills/data/
bills/na-bills-*.csv
bills/new_bills
national_assembly/assembly.csv
national_assembly/assembly.json
national_assembly/html/*
log
121 changes: 121 additions & 0 deletions meetings/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import json
import os
import re
import requests
import urllib

import get

baseurl = 'http://likms.assembly.go.kr/record'
datadir = '/home/e9t/data/popong/meeting-docs/national'

def chkdir(directory):
if not os.path.exists(directory):
os.makedirs(directory)

def get_docs(attrs):
docurl_base = '%s/new/getFileDown.jsp?CONFER_NUM=' % baseurl
filedir = u'%s/%s/%s'\
% (datadir, attrs.get('assembly_id'), attrs.get('date'))
chkdir(filedir)
filename = u'%s-%s-%s-본회의.pdf'\
% (attrs.get('assembly_id'), attrs.get('session_id'),\
attrs.get('sitting_id'))
print filename
urllib.urlretrieve('%s%s' % (docurl_base, attrs.get('docid')),\
'%s/%s' % (filedir, filename))

def get_listurl(assembly_id):
if assembly_id==5.5:
return '%s/content/con_search167.jsp?div=1&DAE_NUM=66&COMM_CODE=ZA'\
'&i=1' % baseurl
elif assembly_id==10.5:
return '%s/content/con_search7.jsp?div=1&DAE_NUM=77&COMM_CODE=ZA'\
'&i=1' % baseurl
else:
return 'http://likms.assembly.go.kr/record/content/con_search2.jsp'\
'?div=1&DAE_NUM=%s' % assembly_id

def get_sessionurl_base(assembly_id):
if assembly_id==5.5:
n = 767
elif assembly_id==10.5:
n = 8
else:
n = 3
return '%s/content/con_search%d.jsp' % (baseurl, n)


def get_docids(listurl, sessionurl_base):
def encode_session_params(k, v):
if assembly_id==5.5:
params['COMM_NAME'] = v
params['COMM_CODE'] = 'ZA'
params['CONF_DATE'] = v.strip(u'년')
elif assembly_id==10.5:
params['CONF_DATE'] = v
else:
params['SES_NUM'] = v
params['SES_BIGO'] = 'null'
return urllib.urlencode({p:v.encode('euc-kr') for p, v in params.items()})

def get_sitting_info(anchor):
print anchor
items = anchor[0].split("'")
return {'sitting_name': items[1].strip(), 'docid': items[3]}

def get_anchors(sessionurl):
root = get.webpage(get.htmltree(sessionurl))
return filter(None, [i.xpath('./@href') for i in root.xpath('.//a')])

root = get.webpage(get.htmltree(listurl))
params = {i.xpath('./@name')[0]: i.xpath('./@value')[0]\
for i in root.xpath('.//input')}

docids = []
for k, v in params.items():
if re.match(ur'(SES_NUM|CONF_DATE|COMM_NAME)[0-9]+', k):
print k, v
sessionurl = '%s?%s' % (sessionurl_base, encode_session_params(k, v))

anchors = get_anchors(sessionurl)
sittings = [get_sitting_info(a) for a in anchors\
if re.match(r'javascript:mainsearch[0-9]+', a[0])]
docids.append({ 'session_name': v, 'sittings': sittings })
return docids


#FIXME: broken for 1-19th assembly
#FIXME: duplicate entries for 10.5th assembly
'''
for assembly_id in [5.5, 10.5]:
print assembly_id
listurl = get_listurl(assembly_id)
sessionurl_base = get_sessionurl_base(assembly_id)
docids = get_docids(listurl, sessionurl_base)
with open('meetingdoc_ids_%.1f.json' % assembly_id, 'w') as f:
json.dump(docids, f)
'''

assembly_id = 5
idfile = '%s/meetingdoc_ids_%s.json' % (datadir, assembly_id)

with open(idfile, 'r') as f:
sessions = json.load(f)

for session in sessions:
session_id = session.get('session_name').split(u'회')[0]
for sitting in session.get('sittings'):
docid = sitting.get('docid')
tmp = re.match(r'(.*?)\((.*?)\)', sitting.get('sitting_name'))
sitting_id = tmp.group(1).replace(u'제', '').replace(u'차', '').strip()
date = re.sub(ur'(년|월|일)', '-', tmp.group(2)).strip('\s-')

attrs = { 'assembly_id': assembly_id, 'session_id': session_id,\
'date': date, 'sitting_id': sitting_id, 'docid': docid }
get_docs(attrs)
print attrs
20 changes: 12 additions & 8 deletions meetings/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import re
import urllib

import requests

import get

basedir = '/var/popong/data' # change me
basedir = '.' # change me
jsondir = '%s/meetings/national/meta' % basedir
pdfdir = '%s/meeting-docs/national' % basedir

Expand All @@ -20,20 +22,22 @@ def checkdir(directory):

def get_html(page_num):
url = '%s/new/new_list.jsp?CLASS_CODE=0&currentPage=%d' % (baseurl, page_num)
r = get.htmltree(url)
return r.read().decode('euc-kr').encode('utf-8')
r = requests.get(url)
return r.text.encode('utf-8')

def get_hidden_url(url):
f = get.htmltree(url.encode('utf-8'))
root = get.webpage(f)
r = requests.get(url)
html = r.text
root = get.webpage(html)
return '%s/%s' % (baseurl, root.xpath('//frame/@src')[1])

def get_issues(url):
f = get.htmltree(url)
r = requests.get(url)
html = r.text
if 'new_list2.jsp' in url:
elems = get.webpage(f).xpath('//a/text()')
elems = get.webpage(html).xpath('//a/text()')
elif 'new_list3.jsp' in url:
elems = get.webpage(f).xpath('//td/@title')
elems = get.webpage(html).xpath('//td/@title')
else:
raise Exception('New DOM type.')
return elems
Expand Down
6 changes: 0 additions & 6 deletions meetings/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@ def localtree(url):
def headtree(root):
print etree.tostring(root)[0:200]

def htmltree(url):
r = urllib2.Request(url)
r.add_header("User-Agent", "Mozilla/5.0")
f = urllib2.urlopen(r)
return f

def webpage(f):
parser = html5lib.HTMLParser(\
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
Expand Down
6 changes: 3 additions & 3 deletions meetings/online.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#! /bin/bash -e

JSON_DIR="/var/popong/data/meetings"
JSON_DIR="." # change me

echo
date
echo "update assembly minutes"

# crawl assembly minutes
cd "/var/popong/crawlers/meetings/"
cd "~/crawlers/meetings/" # change me to meetings crawler directory
./crawl.py

# commit to [data-meetings.git](https://bitbucket.org/teampopong/data-meetings)
# commit to [data-meetings.git](https://github.com/teampopong/data-meetings)
cd $JSON_DIR
git add .
git commit -m "Auto update: `date`"
Expand Down
2 changes: 1 addition & 1 deletion national_assembly/urls
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# first tokens are keys and second tokens are urls
base http://www.assembly.go.kr
people_list http://www.assembly.go.kr/assm/memact/congressman/memCond/memCondListAjax.do?currentPage=1&rowPerPage=500
person http://www.assembly.go.kr/assm/memPop/memPopup.do?num=
person http://www.assembly.go.kr/assm/memPop/memPopup.do?dept_cd=

0 comments on commit b7d7808

Please sign in to comment.