From 94e6c08c3006a8fb2cc00b0292cd661400b36bed Mon Sep 17 00:00:00 2001 From: Park Jiin Date: Fri, 1 Jan 2016 01:11:28 +0900 Subject: [PATCH] =?UTF-8?q?Parse=20'=EB=B6=80=EA=B0=80=EC=A0=95=EB=B3=B4'?= =?UTF-8?q?=20part=20correctly=20=20*=20=EB=B9=84=EA=B3=A0:=20use=20br.tai?= =?UTF-8?q?l=20or=20any.text=20=20*=20=EB=8C=80=EC=95=88,=20=EB=8C=80?= =?UTF-8?q?=EC=95=88=EB=B0=98=EC=98=81=ED=8F=90=EA=B8=B0=20=EB=AA=A9?= =?UTF-8?q?=EB=A1=9D:=20use=20bill=5Fid=20extracted=20from=20link=5Fid=20f?= =?UTF-8?q?rom=20href?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this commit fixes issue #39 --- bills/specific/html2json.py | 48 ++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/bills/specific/html2json.py b/bills/specific/html2json.py index 07f2c19..20e64b8 100644 --- a/bills/specific/html2json.py +++ b/bills/specific/html2json.py @@ -161,6 +161,50 @@ def status_info(es, et, status_en): rows.append(dict(zip(headers, columns))) return rows + def extract_extra_info(meta, c): + extra_infos = dict() + current_category = None + for node in r: + if node.tag == 'span' and node.get('class') == 'text11': + current_category = node.text.strip() + current_category = '대안반영폐기 의안목록' if current_category.startswith('대안반영폐기 의안목록') else current_category + continue + + if current_category == None: + continue + + extra_infos[current_category] = extra_infos[current_category] if extra_infos.has_key(current_category) else [] + content = None + if current_category == '비고': + content = extract_remark(node) + elif current_category == '대안': + content = extract_bill_id_from_link(meta, node) + elif current_category == '대안반영폐기 의안목록': + content = extract_bill_id_from_link(meta, node) + else: + content = lxml.html.tostring(node) + + if content: + extra_infos[current_category].append(content) + return extra_infos + + def extract_remark(c): + try: + if c.tag == 'br': + return c.tail.strip() + else: + return c.text.strip() + except AttributeError: + return None + + def extract_bill_id_from_link(meta, c): + # Assume this is tag + href = c.get('href') + match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href) + if match: + return meta.query('link_id == @match.group(1)')['bill_id'].values[0] + return None + fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] @@ -186,9 +230,7 @@ def status_info(es, et, status_en): if row_titles[i]!='부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: - t = r.xpath('span[@class="text8"]/text()') - c = filter(None, (t.strip() for t in r.xpath('text()'))) - status_dict[row_titles[i]] = dict(zip(t, c)) + status_dict[row_titles[i]] = extract_extra_info(meta, r) headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict'] specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict]