From 94e6c08c3006a8fb2cc00b0292cd661400b36bed Mon Sep 17 00:00:00 2001
From: Park Jiin <tisphie@gmail.com>
Date: Fri, 1 Jan 2016 01:11:28 +0900
Subject: [PATCH] =?UTF-8?q?Parse=20'=EB=B6=80=EA=B0=80=EC=A0=95=EB=B3=B4'?=
 =?UTF-8?q?=20part=20correctly=20=20*=20=EB=B9=84=EA=B3=A0:=20use=20br.tai?=
 =?UTF-8?q?l=20or=20any.text=20=20*=20=EB=8C=80=EC=95=88,=20=EB=8C=80?=
 =?UTF-8?q?=EC=95=88=EB=B0=98=EC=98=81=ED=8F=90=EA=B8=B0=20=EB=AA=A9?=
 =?UTF-8?q?=EB=A1=9D:=20use=20bill=5Fid=20extracted=20from=20link=5Fid=20f?=
 =?UTF-8?q?rom=20href?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this commit fixes issue #39
---
 bills/specific/html2json.py | 48 ++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/bills/specific/html2json.py b/bills/specific/html2json.py
index 07f2c19..20e64b8 100644
--- a/bills/specific/html2json.py
+++ b/bills/specific/html2json.py
@@ -161,6 +161,50 @@ def status_info(es, et, status_en):
             rows.append(dict(zip(headers, columns)))
         return rows
 
+    def extract_extra_info(meta, c):
+        extra_infos = dict()
+        current_category = None
+        for node in r:
+            if node.tag == 'span' and node.get('class') == 'text11':
+                current_category = node.text.strip()
+                current_category = '대안반영폐기 의안목록' if current_category.startswith('대안반영폐기 의안목록') else current_category
+                continue
+
+            if current_category == None:
+                continue
+
+            extra_infos[current_category] = extra_infos[current_category] if extra_infos.has_key(current_category) else []
+            content = None
+            if current_category == '비고':
+                content = extract_remark(node)
+            elif current_category == '대안':
+                content = extract_bill_id_from_link(meta, node)
+            elif current_category == '대안반영폐기 의안목록':
+                content = extract_bill_id_from_link(meta, node)
+            else:
+                content = lxml.html.tostring(node)
+
+            if content:
+                extra_infos[current_category].append(content)
+        return extra_infos
+
+    def extract_remark(c):
+        try:
+            if c.tag == 'br':
+                return c.tail.strip()
+            else:
+                return c.text.strip()
+        except AttributeError:
+            return None
+
+    def extract_bill_id_from_link(meta, c):
+        # Assume this is <a> tag
+        href = c.get('href')
+        match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href)
+        if match:
+            return meta.query('link_id == @match.group(1)')['bill_id'].values[0]
+        return None
+
     fn          = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id)
     page        = utils.read_webpage(fn)
     table       = utils.get_elems(page, X['spec_table'])[1]
@@ -186,9 +230,7 @@ def status_info(es, et, status_en):
         if row_titles[i]!='부가정보':
             status_dict[row_titles[i]] = extract_row_contents(r)
         else:
-            t = r.xpath('span[@class="text8"]/text()')
-            c = filter(None, (t.strip() for t in r.xpath('text()')))
-            status_dict[row_titles[i]] = dict(zip(t, c))
+            status_dict[row_titles[i]] = extract_extra_info(meta, r)
 
     headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict']
     specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict]