diff --git a/bills/specific/html2json.py b/bills/specific/html2json.py index 07f2c19..20e64b8 100644 --- a/bills/specific/html2json.py +++ b/bills/specific/html2json.py @@ -161,6 +161,50 @@ def status_info(es, et, status_en): rows.append(dict(zip(headers, columns))) return rows + def extract_extra_info(meta, c): + extra_infos = dict() + current_category = None + for node in r: + if node.tag == 'span' and node.get('class') == 'text11': + current_category = node.text.strip() + current_category = '대안반영폐기 의안목록' if current_category.startswith('대안반영폐기 의안목록') else current_category + continue + + if current_category == None: + continue + + extra_infos[current_category] = extra_infos[current_category] if extra_infos.has_key(current_category) else [] + content = None + if current_category == '비고': + content = extract_remark(node) + elif current_category == '대안': + content = extract_bill_id_from_link(meta, node) + elif current_category == '대안반영폐기 의안목록': + content = extract_bill_id_from_link(meta, node) + else: + content = lxml.html.tostring(node) + + if content: + extra_infos[current_category].append(content) + return extra_infos + + def extract_remark(c): + try: + if c.tag == 'br': + return c.tail.strip() + else: + return c.text.strip() + except AttributeError: + return None + + def extract_bill_id_from_link(meta, c): + # Assume this is tag + href = c.get('href') + match = re.match('/bill/jsp/BillDetail.jsp\?bill_id=(.*)', href) + if match: + return meta.query('link_id == @match.group(1)')['bill_id'].values[0] + return None + fn = '%s/%d/%s.html' % (DIR['specifics'], assembly_id, bill_id) page = utils.read_webpage(fn) table = utils.get_elems(page, X['spec_table'])[1] @@ -186,9 +230,7 @@ def status_info(es, et, status_en): if row_titles[i]!='부가정보': status_dict[row_titles[i]] = extract_row_contents(r) else: - t = r.xpath('span[@class="text8"]/text()') - c = filter(None, (t.strip() for t in r.xpath('text()'))) - status_dict[row_titles[i]] = dict(zip(t, c)) + status_dict[row_titles[i]] = extract_extra_info(meta, r) headers = ['assembly_id', 'bill_id', 'title', 'status_detail', 'statuses', 'status_infos', 'status_dict'] specifics = [assembly_id, bill_id, title, status_detail, statuses, status_infos, status_dict]