From 69b1b211acbaa770f132c33ba46288e2646afd73 Mon Sep 17 00:00:00 2001 From: Seong JuWon <32597561+lumyjuwon@users.noreply.github.com> Date: Thu, 16 May 2019 18:00:37 +0900 Subject: [PATCH] Add OverbalanceMonth and Fix Parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 뉴스 본문에서 "flash 우회 오류 ..." 내용 제거 및 왼쪽 공백 제거 start_month 및 end_month 조건 추가 운영체제에 따라 euc-kr 또는 utf-8 사용 --- .gitignore | 12 ++++++++++++ korea_news_crawler/articlecrawler.py | 26 +++++++++++++++++--------- korea_news_crawler/articleparser.py | 6 +++--- korea_news_crawler/exceptions.py | 10 ++++++++++ korea_news_crawler/sample.py | 4 ++-- 5 files changed, 44 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index f8bb5ae..43bd96a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,15 @@ korea_news_crawler/__pycache__/exceptions.cpython-36.pyc korea_news_crawler/__pycache__/articleparser.cpython-36.pyc korea_news_crawler/__pycache__/articlecrawler.cpython-36.pyc .idea/KoreaNewsCrawler.iml +build/lib/korea_news_crawler/articlecrawler.py +build/lib/korea_news_crawler/articleparser.py +build/lib/korea_news_crawler/exceptions.py +build/lib/korea_news_crawler/sample.py +build/lib/korea_news_crawler/sportcrawler.py +dist/KoreaNewsCrawler-1.0-py3-none-any.whl +KoreaNewsCrawler.egg-info/dependency_links.txt +KoreaNewsCrawler.egg-info/not-zip-safe +KoreaNewsCrawler.egg-info/PKG-INFO +KoreaNewsCrawler.egg-info/requires.txt +KoreaNewsCrawler.egg-info/SOURCES.txt +KoreaNewsCrawler.egg-info/top_level.txt diff --git a/korea_news_crawler/articlecrawler.py b/korea_news_crawler/articlecrawler.py index 579c3d4..2b905ab 100644 --- a/korea_news_crawler/articlecrawler.py +++ b/korea_news_crawler/articlecrawler.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8, euc-kr -*- from time import sleep from bs4 import BeautifulSoup @@ -7,6 +7,7 @@ from korea_news_crawler.exceptions import * from korea_news_crawler.articleparser import ArticleParser import os +import platform import calendar import requests import csv @@ -20,6 +21,7 @@ def __init__(self): 'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105} self.selected_categories = [] self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0} + self.user_operating_system = str(platform.system()) def set_category(self, *args): for key in args: @@ -35,6 +37,8 @@ def set_date_range(self, start_year, start_month, end_year, end_month): raise InvalidMonth(start_month) if end_month < 1 or end_month > 12: raise InvalidMonth(end_month) + if start_month > end_month: + raise OverbalanceMonth(start_month, end_month) for key, date in zip(self.date, args): self.date[key] = date print(self.date) @@ -87,14 +91,18 @@ def crawling(self, category_name): save_endmonth = str(self.date['end_month']) # 각 카테고리 기사 저장 할 CSV - file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth\ - + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='') + # Windows use euc-kr + if self.user_operating_system == "Windows": + file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth + + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='euc-kr', newline='') + else: + file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth + + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='') wcsv = csv.writer(file) del save_startmonth, save_endmonth # 기사 URL 형식 - url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( - self.categories.get(category_name)) + "&date=" + url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) print(category_name + " Urls are generated") @@ -145,7 +153,7 @@ def crawling(self, category_name): tag_company = document_content.find_all('meta', {'property': 'me2:category1'}) text_company = '' # 언론사 초기화 text_company = text_company + str(tag_company[0].get('content')) - if not text_company: # 공백일 경우 기사 제외 처리 굳. + if not text_company: # 공백일 경우 기사 제외 처리 continue # CSV 작성 wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url]) @@ -156,7 +164,7 @@ def crawling(self, category_name): del request_content, document_content except Exception as ex: # UnicodeEncodeError .. - wcsv.writerow([ex, content_url]) + # wcsv.writerow([ex, content_url]) del request_content, document_content pass file.close() @@ -170,6 +178,6 @@ def start(self): if __name__ == "__main__": Crawler = ArticleCrawler() - Crawler.set_category("생활문화","IT과학") # 정치, 경제, 생활문화, IT과학, 세계, 사회 카테고리 사용 가능 - Crawler.set_date_range(2017, 1, 2018, 4) # 2017년 1월부터 2018년 4월까지 크롤링 시작 + Crawler.set_category("생활문화", "IT과학") + Crawler.set_date_range(2017, 1, 2017, 1) Crawler.start() diff --git a/korea_news_crawler/articleparser.py b/korea_news_crawler/articleparser.py index 0fa1859..e465ff5 100644 --- a/korea_news_crawler/articleparser.py +++ b/korea_news_crawler/articleparser.py @@ -6,14 +6,14 @@ class ArticleParser(object): def __init__(self): self.special_symbol = re.compile('[\{\}\[\]\/?,;:|\)*~`!^\-_+<>@\#$&▲▶◆◀■【】\\\=\(\'\"]') - self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가fuctio flashremoveCallback|tt|앵커 멘트|xa0') + self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가function flash removeCallback|tt|앵커 멘트|xa0') def clear_content(self, text): # 기사 본문에서 필요없는 특수문자 및 본문 양식 등을 다 지움 - newline_symbol_removed_text = text.replace('\\n','').replace('\\t','') + newline_symbol_removed_text = text.replace('\\n', '').replace('\\t', '') special_symbol_removed_content = re.sub(self.special_symbol, ' ', newline_symbol_removed_text) end_phrase_removed_content = re.sub(self.content_pattern, '', special_symbol_removed_content) - blank_removed_content = re.sub(' +', ' ',end_phrase_removed_content) # 공백 에러 삭제 + blank_removed_content = re.sub(' +', ' ', end_phrase_removed_content).lstrip() # 공백 에러 삭제 reversed_content = ''.join(reversed(blank_removed_content)) # 기사 내용을 reverse 한다. content = '' for i in range(0, len(blank_removed_content)): diff --git a/korea_news_crawler/exceptions.py b/korea_news_crawler/exceptions.py index bb72ea6..a91d729 100644 --- a/korea_news_crawler/exceptions.py +++ b/korea_news_crawler/exceptions.py @@ -51,3 +51,13 @@ def __init__(self, month): def __str__(self): return str(self.message) + + +class OverbalanceMonth(Exception): + def __init__(self, start_month, end_month): + self.start_month = start_month + self.end_month = end_month + self.message = "start_month(" + str(self.start_month) + ") is an overbalance with end_month" + "(" + str(self.end_month) + ")" + + def __str__(self): + return str(self.message) diff --git a/korea_news_crawler/sample.py b/korea_news_crawler/sample.py index d712d52..36a0c0a 100644 --- a/korea_news_crawler/sample.py +++ b/korea_news_crawler/sample.py @@ -2,6 +2,6 @@ if __name__ == "__main__": Crawler = ArticleCrawler() - Crawler.set_category("IT과학", "경제") # 정치, 경제, 생활문화, IT과학, 사회 카테고리 사용 가능 - Crawler.set_date_range(2017, 2018, 3) # 2017년 1월부터 2018년 3월까지 크롤링 시작 + Crawler.set_category("IT과학", "경제") # 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능 + Crawler.set_date_range(2017, 1, 2018, 3) # 2017년 1월부터 2018년 3월까지 크롤링 시작 Crawler.start()