From 69b1b211acbaa770f132c33ba46288e2646afd73 Mon Sep 17 00:00:00 2001
From: Seong JuWon <32597561+lumyjuwon@users.noreply.github.com>
Date: Thu, 16 May 2019 18:00:37 +0900
Subject: [PATCH] Add OverbalanceMonth and Fix Parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

뉴스 본문에서 "flash 우회 오류 ..." 내용 제거 및 왼쪽 공백 제거
start_month 및 end_month 조건 추가
운영체제에 따라 euc-kr 또는 utf-8 사용
---
 .gitignore                           | 12 ++++++++++++
 korea_news_crawler/articlecrawler.py | 26 +++++++++++++++++---------
 korea_news_crawler/articleparser.py  |  6 +++---
 korea_news_crawler/exceptions.py     | 10 ++++++++++
 korea_news_crawler/sample.py         |  4 ++--
 5 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index f8bb5ae..43bd96a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,15 @@ korea_news_crawler/__pycache__/exceptions.cpython-36.pyc
 korea_news_crawler/__pycache__/articleparser.cpython-36.pyc
 korea_news_crawler/__pycache__/articlecrawler.cpython-36.pyc
 .idea/KoreaNewsCrawler.iml
+build/lib/korea_news_crawler/articlecrawler.py
+build/lib/korea_news_crawler/articleparser.py
+build/lib/korea_news_crawler/exceptions.py
+build/lib/korea_news_crawler/sample.py
+build/lib/korea_news_crawler/sportcrawler.py
+dist/KoreaNewsCrawler-1.0-py3-none-any.whl
+KoreaNewsCrawler.egg-info/dependency_links.txt
+KoreaNewsCrawler.egg-info/not-zip-safe
+KoreaNewsCrawler.egg-info/PKG-INFO
+KoreaNewsCrawler.egg-info/requires.txt
+KoreaNewsCrawler.egg-info/SOURCES.txt
+KoreaNewsCrawler.egg-info/top_level.txt
diff --git a/korea_news_crawler/articlecrawler.py b/korea_news_crawler/articlecrawler.py
index 579c3d4..2b905ab 100644
--- a/korea_news_crawler/articlecrawler.py
+++ b/korea_news_crawler/articlecrawler.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8, euc-kr -*-
 
 from time import sleep
 from bs4 import BeautifulSoup
@@ -7,6 +7,7 @@
 from korea_news_crawler.exceptions import *
 from korea_news_crawler.articleparser import ArticleParser
 import os
+import platform
 import calendar
 import requests
 import csv
@@ -20,6 +21,7 @@ def __init__(self):
                            'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105}
         self.selected_categories = []
         self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0}
+        self.user_operating_system = str(platform.system())
 
     def set_category(self, *args):
         for key in args:
@@ -35,6 +37,8 @@ def set_date_range(self, start_year, start_month, end_year, end_month):
             raise InvalidMonth(start_month)
         if end_month < 1 or end_month > 12:
             raise InvalidMonth(end_month)
+        if start_month > end_month:
+            raise OverbalanceMonth(start_month, end_month)
         for key, date in zip(self.date, args):
             self.date[key] = date
         print(self.date)
@@ -87,14 +91,18 @@ def crawling(self, category_name):
             save_endmonth = str(self.date['end_month'])
         
         # 각 카테고리 기사 저장 할 CSV
-        file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth\
-                    + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='')
+        # Windows use euc-kr
+        if self.user_operating_system == "Windows":
+            file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth
+                        + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='euc-kr', newline='')
+        else:
+            file = open('Article_' + category_name + '_' + str(self.date['start_year']) + save_startmonth
+                        + '_' + str(self.date['end_year']) + save_endmonth + '.csv', 'w', encoding='utf-8', newline='')
         wcsv = csv.writer(file)
         del save_startmonth, save_endmonth
 
         # 기사 URL 형식
-        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
-            self.categories.get(category_name)) + "&date="
+        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(self.categories.get(category_name)) + "&date="
         # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
         final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
         print(category_name + " Urls are generated")
@@ -145,7 +153,7 @@ def crawling(self, category_name):
                     tag_company = document_content.find_all('meta', {'property': 'me2:category1'})
                     text_company = ''  # 언론사 초기화
                     text_company = text_company + str(tag_company[0].get('content'))
-                    if not text_company:  # 공백일 경우 기사 제외 처리 굳.
+                    if not text_company:  # 공백일 경우 기사 제외 처리
                         continue
                     # CSV 작성
                     wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
@@ -156,7 +164,7 @@ def crawling(self, category_name):
                     del request_content, document_content
 
                 except Exception as ex:  # UnicodeEncodeError ..
-                    wcsv.writerow([ex, content_url])
+                    # wcsv.writerow([ex, content_url])
                     del request_content, document_content
                     pass
         file.close()
@@ -170,6 +178,6 @@ def start(self):
 
 if __name__ == "__main__":
     Crawler = ArticleCrawler()
-    Crawler.set_category("생활문화","IT과학")  # 정치, 경제, 생활문화, IT과학, 세계, 사회 카테고리 사용 가능
-    Crawler.set_date_range(2017, 1, 2018, 4)  # 2017년 1월부터 2018년 4월까지 크롤링 시작
+    Crawler.set_category("생활문화", "IT과학")
+    Crawler.set_date_range(2017, 1, 2017, 1)
     Crawler.start()
diff --git a/korea_news_crawler/articleparser.py b/korea_news_crawler/articleparser.py
index 0fa1859..e465ff5 100644
--- a/korea_news_crawler/articleparser.py
+++ b/korea_news_crawler/articleparser.py
@@ -6,14 +6,14 @@
 class ArticleParser(object):
     def __init__(self):
         self.special_symbol = re.compile('[\{\}\[\]\/?,;:|\)*~`!^\-_+<>@\#$&▲▶◆◀■【】\\\=\(\'\"]')
-        self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가fuctio flashremoveCallback|tt|앵커 멘트|xa0')
+        self.content_pattern = re.compile('본문 내용|TV플레이어| 동영상 뉴스|flash 오류를 우회하기 위한 함수 추가function  flash removeCallback|tt|앵커 멘트|xa0')
         
     def clear_content(self, text):
         # 기사 본문에서 필요없는 특수문자 및 본문 양식 등을 다 지움
-        newline_symbol_removed_text = text.replace('\\n','').replace('\\t','')
+        newline_symbol_removed_text = text.replace('\\n', '').replace('\\t', '')
         special_symbol_removed_content = re.sub(self.special_symbol, ' ', newline_symbol_removed_text)
         end_phrase_removed_content = re.sub(self.content_pattern, '', special_symbol_removed_content)
-        blank_removed_content = re.sub(' +', ' ',end_phrase_removed_content) # 공백 에러 삭제
+        blank_removed_content = re.sub(' +', ' ', end_phrase_removed_content).lstrip()  # 공백 에러 삭제
         reversed_content = ''.join(reversed(blank_removed_content))  # 기사 내용을 reverse 한다.
         content = ''
         for i in range(0, len(blank_removed_content)):
diff --git a/korea_news_crawler/exceptions.py b/korea_news_crawler/exceptions.py
index bb72ea6..a91d729 100644
--- a/korea_news_crawler/exceptions.py
+++ b/korea_news_crawler/exceptions.py
@@ -51,3 +51,13 @@ def __init__(self, month):
 
     def __str__(self):
         return str(self.message)
+
+
+class OverbalanceMonth(Exception):
+    def __init__(self, start_month, end_month):
+        self.start_month = start_month
+        self.end_month = end_month
+        self.message = "start_month(" + str(self.start_month) + ") is an overbalance with end_month" + "(" +  str(self.end_month) + ")"
+
+    def __str__(self):
+        return str(self.message)
diff --git a/korea_news_crawler/sample.py b/korea_news_crawler/sample.py
index d712d52..36a0c0a 100644
--- a/korea_news_crawler/sample.py
+++ b/korea_news_crawler/sample.py
@@ -2,6 +2,6 @@
 
 if __name__ == "__main__":
     Crawler = ArticleCrawler()
-    Crawler.set_category("IT과학", "경제")  # 정치, 경제, 생활문화, IT과학, 사회 카테고리 사용 가능
-    Crawler.set_date_range(2017, 2018, 3)  # 2017년 1월부터 2018년 3월까지 크롤링 시작
+    Crawler.set_category("IT과학", "경제")  # 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능
+    Crawler.set_date_range(2017, 1, 2018, 3)  # 2017년 1월부터 2018년 3월까지 크롤링 시작
     Crawler.start()