From b8ced94a61b9f39a891c9580011f64f6e2577df8 Mon Sep 17 00:00:00 2001 From: Paul Schilling Date: Sun, 14 Jan 2024 12:38:38 +0100 Subject: [PATCH] Update headers --- .pylintrc | 438 ------------------------- articles/management/commands/scrape.py | 2 +- articles/scraper/headers.py | 82 ++--- articles/scraper/spider.py | 3 +- fixtures/sources2.json | 316 ++++++++++++++++++ 5 files changed, 348 insertions(+), 493 deletions(-) delete mode 100644 .pylintrc create mode 100644 fixtures/sources2.json diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 74ff901..0000000 --- a/.pylintrc +++ /dev/null @@ -1,438 +0,0 @@ -# This Pylint rcfile contains a best-effort configuration to uphold the -# best-practices and style described in the Google Python style guide: -# https://google.github.io/styleguide/pyguide.html -# -# Its canonical open-source location is: -# https://google.github.io/styleguide/pylintrc -# -# (slightly modified version) - - -[MASTER] - - -# Files or directories to be skipped. They should be base names, not paths. -ignore=third_party, - headers.py, - migrations, - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. -ignore-patterns= - -# Pickle collected data for later comparisons. -persistent=no - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=4 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -#enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=abstract-method, - apply-builtin, - arguments-differ, - attribute-defined-outside-init, - backtick, - bad-option-value, - basestring-builtin, - buffer-builtin, - c-extension-no-member, - consider-using-enumerate, - cmp-builtin, - cmp-method, - coerce-builtin, - coerce-method, - delslice-method, - div-method, - duplicate-code, - eq-without-hash, - execfile-builtin, - file-builtin, - filter-builtin-not-iterating, - fixme, - getslice-method, - global-statement, - hex-method, - idiv-method, - implicit-str-concat, - import-error, - import-self, - import-star-module-level, - inconsistent-return-statements, - input-builtin, - intern-builtin, - invalid-str-codec, - locally-disabled, - long-builtin, - long-suffix, - map-builtin-not-iterating, - misplaced-comparison-constant, - missing-function-docstring, - metaclass-assignment, - next-method-called, - next-method-defined, - no-absolute-import, - no-else-break, - no-else-continue, - no-else-raise, - no-else-return, - no-init, # added - no-member, - no-name-in-module, - no-self-use, - nonzero-method, - oct-method, - old-division, - old-ne-operator, - old-octal-literal, - old-raise-syntax, - parameter-unpacking, - print-statement, - raising-string, - range-builtin-not-iterating, - raw_input-builtin, - rdiv-method, - reduce-builtin, - relative-import, - reload-builtin, - round-builtin, - setslice-method, - signature-differs, - standarderror-builtin, - suppressed-message, - sys-max-int, - too-few-public-methods, - too-many-ancestors, - too-many-arguments, - too-many-boolean-expressions, - too-many-branches, - too-many-instance-attributes, - too-many-locals, - too-many-nested-blocks, - too-many-public-methods, - too-many-return-statements, - too-many-statements, - trailing-newlines, - unichr-builtin, - unicode-builtin, - unnecessary-pass, - unpacking-in-except, - useless-else-on-loop, - useless-object-inheritance, - useless-suppression, - using-cmp-argument, - wrong-import-order, - xrange-builtin, - zip-builtin-not-iterating, - bad-indentation, - missing-module-docstring, - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=main,e,i,j,k,m,n,_,foo,bar,baz, - -# Bad variable names which should always be refused, separated by a comma -bad-names= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl - -# Regular expression matching correct function names -function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct constant names -const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct attribute names -attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ - -# Regular expression matching correct argument names -argument-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=^_?[A-Z][a-zA-Z0-9]*$ - -# Regular expression matching correct module names -module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ - -# Regular expression matching correct method names -method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=10 - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=85 - -# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt -# lines made too long by directives to pytype. - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=(?x)( - ^\s*(\#\ )??$| - ^\s*(from\s+\S+\s+)?import\s+.+$) - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=yes - -# Maximum number of lines in a module -max-module-lines=99999 - -# String used as indentation unit. The internal Google style guide mandates 2 -# spaces. Google's externaly-published style guide says 4, consistent with -# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google -# projects (like TensorFlow). -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=TODO - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=yes - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging,absl.logging,tensorflow.io.logging - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub, - TERMIOS, - Bastion, - rexec, - sets - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant, absl - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls, - class_ - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=StandardError, - Exception, - BaseException - diff --git a/articles/management/commands/scrape.py b/articles/management/commands/scrape.py index 4d1152c..f0405e7 100644 --- a/articles/management/commands/scrape.py +++ b/articles/management/commands/scrape.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -SCRAPING_INTERVAL = 30 # minutes +SCRAPING_INTERVAL = 180 # minutes def scrape(sitemap: dict): diff --git a/articles/scraper/headers.py b/articles/scraper/headers.py index 53f615b..ccbf7cc 100644 --- a/articles/scraper/headers.py +++ b/articles/scraper/headers.py @@ -1,60 +1,38 @@ headers = [ - # { - # "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", - # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - # "accept-language": "en-US,en;q=0.9", - # "authority": "httpbin.org", - # "cache-control": "max-age=0", - # "sec-ch-ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', - # "sec-ch-ua-mobile": "?0", - # "sec-fetch-site": "none", - # "sec-fetch-mode": "navigate", - # "sec-fetch-user": "?1", - # "sec-fetch-dest": "document", - # "upgrade-insecure-requests": "1", - # }, - # { - # "user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0", - # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - # "authority": "httpbin.org", - # "cache-control": "max-age=0", - # "sec-ch-ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', - # "sec-ch-ua-mobile": "?0", - # "upgrade-insecure-requests": "1", - # "sec-fetch-site": "none", - # "sec-fetch-mode": "navigate", - # "sec-fetch-user": "?1", - # "sec-fetch-dest": "document", - # "accept-language": "en-US,en;q=0.5", - # }, - # { - # "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36", - # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - # "accept-language": "en-US,en;q=0.9", - # "accept-encoding": "gzip, deflate, br", - # "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"', - # "sec-ch-ua-mobile": "?0", - # "sec-ch-ua-platform": "Linux", - # "sec-ch-ua-platform-version": "5.13.0", - # "sec-fetch-site": "none", - # "sec-fetch-mode": "navigate", - # "sec-fetch-user": "?1", - # "sec-fetch-dest": "document", - # "sec-fetch-user": "?1", - # "upgrade-insecure-requests": "1", - # }, - # { - # "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0", - # "cache-control": "max-age=0", - # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - # "sec-fetch-mode": "navigate", - # "accept-language": "en-US,en;q=0.9", - # "referer": "https://www.google.com/", - # }, { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", "Accept": "application/json, text/plain, */*", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "referer": "https://www.google.com/", }, + { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", + "referer": "https://www.google.com/", + }, + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "en-US;q=0.9,en", + "referer": "https://www.google.com/", + }, + { + "User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_1_2 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7D11 Safari/528.16", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "en-US;q=0.9,en", + "referer": "https://www.google.com/", + }, + { + "User-Agent": "Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.65 Mobile Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "en-US;q=0.9,en", + "referer": "https://www.google.com/", + }, + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "en-US;q=0.9,en", + "referer": "https://www.google.com/", + }, ] diff --git a/articles/scraper/spider.py b/articles/scraper/spider.py index c8a3aba..25ec698 100644 --- a/articles/scraper/spider.py +++ b/articles/scraper/spider.py @@ -1,7 +1,6 @@ import asyncio import logging import random -from http.cookiejar import DefaultCookiePolicy from typing import Optional import pyppeteer @@ -112,7 +111,7 @@ def crawl(sitemap: dict): asyncio.set_event_loop(loop) asession = AsyncHTMLSession() # reject cookies - asession.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[])) + # asession.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[])) try: loop.run_until_complete(spider.collect_links(asession)) loop.run_until_complete(spider.collect_metadata(asession)) diff --git a/fixtures/sources2.json b/fixtures/sources2.json new file mode 100644 index 0000000..1757aab --- /dev/null +++ b/fixtures/sources2.json @@ -0,0 +1,316 @@ +[ +{ + "model": "articles.source", + "pk": 1, + "fields": { + "name": "Al Jazeera", + "slug": "al-jazeera", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://www.aljazeera.com/", + "paths": [ + "news/" + ], + "regex": "/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)", + "javascript_required": false, + "headline_selectors": { + "tag": "h1", + "attrs": {} + }, + "summary_selectors": { + "tag": "p", + "attrs": { + "class": "article__subhead" + } + } + } +}, +{ + "model": "articles.source", + "pk": 2, + "fields": { + "name": "Associated Press", + "slug": "", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://apnews.com/", + "paths": [ + "hub/world-news/", + "hub/business/" + ], + "regex": "article/", + "javascript_required": false, + "headline_selectors": { + "tag": "h1", + "attrs": {} + }, + "summary_selectors": { + "tag": "div", + "attrs": { + "class": "Article" + } + } + } +}, +{ + "model": "articles.source", + "pk": 3, + "fields": { + "name": "Christian Science Monitor", + "slug": "", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://www.csmonitor.com/", + "paths": [ + "world/", + "USA/" + ], + "regex": "USA/.*[0-9]{4}/[0-9]{4}/|World/.*[0-9]{4}/[0-9]{4}/", + "javascript_required": false, + "headline_selectors": { + "tag": "h1", + "attrs": { + "id": "headline" + } + }, + "summary_selectors": { + "tag": "div", + "attrs": { + "id": "summary" + } + } + } +}, +{ + "model": "articles.source", + "pk": 4, + "fields": { + "name": "Consortium News", + "slug": "", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://consortiumnews.com/", + "paths": [ + "" + ], + "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", + "javascript_required": false, + "headline_selectors": { + "tag": "h1", + "attrs": {} + }, + "summary_selectors": { + "tag": "p", + "attrs": {} + } + } +}, +{ + "model": "articles.source", + "pk": 5, + "fields": { + "name": "Current Affairs", + "slug": "", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://www.currentaffairs.org/", + "paths": [ + "category/politics/", + "category/economics/", + "category/interviews/", + "category/history/" + ], + "regex": "[0-9]{4}/[0-9]{2}/", + "javascript_required": false, + "headline_selectors": { + "tag": "h1", + "attrs": { + "class": "title" + } + }, + "summary_selectors": { + "tag": "div", + "attrs": { + "class": "tagline" + } + } + } +}, +{ + "model": "articles.source", + "pk": 6, + "fields": { + "name": "New York Times", + "slug": "", + "publication_type": "newspaper/journal", + "language": "en", + "link": "https://www.nytimes.com/", + "paths": [ + "section/world/", + "section/business/" + ], + "regex": "(?