datapusher/dot-env.template

# To specify Datapusher+ settings, modify and copy this file to ".env"
# and put it in working directory from where DP+ is started.
# e.g. in development mode, in the datapusher-plus/datapusher directory
# in production mode, in the /etc/ckan/datapusher-plus directory
#
# Note that DP+ settings can also be passed using environment variables
# e.g. export PII_SCREENING=True

# ============= DATABASE SETTINGS =============
# The connect string of the CKAN Datastore
WRITE_ENGINE_URL = 'postgresql://datapusher:YOURPASSWORD@localhost/datastore_default'

# The connect string of the Datapusher+ Job database
SQLALCHEMY_DATABASE_URI = 'postgresql://datapusher_jobs:YOURPASSWORD@localhost/datapusher_jobs'

# READ BUFFER SIZE IN BYTES WHEN READING CSV FILE WHEN USING POSTGRES COPY
# default 64k = 65536
COPY_READBUFFER_SIZE = 65536

# =============== DOWNLOAD SETTINGS ==============
# 25mb, this is ignored if either PREVIEW_ROWS > 0
MAX_CONTENT_LENGTH = 25600000

# A Datapusher+ job is triggered automatically everytime a resource is modified (even just its metadata)
# if its mimetype is one of the supported datapusher.formats. 
# To ensure DP+ doesn't push an unchanged resource, it computes and stores the hash of the file
# If the hash has not changed (i.e. the file has not been modified), it refrains from "re-pushing" it
IGNORE_FILE_HASH = False

# In bytes. The resource is downloaded on a streaming basis, 16K at a time
CHUNK_SIZE = 16384

# In seconds. How long before DP+ download times out
DOWNLOAD_TIMEOUT = 30

# If the SSL certificate is verified. This is set to False by default
# since externally hosted datasets may sometimes have expired/self-signed SSL certificates
SSL_VERIFY = False

# If this is not zero, the number of preview rows to push into the datastore
# If zero, it pushes the entire file
PREVIEW_ROWS = 0

DOWNLOAD_PROXY = ''

# =========== CKAN SERVICE PROVIDER SETTINGS ==========
HOST = "0.0.0.0"
PORT = 8800

# turns on logger at Debug level
DEBUG = False
# If False, configures the logger for production
# i.e. logs to STDERR and LOG_FILE (autorotates after 68mb, with 5 backups),
# and emails errors to admins.
# If True, only turns on Debug if DEBUG = True
TESTING = False

FROM_EMAIL = 'dpplus-errors@domain.com'
# comma-delimited list of emails to send CKAN Service Provider errors to
ADMINS = ''

# Error logging
LOG_FILE = '/tmp/ckan_service.log'
# Also show log on STDERR
STDERR = True

# These settings are randomly generated by default
# only set these if you need to interface with the CKAN Service Provider API
# see https://ckan-service-provider.readthedocs.io/
# SECRET_KEY = "please replace me"
# USERNAME = "admin"
# PASSWORD = "changeme"

# number of days to keep job history
KEEP_JOBS_AGE = 60

# ============ QSV ANALYSIS SETTINGS ==========

# ---------- BINARY PATHS -------------
# qsv binary to use
# optionally, you can also use qsvdp_nightly.
# qsvdp is already very fast, but if you want even more speed
# qsvdp_nightly is compiled/linked in such a way that it's even faster/smaller
# see https://github.com/jqnatividad/qsv/blob/master/docs/PERFORMANCE.md#nightly-release-builds
QSV_BIN = '/usr/local/bin/qsvdp'

# file binary to use. `file` is used to get file metadata to display on the log
# if qsv cannot open a spreadsheet file (probably, because its password-protected or corrupt)
FILE_BIN = '/usr/bin/file'

# Dates are parsed with an MDY preference by default
# set PREFER_DMY = True if date-parsing should prefer DMY instead
PREFER_DMY = False

# The zero-based index of the default sheet to export to CSV. 0 is the first sheet.
# Accepts negative numbers. -1 is the last sheet, -2 the 2nd to last sheet, etc.
DEFAULT_EXCEL_SHEET = 0

# Check if a file is sorted and has duplicates
SORT_AND_DUPE_CHECK = True

# Should CSVs be deduped? Note that deduping also
# sorts the CSV.
DEDUP = False

# --------- COLUMN HEADER NAME SAFENAMES SETTINGS --------
# unsafe prefix to use if a column name is found to be "unsafe"
UNSAFE_PREFIX = unsafe_
# Comma-delimited list of additional case-insensitive reserved names
# that should be considered "unsafe". If a header name is found in the
# list, it will be prefixed with "reserved_"
RESERVED_COLNAMES = _id

# -------- SUMMARY STATS SETTINGS -----------
# Create a resource for calculated summary stats?
ADD_SUMMARY_STATS_RESOURCE = False

# additional command line options to pass to qsv stats when creating
# summary stats. Set to `--everything` if you want to include all the stats,
# particularly, when ADD_SUMMARY_STATS_RESOURCE is True
SUMMARY_STATS_OPTIONS = ''

# -------- AUTO INDEX SETTINGS ----------
# if AUTO_INDEX_THRESHOLD > 0 or AUTO_INDEX_DATES is true
# create indices automatically based on as column's cardinality (number of unique values)
#   - if a column's cardinality <= AUTO_INDEX_THRESHOLD, create an index for that column
#   - if AUTO_INDEX_THRESHOLD = -1, index all columns regardless of its cardinality
AUTO_INDEX_THRESHOLD = 3

# for columns w/ cardinality equal to record_count, it's all unique values, create a unique index
AUTO_UNIQUE_INDEX = True

# always index date fields?
AUTO_INDEX_DATES = True

# ------ AUTO ALIAS SETTINGS ----------
# Should an alias be automatically created?
# Aliases are easier to use than resource_ids, and can be used with the CKAN API where
# resource_ids are used. Aliases are also SQL views that are easier to use when querying
# the CKAN Datastore database.
# Aliases are created by concatenating "{resource_name}-{package_name}-{owner_org_name}"
# truncated at 55-characters.
AUTO_ALIAS = False

# Should aliases should always be unique? In case of an alias name collision, a three-digit
# sequence number is appended.
AUTO_ALIAS_UNIQUE = False

# -------- PII SETTINGS -----------
PII_SCREENING = False

# Stop scanning on first PII found
PII_QUICK_SCREEN = False

# Abort Datapusher+ job if PII is found
PII_FOUND_ABORT = True

# Create a resource where PII candidates are stored?
PII_SHOW_CANDIDATES = True

# The resource ID/alias of a Text file that has the 
# regex patterns to use for PII scanning.
# If this is not specified, the default PII scanning rules in
# default_pii_regexes.txt are used.
# Note that Rust regex syntax is used, NOT Python, as we use the
# qsv searchset command which can scan for MULTIPLE regex patterns in
# one pass, and is at least an order of magnitude faster than Python
# https://docs.rs/regex/latest/regex/index.html#syntax
# You can test your regex at https://regex101.com using the Rust Flavor
PII_REGEX_RESOURCE_ID_OR_ALIAS = ''