forked from doolin/patentprocessor
-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathprocess.cfg
executable file
·74 lines (64 loc) · 2.98 KB
/
process.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# This is a sample file that configures the environment for the preprocessing
# steps of parsing, cleaning, consolidation
# [process] defines which configured steps the current run of the preprocessor
# will be run. Accepts 4 options:
# parse: defines which parsing configuration will be run
# clean: if True, runs the cleaning step on the output of parse
# consolidate: if True, runs the conslidation step on the output of clean
# outputdir: specifies the final destination of the resulting sqlite3 files
[process]
parse=test
clean=True
consolidate=True
outputdir=.
#[defaultparse]
## 'datadir' specifies the path to the directory containing the XML files that
## we want to parse. This path will be evaluated relative to the main directory
## of preprocessor. Defaults to '/data/patentdata/patents/2013'
#
# datadir=/path/to/patent/data
## 'dataregex' specifies the regular expression that matches the XML files that
## we want to parse. If you are downloading data from the USPTO, then the
## default value should be fine. Defaults to 'ipg\d{6}.xml', the format found
## for most USPTO files since 2005
#
# dataregex=ipg\d{6}.xml
## 'years' specifies the range of years for which you want to download and
## parse. If the current year is specified, the script will download all
## possible files. Specifying the 'years' option will ignore the 'datadir'
## option and just download the relevant files to 'downloaddir' (see below)
## Specify years as:
## year1
## year1-year2
## year1,year2,year3
## year1-year2,year3-year4
## latest (downloads the most recent week's data)
## If this option is NOT specified, the parse will run on the contents of 'datadir'
#
# years=2010-2013
## 'downloaddir' specifies the target base directory into which the weekly
## patent files will be downloaded. Note that the preprocessor will create
## directories named for each year inside 'downloaddir', and if they already
## exist, will look inside for previously downloaded files
## If this option is NOT specified, the parse will run on the contents of 'datadir'
#
# downloaddir=/path/to/base/directory/for/downloads
# example configuration for a parse of 2012 data. Note that the 'dataregex'
# option is not specified because the default value is sufficient
[2012parse]
datadir=/data/patentdata/patents/2012
[test]
datadir=test/fixtures/xml
dataregex=\d{4}_\d.xml
# This section specifies which grant_handler is to be used for each date of the
# released patent. This section should only have to be touched when a new parser is
# introduced. In the case where a year cannot be parsed from the filename (the
# format `ipgYYMMDD` is assumed), then the default parser is used.
# The dates in the ranges are either YYYY or YYYYMMDD. If only one date is provided,
# then the corresponding handler is assumed for all subsequent patents
[grant-xml-handlers]
2005-20130108=lib.handlers.grant_handler_v42
20130115=lib.handlers.grant_handler_v44
default=lib.handlers.grant_handler_v42
[application-xml-handlers]
default=lib.handlers.application_handler_v43