forked from mongodb/mongo-hadoop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mongo-defaults.xml
194 lines (183 loc) · 6.61 KB
/
mongo-defaults.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
<?xml version="1.0"?>
<configuration>
<property>
<!-- run the job verbosely ? -->
<name>mongo.job.verbose</name>
<value>false</value>
</property>
<!-- it is *strongly* recommended for consistent behavior that you disable speculative execution in hadoop -->
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>false</value>
</property>
<property>
<name>mapred.reduce.tasks.speculative.execution</name>
<value>false</value>
</property>
<property>
<!-- Run the job in the foreground and wait for response, or background it? -->
<name>mongo.job.background</name>
<value>false</value>
</property>
<property>
<!-- if the input collection is sharded, read each sharding chunk as Input Splits for Hadoop?
NOTE: Combining with 'mongo.input.split.read_from_shards' can cause erratic/unexpected
behavior as chunks will be forcibly read from their last known shard, rather than
through mongos which accounts for/allows for migrations.
You must also enable 'mongo.input.split.create_input_splits' for any chunk reading
to occur.
-->
<name>mongo.input.split.read_shard_chunks</name>
<value>true</value>
</property>
<property>
<!-- Read directly from the shard servers? If enabled, connects directly to each
configured shard host or replica set. If disabled (the default),
sends all reads through mongos.
For most users this shouldn't be enabled.
Left mutable for advanced users and/or the clinically insane.
-->
<name>mongo.input.split.read_from_shards</name>
<value>false</value>
</property>
<property>
<!--
Should mongo-hadoop attempt to calculate
InputSplits for you?
*** If your input collection is sharded:
You'll need to enable 'read_shard_chunks' also to read
individual chunk data into the system. Otherwise enabling this
will use the "Unsharded" mode as described below.
*** If your input collection is unsharded (or you didn't enable chunk reading):
If creating input splits is true (the default), 'mongo.input.split_size' will
control the approx. # of megabytes assigned per calculated InputSplit.
You'll probably want to set 'mongo.input.split.split_key_pattern' as well
to control the split point (defaults to {_id: 1}).
If this is false, only one InputSplit will be created which is fairly inefficient
Hadoopery.
-->
<name>mongo.input.split.create_input_splits</name>
<value>true</value>
</property>
<property>
<!-- If working on an unsharded collection and 'create_input_splits' is enabled,
this keyPattern will be used to calculate the splitpoint. It needs to be a JSON
keyPattern spec following the *Exact same rules* as choosing a shard key for MongoDB
on an existing collection which contains data.
That includes the key being indexed: http://www.mongodb.org/display/DOCS/Sharding+Introduction#ShardingIntroduction-ShardKeys
Defaults to _id
-->
<name>mongo.input.split.split_key_pattern</name>
<value>{ _id: 1 }</value>
</property>
<property>
<!-- The field to pass as the mapper key. Defaults to _id if blank -->
<name>mongo.input.key</name>
<value></value>
</property>
<property>
<!-- if true, queries don't timeout (QUERY_NOTIMEOUT flag set) for Input Splits. May be erratic in behavior -->
<name>mongo.input.notimeout</name>
<value>false</value>
</property>
<property>
<!-- If you are reading from mongo, the URI -->
<name>mongo.input.uri</name>
<value><!-- mongodb://localhost/db.collection?option=value --></value>
</property>
<property>
<!-- If you are writing to mongo, the URI -->
<name>mongo.output.uri</name>
<value><!-- mongodb://localhost/db.collection?option=value --></value>
</property>
<property>
<!-- The query, in JSON, to execute [OPTIONAL] -->
<name>mongo.input.query</name>
<value></value>
</property>
<property>
<!-- The fields, in JSON, to read [OPTIONAL] -->
<name>mongo.input.fields</name>
<value></value>
</property>
<property>
<!-- A JSON sort specification for read [OPTIONAL] -->
<name>mongo.input.sort</name>
<value></value>
</property>
<property>
<!-- The number of documents to limit to for read [OPTIONAL] -->
<name>mongo.input.limit</name>
<value>0</value> <!-- 0 == no limit -->
</property>
<property>
<!-- The number of documents to skip in read [OPTIONAL] -->
<!-- TODO - Are we running limit() or skip() first? -->
<name>mongo.input.skip</name>
<value>0</value> <!-- 0 == no skip -->
</property>
<property>
<!-- IF you want to control the split size for input, set it here.
Should be a long indicating # of docs per split
Affects # of mappers so be careful what you do -->
<name>mongo.input.split_size</name>
</property>
<!-- These .job.* class defs are optional and only needed if you use the MongoTool baseclass -->
<property>
<!-- Class for the mapper -->
<name>mongo.job.mapper</name>
<value></value>
</property>
<property>
<!-- Reducer class -->
<name>mongo.job.reducer</name>
<value></value>
</property>
<property>
<!-- InputFormat Class -->
<name>mongo.job.input.format</name>
<!-- <value>com.mongodb.hadoop.MongoInputFormat</value> -->
<value></value>
</property>
<property>
<!-- OutputFormat Class -->
<name>mongo.job.output.format</name>
<!-- <value>com.mongodb.hadoop.MongoOutputFormat</value> -->
<value></value>
</property>
<property>
<!-- Output key class for the output format -->
<name>mongo.job.output.key</name>
<value></value>
</property>
<property>
<!-- Output value class for the output format -->
<name>mongo.job.output.value</name>
<value></value>
</property>
<property>
<!-- Output key class for the mapper [optional] -->
<name>mongo.job.mapper.output.key</name>
<value></value>
</property>
<property>
<!-- Output value class for the mapper [optional] -->
<name>mongo.job.mapper.output.value</name>
<value></value>
</property>
<property>
<!-- Class for the combiner [optional] -->
<name>mongo.job.combiner</name>
<value></value>
</property>
<property>
<!-- Partitioner class [optional] -->
<name>mongo.job.partitioner</name>
<value></value>
</property>
<property>
<!-- Sort Comparator class [optional] -->
<name>mongo.job.sort_comparator</name>
<value></value>
</property>
</configuration>