https://github.com/mongodb/mongo-hadoop
Raw File
Tip revision: 20208a027ad8638e56dfcf040773f176d6ee059f authored by Alexander Golin on 28 January 2022, 19:28:02 UTC
Merge pull request #160 from mongodb/DRIVERS-2036
Tip revision: 20208a0
mongo-defaults.xml
<?xml version="1.0"?> 
<configuration>
  <property>
    <!-- run the job verbosely ? -->
    <name>mongo.job.verbose</name>
    <value>false</value>
  </property>
  <!-- it is *strongly* recommended for consistent behavior that you disable speculative execution in hadoop  -->
  <property>
      <name>mapred.map.tasks.speculative.execution</name> 
      <value>false</value>
  </property>
   <property>
      <name>mapred.reduce.tasks.speculative.execution</name> 
      <value>false</value>
  </property>
  <property>
    <!-- Run the job in the foreground and wait for response, or background it? -->
    <name>mongo.job.background</name>
    <value>false</value>
  </property>
  <property>
    <!-- if the input collection is sharded, read each sharding chunk as Input Splits for Hadoop? 
         NOTE: Combining with 'mongo.input.split.read_from_shards' can cause erratic/unexpected 
         behavior as chunks will be forcibly read from their last known shard, rather than
         through mongos which accounts for/allows for migrations.

         You must also enable 'mongo.input.split.create_input_splits' for any chunk reading
         to occur.
      -->
    <name>mongo.input.split.read_shard_chunks</name>
    <value>true</value>
  </property>
  <property>
   <!-- Read directly from the shard servers? If enabled, connects directly to each
        configured shard host or replica set.  If disabled (the default), 
        sends all reads through mongos.

        For most users this shouldn't be enabled.  
        Left mutable for advanced users and/or the clinically insane.
    -->
    <name>mongo.input.split.read_from_shards</name>
    <value>false</value>
  </property>
  <property>
    <!-- 
        Should mongo-hadoop attempt to calculate 
        InputSplits for you? 

        *** If your input collection is sharded:
            
            You'll need to enable 'read_shard_chunks' also to read 
            individual chunk data into the system.  Otherwise enabling this 
            will use the "Unsharded" mode as described below.

        *** If your input collection is unsharded (or you didn't enable chunk reading):

        If creating input splits is true (the default), 'mongo.input.split_size' will
        control the approx. # of megabytes assigned per calculated InputSplit.

        You'll probably want to set 'mongo.input.split.split_key_pattern' as well
        to control the split point (defaults to {_id: 1}).

        If this is false, only one InputSplit will be created which is fairly inefficient
        Hadoopery.

    -->
    <name>mongo.input.split.create_input_splits</name>
    <value>true</value>
  </property>
  <property>
   <!-- If working on an unsharded collection and 'create_input_splits' is enabled,
        this keyPattern will be used to calculate the splitpoint.  It needs to be a JSON
        keyPattern spec following the *Exact same rules* as choosing a shard key for MongoDB 
        on an existing collection which contains data.

        That includes the key being indexed: http://www.mongodb.org/display/DOCS/Sharding+Introduction#ShardingIntroduction-ShardKeys

        Defaults to _id
    --> 
    <name>mongo.input.split.split_key_pattern</name>
    <value>{ _id: 1 }</value>
  </property>
  <property>
    <!-- The field to pass as the mapper key. Defaults to _id if blank -->
    <name>mongo.input.key</name>
    <value></value>
  </property>
  <property>
      <!-- if true, queries don't timeout (QUERY_NOTIMEOUT flag set) for Input Splits. May be erratic in behavior -->
      <name>mongo.input.notimeout</name>
      <value>false</value>
  </property>
  <property>
    <!-- If you are reading from mongo, the URI -->
    <name>mongo.input.uri</name>
    <value><!-- mongodb://localhost/db.collection?option=value --></value>
  </property>
  <property>
    <!-- If you are writing to mongo, the URI -->
    <name>mongo.output.uri</name>
    <value><!-- mongodb://localhost/db.collection?option=value --></value>
  </property>
  <property>
    <!-- The query, in JSON, to execute [OPTIONAL] -->
    <name>mongo.input.query</name>
    <value></value>
  </property>
  <property>
    <!-- The fields, in JSON, to read [OPTIONAL] -->
    <name>mongo.input.fields</name>
    <value></value>
  </property>
  <property>
    <!-- A JSON sort specification for read [OPTIONAL] -->
    <name>mongo.input.sort</name>
    <value></value>
  </property>
  <property>
    <!-- The number of documents to limit to for read [OPTIONAL] -->
    <name>mongo.input.limit</name>
    <value>0</value> <!-- 0 == no limit -->
  </property>
  <property>
    <!-- The number of documents to skip in read [OPTIONAL] -->
    <!-- TODO - Are we running limit() or skip() first? -->
    <name>mongo.input.skip</name>
    <value>0</value> <!-- 0 == no skip -->
  </property>
  <property>
    <!-- IF you want to control the split size for input, set it here. 
      Should be a long indicating # of docs per split 
      Affects # of mappers so be careful what you do -->
    <name>mongo.input.split_size</name>
  </property>
  <!-- These .job.* class defs are optional and only needed if you use the MongoTool baseclass -->
  <property>
    <!-- Class for the mapper -->
    <name>mongo.job.mapper</name>
    <value></value>
  </property>
  <property>
    <!-- Reducer class -->
    <name>mongo.job.reducer</name>
    <value></value>
  </property>
  <property>
    <!-- InputFormat Class -->
    <name>mongo.job.input.format</name>
    <!-- <value>com.mongodb.hadoop.MongoInputFormat</value> -->
    <value></value>
  </property>
  <property>
    <!-- OutputFormat Class -->
    <name>mongo.job.output.format</name>
    <!-- <value>com.mongodb.hadoop.MongoOutputFormat</value> -->
    <value></value>
  </property>
  <property>
    <!-- Output key class for the output format -->
    <name>mongo.job.output.key</name>
    <value></value>
  </property>
  <property>
    <!-- Output value class for the output format -->
    <name>mongo.job.output.value</name>
    <value></value>
  </property>
  <property>
    <!-- Output key class for the mapper [optional] -->
    <name>mongo.job.mapper.output.key</name>
    <value></value>
  </property>
  <property>
    <!-- Output value class for the mapper [optional] -->
    <name>mongo.job.mapper.output.value</name>
    <value></value>
  </property>
  <property>
    <!-- Class for the combiner [optional] -->
    <name>mongo.job.combiner</name>
    <value></value>
  </property>
  <property>
    <!-- Partitioner class [optional] -->
    <name>mongo.job.partitioner</name>
    <value></value>
  </property>
  <property>
    <!-- Sort Comparator class [optional] -->
    <name>mongo.job.sort_comparator</name>
    <value></value>
  </property>
</configuration>
back to top