{"type":"doc","content":[{"type":"heading","attrs":{"level":2},"content":[{"text":"A","type":"text","marks":[{"type":"annotation","attrs":{"annotationType":"inlineComment","id":"d303f42c-7e92-46f2-b4f5-e925d68acd20"}}]},{"text":"ction Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"An ","type":"text"},{"text":"Action","type":"text","marks":[{"type":"code"}]},{"text":" plugin runs arbitrary logic at the start or end of a batch data pipeline.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement an Action plugin, you extend the ","type":"text"},{"text":"Action","type":"text","marks":[{"type":"code"}]},{"text":" class. Only one method is required to be implemented: ","type":"text"},{"text":"run()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"run()","type":"text","marks":[{"type":"code"}]},{"text":": Used to implement the functionality of the plugin.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Action that moves files from one fileset into another, optionally filtering files that match a regex.\n */\n@Plugin(type = Action.PLUGIN_TYPE)\n@Name(FilesetMoveAction.NAME)\n@Description(\"Action that moves files from one fileset into another, optionally filtering files that match a regex.\")\npublic class FilesetMoveAction extends Action {\n public static final String NAME = \"FilesetMove\";\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n public static final String SOURCE_FILESET = \"sourceFileset\";\n public static final String DEST_FILESET = \"destinationFileset\";\n public static final String FILTER_REGEX = \"filterRegex\";\n\n @Name(SOURCE_FILESET)\n @Description(\"The fileset to move files from.\")\n private String sourceFileset;\n\n @Name(DEST_FILESET)\n @Description(\"The fileset to move files to.\")\n private String destinationFileset;\n\n @Nullable\n @Name(FILTER_REGEX)\n @Description(\"Filter any files whose name matches this regex. Defaults to '^\\\\.', which will filter any files \" +\n \"that begin with a period.\")\n private String filterRegex;\n\n // set defaults for properties in a no-argument constructor.\n public Conf() {\n filterRegex = \"^\\\\.\";\n }\n }\n\n public FilesetMoveAction(Conf config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n Pattern.compile(config.filterRegex);\n }\n\n @Override\n public void run(ActionContext context) throws Exception {\n context.execute(new TxRunnable() {\n @Override\n public void run(DatasetContext context) throws Exception {\n FileSet sourceFileSet = context.getDataset(config.sourceFileset);\n FileSet destinationFileSet = context.getDataset(config.destinationFileset);\n\n Pattern pattern = Pattern.compile(config.filterRegex);\n\n for (Location sourceFile : sourceFileSet.getBaseLocation().list()) {\n if (pattern.matcher(sourceFile.getName()).find()) {\n continue;\n }\n Location destFile = destinationFileSet.getBaseLocation().append(sourceFile.getName());\n sourceFile.renameTo(destFile);\n }\n }\n });\n }\n}\n","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Post-run Action Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"PostAction","type":"text","marks":[{"type":"code"}]},{"text":" plugin runs arbitrary logic after the end of a pipeline run. It can be set to execute based on whether the run completed successfully, if it failed, or in either case.","type":"text"}]},{"type":"paragraph","content":[{"text":"The difference between a ","type":"text"},{"text":"PostAction","type":"text","marks":[{"type":"code"}]},{"text":" and an ","type":"text"},{"text":"Action","type":"text","marks":[{"type":"code"}]},{"text":" that is placed at the end of a pipeline is that a ","type":"text"},{"text":"PostAction","type":"text","marks":[{"type":"code"}]},{"text":" will always be executed even if the pipeline run fails, while an ","type":"text"},{"text":"Action","type":"text","marks":[{"type":"code"}]},{"text":" will only be executed if every stage preceding it successfully runs.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Post-run Action plugin, you extend the ","type":"text"},{"text":"PostAction","type":"text","marks":[{"type":"code"}]},{"text":" class. Only one method is required to be implemented: ","type":"text"},{"text":"run()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"run()","type":"text","marks":[{"type":"code"}]},{"text":": Used to implement the functionality of the plugin.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Post run action that deletes files in a FileSet that match a configurable regex.\n */\n@Plugin(type = PostAction.PLUGIN_TYPE)\n@Name(FilesetDeletePostAction.NAME)\n@Description(\"Post run action that deletes files in a FileSet that match a configurable regex if the run succeeded.\")\npublic class FilesetDeletePostAction extends PostAction {\n public static final String NAME = \"FilesetDelete\";\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n public static final String FILESET_NAME = \"filesetName\";\n public static final String DELETE_REGEX = \"deleteRegex\";\n public static final String DIRECTORY = \"directory\";\n\n @Name(FILESET_NAME)\n @Description(\"The fileset to delete files from.\")\n private String filesetName;\n\n @Name(DELETE_REGEX)\n @Description(\"Delete files that match this regex.\")\n private String deleteRegex;\n\n // Macro enabled properties can be set to a placeholder value ${key} when the pipeline is deployed.\n // At runtime, the value for 'key' can be given and substituted in.\n @Macro\n @Name(DIRECTORY)\n @Description(\"The fileset directory to delete files from.\")\n private String directory;\n }\n\n public FilesetDeletePostAction(Conf config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n Pattern.compile(config.deleteRegex);\n }\n\n @Override\n public void run(BatchActionContext context) throws Exception {\n if (!context.isSuccessful()) {\n return;\n }\n\n FileSet fileSet = context.getDataset(config.filesetName);\n Pattern pattern = Pattern.compile(config.deleteRegex);\n for (Location fileLocation : fileSet.getBaseLocation().append(config.directory).list()) {\n if (pattern.matcher(fileLocation.getName()).find()) {\n fileLocation.delete();\n }\n }\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Batch Source Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"BatchSource","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used as a source of a batch data pipeline. It is used to prepare and configure the input of a pipeline run.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Batch Source, you extend the ","type":"text"},{"text":"BatchSource","type":"text","marks":[{"type":"code"}]},{"text":" class. You need to define the types of the ","type":"text"},{"text":"KEY","type":"text","marks":[{"type":"code"}]},{"text":" and ","type":"text"},{"text":"VALUE","type":"text","marks":[{"type":"code"}]},{"text":" that the Batch Source will receive and the type of object that the Batch Source will emit to the subsequent stage (which could be either a Transformation or a Batch Sink). After defining the types, only one method is required to be implemented: ","type":"text"},{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]},{"text":": Used to configure the input for each run of the pipeline. If the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is a macro, their creation will happen during this stage. This is called by the client that will submit the job for the pipeline run.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"onRunFinish()","type":"text","marks":[{"type":"code"}]},{"text":": Used to run any required logic at the end of a pipeline run. This is called by the client that submitted the job for the pipeline run.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Initialize the Batch Source. Guaranteed to be executed before any call to the plugin’s ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Destroy any resources created by ","type":"text"},{"text":"initialize","type":"text","marks":[{"type":"code"}]},{"text":". Guaranteed to be executed after all calls to the plugin’s ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method have been made. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"transform()","type":"text","marks":[{"type":"code"}]},{"text":": This method will be called for every input key-value pair generated by the batch job. By default, the value is emitted to the subsequent stage.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Batch Source that reads from a FileSet that has its data formatted as text.\n *\n * LongWritable is the first parameter because that is the key used by Hadoop's {@link TextInputFormat}.\n * Similarly, Text is the second parameter because that is the value used by Hadoop's {@link TextInputFormat}.\n * {@link StructuredRecord} is the third parameter because that is what the source will output.\n * All the plugins included with Hydrator operate on StructuredRecord.\n */\n@Plugin(type = BatchSource.PLUGIN_TYPE)\n@Name(TextFileSetSource.NAME)\n@Description(\"Reads from a FileSet that has its data formatted as text.\")\npublic class TextFileSetSource extends BatchSource {\n public static final String NAME = \"TextFileSet\";\n public static final Schema OUTPUT_SCHEMA = Schema.recordOf(\n \"textRecord\",\n Schema.Field.of(\"position\", Schema.of(Schema.Type.LONG)),\n Schema.Field.of(\"text\", Schema.of(Schema.Type.STRING))\n );\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n public static final String FILESET_NAME = \"fileSetName\";\n public static final String FILES = \"files\";\n public static final String CREATE_IF_NOT_EXISTS = \"createIfNotExists\";\n public static final String DELETE_INPUT_ON_SUCCESS = \"deleteInputOnSuccess\";\n\n // The name annotation tells CDAP what the property name is. It is optional, and defaults to the variable name.\n // Note: only primitives (including boxed types) and string are the types that are supported\n @Name(FILESET_NAME)\n @Description(\"The name of the FileSet to read from.\")\n private String fileSetName;\n\n // Macro enabled properties can be set to a placeholder value ${key} when the pipeline is deployed.\n // At runtime, the value for 'key' can be given and substituted in.\n @Macro\n @Name(FILES)\n @Description(\"A comma separated list of files in the FileSet to read.\")\n private String files;\n\n // A nullable fields tells CDAP that this is an optional field.\n @Nullable\n @Name(CREATE_IF_NOT_EXISTS)\n @Description(\"Whether to create the FileSet if it doesn't already exist. Defaults to false.\")\n private Boolean createIfNotExists;\n\n @Nullable\n @Name(DELETE_INPUT_ON_SUCCESS)\n @Description(\"Whether to delete the data read by the source after the run succeeds. Defaults to false.\")\n private Boolean deleteInputOnSuccess;\n\n // Use a no-args constructor to set field defaults.\n public Conf() {\n createIfNotExists = false;\n deleteInputOnSuccess = false;\n }\n }\n\n // CDAP will pass in a config with its fields populated based on the configuration given when creating the pipeline.\n public TextFileSetSource(Conf config) {\n this.config = config;\n }\n\n // configurePipeline is called exactly once when the pipeline is being created.\n // Any static configuration should be performed here.\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n // if the user has set createIfNotExists to true, create the FileSet here.\n if (config.createIfNotExists) {\n pipelineConfigurer.createDataset(config.fileSetName,\n FileSet.class,\n FileSetProperties.builder()\n .setInputFormat(TextInputFormat.class)\n .setOutputFormat(TextOutputFormat.class)\n .setEnableExploreOnCreate(true)\n .setExploreFormat(\"text\")\n .setExploreSchema(\"text string\")\n .build()\n );\n }\n // set the output schema of this stage so that stages further down the pipeline will know their input schema.\n pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);\n }\n\n // prepareRun is called before every pipeline run, and is used to configure what the input should be,\n // as well as any arguments the input should use. It is called by the client that is submitting the batch job.\n @Override\n public void prepareRun(BatchSourceContext context) throws IOException {\n Map arguments = new HashMap<>();\n FileSetArguments.setInputPaths(arguments, config.files);\n context.setInput(Input.ofDataset(config.fileSetName, arguments));\n }\n\n // onRunFinish is called at the end of the pipeline run by the client that submitted the batch job.\n @Override\n public void onRunFinish(boolean succeeded, BatchSourceContext context) {\n // perform any actions that should happen at the end of the run.\n // in our case, we want to delete the data read during this run if the run succeeded.\n if (succeeded && config.deleteInputOnSuccess) {\n Map arguments = new HashMap<>();\n FileSetArguments.setInputPaths(arguments, config.files);\n FileSet fileSet = context.getDataset(config.fileSetName, arguments);\n for (Location inputLocation : fileSet.getInputLocations()) {\n try {\n inputLocation.delete(true);\n } catch (IOException e) {\n throw new RuntimeException(e);\n }\n }\n }\n }\n\n // initialize is called by each job executor before any call to transform is made.\n // This occurs at the start of the batch job run, after the job has been successfully submitted.\n // For example, if mapreduce is the execution engine, each mapper will call initialize at the start of the program.\n @Override\n public void initialize(BatchRuntimeContext context) throws Exception {\n super.initialize(context);\n // create any resources required by transform()\n }\n\n // destroy is called by each job executor at the end of its life.\n // For example, if mapreduce is the execution engine, each mapper will call destroy at the end of the program.\n @Override\n public void destroy() {\n // clean up any resources created by initialize\n }\n\n // transform is used to transform the key-value pair output by the input into objects output by this source.\n // The output should be a StructuredRecord if you want the source to be compatible with the plugins included\n // with Hydrator.\n @Override\n public void transform(KeyValue input, Emitter emitter) throws Exception {\n emitter.emit(StructuredRecord.builder(OUTPUT_SCHEMA)\n .set(\"position\", input.getKey().get())\n .set(\"text\", input.getValue().toString())\n .build()\n );\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Lineage","type":"text"}]},{"type":"paragraph","content":[{"text":"For plugins that fetch data from non-CDAP sources, the lineage is registered using the ","type":"text"},{"text":"inputName","type":"text","marks":[{"type":"code"}]},{"text":" provided when ","type":"text"},{"text":"setInput()","type":"text","marks":[{"type":"code"}]},{"text":" is invoked on ","type":"text"},{"text":"BatchSourceContext","type":"text","marks":[{"type":"code"}]},{"text":" in ","type":"text"},{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]},{"text":". Note that the ","type":"text"},{"text":"inputName","type":"text","marks":[{"type":"code"}]},{"text":" should be a valid ","type":"text"},{"text":"DatasetId","type":"text","marks":[{"type":"code"}]},{"text":". For example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"@Override\npublic void prepareRun(BatchSourceContext context) throws Exception {\n ...\n context.setInput(Input.of(\"myExternalSource\", myInputFormatProvider));\n}","type":"text"}]},{"type":"paragraph","content":[{"text":"Lineage will be tracked using ","type":"text"},{"text":"myExternalSource","type":"text","marks":[{"type":"code"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Batch Sink Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"BatchSink","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to write data in either batch or real-time data pipelines. It is used to prepare and configure the output of a batch of data from a pipeline run.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Batch Sink, you extend the ","type":"text"},{"text":"BatchSink","type":"text","marks":[{"type":"code"}]},{"text":" class. Similar to a Batch Source, you need to define the types of the ","type":"text"},{"text":"KEY","type":"text","marks":[{"type":"code"}]},{"text":" and ","type":"text"},{"text":"VALUE","type":"text","marks":[{"type":"code"}]},{"text":" that the Batch Sink will write in the Batch job and the type of object that it will accept from the previous stage (which could be either a Transformation or a Batch Source).","type":"text"}]},{"type":"paragraph","content":[{"text":"After defining the types, only one method is required to be implemented: ","type":"text"},{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]},{"text":": Used to configure the output for each run of the pipeline. This is called by the client that will submit the job for the pipeline run.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"onRunFinish()","type":"text","marks":[{"type":"code"}]},{"text":": Used to run any required logic at the end of a pipeline run. This is called by the client that submitted the job for the pipeline run.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Initialize the Batch Sink. Guaranteed to be executed before any call to the plugin’s ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Destroy any resources created by ","type":"text"},{"text":"initialize","type":"text","marks":[{"type":"code"}]},{"text":". Guaranteed to be executed after all calls to the plugin’s ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method have been made. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"transform()","type":"text","marks":[{"type":"code"}]},{"text":": This method will be called for every object that is received from the previous stage. The logic inside the method will transform the object to the key-value pair expected by the Batch Sink's output format. If you don't override this method, the incoming object is set as the key and the value is set to null.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Batch Sink that writes to a FileSet in text format.\n * Each record will be written as a single line, with record fields separated by a configurable separator.\n *\n * StructuredRecord is the first parameter because that is the input to the sink.\n * The second and third parameters are the key and value expected by Hadoop's {@link TextOutputFormat}.\n */\n@Plugin(type = BatchSink.PLUGIN_TYPE)\n@Name(TextFileSetSink.NAME)\n@Description(\"Writes to a FileSet in text format.\")\npublic class TextFileSetSink extends BatchSink {\n public static final String NAME = \"TextFileSet\";\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n public static final String FILESET_NAME = \"fileSetName\";\n public static final String OUTPUT_DIR = \"outputDir\";\n public static final String FIELD_SEPARATOR = \"fieldSeparator\";\n\n // The name annotation tells CDAP what the property name is. It is optional, and defaults to the variable name.\n // Note: only primitives (including boxed types) and string are the types that are supported\n @Name(FILESET_NAME)\n @Description(\"The name of the FileSet to write to.\")\n private String fileSetName;\n\n // Macro enabled properties can be set to a placeholder value ${key} when the pipeline is deployed.\n // At runtime, the value for 'key' can be given and substituted in.\n @Macro\n @Name(OUTPUT_DIR)\n @Description(\"The FileSet directory to write to.\")\n private String outputDir;\n\n @Nullable\n @Name(FIELD_SEPARATOR)\n @Description(\"The separator to use to join input record fields together. Defaults to ','.\")\n private String fieldSeparator;\n\n // Use a no-args constructor to set field defaults.\n public Conf() {\n fileSetName = \"\";\n fieldSeparator = \",\";\n }\n }\n\n // CDAP will pass in a config with its fields populated based on the configuration given when creating the pipeline.\n public TextFileSetSink(Conf config) {\n this.config = config;\n }\n\n // configurePipeline is called exactly once when the pipeline is being created.\n // Any static configuration should be performed here.\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n // create the FileSet here.\n pipelineConfigurer.createDataset(config.fileSetName,\n FileSet.class,\n FileSetProperties.builder()\n .setInputFormat(TextInputFormat.class)\n .setOutputFormat(TextOutputFormat.class)\n .setEnableExploreOnCreate(true)\n .setExploreFormat(\"text\")\n .setExploreSchema(\"text string\")\n .build()\n );\n }\n\n // prepareRun is called before every pipeline run, and is used to configure what the input should be,\n // as well as any arguments the input should use. It is called by the client that is submitting the batch job.\n @Override\n public void prepareRun(BatchSinkContext context) throws Exception {\n Map arguments = new HashMap<>();\n FileSetArguments.setOutputPath(arguments, config.outputDir);\n context.addOutput(Output.ofDataset(config.fileSetName, arguments));\n }\n\n // onRunFinish is called at the end of the pipeline run by the client that submitted the batch job.\n @Override\n public void onRunFinish(boolean succeeded, BatchSinkContext context) {\n // perform any actions that should happen at the end of the run.\n }\n\n // initialize is called by each job executor before any call to transform is made.\n // This occurs at the start of the batch job run, after the job has been successfully submitted.\n // For example, if mapreduce is the execution engine, each mapper will call initialize at the start of the program.\n @Override\n public void initialize(BatchRuntimeContext context) throws Exception {\n super.initialize(context);\n // create any resources required by transform()\n }\n\n // destroy is called by each job executor at the end of its life.\n // For example, if mapreduce is the execution engine, each mapper will call destroy at the end of the program.\n @Override\n public void destroy() {\n // clean up any resources created by initialize\n }\n\n @Override\n public void transform(StructuredRecord input, Emitter> emitter) throws Exception {\n StringBuilder joinedFields = new StringBuilder();\n Iterator fieldIter = input.getSchema().getFields().iterator();\n if (!fieldIter.hasNext()) {\n // shouldn't happen\n return;\n }\n\n Object val = input.get(fieldIter.next().getName());\n if (val != null) {\n joinedFields.append(val);\n }\n while (fieldIter.hasNext()) {\n String fieldName = fieldIter.next().getName();\n joinedFields.append(config.fieldSeparator);\n val = input.get(fieldName);\n if (val != null) {\n joinedFields.append(val);\n }\n }\n emitter.emit(new KeyValue<>(NullWritable.get(), new Text(joinedFields.toString())));\n }\n\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Lineage","type":"text"}]},{"type":"paragraph","content":[{"text":"For plugins that write data to non-CDAP sinks, the lineage is registered using the ","type":"text"},{"text":"outputName","type":"text","marks":[{"type":"code"}]},{"text":" provided when ","type":"text"},{"text":"addOutput()","type":"text","marks":[{"type":"code"}]},{"text":" is invoked on ","type":"text"},{"text":"BatchSinkContext","type":"text","marks":[{"type":"code"}]},{"text":" in ","type":"text"},{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]},{"text":". Note that the ","type":"text"},{"text":"outputName","type":"text","marks":[{"type":"code"}]},{"text":" should be a valid ","type":"text"},{"text":"DatasetId","type":"text","marks":[{"type":"code"}]},{"text":". For example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"@Override\npublic void prepareRun(BatchSinkContext context) throws Exception {\n ...\n context.addOutput(Output.of(\"myExternalSink\", myOutputFormatProvider));\n}","type":"text"}]},{"type":"paragraph","content":[{"text":"Lineage will be tracked using ","type":"text"},{"text":"myExternalSink","type":"text","marks":[{"type":"code"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Transformation Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to convert one input record into zero or more output records. It can be used in both batch and real-time data pipelines.","type":"text"}]},{"type":"paragraph","content":[{"text":"The only method that needs to be implemented is: ","type":"text"},{"text":"transform()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any initialization step that might be required during the runtime of the ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":". It is guaranteed that this method will be invoked before the ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"transform()","type":"text","marks":[{"type":"code"}]},{"text":": This method contains the logic that will be applied on each incoming data object. An emitter can be used to pass the results to the subsequent stage.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any cleanup before the plugin shuts down.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Below is an example of a ","type":"text"},{"text":"StringCase","type":"text","marks":[{"type":"code"}]},{"text":" that transforms specific fields to lowercase or uppercase.","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Transform that can transforms specific fields to lowercase or uppercase.\n */\n@Plugin(type = Transform.PLUGIN_TYPE)\n@Name(StringCaseTransform.NAME)\n@Description(\"Transforms configured fields to lowercase or uppercase.\")\npublic class StringCaseTransform extends Transform {\n public static final String NAME = \"StringCase\";\n private final Conf config;\n private Set upperFields;\n private Set lowerFields;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n public static final String UPPER_FIELDS = \"upperFields\";\n public static final String LOWER_FIELDS = \"lowerFields\";\n private static final Pattern SPLIT_ON = Pattern.compile(\"\\\\s*,\\\\s*\");\n\n // nullable means this property is optional\n @Nullable\n @Name(UPPER_FIELDS)\n @Description(\"A comma separated list of fields to uppercase. Each field must be of type String.\")\n private String upperFields;\n\n @Nullable\n @Name(LOWER_FIELDS)\n @Description(\"A comma separated list of fields to lowercase. Each field must be of type String.\")\n private String lowerFields;\n\n private Set getUpperFields() {\n return parseToSet(upperFields);\n }\n\n private Set getLowerFields() {\n return parseToSet(lowerFields);\n }\n\n private Set parseToSet(String str) {\n Set set = new HashSet<>();\n if (str == null || str.isEmpty()) {\n return set;\n }\n for (String element : SPLIT_ON.split(str)) {\n set.add(element);\n }\n return set;\n }\n }\n\n public StringCaseTransform(Conf config) {\n this.config = config;\n }\n\n // configurePipeline is called only once, when the pipeline is deployed. Static validation should be done here.\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();\n // the output schema is always the same as the input schema\n Schema inputSchema = stageConfigurer.getInputSchema();\n\n // if schema is null, that means it is either not known until runtime, or it is variable\n if (inputSchema != null) {\n // if the input schema is constant and known at configure time, check that all configured fields are strings\n for (String fieldName : config.getUpperFields()) {\n validateFieldIsString(inputSchema, fieldName);\n }\n for (String fieldName : config.getLowerFields()) {\n validateFieldIsString(inputSchema, fieldName);\n }\n }\n\n stageConfigurer.setOutputSchema(inputSchema);\n }\n\n // initialize is called once at the start of each pipeline run\n @Override\n public void initialize(TransformContext context) throws Exception {\n upperFields = config.getUpperFields();\n lowerFields = config.getLowerFields();\n }\n\n // transform is called once for each record that goes into this stage\n @Override\n public void transform(StructuredRecord record, Emitter emitter) throws Exception {\n StructuredRecord.Builder builder = StructuredRecord.builder(record.getSchema());\n for (Schema.Field field : record.getSchema().getFields()) {\n String fieldName = field.getName();\n if (upperFields.contains(fieldName)) {\n builder.set(fieldName, record.get(fieldName).toString().toUpperCase());\n } else if (lowerFields.contains(fieldName)) {\n builder.set(fieldName, record.get(fieldName).toString().toLowerCase());\n } else {\n builder.set(fieldName, record.get(fieldName));\n }\n }\n emitter.emit(builder.build());\n }\n\n private void validateFieldIsString(Schema schema, String fieldName) {\n Schema.Field inputField = schema.getField(fieldName);\n if (inputField == null) {\n throw new IllegalArgumentException(\n String.format(\"Field '%s' does not exist in input schema %s.\", fieldName, schema));\n }\n Schema fieldSchema = inputField.getSchema();\n Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();\n if (fieldType != Schema.Type.STRING) {\n throw new IllegalArgumentException(\n String.format(\"Field '%s' is of illegal type %s. Must be of type %s.\",\n fieldName, fieldType, Schema.Type.STRING));\n }\n }\n}\n","type":"text"}]},{"type":"paragraph","content":[{"text":"If you wanted, you could add to the ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method a user metric indicating the number of fields changed. The user metrics can be queried by using the CDAP ","type":"text"},{"text":"Metrics Microservices","type":"text","marks":[{"type":"link","attrs":{"href":"https://cdap.atlassian.net/wiki/spaces/DOCS/pages/477692194/Metrics+HTTP+RESTful+API"}}]},{"text":":","type":"text"}]},{"type":"codeBlock","content":[{"text":"public void transform(StructuredRecord input, Emitter emitter) throws Exception {\n int fieldsChanged = 0;\n . . .\n builder.set(fieldName, record.get(fieldName). . .\n fieldsChanged += 1;\n . . .\n getContext().getMetrics().count(\"fieldsChanged\", fieldsChanged);\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Error Transformation Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"An ","type":"text"},{"text":"ErrorTransform","type":"text","marks":[{"type":"code"}]},{"text":" plugin is a special type of ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":" that consumes error records emitted from the previous stages instead of output records. It is used to transform an ","type":"text"},{"text":"ErrorRecord","type":"text","marks":[{"type":"code"}]},{"text":" to zero or more output records. In addition to the actual error object, an ","type":"text"},{"text":"ErrorRecord","type":"text","marks":[{"type":"code"}]},{"text":" exposes the stage the error was emitted from, an error code, and an error message. Errors can be emitted by ","type":"text"},{"text":"BatchSource","type":"text","marks":[{"type":"code"}]},{"text":", ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":", and ","type":"text"},{"text":"BatchAggregator","type":"text","marks":[{"type":"code"}]},{"text":" plugins using the ","type":"text"},{"text":"Emitter","type":"text","marks":[{"type":"code"}]},{"text":" they receive. An ","type":"text"},{"text":"ErrorTransform","type":"text","marks":[{"type":"code"}]},{"text":" can be used in both batch and real-time data pipelines.","type":"text"}]},{"type":"paragraph","content":[{"text":"The only method that needs to be implemented is: ","type":"text"},{"text":"transform()","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any initialization step that might be required during the runtime of the ","type":"text"},{"text":"ErrorTransform","type":"text","marks":[{"type":"code"}]},{"text":". It is guaranteed that this method will be invoked before the ","type":"text"},{"text":"transform","type":"text","marks":[{"type":"code"}]},{"text":" method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"transform()","type":"text","marks":[{"type":"code"}]},{"text":": This method contains the logic that will be applied on each incoming ","type":"text"},{"text":"ErrorRecord","type":"text","marks":[{"type":"code"}]},{"text":" object. An emitter can be used to pass the results to the subsequent stage.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any cleanup before the plugin shuts down.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Below is an example of an ","type":"text"},{"text":"ErrorCollector","type":"text","marks":[{"type":"code"}]},{"text":" that adds the error stage, code, and message to each record it receives.","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Adds the error code and error message to each record, then emits it.\n */\n@Plugin(type = ErrorTransform.PLUGIN_TYPE)\n@Name(\"ErrorCollector\")\npublic class ErrorCollector extends ErrorTransform {\n private final Config config;\n\n public ErrorCollector(Config config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {\n Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();\n if (inputSchema != null) {\n if (inputSchema.getField(config.messageField) != null) {\n throw new IllegalArgumentException(String.format(\n \"Input schema already contains message field %s. Please set messageField to a different value.\",\n config.messageField));\n }\n if (inputSchema.getField(config.codeField) != null) {\n throw new IllegalArgumentException(String.format(\n \"Input schema already contains code field %s. Please set codeField to a different value.\",\n config.codeField));\n }\n if (inputSchema.getField(config.stageField) != null) {\n throw new IllegalArgumentException(String.format(\n \"Input schema already contains stage field %s. Please set stageField to a different value.\",\n config.stageField));\n }\n Schema outputSchema = getOutputSchema(config, inputSchema);\n pipelineConfigurer.getStageConfigurer().setOutputSchema(outputSchema);\n }\n }\n\n @Override\n public void transform(ErrorRecord input, Emitter emitter) throws Exception {\n StructuredRecord invalidRecord = input.getRecord();\n StructuredRecord.Builder output = StructuredRecord.builder(getOutputSchema(config, invalidRecord.getSchema()));\n for (Schema.Field field : invalidRecord.getSchema().getFields()) {\n output.set(field.getName(), invalidRecord.get(field.getName()));\n }\n if (config.messageField != null) {\n output.set(config.messageField, input.getErrorMessage());\n }\n if (config.codeField != null) {\n output.set(config.codeField, input.getErrorCode());\n }\n if (config.stageField != null) {\n output.set(config.stageField, input.getStageName());\n }\n emitter.emit(output.build());\n }\n\n private static Schema getOutputSchema(Config config, Schema inputSchema) {\n List fields = new ArrayList<>();\n fields.addAll(inputSchema.getFields());\n if (config.messageField != null) {\n fields.add(Schema.Field.of(config.messageField, Schema.of(Schema.Type.STRING)));\n }\n if (config.codeField != null) {\n fields.add(Schema.Field.of(config.codeField, Schema.of(Schema.Type.INT)));\n }\n if (config.stageField != null) {\n fields.add(Schema.Field.of(config.stageField, Schema.of(Schema.Type.STRING)));\n }\n return Schema.recordOf(\"error\" + inputSchema.getRecordName(), fields);\n }\n\n /**\n * The plugin config\n */\n public static class Config extends PluginConfig {\n @Nullable\n @Description(\"The name of the error message field to use in the output schema. \" +\n \"If this not specified, the error message will be dropped.\")\n private String messageField;\n\n @Nullable\n @Description(\"The name of the error code field to use in the output schema. \" +\n \"If this not specified, the error code will be dropped.\")\n private String codeField;\n\n @Nullable\n @Description(\"The name of the error stage field to use in the output schema. \" +\n \"If this not specified, the error stage will be dropped.\")\n private String stageField;\n\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Alert Publisher Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"An ","type":"text"},{"text":"AlertPublisher","type":"text","marks":[{"type":"code"}]},{"text":" plugin is a special type of plugin that consumes alerts emitted from previous stages instead of output records. Alerts are meant to be uncommon events that need to be acted on in some other program. Alerts contain a payload, which is just a map of strings containing any relevant data. An alert publisher is responsible for writing the alerts to some system, where it can be read and acted upon by some external program. For example, a plugin may write alerts to Kafka. Alerts may not be published immediately after they are emitted. It is up to the processing engine to decide when to publish alerts.","type":"text"}]},{"type":"paragraph","content":[{"text":"The only method that needs to be implemented is: ","type":"text"},{"text":"publish(Iterator alerts)","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any initialization step that might be required during the runtime of the ","type":"text"},{"text":"AlertPublisher","type":"text","marks":[{"type":"code"}]},{"text":". It is guaranteed that this method will be invoked before the ","type":"text"},{"text":"publish","type":"text","marks":[{"type":"code"}]},{"text":" method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"publish()","type":"text","marks":[{"type":"code"}]},{"text":": This method contains the logic that will publish each incoming Alert.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any cleanup before the plugin shuts down.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Batch Aggregator Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"BatchAggregator","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to compute aggregates over a batch of data. It is used in both batch and real-time data pipelines. An aggregation takes place in two steps: ","type":"text"},{"text":"groupBy","type":"text","marks":[{"type":"em"}]},{"text":" and then ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"em"}]},{"text":".","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"In the ","type":"text"},{"text":"groupBy","type":"text","marks":[{"type":"em"}]},{"text":" step, the aggregator creates zero or more group keys for each input record. Before the ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"em"}]},{"text":" step occurs, the CDAP pipeline will take all records that have the same group key, and collect them into a group. If a record does not have any of the group keys, it is filtered out. If a record has multiple group keys, it will belong to multiple groups.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"The ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"em"}]},{"text":" step is then called. In this step, the plugin receives group keys and all records that had that group key. It is then left to the plugin to decide what to do with each of the groups.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"In order to implement a Batch Aggregator, you extend the ","type":"text"},{"text":"BatchAggregator","type":"text","marks":[{"type":"code"}]},{"text":" class. Unlike a ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":", which operates on a single record at a time, a ","type":"text"},{"text":"BatchAggregator","type":"text","marks":[{"type":"code"}]},{"text":" operates on a collection of records.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Initialize the Batch Aggregator. Guaranteed to be executed before any call to the plugin’s ","type":"text"},{"text":"groupBy","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"code"}]},{"text":" methods. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Destroy any resources created by ","type":"text"},{"text":"initialize","type":"text","marks":[{"type":"code"}]},{"text":". Guaranteed to be executed after all calls to the plugin’s ","type":"text"},{"text":"groupBy","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"code"}]},{"text":" methods have been made. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"groupBy()","type":"text","marks":[{"type":"code"}]},{"text":": This method will be called for every object that is received from the previous stage. This method returns zero or more group keys for each object it receives. Objects with the same group key will be grouped together for the ","type":"text"},{"text":"aggregate","type":"text","marks":[{"type":"code"}]},{"text":" method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"aggregate()","type":"text","marks":[{"type":"code"}]},{"text":": The method is called after every object has been assigned their group keys. This method is called once for each group key emitted by the ","type":"text"},{"text":"groupBy","type":"text","marks":[{"type":"code"}]},{"text":" method. The method receives a group key as well as an iterator over all objects that had that group key. Objects emitted in this method are the output for this stage.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * Aggregator that counts how many times each word appears in records input to the aggregator.\n */\n@Plugin(type = BatchAggregator.PLUGIN_TYPE)\n@Name(WordCountAggregator.NAME)\n@Description(\"Counts how many times each word appears in all records input to the aggregator.\")\npublic class WordCountAggregator extends BatchAggregator {\n public static final String NAME = \"WordCount\";\n public static final Schema OUTPUT_SCHEMA = Schema.recordOf(\n \"wordCount\",\n Schema.Field.of(\"word\", Schema.of(Schema.Type.STRING)),\n Schema.Field.of(\"count\", Schema.of(Schema.Type.LONG))\n );\n private static final Pattern WHITESPACE = Pattern.compile(\"\\\\s\");\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n @Description(\"The field from the input records containing the words to count.\")\n private String field;\n }\n\n public WordCountAggregator(Conf config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n // any static configuration validation should happen here.\n // We will check that the field is in the input schema and is of type string.\n Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();\n // a null input schema means its unknown until runtime, or its not constant\n if (inputSchema != null) {\n // if the input schema is constant and known at configure time, check that the input field exists and is a string.\n Schema.Field inputField = inputSchema.getField(config.field);\n if (inputField == null) {\n throw new IllegalArgumentException(\n String.format(\"Field '%s' does not exist in input schema %s.\", config.field, inputSchema));\n }\n Schema fieldSchema = inputField.getSchema();\n Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();\n if (fieldType != Schema.Type.STRING) {\n throw new IllegalArgumentException(\n String.format(\"Field '%s' is of illegal type %s. Must be of type %s.\",\n config.field, fieldType, Schema.Type.STRING));\n }\n }\n // set the output schema so downstream stages will know their input schema.\n pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);\n }\n\n @Override\n public void groupBy(StructuredRecord input, Emitter groupKeyEmitter) throws Exception {\n String val = input.get(config.field);\n if (val == null) {\n return;\n }\n\n for (String word : WHITESPACE.split(val)) {\n groupKeyEmitter.emit(word);\n }\n }\n\n @Override\n public void aggregate(String groupKey, Iterator groupValues,\n Emitter emitter) throws Exception {\n long count = 0;\n while (groupValues.hasNext()) {\n groupValues.next();\n count++;\n }\n emitter.emit(StructuredRecord.builder(OUTPUT_SCHEMA).set(\"word\", groupKey).set(\"count\", count).build());\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Batch Joiner Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"BatchJoiner","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to join records over a batch of data. It can be used in both batch and real-time data pipelines. A join takes place in two steps: a ","type":"text"},{"text":"joinOn","type":"text","marks":[{"type":"em"}]},{"text":" step followed by a ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"em"}]},{"text":" step.","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"In the ","type":"text"},{"text":"joinOn","type":"text","marks":[{"type":"em"}]},{"text":" step, the joiner creates a join key for each input record. The CDAP pipeline will then take all records that have the same join key and collect them into a group.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"The ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"em"}]},{"text":" step is then called. In this step, the plugin receives a list of all the records with same join key based on the type of join (either an ","type":"text"},{"text":"inner","type":"text","marks":[{"type":"em"}]},{"text":" or ","type":"text"},{"text":"outer","type":"text","marks":[{"type":"em"}]},{"text":" join). It is then up to the plugin to decide what to emit, in what becomes the final output of the stage.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"To implement a Batch Joiner, you extend the ","type":"text"},{"text":"BatchJoiner","type":"text","marks":[{"type":"code"}]},{"text":" class. Unlike a ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":", which operates on a single record at a time, a ","type":"text"},{"text":"BatchJoiner","type":"text","marks":[{"type":"code"}]},{"text":" operates on a collection of records.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to create any datasets, or perform any validation on the application configuration that is required by this plugin.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"initialize()","type":"text","marks":[{"type":"code"}]},{"text":": Initialize the Batch Joiner. Guaranteed to be executed before any call to the plugin’s ","type":"text"},{"text":"joinOn","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"code"}]},{"text":" methods. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"prepareRun()","type":"text","marks":[{"type":"code"}]},{"text":": Prepares a pipeline run. This is run every time before a pipeline runs to help set up the run. Here you can set properties such as the number of partitions to use when joining and the join key class, if it is not known at compile time.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"destroy()","type":"text","marks":[{"type":"code"}]},{"text":": Destroy any resources created by the ","type":"text"},{"text":"initialize","type":"text","marks":[{"type":"code"}]},{"text":" method. Guaranteed to be executed after all calls to the plugin’s ","type":"text"},{"text":"joinOn","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"code"}]},{"text":" methods have been made. This is called by each executor of the job. For example, if the MapReduce engine is being used, each mapper will call this method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"joinOn()","type":"text","marks":[{"type":"code"}]},{"text":": This method will be called for every object that is received from the previous stage. This method returns a join key for each object it receives. Objects with the same join key will be grouped together for the ","type":"text"},{"text":"merge","type":"text","marks":[{"type":"code"}]},{"text":" method.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"getJoinConfig()","type":"text","marks":[{"type":"code"}]},{"text":": This method will be called by the CDAP Pipeline to find out the type of join to be performed. The config specifies which input stages are ","type":"text"},{"text":"requiredInputs","type":"text","marks":[{"type":"code"}]},{"text":". Records from a required input will always be present in the ","type":"text"},{"text":"merge()","type":"text","marks":[{"type":"code"}]},{"text":" method. Records from a non-required input will only be present in the ","type":"text"},{"text":"merge()","type":"text","marks":[{"type":"code"}]},{"text":" method if they meet the join criteria. In other words, if there are no required inputs, a full outer join is performed. If all inputs are required inputs, an inner join is performed.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"merge()","type":"text","marks":[{"type":"code"}]},{"text":": This method is called after each object has been assigned a join key. The method receives a join key, an iterator over all objects with that join key, and the stage that emitted the object. Objects emitted by this method are the output for this stage.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Spark Compute Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"SparkCompute","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to transform a collection of input records into a collection of output records. It can be used in both batch and real-time data pipelines. It is similar to a ","type":"text"},{"text":"Transform","type":"text","marks":[{"type":"code"}]},{"text":", except instead of transforming its input record by record, it transforms an entire collection. In a ","type":"text"},{"text":"SparkCompute","type":"text","marks":[{"type":"code"}]},{"text":" plugin, you are given access to anything you would be able to do in a Spark program.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Spark Compute Plugin, you extend the ","type":"text"},{"text":"SparkCompute","type":"text","marks":[{"type":"code"}]},{"text":" class.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"transform()","type":"text","marks":[{"type":"code"}]},{"text":": This method is given a Spark RDD (Resilient Distributed Dataset) containing every object that is received from the previous stage. This method then performs Spark operations on the input to transform it into an output RDD that will be sent to the next stage.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * SparkCompute plugin that counts how many times each word appears in records input to the compute stage.\n */\n@Plugin(type = SparkCompute.PLUGIN_TYPE)\n@Name(WordCountCompute.NAME)\n@Description(\"Counts how many times each word appears in all records input to the aggregator.\")\npublic class WordCountCompute extends SparkCompute {\n public static final String NAME = \"WordCount\";\n public static final Schema OUTPUT_SCHEMA = Schema.recordOf(\n \"wordCount\",\n Schema.Field.of(\"word\", Schema.of(Schema.Type.STRING)),\n Schema.Field.of(\"count\", Schema.of(Schema.Type.LONG))\n );\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n @Description(\"The field from the input records containing the words to count.\")\n private String field;\n }\n\n public WordCountCompute(Conf config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n // any static configuration validation should happen here.\n // We will check that the field is in the input schema and is of type string.\n Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();\n if (inputSchema != null) {\n WordCount wordCount = new WordCount(config.field);\n wordCount.validateSchema(inputSchema);\n }\n // set the output schema so downstream stages will know their input schema.\n pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA);\n }\n\n @Override\n public JavaRDD transform(SparkExecutionPluginContext sparkExecutionPluginContext,\n JavaRDD javaRDD) throws Exception {\n WordCount wordCount = new WordCount(config.field);\n return wordCount.countWords(javaRDD)\n .flatMap(new FlatMapFunction, StructuredRecord>() {\n @Override\n public Iterable call(Tuple2 stringLongTuple2) throws Exception {\n List output = new ArrayList<>();\n output.add(StructuredRecord.builder(OUTPUT_SCHEMA)\n .set(\"word\", stringLongTuple2._1())\n .set(\"count\", stringLongTuple2._2())\n .build());\n return output;\n }\n });\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Spark Sink Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A ","type":"text"},{"text":"SparkSink","type":"text","marks":[{"type":"code"}]},{"text":" plugin is used to perform computations on a collection of input records and optionally write output data. It can only be used in batch data pipelines. A ","type":"text"},{"text":"SparkSink","type":"text","marks":[{"type":"code"}]},{"text":" is similar to a ","type":"text"},{"text":"SparkCompute","type":"text","marks":[{"type":"code"}]},{"text":" plugin except that it has no output. In a ","type":"text"},{"text":"SparkSink","type":"text","marks":[{"type":"code"}]},{"text":", you are given access to anything you would be able to do in a Spark program. For example, one common use case is to train a machine-learning model in this plugin.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Spark Sink Plugin, you extend the ","type":"text"},{"text":"SparkSink","type":"text","marks":[{"type":"code"}]},{"text":" class.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"run()","type":"text","marks":[{"type":"code"}]},{"text":": This method is given a Spark RDD (Resilient Distributed Dataset) containing every object that is received from the previous stage. Then this method performs Spark operations on the input, and usually saves the result to a dataset.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"/**\n * SparkSink plugin that counts how many times each word appears in records input to it and stores the result in\n * a KeyValueTable.\n */\n@Plugin(type = SparkSink.PLUGIN_TYPE)\n@Name(WordCountSink.NAME)\n@Description(\"Counts how many times each word appears in all records input to the aggregator.\")\npublic class WordCountSink extends SparkSink {\n public static final String NAME = \"WordCount\";\n private final Conf config;\n\n /**\n * Config properties for the plugin.\n */\n public static class Conf extends PluginConfig {\n @Description(\"The field from the input records containing the words to count.\")\n private String field;\n\n @Description(\"The name of the KeyValueTable to write to.\")\n private String tableName;\n }\n\n public WordCountSink(Conf config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n // any static configuration validation should happen here.\n // We will check that the field is in the input schema and is of type string.\n Schema inputSchema = pipelineConfigurer.getStageConfigurer().getInputSchema();\n if (inputSchema != null) {\n WordCount wordCount = new WordCount(config.field);\n wordCount.validateSchema(inputSchema);\n }\n pipelineConfigurer.createDataset(config.tableName, KeyValueTable.class, DatasetProperties.EMPTY);\n }\n\n @Override\n public void prepareRun(SparkPluginContext sparkPluginContext) throws Exception {\n // no-op\n }\n\n @Override\n public void run(SparkExecutionPluginContext sparkExecutionPluginContext,\n JavaRDD javaRDD) throws Exception {\n WordCount wordCount = new WordCount(config.field);\n JavaPairRDD outputRDD = wordCount.countWords(javaRDD)\n .mapToPair(new PairFunction, byte[], byte[]>() {\n @Override\n public Tuple2 call(Tuple2 stringLongTuple2) throws Exception {\n return new Tuple2<>(Bytes.toBytes(stringLongTuple2._1()), Bytes.toBytes(stringLongTuple2._2()));\n }\n });\n sparkExecutionPluginContext.saveAsDataset(outputRDD, config.tableName);\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Streaming Source Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A Streaming Source plugin is used as a source in real-time data pipelines. It is used to fetch a Spark DStream, which is an object that represents a collection of Spark RDDs and that produces a new RDD every batch interval of the pipeline.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Streaming Source Plugin, you extend the ","type":"text"},{"text":"StreamingSource","type":"text","marks":[{"type":"code"}]},{"text":" class.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"getStream()","type":"text","marks":[{"type":"code"}]},{"text":": Returns the ","type":"text"},{"text":"JavaDStream","type":"text","marks":[{"type":"code"}]},{"text":" that will be used as a source in the pipeline.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"@Plugin(type = StreamingSource.PLUGIN_TYPE)\n@Name(\"Twitter\")\n@Description(\"Twitter streaming source.\")\npublic class TwitterStreamingSource extends StreamingSource {\n private final TwitterStreamingConfig config;\n\n /**\n * Config class for TwitterStreamingSource.\n */\n public static class TwitterStreamingConfig extends PluginConfig implements Serializable {\n private static final long serialVersionUID = 4218063781909515444L;\n\n private String consumerKey;\n\n private String consumerSecret;\n\n private String accessToken;\n\n private String accessTokenSecret;\n\n private String referenceName;\n }\n\n public TwitterStreamingSource(TwitterStreamingConfig config) {\n this.config = config;\n }\n\n @Override\n public void configurePipeline(PipelineConfigurer pipelineConfigurer) {\n pipelineConfigurer.getStageConfigurer().setOutputSchema(TwitterConstants.SCHEMA);\n }\n\n @Override\n public JavaDStream getStream(StreamingContext context) throws Exception {\n JavaStreamingContext javaStreamingContext = context.getSparkStreamingContext();\n // lineage for this source will be tracked with this reference name\n context.registerLineage(config.referenceName);\n\n // Create authorization from user-provided properties\n ConfigurationBuilder configurationBuilder = new ConfigurationBuilder();\n configurationBuilder.setDebugEnabled(false)\n .setOAuthConsumerKey(config.consumerKey)\n .setOAuthConsumerSecret(config.consumerSecret)\n .setOAuthAccessToken(config.accessToken)\n .setOAuthAccessTokenSecret(config.accessTokenSecret);\n Authorization authorization = new OAuthAuthorization(configurationBuilder.build());\n\n return TwitterUtils.createStream(javaStreamingContext, authorization).map(\n new Function() {\n public StructuredRecord call(Status status) {\n return convertTweet(status);\n }\n }\n );\n }\n\n private StructuredRecord convertTweet(Status tweet) {\n // logic to convert a Twitter Status into a CDAP StructuredRecord\n }\n\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Lineage","type":"text"}]},{"type":"paragraph","content":[{"text":"The lineage is registered using the ","type":"text"},{"text":"referenceName","type":"text","marks":[{"type":"code"}]},{"text":" provided when invoking ","type":"text"},{"text":"registerLineage()","type":"text","marks":[{"type":"code"}]},{"text":" on ","type":"text"},{"text":"StreamingContext","type":"text","marks":[{"type":"code"}]},{"text":" in ","type":"text"},{"text":"getStream()","type":"text","marks":[{"type":"code"}]},{"text":". Note that the ","type":"text"},{"text":"referenceName","type":"text","marks":[{"type":"code"}]},{"text":" should be a valid ","type":"text"},{"text":"DatasetId","type":"text","marks":[{"type":"code"}]},{"text":".","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Windower Plugin","type":"text"}]},{"type":"paragraph","content":[{"text":"A Windower plugin is used in real-time data pipelines to create sliding windows over the data. It does this by combining multiple micro batches into larger batches.","type":"text"}]},{"type":"paragraph","content":[{"text":"A window is defined by its ","type":"text"},{"text":"size","type":"text","marks":[{"type":"em"}]},{"text":" and its ","type":"text"},{"text":"slide interval","type":"text","marks":[{"type":"em"}]},{"text":". Both are defined in seconds and must be multiples of the ","type":"text"},{"text":"batchInterval","type":"text","marks":[{"type":"code"}]},{"text":" of the pipeline. The ","type":"text"},{"text":"size","type":"text","marks":[{"type":"em"}]},{"text":" defines how much data is contained in the window. The ","type":"text"},{"text":"slide interval","type":"text","marks":[{"type":"em"}]},{"text":" defines have often a window is created.","type":"text"}]},{"type":"paragraph","content":[{"text":"For example, consider a pipeline with a ","type":"text"},{"text":"batchInterval","type":"text","marks":[{"type":"code"}]},{"text":" of 10 seconds. The pipeline uses a ","type":"text"},{"text":"windower","type":"text","marks":[{"type":"code"}]},{"text":" that has a size of 60 and a slide interval of 30. The input into the ","type":"text"},{"text":"windower","type":"text","marks":[{"type":"code"}]},{"text":" will be micro batches containing 10 seconds of data. Every 30 seconds, the ","type":"text"},{"text":"windower","type":"text","marks":[{"type":"code"}]},{"text":" will output a batch of data containing the past 60 seconds of data, meaning the previous six micro batches that it received as input.","type":"text"}]},{"type":"paragraph","content":[{"text":"This also means that each window output will overlap (repeat) some of the data from the previous window. This is useful in calculating aggregates, such as how many \"404\" responses did a website send out in the past ten seconds, past minute, past five minutes.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to implement a Windower Plugin, you extend the ","type":"text"},{"text":"Windower","type":"text","marks":[{"type":"code"}]},{"text":" class.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Methods","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"configurePipeline()","type":"text","marks":[{"type":"code"}]},{"text":": Used to perform any validation on the application configuration that is required by this plugin or to create any datasets if the ","type":"text"},{"text":"fieldName","type":"text","marks":[{"type":"code"}]},{"text":" for a ","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"dataset is not a macro.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"getWidth()","type":"text","marks":[{"type":"code"}]},{"text":": Return the width in seconds of windows created by this plugin. Must be a multiple of the ","type":"text"},{"text":"batchInterval","type":"text","marks":[{"type":"code"}]},{"text":" of the pipeline.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"getSlideInterval()","type":"text","marks":[{"type":"code"}]},{"text":": Get the slide interval in seconds of windows created by this plugin. Must be a multiple of the ","type":"text"},{"text":"batchInterval","type":"text","marks":[{"type":"code"}]},{"text":" of the pipeline.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Example:","type":"text"}]},{"type":"codeBlock","content":[{"text":"@Plugin(type = Windower.PLUGIN_TYPE)\n@Name(\"Window\")\n@Description(\"Creates a sliding window over the data\")\npublic class Window extends Windower {\n private final Conf conf;\n\n /**\n * Config for window plugin.\n */\n public static class Conf extends PluginConfig {\n long width;\n long slideInterval;\n }\n\n public Window(Conf conf) {\n this.conf = conf;\n }\n\n @Override\n public long getWidth() {\n return conf.width;\n }\n\n @Override\n public long getSlideInterval() {\n return conf.slideInterval;\n }\n}","type":"text"}]},{"type":"paragraph"}],"version":1}

Browser not supported