Oberon's Legacy

{"type":"doc","content":[{"type":"paragraph","content":[{"text":"Plugin version: 2.11.0","type":"text"}]},{"type":"paragraph","content":[{"text":"The XML Reader plugin is a source plugin that allows users to read XML files stored on HDFS.","type":"text"}]},{"type":"paragraph","content":[{"text":"A user would like to read XML files that have been dropped into HDFS. These can range in size from small to very large XML files. The XMLReader will read and parse the files, and when used in conjunction with the XMLParser plugin, fields can be extracted. This reader emits one XML event, specified by the node path property, for each file read.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Configuration","type":"text"}]},{"type":"table","attrs":{"layout":"default","localId":"d46d4bc8-2cf9-4509-801b-004ab530d674"},"content":[{"type":"tableRow","content":[{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Property","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Macro Enabled?","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Description","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Reference Name","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. This will be used to uniquely identify this source for lineage, annotating metadata, etc.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Path","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. Path to file(s) to be read. If a directory is specified, terminate the path name with a ‘/‘. This leverages glob syntax as described in the ","type":"text"},{"text":"Java Documentation","type":"text","marks":[{"type":"link","attrs":{"href":"https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob"}}]},{"text":".","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Node Path","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. Node path (XPath) to emit as an individual event from the XML schema. Example: '/book/price' to read only the price from under the book node. For more information about XPaths, see the ","type":"text"},{"text":"Java Documentation","type":"text","marks":[{"type":"link","attrs":{"href":"https://docs.oracle.com/javase/tutorial/jaxp/xslt/xpath.html"}}]},{"text":". ","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Action After Processing File","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. Action to be taken after processing of the XML file. Possible actions are: (DELETE) delete from HDFS; (ARCHIVE) archive to the target location; and (MOVE) move to the target location.","type":"text"}]},{"type":"paragraph","content":[{"text":"Default is None.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Reprocessing Required","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. Specifies whether the files should be reprocessed. If set to ","type":"text"},{"text":"No","type":"text","marks":[{"type":"code"}]},{"text":", the files are tracked and will not be processed again on future runs of the pipeline.","type":"text"}]},{"type":"paragraph","content":[{"text":"Default is Yes.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Temporary Folder","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. An existing folder path with read and write access for the current user. This is required for storing temporary files containing paths of the processed XML files. These temporary files will be read at the end of the job to update the file track table. ","type":"text"}]},{"type":"paragraph","content":[{"text":"Default is ","type":"text"},{"text":"/tmp","type":"text","marks":[{"type":"code"}]},{"text":". ","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"File Pattern","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Optional. The regular expression pattern used to select specific files. This should be used in cases when the glob syntax in the ","type":"text"},{"text":"Path","type":"text","marks":[{"type":"code"}]},{"text":" is not precise enough. See examples in the “Usage Notes” below.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Target Folder","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Optional. Target folder path if the user select an action for after the process, either one of ARCHIVE or MOVE. Target folder must be an existing directory.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Enable processing external entities","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"Yes","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Optional. This enables processing external entities while reading xml file. Defaults to ","type":"text"},{"text":"false","type":"text","marks":[{"type":"code"}]},{"text":". ","type":"text"},{"text":"Note","type":"text","marks":[{"type":"strong"}]},{"text":": The external entities should be enabled only if necessary. It posts security risk of malicious code execution. Please read more about ","type":"text"},{"text":"xxe xml vulnerability here","type":"text","marks":[{"type":"link","attrs":{"href":"https://owasp.org/www-community/vulnerabilities/XML_External_Entity_(XXE"}}]},{"text":"_Processing).","type":"text"}]},{"type":"paragraph","content":[{"text":"Default is Off.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Enable XML parser to support DTDs","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Optional. This sets supporting DTDs while processing xml file. This property needs to be set ","type":"text"},{"text":"false","type":"text","marks":[{"type":"code"}]},{"text":" if external entities needs to be evaluated. ","type":"text"}]},{"type":"paragraph","content":[{"text":"Default is Off.","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[253.0]},"content":[{"type":"paragraph","content":[{"text":"Output Schema","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[106.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[400.0]},"content":[{"type":"paragraph","content":[{"text":"Required. The output schema for the data.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Usage Notes","type":"text"}]},{"type":"paragraph","content":[{"text":"When specifying a regular expression for filtering files, you must use glob syntax in the folder path. This usually means ending the path with '/*'.","type":"text"}]},{"type":"paragraph","content":[{"text":"Here are some regular expression pattern examples:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Use '^' to select files with names starting with 'catalog', such as '^catalog'.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Use '$' to select files with names ending with 'catalog.xml', such as 'catalog.xml$'.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Use '.*' to select files with a name that contains 'catalogBook', such as 'catalogBook.*'.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Example","type":"text"}]},{"type":"paragraph","content":[{"text":"This example reads data from the folder ","type":"text"},{"text":"hdfs:/cdap/source/xmls/","type":"text","marks":[{"type":"code"}]},{"text":" and emits XML records on the basis of the node path ","type":"text"},{"text":"/catalog/book/title","type":"text","marks":[{"type":"code"}]},{"text":". It will generate structured records with the fields ","type":"text"},{"text":"offset","type":"text","marks":[{"type":"code"}]},{"text":", ","type":"text"},{"text":"fileName","type":"text","marks":[{"type":"code"}]},{"text":", and ","type":"text"},{"text":"record","type":"text","marks":[{"type":"code"}]},{"text":". It will move the XML files to the target folder ","type":"text"},{"text":"hdfs:/cdap/target/xmls/","type":"text","marks":[{"type":"code"}]},{"text":" and update the processed file information in the ","type":"text"},{"text":"table named","type":"text","marks":[{"type":"annotation","attrs":{"annotationType":"inlineComment","id":"08169af9-3fa6-4414-830d-317ae1ecbeaa"}}]},{"text":" ","type":"text"},{"text":"trackingTable","type":"text","marks":[{"type":"code"}]},{"text":".","type":"text"}]},{"type":"table","attrs":{"layout":"default","localId":"f94e1ccf-7bee-4a3c-8277-8a901e046a36"},"content":[{"type":"tableRow","content":[{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Property","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Value","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Reference Name","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"referenceName","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Path","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"hdfs:/cdap/source/xmls/*","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Node Path","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"/catalog/book/title","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Action After Processing File","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Move","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Reprocessing Required","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"No","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Temporary Folder","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"hdfs:/cdap/target/xmls/","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"File Pattern","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"^catalog.*","type":"text","marks":[{"type":"code"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"Target Folder","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[340.0]},"content":[{"type":"paragraph","content":[{"text":"hdfs:/cdap/target/xmls/","type":"text","marks":[{"type":"code"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"For this XML as an input:","type":"text"}]},{"type":"codeBlock","content":[{"text":"\n \n Corets, Eva\n Oberon's Legacy\n Fantasy\n 5.9513.0013.00\n 2001-03-10\n In post-apocalypse England, the mysterious\n agent known only as Oberon helps to create a new life\n for the inhabitants of London. Sequel to Maeve\n Ascendant.\n \n \n Corets, Eva\n The Sundered Grail\n Fantasy\n 5.9514.0014.00\n 2001-09-10\n The two daughters of Maeve, half-sisters,\n battle one another for control of England. Sequel to\n Oberon's Legacy.\n \n","type":"text"}]},{"type":"paragraph","content":[{"text":"The output records will be:","type":"text"}]},{"type":"table","attrs":{"layout":"default","localId":"1b49d20b-8ec4-49af-9d5c-8d416480f9d8"},"content":[{"type":"tableRow","content":[{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[124.0]},"content":[{"type":"paragraph","content":[{"text":"offset","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[348.0]},"content":[{"type":"paragraph","content":[{"text":"filename","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"tableHeader","attrs":{"colspan":1,"rowspan":1,"colwidth":[287.0]},"content":[{"type":"paragraph","content":[{"text":"record","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[124.0]},"content":[{"type":"paragraph","content":[{"text":"2","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[348.0]},"content":[{"type":"paragraph","content":[{"text":"hdfs:/cdap/source/xmls/catalog.xml","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[287.0]},"content":[{"type":"paragraph","content":[{"text":"Oberon’s Legacy","type":"text"}]}]}]},{"type":"tableRow","content":[{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[124.0]},"content":[{"type":"paragraph","content":[{"text":"13","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[348.0]},"content":[{"type":"paragraph","content":[{"text":"hdfs:/cdap/source/xmls/catalog.xml","type":"text"}]}]},{"type":"tableCell","attrs":{"colspan":1,"rowspan":1,"colwidth":[287.0]},"content":[{"type":"paragraph","content":[{"text":"The Sundered Grail","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"type":"hardBreak"}]}],"version":1}

Browser not supported