Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Specify server locations in a SOLR_LOCATOR variable; used later in
- # variable substitutions:
- SOLR_LOCATOR : {
- # Name of solr collection
- collection : IDEAL_tweets
- # ZooKeeper ensemble
- zkHost : "127.0.0.1:2181/solr"
- }
- morphlines : [
- {
- id : morphline1
- importCommands : ["org.kitesdk.morphline.**", "com.ngdata.**", "com.cloudera.cdk.morphline.**", "org.apache.solr.**"]
- commands : [
- {
- extractHBaseCells {
- mappings : [
- {
- inputColumn : "original:tweet_id"
- outputField : "id"
- type : string
- source : value
- }
- {
- inputColumn : "original:text_clean"
- outputField : "text"
- type : string
- source : value
- }
- ]
- }
- }
- # This command deletes record fields that are unknown to Solr
- # schema.xml. Solr throws an exception on any attempt to load a
- # document that contains a field that is not specified in schema.xml.
- {
- sanitizeUnknownSolrFields {
- # Location from which to fetch Solr schema
- solrLocator : ${SOLR_LOCATOR}
- }
- }
- # convert timestamp field to native Solr timestamp format
- # such as 2012-09-06T07:14:34Z to 2012-09-06T07:14:34.000Z
- {
- convertTimestamp {
- field : created_at
- inputFormats : ["yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd"]
- inputTimezone : UTC
- outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"
- outputTimezone : UTC
- }
- }
- {
- split {
- inputField : "description"
- outputField : "title"
- separator : "|"
- }
- }
- {
- logTrace {
- format : "output record: {}", args : ["@{}"]
- }
- }
- ]
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement