Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- conf = SparkConf().setAppName("sample_app").set("spark.sql.execution.arrow.enabled", "true")
- sc = SparkContext(conf=conf)
- sqlContext = SQLContext (sc)
- load_data = "hdfs:///user/pknees/RSC20/training.tsv" df = sqlContext.read.format("com.databricks.spark.csv").option("inferSchema", "true").option("delimiter", "\u0001").load(load_data)
- column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains", "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count", "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation", "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified", "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]
- def classifyLabels(inputCol):
- if inputCol is not None:
- return 1
- return 0
- labelClassify = udf(lambda inp: classifyLabels(inp))
- dfWithWeekdays = dfWithWeekdays.withColumn("is_reply",labelClassify(f.col("reply_timestamp"))).withColumn("is_retweet",labelClassify(f.col("retweet_timestamp"))).withColumn("is_retweet_with_comment",labelClassify(f.col("retweet_with_comment_timestamp"))).withColumn("is_like",labelClassify(f.col("like_timestamp")))
- dfWithWeekdays = dfWithWeekdays.withColumn("is_reply",
- dfWithWeekdays["is_reply"].cast(IntegerType())).withColumn("is_retweet",
- dfWithWeekdays["is_retweet"].cast(IntegerType())).withColumn("is_retweet_with_comment",
- dfWithWeekdays["is_retweet_with_comment"].cast(IntegerType())).withColumn("is_like",
- dfWithWeekdays["is_like"].cast(IntegerType()))
- dfWithWeekdaysLikes = dfWithWeekdays.filter(dfWithWeekdays["is_like"] == 1)
- dfWithWeekdaysLikes.groupBy("tweet_timestamp_weekdays").count()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement