Untitled

conf = SparkConf().setAppName("sample_app").set("spark.sql.execution.arrow.enabled", "true")
sc = SparkContext(conf=conf)
sqlContext = SQLContext (sc)

load_data = "hdfs:///user/pknees/RSC20/training.tsv" df = sqlContext.read.format("com.databricks.spark.csv").option("inferSchema", "true").option("delimiter", "\u0001").load(load_data)

column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains", "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count", "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation", "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified", "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]


def classifyLabels(inputCol):
    if inputCol is not None:
        return 1

    return 0

labelClassify = udf(lambda inp: classifyLabels(inp))


dfWithWeekdays = dfWithWeekdays.withColumn("is_reply",labelClassify(f.col("reply_timestamp"))).withColumn("is_retweet",labelClassify(f.col("retweet_timestamp"))).withColumn("is_retweet_with_comment",labelClassify(f.col("retweet_with_comment_timestamp"))).withColumn("is_like",labelClassify(f.col("like_timestamp")))
dfWithWeekdays = dfWithWeekdays.withColumn("is_reply",
                                     dfWithWeekdays["is_reply"].cast(IntegerType())).withColumn("is_retweet",
                                     dfWithWeekdays["is_retweet"].cast(IntegerType())).withColumn("is_retweet_with_comment",
                                     dfWithWeekdays["is_retweet_with_comment"].cast(IntegerType())).withColumn("is_like",
                                     dfWithWeekdays["is_like"].cast(IntegerType()))


dfWithWeekdaysLikes = dfWithWeekdays.filter(dfWithWeekdays["is_like"] == 1)

dfWithWeekdaysLikes.groupBy("tweet_timestamp_weekdays").count()