Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.sql import SparkSession
- from urllib import urlopen
- import random
- if __name__ == "__main__":
- print("Pyspark 101 tutorial Datamaking")
- print("Part 3 - How to create rdd with numbers , string and creating from reading text file in PySpark")
- spark = SparkSession \
- .builder \
- .appName("Part 3 - How to create rdd with numbers , string and creating from reading text file in Pyspark using Pycharm IDE") \
- .master("local[*]") \
- .enableHiveSupport() \
- .getOrCreate() \
- randomlist = []
- for i in range(0, 100):
- n = random.randint(1, 100)
- randomlist.append(n)
- print(randomlist)
- py_rand_list = randomlist
- print("PYTHON NUMBER LIST")
- print(py_rand_list)
- print(type(py_rand_list))
- print("CREATING first RDD from python number list")
- number_rdd = spark.sparkContext.parallelize(py_rand_list, 3)
- number_even_rdd = number_rdd.filter(lambda n:n % 2 == 0)
- print(number_even_rdd.collect())
- py_str_list =["Arun","Aravib","Arjun","Ramamy"]
- print(py_str_list)
- str_rdd = spark.sparkContext.parallelize(py_str_list, 2)
- str_rdd_result = str_rdd.filter(lambda name: 'r' in name).collect()
- print(str_rdd_result)
- textpage = urlopen("https://raw.githubusercontent.com/philipperemy/name-dataset/master/names_dataset/first_names.all.txt")
- print(textpage.read(10))
- textpage_rdd = spark.sparkContext.textFile(textpage)
- textpage_rdd_a =textpage_rdd.filter(lambda name: "a" in name)
- result_a = textpage_rdd_a.collect()
- for names in result_a:
- print names
- print("Stopping the spark session object")
- spark.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement