Advertisement
Guest User

Untitled

a guest
Jan 29th, 2020
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.71 KB | None | 0 0
  1. from pyspark.sql import SparkSession
  2. from urllib import urlopen
  3. import random
  4.  
  5.  
  6. if __name__ == "__main__":
  7. print("Pyspark 101 tutorial Datamaking")
  8. print("Part 3 - How to create rdd with numbers , string and creating from reading text file in PySpark")
  9.  
  10. spark = SparkSession \
  11. .builder \
  12. .appName("Part 3 - How to create rdd with numbers , string and creating from reading text file in Pyspark using Pycharm IDE") \
  13. .master("local[*]") \
  14. .enableHiveSupport() \
  15. .getOrCreate() \
  16.  
  17.  
  18. randomlist = []
  19. for i in range(0, 100):
  20. n = random.randint(1, 100)
  21. randomlist.append(n)
  22. print(randomlist)
  23.  
  24. py_rand_list = randomlist
  25. print("PYTHON NUMBER LIST")
  26. print(py_rand_list)
  27. print(type(py_rand_list))
  28.  
  29. print("CREATING first RDD from python number list")
  30. number_rdd = spark.sparkContext.parallelize(py_rand_list, 3)
  31. number_even_rdd = number_rdd.filter(lambda n:n % 2 == 0)
  32. print(number_even_rdd.collect())
  33.  
  34. py_str_list =["Arun","Aravib","Arjun","Ramamy"]
  35. print(py_str_list)
  36. str_rdd = spark.sparkContext.parallelize(py_str_list, 2)
  37. str_rdd_result = str_rdd.filter(lambda name: 'r' in name).collect()
  38. print(str_rdd_result)
  39.  
  40. textpage = urlopen("https://raw.githubusercontent.com/philipperemy/name-dataset/master/names_dataset/first_names.all.txt")
  41. print(textpage.read(10))
  42. textpage_rdd = spark.sparkContext.textFile(textpage)
  43. textpage_rdd_a =textpage_rdd.filter(lambda name: "a" in name)
  44. result_a = textpage_rdd_a.collect()
  45. for names in result_a:
  46. print names
  47.  
  48.  
  49. print("Stopping the spark session object")
  50. spark.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement