from pyspark.context import SparkContext from pyspark.conf import SparkConf import json from datetime import datetime import crawl def char(c): print (c) return True if __name__ == "__main__": conf = SparkConf() conf.setMaster("spark://192.168.1.104:7077") conf.setAppName("Crawl 3000 Urls") conf.set("spark.executor.memory", "6g") # conf.set("") sc = SparkContext(conf=conf) sc.addFile("/home/louis/Desktop/Spark/crawl.py") arr = json.load(open("/home/louis/Downloads/urls.json", "r")) print (len(arr)) size = 300 params = [arr[i * size : (i + 1) * size] for i in range(10)] print (len (params)) urlData = sc.parallelize(params, numSlices=10) print ("Number of partitions: {}".format(urlData.getNumPartitions())) print ("Partitioner: {}".format(urlData.partitioner)) print ("Partitions structure: {}".format(urlData.glom().collect())) print (datetime.now()) ret = urlData.map(lambda url: crawl.run(url)).collect() print (ret) print (datetime.now())