Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark import SparkContext
- from pyspark.sql import SQLContext
- from pyspark.sql.types import *
- def doWork(rec):
- data = SQLContext.sql("select * from zip_data where STATEFP ='{sfp}' and COUNTYFP = '{cfp}' ".format(sfp=rec[0], cfp=rec[1]))
- for item in data.collect():
- print(item)
- # do something
- return (rec[0], rec[1])
- if __name__ == "__main__":
- sc = SparkContext(appName="Zip Disaggregation")
- print("Starting Zip Disaggregation")
- SQLContext = SQLContext(sc)
- parquetFile = SQLContext.read.parquet("/path/to/data/")
- parquetFile.registerTempTable("zip_data")
- df = SQLContext.sql("select distinct STATEFP,COUNTYFP from zip_data where STATEFP IN ('12') ")
- rslts = df.map(doWork)
- for rslt in rslts.collect():
- print(rslt)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement