Guest User

Untitled

a guest
Oct 19th, 2018
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.17 KB | None | 0 0
  1. import pyspark
  2. from pyspark.sql import SparkSession
  3. from pyspark.sql.functions import monotonically_increasing_id
  4.  
  5. ### xml file from https://wit3.fbk.eu/
  6. sc = SparkSession.builder.getOrCreate()
  7. df = sc.read.format("com.databricks.spark.xml").option("rowTag","transcription").load('ted_en-20160408.xml')
  8. df_values = df.select("seekvideo._VALUE")
  9. df_id = df.select("seekvideo._id")
  10. df_values = df_values.withColumn("id", monotonically_increasing_id())
  11. df_id = df_id.withColumn("id", monotonically_increasing_id())
  12. result = df_values.join(df_id, "id", "outer").drop("id")
  13. answer = result.toPandas()
  14.  
  15. transcription = dict()
  16. for talk in range(len(ted)):
  17. if not answer._id.iloc[talk]:
  18. continue
  19. transcription[talk] = zip(answer._id.iloc[talk], answer._VALUE.iloc[talk])
  20.  
  21. DataFrame[_corrupt_record: string, seekvideo: array<struct<_VALUE:string,_id:bigint>>]
  22.  
  23. [(800, u'When I moved to Harare in 1985,'),
  24. (4120,
  25. u"social justice was at the core of Zimbabwe's national health policy."),
  26. (8920, u'The new government emerged from a long war of independence'),
  27. (12640, u'and immediately proclaimed a socialist agenda:'),
  28. (15480, u'health care services, primary education'),
  29. ...
  30. ]
Add Comment
Please, Sign In to add comment