Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pyspark
- from pyspark.sql import SparkSession
- from pyspark.sql.functions import monotonically_increasing_id
- ### xml file from https://wit3.fbk.eu/
- sc = SparkSession.builder.getOrCreate()
- df = sc.read.format("com.databricks.spark.xml").option("rowTag","transcription").load('ted_en-20160408.xml')
- df_values = df.select("seekvideo._VALUE")
- df_id = df.select("seekvideo._id")
- df_values = df_values.withColumn("id", monotonically_increasing_id())
- df_id = df_id.withColumn("id", monotonically_increasing_id())
- result = df_values.join(df_id, "id", "outer").drop("id")
- answer = result.toPandas()
- transcription = dict()
- for talk in range(len(ted)):
- if not answer._id.iloc[talk]:
- continue
- transcription[talk] = zip(answer._id.iloc[talk], answer._VALUE.iloc[talk])
- DataFrame[_corrupt_record: string, seekvideo: array<struct<_VALUE:string,_id:bigint>>]
- [(800, u'When I moved to Harare in 1985,'),
- (4120,
- u"social justice was at the core of Zimbabwe's national health policy."),
- (8920, u'The new government emerged from a long war of independence'),
- (12640, u'and immediately proclaimed a socialist agenda:'),
- (15480, u'health care services, primary education'),
- ...
- ]
Add Comment
Please, Sign In to add comment