Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import regex as re
- from pyspark.sql import Row
- from pyspark.sql.functions import udf, when
- from pyspark.sql.types import BooleanType
- formats = [r'^(?:\(\d{3}\)-)\d{3}-\d{4}$',
- r'^(?:\d{3}-)\d{3}-\d{4}$', r'^(?:\d{3}-)\d{7}$', r'^\d{10}$']
- def validate_format(number):
- length = len(number)
- if length == 14:
- if (re.match(formats[0], number)):
- return True
- raise ValueError
- if length == 12:
- if (re.match(formats[1], number)):
- return True
- raise ValueError
- if length == 11:
- if (re.match(formats[2], number)):
- return True
- raise ValueError
- if length == 10:
- if (re.match(formats[3], number)):
- return True
- raise ValueError
- raise ValueError
- def process_df(spark):
- my_cols = Row("Column1", "Column2", "Column3", "Column4")
- row_1 = my_cols('(617)-283-3811', 'Salah', 'Messi', None)
- row_2 = my_cols('617-2833811', 'Messi', 'Virgil', 'Messi')
- row_3 = my_cols('617-283-3811', 'Ronaldo', 'Messi', 'Ronaldo')
- row_seq = [row_1, row_2, row_3]
- df = spark.createDataFrame(row_seq)
- validate_format_udf = udf(validate_format, BooleanType())
- try:
- df2 = df.select("Column2", validate_format_udf(
- "Column2").alias("something"))
- df2.show()
- except ValueError:
- df = df.drop_column("Column2")
- print(df.show())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement