Advertisement
avisrivastava254084

Untitled

Sep 30th, 2019
138
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.42 KB | None | 0 0
  1. import regex as re
  2. from pyspark.sql import Row
  3. from pyspark.sql.functions import udf, when
  4. from pyspark.sql.types import BooleanType
  5.  
  6. formats = [r'^(?:\(\d{3}\)-)\d{3}-\d{4}$',
  7.            r'^(?:\d{3}-)\d{3}-\d{4}$', r'^(?:\d{3}-)\d{7}$', r'^\d{10}$']
  8.  
  9.  
  10. def validate_format(number):
  11.     length = len(number)
  12.     if length == 14:
  13.         if (re.match(formats[0], number)):
  14.             return True
  15.         raise ValueError
  16.     if length == 12:
  17.         if (re.match(formats[1], number)):
  18.             return True
  19.         raise ValueError
  20.     if length == 11:
  21.         if (re.match(formats[2], number)):
  22.             return True
  23.         raise ValueError
  24.     if length == 10:
  25.         if (re.match(formats[3], number)):
  26.             return True
  27.         raise ValueError
  28.     raise ValueError
  29.  
  30.  
  31. def process_df(spark):
  32.     my_cols = Row("Column1", "Column2", "Column3", "Column4")
  33.     row_1 = my_cols('(617)-283-3811', 'Salah', 'Messi', None)
  34.     row_2 = my_cols('617-2833811', 'Messi', 'Virgil', 'Messi')
  35.     row_3 = my_cols('617-283-3811', 'Ronaldo', 'Messi', 'Ronaldo')
  36.     row_seq = [row_1, row_2, row_3]
  37.     df = spark.createDataFrame(row_seq)
  38.     validate_format_udf = udf(validate_format, BooleanType())
  39.     try:
  40.         df2 = df.select("Column2", validate_format_udf(
  41.             "Column2").alias("something"))
  42.         df2.show()
  43.     except ValueError:
  44.         df = df.drop_column("Column2")
  45.     print(df.show())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement