Advertisement
chojuaib

Untitled

Jan 31st, 2020
188
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.72 KB | None | 0 0
  1. def find_groups(action_type, action_desc):
  2. """
  3. Finds cancellations, pickups, initial blocks and allotted blocks within
  4. UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
  5.  
  6. Parameters
  7. ----------
  8. action_type : str
  9. Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
  10. action_desc : str
  11. Action description from block journal.
  12.  
  13. Returns
  14. -------
  15. list
  16. List with matched groups from predefined regexes.
  17. """
  18.  
  19. subtypes = ['Initial Block', 'Allotted Block']
  20.  
  21. update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
  22. update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
  23. update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
  24. update_grid_cancellation_regex = r"Cancellation\s+code"
  25.  
  26. matches = []
  27.  
  28. if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
  29. matches = re.findall(update_grid_regex, action_desc)
  30.  
  31. elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
  32. matches = [("CANCELLED")] # leave it in redundant parentheses to match re.findall return structure
  33.  
  34. elif action_type == 'UPDATE BLOCK PICKUP':
  35. matches = re.findall(update_pickup_regex, action_desc)
  36. else:
  37. return []
  38.  
  39. if not matches:
  40. print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
  41. return []
  42.  
  43. return matches[0]
  44.  
  45.  
  46. def prep_block_journal(df):
  47. """
  48. Parses and standardizes block journal.
  49.  
  50. Parameters
  51. ----------
  52. df : DataFrame
  53. Data frame with block journal logs.
  54.  
  55. Returns
  56. -------
  57. DataFrame
  58. Parsed and standardized block journal.
  59.  
  60. """
  61. df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
  62. df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')
  63.  
  64. clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
  65. nulls_count = df[clause].shape[0]
  66.  
  67. if nulls_count > 0:
  68. print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))
  69.  
  70. df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)
  71.  
  72. result = pd.DataFrame(
  73. columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
  74. 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
  75. column_names = result.columns
  76. unique_blocks = df.ALLOTMENT_HEADER_ID.unique()
  77.  
  78. for block_id in tqdm(unique_blocks):
  79. block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]
  80.  
  81. for row in block_df.iterrows():
  82. row = row[1]
  83.  
  84. groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])
  85.  
  86. if groups:
  87. resort = row['RESORT']
  88. block_id = row['ALLOTMENT_HEADER_ID']
  89. action_id = row['ACTION_ID']
  90. action_type = row['ACTION_TYPE']
  91. insert_date = row['INSERT_DATE']
  92. change_date = row['CHANGE_DATE']
  93.  
  94. old_book = '0' if groups[-2] == '' else groups[-2]
  95. new_book = '0' if groups[-1] == '' else groups[-1]
  96. cancellation = 0
  97.  
  98. if action_type == 'UPDATE BLOCK PICKUP':
  99. action_subtype = ''
  100. room = groups[0]
  101. business_date = groups[1]
  102.  
  103.  
  104. elif action_type == 'UPDATE BLOCK GRID':
  105. action_subtype = groups[0]
  106. room = groups[1]
  107. business_date = [x for x in [groups[2], groups[3]] if x]
  108.  
  109. if len(business_date) == 1:
  110. business_date = business_date[0]
  111. if len(business_date) == 2:
  112. business_date = [d.strftime('%Y-%m-%d') for d in
  113. pd.date_range(start=business_date[0], end=business_date[1])]
  114.  
  115. elif 'CANCELLED' in groups:
  116. old_book = ''
  117. new_book = ''
  118. action_subtype = ''
  119. room = ''
  120. business_date = ''
  121. cancellation = 1
  122.  
  123. data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
  124. action_subtype, room, business_date, cancellation]
  125. temp_df = pd.DataFrame([data], columns=column_names)
  126. temp_df = temp_df.explode('BUSINESS_DATE')
  127.  
  128. result = result.append(temp_df, sort=False)
  129.  
  130. return result
  131. RAW Paste Data
  132. def find_groups(action_type, action_desc):
  133. """
  134. Finds cancellations, pickups, initial blocks and allotted blocks within
  135. UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
  136.  
  137. Parameters
  138. ----------
  139. action_type : str
  140. Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
  141. action_desc : str
  142. Action description from block journal.
  143.  
  144. Returns
  145. -------
  146. list
  147. List with matched groups from predefined regexes.
  148. """
  149.  
  150. subtypes = ['Initial Block', 'Allotted Block']
  151.  
  152. update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
  153. update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
  154. update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
  155. update_grid_cancellation_regex = r"Cancellation\s+code"
  156.  
  157. matches = []
  158.  
  159. if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
  160. matches = re.findall(update_grid_regex, action_desc)
  161.  
  162. elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
  163. matches = [("CANCELLED")] # leave it in redundant parentheses to match re.findall return structure
  164.  
  165. elif action_type == 'UPDATE BLOCK PICKUP':
  166. matches = re.findall(update_pickup_regex, action_desc)
  167. else:
  168. return []
  169.  
  170. if not matches:
  171. print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
  172. return []
  173.  
  174. return matches[0]
  175.  
  176.  
  177. def prep_block_journal(df):
  178. """
  179. Parses and standardizes block journal.
  180.  
  181. Parameters
  182. ----------
  183. df : DataFrame
  184. Data frame with block journal logs.
  185.  
  186. Returns
  187. -------
  188. DataFrame
  189. Parsed and standardized block journal.
  190.  
  191. """
  192. df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
  193. df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')
  194.  
  195. clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
  196. nulls_count = df[clause].shape[0]
  197.  
  198. if nulls_count > 0:
  199. print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))
  200.  
  201. df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)
  202.  
  203. result = pd.DataFrame(
  204. columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
  205. 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
  206. column_names = result.columns
  207. unique_blocks = df.ALLOTMENT_HEADER_ID.unique()
  208.  
  209. for block_id in tqdm(unique_blocks):
  210. block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]
  211.  
  212. for row in block_df.iterrows():
  213. row = row[1]
  214.  
  215. groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])
  216.  
  217. if groups:
  218. resort = row['RESORT']
  219. block_id = row['ALLOTMENT_HEADER_ID']
  220. action_id = row['ACTION_ID']
  221. action_type = row['ACTION_TYPE']
  222. insert_date = row['INSERT_DATE']
  223. change_date = row['CHANGE_DATE']
  224.  
  225. old_book = '0' if groups[-2] == '' else groups[-2]
  226. new_book = '0' if groups[-1] == '' else groups[-1]
  227. cancellation = 0
  228.  
  229. if action_type == 'UPDATE BLOCK PICKUP':
  230. action_subtype = ''
  231. room = groups[0]
  232. business_date = groups[1]
  233.  
  234.  
  235. elif action_type == 'UPDATE BLOCK GRID':
  236. action_subtype = groups[0]
  237. room = groups[1]
  238. business_date = [x for x in [groups[2], groups[3]] if x]
  239.  
  240. if len(business_date) == 1:
  241. business_date = business_date[0]
  242. if len(business_date) == 2:
  243. business_date = [d.strftime('%Y-%m-%d') for d in
  244. pd.date_range(start=business_date[0], end=business_date[1])]
  245.  
  246. elif 'CANCELLED' in groups:
  247. old_book = ''
  248. new_book = ''
  249. action_subtype = ''
  250. room = ''
  251. business_date = ''
  252. cancellation = 1
  253.  
  254. data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
  255. action_subtype, room, business_date, cancellation]
  256. temp_df = pd.DataFrame([data], columns=column_names)
  257. temp_df = temp_df.explode('BUSINESS_DATE')
  258.  
  259. result = result.append(temp_df, sort=False)
  260.  
  261. return result
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement