Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def find_groups(action_type, action_desc):
- """
- Finds cancellations, pickups, initial blocks and allotted blocks within
- UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
- Parameters
- ----------
- action_type : str
- Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
- action_desc : str
- Action description from block journal.
- Returns
- -------
- list
- List with matched groups from predefined regexes.
- """
- subtypes = ['Initial Block', 'Allotted Block']
- update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
- update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
- update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
- update_grid_cancellation_regex = r"Cancellation\s+code"
- matches = []
- if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
- matches = re.findall(update_grid_regex, action_desc)
- elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
- matches = [("CANCELLED")] # leave it in redundant parentheses to match re.findall return structure
- elif action_type == 'UPDATE BLOCK PICKUP':
- matches = re.findall(update_pickup_regex, action_desc)
- else:
- return []
- if not matches:
- print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
- return []
- return matches[0]
- def prep_block_journal(df):
- """
- Parses and standardizes block journal.
- Parameters
- ----------
- df : DataFrame
- Data frame with block journal logs.
- Returns
- -------
- DataFrame
- Parsed and standardized block journal.
- """
- df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
- df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')
- clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
- nulls_count = df[clause].shape[0]
- if nulls_count > 0:
- print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))
- df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)
- result = pd.DataFrame(
- columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
- 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
- column_names = result.columns
- unique_blocks = df.ALLOTMENT_HEADER_ID.unique()
- for block_id in tqdm(unique_blocks):
- block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]
- for row in block_df.iterrows():
- row = row[1]
- groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])
- if groups:
- resort = row['RESORT']
- block_id = row['ALLOTMENT_HEADER_ID']
- action_id = row['ACTION_ID']
- action_type = row['ACTION_TYPE']
- insert_date = row['INSERT_DATE']
- change_date = row['CHANGE_DATE']
- old_book = '0' if groups[-2] == '' else groups[-2]
- new_book = '0' if groups[-1] == '' else groups[-1]
- cancellation = 0
- if action_type == 'UPDATE BLOCK PICKUP':
- action_subtype = ''
- room = groups[0]
- business_date = groups[1]
- elif action_type == 'UPDATE BLOCK GRID':
- action_subtype = groups[0]
- room = groups[1]
- business_date = [x for x in [groups[2], groups[3]] if x]
- if len(business_date) == 1:
- business_date = business_date[0]
- if len(business_date) == 2:
- business_date = [d.strftime('%Y-%m-%d') for d in
- pd.date_range(start=business_date[0], end=business_date[1])]
- elif 'CANCELLED' in groups:
- old_book = ''
- new_book = ''
- action_subtype = ''
- room = ''
- business_date = ''
- cancellation = 1
- data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
- action_subtype, room, business_date, cancellation]
- temp_df = pd.DataFrame([data], columns=column_names)
- temp_df = temp_df.explode('BUSINESS_DATE')
- result = result.append(temp_df, sort=False)
- return result
- RAW Paste Data
- def find_groups(action_type, action_desc):
- """
- Finds cancellations, pickups, initial blocks and allotted blocks within
- UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
- Parameters
- ----------
- action_type : str
- Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
- action_desc : str
- Action description from block journal.
- Returns
- -------
- list
- List with matched groups from predefined regexes.
- """
- subtypes = ['Initial Block', 'Allotted Block']
- update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
- update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
- update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
- update_grid_cancellation_regex = r"Cancellation\s+code"
- matches = []
- if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
- matches = re.findall(update_grid_regex, action_desc)
- elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
- matches = [("CANCELLED")] # leave it in redundant parentheses to match re.findall return structure
- elif action_type == 'UPDATE BLOCK PICKUP':
- matches = re.findall(update_pickup_regex, action_desc)
- else:
- return []
- if not matches:
- print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
- return []
- return matches[0]
- def prep_block_journal(df):
- """
- Parses and standardizes block journal.
- Parameters
- ----------
- df : DataFrame
- Data frame with block journal logs.
- Returns
- -------
- DataFrame
- Parsed and standardized block journal.
- """
- df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
- df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')
- clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
- nulls_count = df[clause].shape[0]
- if nulls_count > 0:
- print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))
- df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)
- result = pd.DataFrame(
- columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
- 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
- column_names = result.columns
- unique_blocks = df.ALLOTMENT_HEADER_ID.unique()
- for block_id in tqdm(unique_blocks):
- block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]
- for row in block_df.iterrows():
- row = row[1]
- groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])
- if groups:
- resort = row['RESORT']
- block_id = row['ALLOTMENT_HEADER_ID']
- action_id = row['ACTION_ID']
- action_type = row['ACTION_TYPE']
- insert_date = row['INSERT_DATE']
- change_date = row['CHANGE_DATE']
- old_book = '0' if groups[-2] == '' else groups[-2]
- new_book = '0' if groups[-1] == '' else groups[-1]
- cancellation = 0
- if action_type == 'UPDATE BLOCK PICKUP':
- action_subtype = ''
- room = groups[0]
- business_date = groups[1]
- elif action_type == 'UPDATE BLOCK GRID':
- action_subtype = groups[0]
- room = groups[1]
- business_date = [x for x in [groups[2], groups[3]] if x]
- if len(business_date) == 1:
- business_date = business_date[0]
- if len(business_date) == 2:
- business_date = [d.strftime('%Y-%m-%d') for d in
- pd.date_range(start=business_date[0], end=business_date[1])]
- elif 'CANCELLED' in groups:
- old_book = ''
- new_book = ''
- action_subtype = ''
- room = ''
- business_date = ''
- cancellation = 1
- data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
- action_subtype, room, business_date, cancellation]
- temp_df = pd.DataFrame([data], columns=column_names)
- temp_df = temp_df.explode('BUSINESS_DATE')
- result = result.append(temp_df, sort=False)
- return result
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement