Untitled

def find_groups(action_type, action_desc):
    """
    Finds cancellations, pickups, initial blocks and allotted blocks within
    UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.

    Parameters
    ----------
    action_type : str
        Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
    action_desc : str
        Action description from block journal.

    Returns
    -------
    list
        List with matched groups from predefined regexes.
    """

    subtypes = ['Initial Block', 'Allotted Block']

    update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
    update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
    update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
    update_grid_cancellation_regex = r"Cancellation\s+code"

    matches = []

    if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
        matches = re.findall(update_grid_regex, action_desc)

    elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
        matches = [("CANCELLED")]  # leave it in redundant parentheses to match re.findall return structure

    elif action_type == 'UPDATE BLOCK PICKUP':
        matches = re.findall(update_pickup_regex, action_desc)
    else:
        return []

    if not matches:
        print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
        return []

    return matches[0]


def prep_block_journal(df):
    """
    Parses and standardizes block journal.

    Parameters
    ----------
    df : DataFrame
        Data frame with block journal logs.

    Returns
    -------
    DataFrame
        Parsed and standardized block journal.

    """
    df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
    df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')

    clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
    nulls_count = df[clause].shape[0]

    if nulls_count > 0:
        print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))

    df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)

    result = pd.DataFrame(
        columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
                 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
    column_names = result.columns
    unique_blocks = df.ALLOTMENT_HEADER_ID.unique()

    for block_id in tqdm(unique_blocks):
        block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]

        for row in block_df.iterrows():
            row = row[1]

            groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])

            if groups:
                resort = row['RESORT']
                block_id = row['ALLOTMENT_HEADER_ID']
                action_id = row['ACTION_ID']
                action_type = row['ACTION_TYPE']
                insert_date = row['INSERT_DATE']
                change_date = row['CHANGE_DATE']

                old_book = '0' if groups[-2] == '' else groups[-2]
                new_book = '0' if groups[-1] == '' else groups[-1]
                cancellation = 0

                if action_type == 'UPDATE BLOCK PICKUP':
                    action_subtype = ''
                    room = groups[0]
                    business_date = groups[1]


                elif action_type == 'UPDATE BLOCK GRID':
                    action_subtype = groups[0]
                    room = groups[1]
                    business_date = [x for x in [groups[2], groups[3]] if x]

                    if len(business_date) == 1:
                        business_date = business_date[0]
                    if len(business_date) == 2:
                        business_date = [d.strftime('%Y-%m-%d') for d in
                                         pd.date_range(start=business_date[0], end=business_date[1])]

                elif 'CANCELLED' in groups:
                    old_book = ''
                    new_book = ''
                    action_subtype = ''
                    room = ''
                    business_date = ''
                    cancellation = 1

                data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
                        action_subtype, room, business_date, cancellation]
                temp_df = pd.DataFrame([data], columns=column_names)
                temp_df = temp_df.explode('BUSINESS_DATE')

                result = result.append(temp_df, sort=False)

    return result
RAW Paste Data
def find_groups(action_type, action_desc):
    """
    Finds cancellations, pickups, initial blocks and allotted blocks within
    UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.

    Parameters
    ----------
    action_type : str
        Action type of UPDATE BLOCK GRID, UPDATE BLOCK HEADER and UPDATE BLOCK PICKUP.
    action_desc : str
        Action description from block journal.

    Returns
    -------
    list
        List with matched groups from predefined regexes.
    """

    subtypes = ['Initial Block', 'Allotted Block']

    update_grid_regex = r"({})\s*-\s*(?:.+\s+)?(\w+)\s*-\s*(\d+-\d+-\d+)\s*(?:->\s*)?(\d+-\d+-\d+)?\s*(\d+)?\s*->\s*(\d+)?".format('|'.join(subtypes))
    update_grid_subtype_regex = r"({})".format('|'.join(subtypes))
    update_pickup_regex = r"\w+\s+\w+\s+(\w+)\s+-\s+(\d+-\d+-\d+)\s*(\d+)?\s*->\s*(\d+)?"
    update_grid_cancellation_regex = r"Cancellation\s+code"

    matches = []

    if action_type == 'UPDATE BLOCK GRID' and re.findall(update_grid_subtype_regex, action_desc):
        matches = re.findall(update_grid_regex, action_desc)

    elif action_type == 'UPDATE BLOCK HEADER' and re.findall(update_grid_cancellation_regex, action_desc):
        matches = [("CANCELLED")]  # leave it in redundant parentheses to match re.findall return structure

    elif action_type == 'UPDATE BLOCK PICKUP':
        matches = re.findall(update_pickup_regex, action_desc)
    else:
        return []

    if not matches:
        print("PARSE ERROR: [{}] --- [{}]".format(action_type, action_desc))
        return []

    return matches[0]


def prep_block_journal(df):
    """
    Parses and standardizes block journal.

    Parameters
    ----------
    df : DataFrame
        Data frame with block journal logs.

    Returns
    -------
    DataFrame
        Parsed and standardized block journal.

    """
    df['INSERT_DATE'] = pd.to_datetime(df['INSERT_DATE'], format='%d.%m.%y', errors='coerce')
    df['CHANGE_DATE'] = pd.to_datetime(df['CHANGE_DATE'], format='%d.%m.%y', errors='coerce')

    clause = (df['INSERT_DATE'].isna()) | (df['CHANGE_DATE'].isna())
    nulls_count = df[clause].shape[0]

    if nulls_count > 0:
        print('Removed {} rows from dataset [Incorrect INSERT_DATE or CHANGE_DATE]'.format(nulls_count))

    df = df[~clause].sort_values(['ALLOTMENT_HEADER_ID', 'INSERT_DATE', 'ACTION_ID'], ascending=True)

    result = pd.DataFrame(
        columns=['RESORT', 'ALLOTMENT_HEADER_ID', 'ACTION_ID', 'ACTION_TYPE', 'INSERT_DATE', 'CHANGE_DATE', 'OLD_BOOK',
                 'NEW_BOOK', 'ACTION_SUBTYPE', 'ROOM', 'BUSINESS_DATE', 'CANCELLED'])
    column_names = result.columns
    unique_blocks = df.ALLOTMENT_HEADER_ID.unique()

    for block_id in tqdm(unique_blocks):
        block_df = df[df['ALLOTMENT_HEADER_ID'] == block_id]

        for row in block_df.iterrows():
            row = row[1]

            groups = find_groups(row['ACTION_TYPE'], row['ACTION_DESC'])

            if groups:
                resort = row['RESORT']
                block_id = row['ALLOTMENT_HEADER_ID']
                action_id = row['ACTION_ID']
                action_type = row['ACTION_TYPE']
                insert_date = row['INSERT_DATE']
                change_date = row['CHANGE_DATE']

                old_book = '0' if groups[-2] == '' else groups[-2]
                new_book = '0' if groups[-1] == '' else groups[-1]
                cancellation = 0

                if action_type == 'UPDATE BLOCK PICKUP':
                    action_subtype = ''
                    room = groups[0]
                    business_date = groups[1]


                elif action_type == 'UPDATE BLOCK GRID':
                    action_subtype = groups[0]
                    room = groups[1]
                    business_date = [x for x in [groups[2], groups[3]] if x]

                    if len(business_date) == 1:
                        business_date = business_date[0]
                    if len(business_date) == 2:
                        business_date = [d.strftime('%Y-%m-%d') for d in
                                         pd.date_range(start=business_date[0], end=business_date[1])]

                elif 'CANCELLED' in groups:
                    old_book = ''
                    new_book = ''
                    action_subtype = ''
                    room = ''
                    business_date = ''
                    cancellation = 1

                data = [resort, block_id, action_id, action_type, insert_date, change_date, old_book, new_book,
                        action_subtype, room, business_date, cancellation]
                temp_df = pd.DataFrame([data], columns=column_names)
                temp_df = temp_df.explode('BUSINESS_DATE')

                result = result.append(temp_df, sort=False)

    return result