rec_popular kamil

import math
import pandas as pd
import numpy as np

GR_COLS = ["user_id", "session_id", "timestamp", "step"]


def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out


def get_popularity(df):
    """Get number of clicks that each item received in the df."""

    #wybierz tylko clikouty
    df_clickout = df[(df.action_type == "clickout item")]
    #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]

    #policz klikniecia w dany hotel
    df_item_clicks = (
        df_clickout
            .groupby("reference")
            .size()
            .reset_index(name="n_clicks")
            .transform(lambda x: x.astype(int))
    )

    #policz ile razy kazdy hotel sie wyswietlil
    hotels = explode(df_clickout, "impressions")
    hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
            lambda x: x.astype(int))

    #rozdziel kolumne z cenami oraz znajdz wszystkie unikalne ceny hotelow
    prices = explode(df_clickout, "prices")
    unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
    unique_prices_list = unique_prices['prices'].tolist()

    #polacz hotele z cenami
    hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
    hotel_prices_df = pd.DataFrame(hotel_prices)
    hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])

    #polacz hotele z kliknieciami
    hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
    hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
    hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)

    #dodaj CTR do poprzedniego DF
    hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
    hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))

    #dodaj ceny hotelów
    hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')

    #oblicz górny próg ceny
    price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]

    #wybierz tylko wiersza ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
    hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
    hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)

    print(hotels_info.sort_values(["reference"]))
    return hotels_info


def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out


def explode(df_in, col_expl):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)

    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )

    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out


def group_concat(df, gr_cols, col_concat):
    """Concatenate multiple rows into one."""

    df_out = (
        df
        .groupby(gr_cols)[col_concat]
        .apply(lambda x: ' '.join(x))
        .to_frame()
        .reset_index()
    )

    return df_out


def calc_recommendation(df_expl, df_pop):
    """Calculate recommendations based on popularity of items.

    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.

    :param df_expl: Data frame with exploded impression list
    :param df_pop: Data frame with items and number of clicks
    :return: Data frame with sorted impression list according to popularity in df_pop
    """

    df_expl_clicks = (
        df_expl[GR_COLS + ["impressions"]]
        .merge(df_pop,
               left_on="impressions",
               right_on="reference",
               how="left")
    )

    df_out = (
        df_expl_clicks
        .assign(impressions=lambda x: x["impressions"].apply(str))
        .sort_values(GR_COLS + ["n_clicks"],
                     ascending=[True, True, True, True, False])
    )

    df_out = group_concat(df_out, GR_COLS, "impressions")
    df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)

    return df_out

def main():

    train_csv = '../../data/newsmall5mln/train.csv'
    test_csv = '../../data/newsmall5mln/test.csv'
    subm_csv = '../../data/newsmall5mln/submission_popular.csv'

   # print(f"Reading {train_csv} ...")
    df_train = pd.read_csv(train_csv)
  #  print(f"Reading {test_csv} ...")
    df_test = pd.read_csv(test_csv)

    print("Get popular items...")
    df_popular = get_popularity(df_train)

    print("Identify target rows...")
    df_target = get_submission_target(df_test)

    print("Get recommendations...")
    df_expl = explode(df_target, "impressions")
    df_out = calc_recommendation(df_expl, df_popular)

   # print(f"Writing {subm_csv}...")
    df_out.to_csv(subm_csv, index=False)

    print("Finished calculating recommendations.")


if __name__ == '__main__':
    main()