caparol6991

Untitled

Dec 16th, 2019
127
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import math
  2. import pandas as pd
  3. import numpy as np
  4.  
  5. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
  6.  
  7.  
  8. def get_submission_target(df):
  9.     """Identify target rows with missing click outs."""
  10.  
  11.     mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
  12.     df_out = df[mask]
  13.  
  14.     return df_out
  15.  
  16.  
  17. def get_popularity(df):
  18.     """Get number of clicks that each item received in the df."""
  19.  
  20.     df_clickout = df[(df.action_type == "clickout item")]
  21.     #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]
  22.  
  23.     df_item_clicks = (
  24.         df_clickout
  25.             .groupby("reference")
  26.             .size()
  27.             .reset_index(name="n_clicks")
  28.             .transform(lambda x: x.astype(int))
  29.     )
  30.  
  31.     hotels = explode(df_clickout, "impressions")
  32.     hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
  33.             lambda x: x.astype(int))
  34.  
  35.     prices = explode(df_clickout, "prices")
  36.     unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
  37.     unique_prices_list = unique_prices['prices'].tolist()
  38.  
  39.     hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
  40.     hotel_prices_df = pd.DataFrame(hotel_prices)
  41.  
  42.     hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])
  43.  
  44.     hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
  45.     hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
  46.     hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)
  47.  
  48.     hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
  49.     hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))
  50.     hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')
  51.  
  52.     price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]
  53.  
  54.     hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
  55.     hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)
  56.  
  57.     print(hotels_info.sort_values(["reference"]))
  58.     return hotels_info
  59.  
  60.  
  61. def string_to_array(s):
  62.     """Convert pipe separated string to array."""
  63.  
  64.     if isinstance(s, str):
  65.         out = s.split("|")
  66.     elif math.isnan(s):
  67.         out = []
  68.     else:
  69.         raise ValueError("Value must be either string of nan")
  70.     return out
  71.  
  72.  
  73. def explode(df_in, col_expl):
  74.     """Explode column col_expl of array type into multiple rows."""
  75.  
  76.     df = df_in.copy()
  77.     df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
  78.  
  79.     df_out = pd.DataFrame(
  80.         {col: np.repeat(df[col].values,
  81.                         df[col_expl].str.len())
  82.          for col in df.columns.drop(col_expl)}
  83.     )
  84.  
  85.     df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
  86.     df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
  87.  
  88.     return df_out
  89.  
  90.  
  91. def group_concat(df, gr_cols, col_concat):
  92.     """Concatenate multiple rows into one."""
  93.  
  94.     df_out = (
  95.         df
  96.         .groupby(gr_cols)[col_concat]
  97.         .apply(lambda x: ' '.join(x))
  98.         .to_frame()
  99.         .reset_index()
  100.     )
  101.  
  102.     return df_out
  103.  
  104.  
  105. def calc_recommendation(df_expl, df_pop):
  106.     """Calculate recommendations based on popularity of items.
  107.  
  108.    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
  109.  
  110.    :param df_expl: Data frame with exploded impression list
  111.    :param df_pop: Data frame with items and number of clicks
  112.    :return: Data frame with sorted impression list according to popularity in df_pop
  113.    """
  114.  
  115.     df_expl_clicks = (
  116.         df_expl[GR_COLS + ["impressions"]]
  117.         .merge(df_pop,
  118.                left_on="impressions",
  119.                right_on="reference",
  120.                how="left")
  121.     )
  122.  
  123.     df_out = (
  124.         df_expl_clicks
  125.         .assign(impressions=lambda x: x["impressions"].apply(str))
  126.         .sort_values(GR_COLS + ["n_clicks"],
  127.                      ascending=[True, True, True, True, False])
  128.     )
  129.  
  130.     df_out = group_concat(df_out, GR_COLS, "impressions")
  131.     df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
  132.  
  133.     return df_out
  134.  
  135. def main():
  136.  
  137.     train_csv = '../../data/newsmall5mln/train.csv'
  138.     test_csv = '../../data/newsmall5mln/test.csv'
  139.     subm_csv = '../../data/newsmall5mln/submission_popular.csv'
  140.  
  141.    # print(f"Reading {train_csv} ...")
  142.     df_train = pd.read_csv(train_csv)
  143.   #  print(f"Reading {test_csv} ...")
  144.     df_test = pd.read_csv(test_csv)
  145.  
  146.     print("Get popular items...")
  147.     df_popular = get_popularity(df_train)
  148.  
  149.     print("Identify target rows...")
  150.     df_target = get_submission_target(df_test)
  151.  
  152.     print("Get recommendations...")
  153.     df_expl = explode(df_target, "impressions")
  154.     df_out = calc_recommendation(df_expl, df_popular)
  155.  
  156.    # print(f"Writing {subm_csv}...")
  157.     df_out.to_csv(subm_csv, index=False)
  158.  
  159.     print("Finished calculating recommendations.")
  160.  
  161.  
  162. if __name__ == '__main__':
  163.     main()
RAW Paste Data