caparol6991

Kamil

Dec 18th, 2019
96
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import math
  2. import pandas as pd
  3. import numpy as np
  4.  
  5. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
  6.  
  7.  
  8. def get_submission_target(df):
  9.     """Identify target rows with missing click outs."""
  10.  
  11.     mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
  12.     df_out = df[mask]
  13.  
  14.     return df_out
  15.  
  16.  
  17. def get_popularity(df):
  18.     """Get number of clicks that each item received in the df."""
  19.  
  20.     #wybierz tylko clikouty
  21.     df_clickout = df[(df.action_type == "clickout item")]
  22.     #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]
  23.  
  24.     #policz klikniecia w dany hotel
  25.     df_item_clicks = (
  26.         df_clickout
  27.             .groupby("reference")
  28.             .size()
  29.             .reset_index(name="n_clicks")
  30.             .transform(lambda x: x.astype(int))
  31.     )
  32.  
  33.     #policz ile razy kazdy hotel sie wyswietlil
  34.     hotels = explode(df_clickout, "impressions")
  35.     hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
  36.             lambda x: x.astype(int))
  37.  
  38.     #rozdziel kolumne z cenami oraz znajdz wszystkie unikalne ceny hotelow
  39.     prices = explode(df_clickout, "prices")
  40.     unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
  41.     unique_prices_list = unique_prices['prices'].tolist()
  42.  
  43.     #polacz hotele z cenami
  44.     hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
  45.     hotel_prices_df = pd.DataFrame(hotel_prices)
  46.     hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])
  47.  
  48.     #polacz hotele z kliknieciami
  49.     hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
  50.     hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
  51.     hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)
  52.  
  53.     #dodaj CTR do poprzedniego DF
  54.     hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
  55.     hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))
  56.  
  57.     #dodaj ceny hotelów
  58.     hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')
  59.  
  60.     #oblicz górny próg ceny
  61.     price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]
  62.  
  63.     #wybierz tylko wiersza ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
  64.     hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
  65.     hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)
  66.  
  67.     print(hotels_info.sort_values(["reference"]))
  68.     return hotels_info
  69.  
  70.  
  71. def string_to_array(s):
  72.     """Convert pipe separated string to array."""
  73.  
  74.     if isinstance(s, str):
  75.         out = s.split("|")
  76.     elif math.isnan(s):
  77.         out = []
  78.     else:
  79.         raise ValueError("Value must be either string of nan")
  80.     return out
  81.  
  82.  
  83. def explode(df_in, col_expl):
  84.     """Explode column col_expl of array type into multiple rows."""
  85.  
  86.     df = df_in.copy()
  87.     df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
  88.  
  89.     df_out = pd.DataFrame(
  90.         {col: np.repeat(df[col].values,
  91.                         df[col_expl].str.len())
  92.          for col in df.columns.drop(col_expl)}
  93.     )
  94.  
  95.     df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
  96.     df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
  97.  
  98.     return df_out
  99.  
  100.  
  101. def group_concat(df, gr_cols, col_concat):
  102.     """Concatenate multiple rows into one."""
  103.  
  104.     df_out = (
  105.         df
  106.         .groupby(gr_cols)[col_concat]
  107.         .apply(lambda x: ' '.join(x))
  108.         .to_frame()
  109.         .reset_index()
  110.     )
  111.  
  112.     return df_out
  113.  
  114.  
  115. def calc_recommendation(df_expl, df_pop):
  116.     """Calculate recommendations based on popularity of items.
  117.  
  118.    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
  119.  
  120.    :param df_expl: Data frame with exploded impression list
  121.    :param df_pop: Data frame with items and number of clicks
  122.    :return: Data frame with sorted impression list according to popularity in df_pop
  123.    """
  124.  
  125.     df_expl_clicks = (
  126.         df_expl[GR_COLS + ["impressions"]]
  127.         .merge(df_pop,
  128.                left_on="impressions",
  129.                right_on="reference",
  130.                how="left")
  131.     )
  132.  
  133.     df_out = (
  134.         df_expl_clicks
  135.         .assign(impressions=lambda x: x["impressions"].apply(str))
  136.         .sort_values(GR_COLS + ["n_clicks"],
  137.                      ascending=[True, True, True, True, False])
  138.     )
  139.  
  140.     df_out = group_concat(df_out, GR_COLS, "impressions")
  141.     df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
  142.  
  143.     return df_out
  144.  
  145. def main():
  146.  
  147.     train_csv = '../../data/newsmall5mln/train.csv'
  148.     test_csv = '../../data/newsmall5mln/test.csv'
  149.     subm_csv = '../../data/newsmall5mln/submission_popular.csv'
  150.  
  151.    # print(f"Reading {train_csv} ...")
  152.     df_train = pd.read_csv(train_csv)
  153.   #  print(f"Reading {test_csv} ...")
  154.     df_test = pd.read_csv(test_csv)
  155.  
  156.     print("Get popular items...")
  157.     df_popular = get_popularity(df_train)
  158.  
  159.     print("Identify target rows...")
  160.     df_target = get_submission_target(df_test)
  161.  
  162.     print("Get recommendations...")
  163.     df_expl = explode(df_target, "impressions")
  164.     df_out = calc_recommendation(df_expl, df_popular)
  165.  
  166.    # print(f"Writing {subm_csv}...")
  167.     df_out.to_csv(subm_csv, index=False)
  168.  
  169.     print("Finished calculating recommendations.")
  170.  
  171.  
  172. if __name__ == '__main__':
  173.     main()
RAW Paste Data