caparol6991

System rekomendacyjne part 2

Dec 15th, 2019
135
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import math
  2. import pandas as pd
  3. import numpy as np
  4. import time
  5.  
  6. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
  7.  
  8.  
  9. def get_submission_target(df):
  10.     """Identify target rows with missing click outs."""
  11.  
  12.     mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
  13.     df_out = df[mask]
  14.  
  15.     return df_out
  16.  
  17.  
  18. def get_popularity(df):
  19.  
  20.     start_time = time.time()
  21.  
  22.     #rozdziel wyswietlone hotele i ceny
  23.     impressions_explode = explode(df, "impressions")
  24.     prices_explode = explode(df, "prices")
  25.  
  26.     action_type = df['action_type'] == "clickout item"
  27.     clicks = df.loc[action_type]
  28.  
  29.     action_type = impressions_explode['action_type'] == "clickout item"
  30.     views = impressions_explode.loc[action_type]
  31.  
  32.     #zlicz klikniecia w kazdy hotel
  33.     clicks_count = clicks['reference'].value_counts()
  34.     clicks_count.index = clicks_count.index.astype(int)
  35.  
  36.     #zlicz ile razy hotel byl wyswietlony
  37.     views_count = views['impressions'].value_counts()
  38.     ctr = clicks_count.divide(views_count)
  39.  
  40.     #polacz hotele z ich cenami
  41.     prices_temp = {'reference': impressions_explode.impressions, 'Price': prices_explode.prices}
  42.     prices = pd.DataFrame(prices_temp)
  43.  
  44.     #prices.set_index('reference')
  45.  
  46.     #wyrzuc wszystkie duplikujace sie wiersze z cenami hotelow
  47.     prices = prices.drop_duplicates('reference')
  48.  
  49.     #polacz klikniecia, wyswietlenia i CTR hotelu w jedno
  50.     df_combined = {'Clicks': clicks_count, 'Views': views_count, "CTR": ctr}
  51.     result = pd.DataFrame(df_combined)
  52.     result = result.rename_axis('reference').reset_index()
  53.     result = result.drop_duplicates('reference')
  54.     result = result.dropna()
  55.  
  56.     #polacz poprzedni dataframe z cenami
  57.     result2 = pd.merge(result, prices, on='reference')
  58.     result2 = result2.sort_values('Clicks', ascending=False)
  59.     avg = result2["Price"].mean()
  60.     print(avg)
  61.     print(result2.shape)
  62.    
  63.     #weź wyniki gdzie CTR jest większe od 0.5 i cena jest poniżej średniej
  64.     result2 = result2.loc[(result2.CTR > 0.50) & (result2.Price <= avg)]
  65.     print(result2.shape)
  66.     print(result2)
  67.     print(str(time.time() - start_time))
  68.  
  69.     return result2
  70.  
  71. def string_to_array(s):
  72.     """Convert pipe separated string to array."""
  73.  
  74.     if isinstance(s, str):
  75.         out = s.split("|")
  76.     elif math.isnan(s):
  77.         out = []
  78.     else:
  79.         raise ValueError("Value must be either string of nan")
  80.     return out
  81.  
  82.  
  83. def explode(df_in, col_expl):
  84.     """Explode column col_expl of array type into multiple rows."""
  85.  
  86.     df = df_in.copy()
  87.     df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
  88.  
  89.     df_out = pd.DataFrame(
  90.         {col: np.repeat(df[col].values,
  91.                         df[col_expl].str.len())
  92.          for col in df.columns.drop(col_expl)}
  93.     )
  94.  
  95.     df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
  96.     df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
  97.  
  98.     return df_out
  99.  
  100.  
  101. def group_concat(df, gr_cols, col_concat):
  102.     """Concatenate multiple rows into one."""
  103.  
  104.     df_out = (
  105.         df
  106.         .groupby(gr_cols)[col_concat]
  107.         .apply(lambda x: ' '.join(x))
  108.         .to_frame()
  109.         .reset_index()
  110.     )
  111.  
  112.     return df_out
  113.  
  114.  
  115. def calc_recommendation(df_expl, df_pop):
  116.     """Calculate recommendations based on popularity of items.
  117.  
  118.    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
  119.  
  120.    :param df_expl: Data frame with exploded impression list
  121.    :param df_pop: Data frame with items and number of clicks
  122.    :return: Data frame with sorted impression list according to popularity in df_pop
  123.    """
  124.  
  125.     df_expl_clicks = (
  126.         df_expl[GR_COLS + ["impressions"]]
  127.         .merge(df_pop,
  128.                left_on="impressions",
  129.                right_on="reference",
  130.                how="left")
  131.     )
  132.  
  133.     df_out = (
  134.         df_expl_clicks
  135.         .assign(impressions=lambda x: x["impressions"].apply(str))
  136.         .sort_values(GR_COLS + ["Clicks"],
  137.                      ascending=[True, True, True, True, False])
  138.     )
  139.  
  140.     df_out = group_concat(df_out, GR_COLS, "impressions")
  141.     df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
  142.  
  143.     return df_out
  144.  
  145.  
  146. def main():
  147.  
  148.     train_csv = '../../data/newsmall/train.csv'
  149.     test_csv = '../../data/newsmall/test.csv'
  150.     subm_csv = '../../data/newsmall/submission_popular.csv'
  151.  
  152.    # print(f"Reading {train_csv} ...")
  153.     df_train = pd.read_csv(train_csv)
  154.   #  print(f"Reading {test_csv} ...")
  155.     df_test = pd.read_csv(test_csv)
  156.  
  157.     print("Get popular items...")
  158.     df_popular = get_popularity(df_train)
  159.  
  160.     print("Identify target rows...")
  161.     df_target = get_submission_target(df_test)
  162.  
  163.     print("Get recommendations...")
  164.     df_expl = explode(df_target, "impressions")
  165.     df_out = calc_recommendation(df_expl, df_popular)
  166.  
  167.    # print(f"Writing {subm_csv}...")
  168.     df_out.to_csv(subm_csv, index=False)
  169.  
  170.     print("Finished calculating recommendations.")
  171.  
  172.  
  173. if __name__ == '__main__':
  174.     main()
RAW Paste Data