caparol6991

rec_popular kamil

Jan 9th, 2020
94
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import math
  2. import pandas as pd
  3. import numpy as np
  4.  
  5. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
  6.  
  7.  
  8. def get_submission_target(df):
  9. """Identify target rows with missing click outs."""
  10.  
  11. mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
  12. df_out = df[mask]
  13.  
  14. return df_out
  15.  
  16.  
  17. def get_popularity(df):
  18. """Get number of clicks that each item received in the df."""
  19.  
  20. #wybierz tylko clikouty
  21. df_clickout = df[(df.action_type == "clickout item")]
  22. #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]
  23.  
  24. #policz klikniecia w dany hotel
  25. df_item_clicks = (
  26. df_clickout
  27. .groupby("reference")
  28. .size()
  29. .reset_index(name="n_clicks")
  30. .transform(lambda x: x.astype(int))
  31. )
  32.  
  33. #policz ile razy kazdy hotel sie wyswietlil
  34. hotels = explode(df_clickout, "impressions")
  35. hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
  36. lambda x: x.astype(int))
  37.  
  38. #rozdziel kolumne z cenami oraz znajdz wszystkie unikalne ceny hotelow
  39. prices = explode(df_clickout, "prices")
  40. unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
  41. unique_prices_list = unique_prices['prices'].tolist()
  42.  
  43. #polacz hotele z cenami
  44. hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
  45. hotel_prices_df = pd.DataFrame(hotel_prices)
  46. hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])
  47.  
  48. #polacz hotele z kliknieciami
  49. hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
  50. hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
  51. hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)
  52.  
  53. #dodaj CTR do poprzedniego DF
  54. hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
  55. hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))
  56.  
  57. #dodaj ceny hotelów
  58. hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')
  59.  
  60. #oblicz górny próg ceny
  61. price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]
  62.  
  63. #wybierz tylko wiersza ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
  64. hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
  65. hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)
  66.  
  67. print(hotels_info.sort_values(["reference"]))
  68. return hotels_info
  69.  
  70.  
  71. def string_to_array(s):
  72. """Convert pipe separated string to array."""
  73.  
  74. if isinstance(s, str):
  75. out = s.split("|")
  76. elif math.isnan(s):
  77. out = []
  78. else:
  79. raise ValueError("Value must be either string of nan")
  80. return out
  81.  
  82.  
  83. def explode(df_in, col_expl):
  84. """Explode column col_expl of array type into multiple rows."""
  85.  
  86. df = df_in.copy()
  87. df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
  88.  
  89. df_out = pd.DataFrame(
  90. {col: np.repeat(df[col].values,
  91. df[col_expl].str.len())
  92. for col in df.columns.drop(col_expl)}
  93. )
  94.  
  95. df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
  96. df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
  97.  
  98. return df_out
  99.  
  100.  
  101. def group_concat(df, gr_cols, col_concat):
  102. """Concatenate multiple rows into one."""
  103.  
  104. df_out = (
  105. df
  106. .groupby(gr_cols)[col_concat]
  107. .apply(lambda x: ' '.join(x))
  108. .to_frame()
  109. .reset_index()
  110. )
  111.  
  112. return df_out
  113.  
  114.  
  115. def calc_recommendation(df_expl, df_pop):
  116. """Calculate recommendations based on popularity of items.
  117.  
  118. The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
  119.  
  120. :param df_expl: Data frame with exploded impression list
  121. :param df_pop: Data frame with items and number of clicks
  122. :return: Data frame with sorted impression list according to popularity in df_pop
  123. """
  124.  
  125. df_expl_clicks = (
  126. df_expl[GR_COLS + ["impressions"]]
  127. .merge(df_pop,
  128. left_on="impressions",
  129. right_on="reference",
  130. how="left")
  131. )
  132.  
  133. df_out = (
  134. df_expl_clicks
  135. .assign(impressions=lambda x: x["impressions"].apply(str))
  136. .sort_values(GR_COLS + ["n_clicks"],
  137. ascending=[True, True, True, True, False])
  138. )
  139.  
  140. df_out = group_concat(df_out, GR_COLS, "impressions")
  141. df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
  142.  
  143. return df_out
  144.  
  145. def main():
  146.  
  147. train_csv = '../../data/newsmall5mln/train.csv'
  148. test_csv = '../../data/newsmall5mln/test.csv'
  149. subm_csv = '../../data/newsmall5mln/submission_popular.csv'
  150.  
  151. # print(f"Reading {train_csv} ...")
  152. df_train = pd.read_csv(train_csv)
  153. # print(f"Reading {test_csv} ...")
  154. df_test = pd.read_csv(test_csv)
  155.  
  156. print("Get popular items...")
  157. df_popular = get_popularity(df_train)
  158.  
  159. print("Identify target rows...")
  160. df_target = get_submission_target(df_test)
  161.  
  162. print("Get recommendations...")
  163. df_expl = explode(df_target, "impressions")
  164. df_out = calc_recommendation(df_expl, df_popular)
  165.  
  166. # print(f"Writing {subm_csv}...")
  167. df_out.to_csv(subm_csv, index=False)
  168.  
  169. print("Finished calculating recommendations.")
  170.  
  171.  
  172. if __name__ == '__main__':
  173. main()
RAW Paste Data