caparol6991

rec_popular karol

Jan 9th, 2020
146
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import math
  2. import pandas as pd
  3. import numpy as np
  4. import time
  5.  
  6. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
  7.  
  8.  
  9. def get_submission_target(df):
  10. """Identify target rows with missing click outs."""
  11.  
  12. mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
  13. df_out = df[mask]
  14.  
  15. return df_out
  16.  
  17.  
  18. def get_popularity(df):
  19. start_time = time.time()
  20.  
  21. # rozdziel wyswietlone hotele i ceny
  22. action_type = df['action_type'] == "clickout item"
  23. clicks = df.loc[action_type]
  24.  
  25. impressions_explode = explode(clicks, "impressions")
  26. prices_explode = explode(clicks, "prices")
  27.  
  28. # zlicz klikniecia w kazdy hotel
  29. clicks_count = clicks['reference'].value_counts()
  30. clicks_count.index = clicks_count.index.astype(int)
  31.  
  32. # zlicz ile razy hotel byl wyswietlony
  33. views_count = impressions_explode['impressions'].value_counts()
  34. ctr = clicks_count.divide(views_count)
  35.  
  36. # polacz hotele z ich cenami
  37. prices_temp = {'reference': impressions_explode.impressions, 'Price': prices_explode.prices}
  38. prices = pd.DataFrame(prices_temp)
  39.  
  40. # prices.set_index('reference')
  41.  
  42. # wyrzuc wszystkie duplikujace sie wiersze z cenami hotelow
  43. prices = prices.drop_duplicates('reference')
  44.  
  45. # polacz klikniecia, wyswietlenia i CTR hotelu w jedno
  46. df_combined = {'Clicks': clicks_count, 'Views': views_count, "CTR": ctr}
  47. result = pd.DataFrame(df_combined)
  48. result = result.rename_axis('reference').reset_index()
  49. result = result.drop_duplicates('reference')
  50. result = result.dropna()
  51.  
  52. # polacz poprzedni dataframe z cenami
  53. result2 = pd.merge(result, prices, on='reference')
  54. result2 = result2.sort_values('Clicks', ascending=False)
  55. avg = result2["Price"].mean()
  56. # print(avg)
  57. # print(result2.shape)
  58.  
  59. # wez wyniki gdzie CTR jest wieksze od 0.5 i cena jest ponizej sredniej
  60. result2 = result2.loc[(result2.CTR > 0.50) & (result2.Price <= avg)]
  61. # print(result2.shape)
  62. print(result2)
  63. print(str(time.time() - start_time))
  64.  
  65. return result2
  66.  
  67.  
  68. def string_to_array(s):
  69. """Convert pipe separated string to array."""
  70.  
  71. if isinstance(s, str):
  72. out = s.split("|")
  73. elif math.isnan(s):
  74. out = []
  75. else:
  76. raise ValueError("Value must be either string of nan")
  77. return out
  78.  
  79.  
  80. def explode(df_in, col_expl):
  81. """Explode column col_expl of array type into multiple rows."""
  82.  
  83. df = df_in.copy()
  84. df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
  85.  
  86. df_out = pd.DataFrame(
  87. {col: np.repeat(df[col].values,
  88. df[col_expl].str.len())
  89. for col in df.columns.drop(col_expl)}
  90. )
  91.  
  92. df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
  93. df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
  94.  
  95. return df_out
  96.  
  97.  
  98. def group_concat(df, gr_cols, col_concat):
  99. """Concatenate multiple rows into one."""
  100.  
  101. df_out = (
  102. df
  103. .groupby(gr_cols)[col_concat]
  104. .apply(lambda x: ' '.join(x))
  105. .to_frame()
  106. .reset_index()
  107. )
  108.  
  109. return df_out
  110.  
  111.  
  112. def calc_recommendation(df_expl, df_pop):
  113. """Calculate recommendations based on popularity of items.
  114.  
  115. The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
  116.  
  117. :param df_expl: Data frame with exploded impression list
  118. :param df_pop: Data frame with items and number of clicks
  119. :return: Data frame with sorted impression list according to popularity in df_pop
  120. """
  121.  
  122. df_expl_clicks = (
  123. df_expl[GR_COLS + ["impressions"]]
  124. .merge(df_pop,
  125. left_on="impressions",
  126. right_on="reference",
  127. how="left")
  128. )
  129.  
  130. df_out = (
  131. df_expl_clicks
  132. .assign(impressions=lambda x: x["impressions"].apply(str))
  133. .sort_values(GR_COLS + ["Clicks"],
  134. ascending=[True, True, True, True, False])
  135. )
  136.  
  137. df_out = group_concat(df_out, GR_COLS, "impressions")
  138. df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
  139.  
  140. return df_out
  141.  
  142.  
  143. def main():
  144. train_csv = '../../data/newsmall/train.csv'
  145. test_csv = '../../data/newsmall/test.csv'
  146. subm_csv = '../../data/newsmall/submission_popular.csv'
  147.  
  148. # print(f"Reading {train_csv} ...")
  149. df_train = pd.read_csv(train_csv)
  150. # print(f"Reading {test_csv} ...")
  151. df_test = pd.read_csv(test_csv)
  152.  
  153. print("Get popular items...")
  154. df_popular = get_popularity(df_train)
  155.  
  156. print("Identify target rows...")
  157. df_target = get_submission_target(df_test)
  158.  
  159. print("Get recommendations...")
  160. df_expl = explode(df_target, "impressions")
  161. df_out = calc_recommendation(df_expl, df_popular)
  162.  
  163. # print(f"Writing {subm_csv}...")
  164. df_out.to_csv(subm_csv, index=False)
  165.  
  166. print("Finished calculating recommendations.")
  167.  
  168.  
  169. if __name__ == '__main__':
  170. main()
RAW Paste Data