# rec_popular karol

Jan 9th, 2020
146
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import math
2. import pandas as pd
3. import numpy as np
4. import time
5.
6. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
7.
8.
9. def get_submission_target(df):
10. """Identify target rows with missing click outs."""
11.
12. mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
14.
15. return df_out
16.
17.
18. def get_popularity(df):
19. start_time = time.time()
20.
21. # rozdziel wyswietlone hotele i ceny
22. action_type = df['action_type'] == "clickout item"
23. clicks = df.loc[action_type]
24.
25. impressions_explode = explode(clicks, "impressions")
26. prices_explode = explode(clicks, "prices")
27.
28. # zlicz klikniecia w kazdy hotel
29. clicks_count = clicks['reference'].value_counts()
30. clicks_count.index = clicks_count.index.astype(int)
31.
32. # zlicz ile razy hotel byl wyswietlony
33. views_count = impressions_explode['impressions'].value_counts()
34. ctr = clicks_count.divide(views_count)
35.
36. # polacz hotele z ich cenami
37. prices_temp = {'reference': impressions_explode.impressions, 'Price': prices_explode.prices}
38. prices = pd.DataFrame(prices_temp)
39.
40. # prices.set_index('reference')
41.
42. # wyrzuc wszystkie duplikujace sie wiersze z cenami hotelow
43. prices = prices.drop_duplicates('reference')
44.
45. # polacz klikniecia, wyswietlenia i CTR hotelu w jedno
46. df_combined = {'Clicks': clicks_count, 'Views': views_count, "CTR": ctr}
47. result = pd.DataFrame(df_combined)
48. result = result.rename_axis('reference').reset_index()
49. result = result.drop_duplicates('reference')
50. result = result.dropna()
51.
52. # polacz poprzedni dataframe z cenami
53. result2 = pd.merge(result, prices, on='reference')
54. result2 = result2.sort_values('Clicks', ascending=False)
55. avg = result2["Price"].mean()
56. # print(avg)
57. # print(result2.shape)
58.
59. # wez wyniki gdzie CTR jest wieksze od 0.5 i cena jest ponizej sredniej
60. result2 = result2.loc[(result2.CTR > 0.50) & (result2.Price <= avg)]
61. # print(result2.shape)
62. print(result2)
63. print(str(time.time() - start_time))
64.
65. return result2
66.
67.
68. def string_to_array(s):
69. """Convert pipe separated string to array."""
70.
71. if isinstance(s, str):
72. out = s.split("|")
73. elif math.isnan(s):
74. out = []
75. else:
76. raise ValueError("Value must be either string of nan")
77. return out
78.
79.
80. def explode(df_in, col_expl):
81. """Explode column col_expl of array type into multiple rows."""
82.
83. df = df_in.copy()
84. df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
85.
86. df_out = pd.DataFrame(
87. {col: np.repeat(df[col].values,
88. df[col_expl].str.len())
89. for col in df.columns.drop(col_expl)}
90. )
91.
92. df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
93. df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
94.
95. return df_out
96.
97.
98. def group_concat(df, gr_cols, col_concat):
99. """Concatenate multiple rows into one."""
100.
101. df_out = (
102. df
103. .groupby(gr_cols)[col_concat]
104. .apply(lambda x: ' '.join(x))
105. .to_frame()
106. .reset_index()
107. )
108.
109. return df_out
110.
111.
112. def calc_recommendation(df_expl, df_pop):
113. """Calculate recommendations based on popularity of items.
114.
115. The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
116.
117. :param df_expl: Data frame with exploded impression list
118. :param df_pop: Data frame with items and number of clicks
119. :return: Data frame with sorted impression list according to popularity in df_pop
120. """
121.
122. df_expl_clicks = (
123. df_expl[GR_COLS + ["impressions"]]
124. .merge(df_pop,
125. left_on="impressions",
126. right_on="reference",
127. how="left")
128. )
129.
130. df_out = (
131. df_expl_clicks
132. .assign(impressions=lambda x: x["impressions"].apply(str))
133. .sort_values(GR_COLS + ["Clicks"],
134. ascending=[True, True, True, True, False])
135. )
136.
137. df_out = group_concat(df_out, GR_COLS, "impressions")
138. df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
139.
140. return df_out
141.
142.
143. def main():
144. train_csv = '../../data/newsmall/train.csv'
145. test_csv = '../../data/newsmall/test.csv'
146. subm_csv = '../../data/newsmall/submission_popular.csv'
147.
152.
153. print("Get popular items...")
154. df_popular = get_popularity(df_train)
155.
156. print("Identify target rows...")
157. df_target = get_submission_target(df_test)
158.
159. print("Get recommendations...")
160. df_expl = explode(df_target, "impressions")
161. df_out = calc_recommendation(df_expl, df_popular)
162.
163. # print(f"Writing {subm_csv}...")
164. df_out.to_csv(subm_csv, index=False)
165.
166. print("Finished calculating recommendations.")
167.
168.
169. if __name__ == '__main__':
170. main()
RAW Paste Data