# pajton

Dec 17th, 2019
187
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import math
2. import pandas as pd
3. import numpy as np
4.
5. GR_COLS = ["user_id", "session_id", "timestamp", "step"]
6.
7.
8. def get_submission_target(df):
9.     """Identify target rows with missing click outs."""
10.
11.     mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
13.
14.     return df_out
15.
16.
17. def get_popularity(df):
18.     """Get number of clicks that each item received in the df."""
19.
20.     #wybierz tylko clikouty
21.     df_clickout = df[(df.action_type == "clickout item")]
22.     #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]
23.
24.     #policz klikniecia w dany hotel
25.     df_item_clicks = (
26.         df_clickout
27.             .groupby("reference")
28.             .size()
29.             .reset_index(name="n_clicks")
30.             .transform(lambda x: x.astype(int))
31.     )
32.
33.     #policz ile razy kazdy hotel sie wyswietlil
34.     hotels = explode(df_clickout, "impressions")
35.     hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
36.             lambda x: x.astype(int))
37.
38.     #rozdziel kolumne z cenami oraz znajdz wszystkie unikalne ceny hotelow
39.     prices = explode(df_clickout, "prices")
40.     unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
41.     unique_prices_list = unique_prices['prices'].tolist()
42.
43.     #polacz hotele z cenami
44.     hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
45.     hotel_prices_df = pd.DataFrame(hotel_prices)
46.     hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])
47.
48.     #polacz hotele z kliknieciami
49.     hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
50.     hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
51.     hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)
52.
53.     #dodaj CTR do poprzedniego DF
54.     hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
55.     hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))
56.
57.     #dodaj ceny hotelów
58.     hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')
59.
60.     #oblicz górny próg ceny
61.     price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]
62.
63.     #wybierz tylko wiersza ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
64.     hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
65.     hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)
66.
67.     print(hotels_info.sort_values(["reference"]))
68.     return hotels_info
69.
70.
71. def string_to_array(s):
72.     """Convert pipe separated string to array."""
73.
74.     if isinstance(s, str):
75.         out = s.split("|")
76.     elif math.isnan(s):
77.         out = []
78.     else:
79.         raise ValueError("Value must be either string of nan")
80.     return out
81.
82.
83. def explode(df_in, col_expl):
84.     """Explode column col_expl of array type into multiple rows."""
85.
86.     df = df_in.copy()
87.     df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
88.
89.     df_out = pd.DataFrame(
90.         {col: np.repeat(df[col].values,
91.                         df[col_expl].str.len())
92.          for col in df.columns.drop(col_expl)}
93.     )
94.
95.     df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
96.     df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
97.
98.     return df_out
99.
100.
101. def group_concat(df, gr_cols, col_concat):
102.     """Concatenate multiple rows into one."""
103.
104.     df_out = (
105.         df
106.         .groupby(gr_cols)[col_concat]
107.         .apply(lambda x: ' '.join(x))
108.         .to_frame()
109.         .reset_index()
110.     )
111.
112.     return df_out
113.
114.
115. def calc_recommendation(df_expl, df_pop):
116.     """Calculate recommendations based on popularity of items.
117.
118.    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
119.
120.    :param df_expl: Data frame with exploded impression list
121.    :param df_pop: Data frame with items and number of clicks
122.    :return: Data frame with sorted impression list according to popularity in df_pop
123.    """
124.
125.     df_expl_clicks = (
126.         df_expl[GR_COLS + ["impressions"]]
127.         .merge(df_pop,
128.                left_on="impressions",
129.                right_on="reference",
130.                how="left")
131.     )
132.
133.     df_out = (
134.         df_expl_clicks
135.         .assign(impressions=lambda x: x["impressions"].apply(str))
136.         .sort_values(GR_COLS + ["n_clicks"],
137.                      ascending=[True, True, True, True, False])
138.     )
139.
140.     df_out = group_concat(df_out, GR_COLS, "impressions")
141.     df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
142.
143.     return df_out
144.
145. def main():
146.
147.     train_csv = '../../data/newsmall5mln/train.csv'
148.     test_csv = '../../data/newsmall5mln/test.csv'
149.     subm_csv = '../../data/newsmall5mln/submission_popular.csv'
150.
155.
156.     print("Get popular items...")
157.     df_popular = get_popularity(df_train)
158.
159.     print("Identify target rows...")
160.     df_target = get_submission_target(df_test)
161.
162.     print("Get recommendations...")
163.     df_expl = explode(df_target, "impressions")
164.     df_out = calc_recommendation(df_expl, df_popular)
165.
166.    # print(f"Writing {subm_csv}...")
167.     df_out.to_csv(subm_csv, index=False)
168.
169.     print("Finished calculating recommendations.")
170.
171.
172. if __name__ == '__main__':
173.     main()
RAW Paste Data