Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import pandas as pd
- import numpy as np
- GR_COLS = ["user_id", "session_id", "timestamp", "step"]
- def get_submission_target(df):
- """Identify target rows with missing click outs."""
- mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
- df_out = df[mask]
- return df_out
- def get_popularity(df):
- """Get number of clicks that each item received in the df."""
- #wybierz tylko clikouty
- df_clickout = df[(df.action_type == "clickout item")]
- #df_clickout = df_clickout[df_clickout.step <= int(df_clickout.step.sum()/df_clickout.shape[0])]
- #policz klikniecia w dany hotel
- df_item_clicks = (
- df_clickout
- .groupby("reference")
- .size()
- .reset_index(name="n_clicks")
- .transform(lambda x: x.astype(int))
- )
- #policz ile razy kazdy hotel sie wyswietlil
- hotels = explode(df_clickout, "impressions")
- hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
- lambda x: x.astype(int))
- #rozdziel kolumne z cenami oraz znajdz wszystkie unikalne ceny hotelow
- prices = explode(df_clickout, "prices")
- unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
- unique_prices_list = unique_prices['prices'].tolist()
- #polacz hotele z cenami
- hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
- hotel_prices_df = pd.DataFrame(hotel_prices)
- hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])
- #polacz hotele z kliknieciami
- hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
- hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
- hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)
- #dodaj CTR do poprzedniego DF
- hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
- hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))
- #dodaj ceny hotelów
- hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')
- #oblicz górny próg ceny
- price_cuttoff = unique_prices_list[int(unique_prices.shape[0]*0.75)]
- #wybierz tylko wiersza ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
- hotels_info = hotels_info[(hotels_info.n_clicks != 0) & (hotels_info.CTR > 50) & (hotels_info.Price <= price_cuttoff)]
- hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)
- print(hotels_info.sort_values(["reference"]))
- return hotels_info
- def string_to_array(s):
- """Convert pipe separated string to array."""
- if isinstance(s, str):
- out = s.split("|")
- elif math.isnan(s):
- out = []
- else:
- raise ValueError("Value must be either string of nan")
- return out
- def explode(df_in, col_expl):
- """Explode column col_expl of array type into multiple rows."""
- df = df_in.copy()
- df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
- df_out = pd.DataFrame(
- {col: np.repeat(df[col].values,
- df[col_expl].str.len())
- for col in df.columns.drop(col_expl)}
- )
- df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
- df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
- return df_out
- def group_concat(df, gr_cols, col_concat):
- """Concatenate multiple rows into one."""
- df_out = (
- df
- .groupby(gr_cols)[col_concat]
- .apply(lambda x: ' '.join(x))
- .to_frame()
- .reset_index()
- )
- return df_out
- def calc_recommendation(df_expl, df_pop):
- """Calculate recommendations based on popularity of items.
- The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
- :param df_expl: Data frame with exploded impression list
- :param df_pop: Data frame with items and number of clicks
- :return: Data frame with sorted impression list according to popularity in df_pop
- """
- df_expl_clicks = (
- df_expl[GR_COLS + ["impressions"]]
- .merge(df_pop,
- left_on="impressions",
- right_on="reference",
- how="left")
- )
- df_out = (
- df_expl_clicks
- .assign(impressions=lambda x: x["impressions"].apply(str))
- .sort_values(GR_COLS + ["n_clicks"],
- ascending=[True, True, True, True, False])
- )
- df_out = group_concat(df_out, GR_COLS, "impressions")
- df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)
- return df_out
- def main():
- train_csv = '../../data/newsmall5mln/train.csv'
- test_csv = '../../data/newsmall5mln/test.csv'
- subm_csv = '../../data/newsmall5mln/submission_popular.csv'
- # print(f"Reading {train_csv} ...")
- df_train = pd.read_csv(train_csv)
- # print(f"Reading {test_csv} ...")
- df_test = pd.read_csv(test_csv)
- print("Get popular items...")
- df_popular = get_popularity(df_train)
- print("Identify target rows...")
- df_target = get_submission_target(df_test)
- print("Get recommendations...")
- df_expl = explode(df_target, "impressions")
- df_out = calc_recommendation(df_expl, df_popular)
- # print(f"Writing {subm_csv}...")
- df_out.to_csv(subm_csv, index=False)
- print("Finished calculating recommendations.")
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement