Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import re
- import datrie
- import string
- from matplotlib import pylab
- import datetime
- import numpy as np
- data = pd.read_csv("raw_search_data.csv", sep=';', error_bad_lines=False)
- data = data.dropna()
- data['query'] = data['query'].apply(lambda x: str(x).lower())
- data = data[list(map(lambda x: len(x) < 50, data[:]['query']))]
- data['datetime'] = data['datetime'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
- data = data.reset_index()
- data = data.drop(columns=['index'])
- uid_set = set(data['uid'])
- idx_for_delete = []
- for uid, i in zip(uid_set, range(len(uid_set))):
- cur_data = data[data.uid == uid]
- idxs = data[data.uid == uid].index
- if (max(cur_data['datetime']) - min(cur_data['datetime'])) <= datetime.timedelta(seconds=10):
- idxs = idxs.delete(np.argmax(list(map(lambda x: len(x), cur_data['query']))))
- idx_for_delete += list(idxs)
- print (idxs)
- print (i)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement