Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import random
- import time
- start_time = time.time()
- data = pd.read_csv("podstawa.csv", sep=',', engine='python')
- print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
- print(data.info())
- print(data.values[0])
- print(data.values[15])
- uniquelist = data['session_id'].unique().tolist()
- d = int(len(uniquelist)*3/4)
- trainlist = uniquelist[:d]
- testlist = uniquelist[d:]
- #df = data
- print(len(data['session_id']))
- print(len(uniquelist))
- print(len(trainlist))
- print(len(testlist))
- print(testlist[0])
- print(testlist[1])
- ll = []
- #data = data[data['session_id'].isin(ll)]
- #len(uniquelist)*0,25
- ii = len(trainlist)/5
- print("!!!!!!!!!!")
- print(ii)
- for i in range(int(ii)):
- x = random.choice(trainlist)
- ll.append(x)
- trainlist.remove(x)
- print(len(testlist))
- print(len(ll))
- dt = data.copy()
- trainset = dt[dt['session_id'].isin(ll)]
- print("@@@@@@@@@@@@@@@@@@@@@@@@@@@")
- print(trainset.info())
- #tainset ^^^
- print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
- ll = []
- ii = len(testlist)/5
- print("!!!!!!!!!!")
- print(ii)
- for i in range(int(ii)):
- x = random.choice(testlist)
- ll.append(x)
- testlist.remove(x)
- testset = data[data['session_id'].isin(ll)]
- gtset = testset.copy()
- print(testset.info())
- print(len(testset['session_id']))
- print(testset['session_id'])
- print(testset['session_id'].value_counts())
- print("010")
- w = testset['session_id'].value_counts().to_frame()
- print(w)
- for i in range(len(testset['session_id'])):
- print(testset.values[i][5])
- print("!!!!")
- for i in range(len(w)):
- testset.loc[((testset['session_id'] == w.index.values[i]) & (testset['step'] == w['session_id'].values[i])), 'reference'] = ''
- for i in range(len(testset['session_id'])):
- print(testset.values[i][5])
- trainset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\train.csv')
- gtset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\GroundTruth.csv')
- testset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\test.csv')
- print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement