Advertisement
Guest User

Untitled

a guest
Nov 11th, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.16 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import random
  4. import time
  5.  
  6. start_time = time.time()
  7. data = pd.read_csv("podstawa.csv", sep=',', engine='python')
  8. print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
  9. print(data.info())
  10. print(data.values[0])
  11. print(data.values[15])
  12. uniquelist = data['session_id'].unique().tolist()
  13. d = int(len(uniquelist)*3/4)
  14. trainlist = uniquelist[:d]
  15. testlist = uniquelist[d:]
  16. #df = data
  17. print(len(data['session_id']))
  18. print(len(uniquelist))
  19. print(len(trainlist))
  20. print(len(testlist))
  21. print(testlist[0])
  22. print(testlist[1])
  23. ll = []
  24. #data = data[data['session_id'].isin(ll)]
  25. #len(uniquelist)*0,25
  26. ii = len(trainlist)/5
  27. print("!!!!!!!!!!")
  28. print(ii)
  29. for i in range(int(ii)):
  30. x = random.choice(trainlist)
  31. ll.append(x)
  32. trainlist.remove(x)
  33. print(len(testlist))
  34. print(len(ll))
  35. dt = data.copy()
  36. trainset = dt[dt['session_id'].isin(ll)]
  37. print("@@@@@@@@@@@@@@@@@@@@@@@@@@@")
  38. print(trainset.info())
  39. #tainset ^^^
  40. print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
  41. ll = []
  42. ii = len(testlist)/5
  43. print("!!!!!!!!!!")
  44. print(ii)
  45. for i in range(int(ii)):
  46. x = random.choice(testlist)
  47. ll.append(x)
  48. testlist.remove(x)
  49. testset = data[data['session_id'].isin(ll)]
  50. gtset = testset.copy()
  51. print(testset.info())
  52. print(len(testset['session_id']))
  53. print(testset['session_id'])
  54. print(testset['session_id'].value_counts())
  55. print("010")
  56. w = testset['session_id'].value_counts().to_frame()
  57. print(w)
  58. for i in range(len(testset['session_id'])):
  59. print(testset.values[i][5])
  60. print("!!!!")
  61. for i in range(len(w)):
  62. testset.loc[((testset['session_id'] == w.index.values[i]) & (testset['step'] == w['session_id'].values[i])), 'reference'] = ''
  63. for i in range(len(testset['session_id'])):
  64. print(testset.values[i][5])
  65.  
  66. trainset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\train.csv')
  67. gtset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\GroundTruth.csv')
  68. testset.to_csv('C:\\Users\\Nico\\Desktop\\SR\\1\\1\\data\\test.csv')
  69. print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+str(time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement