Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- dataset_array = pd.read_csv(
- 'C:\python hz\dataset.csv',
- delimiter=':',
- names=['NameOfUser', 'Q1', 'Q2',
- 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8',
- 'Q9', 'Q10']
- )
- # ----------------------------------------------------------------------------------------------
- # ВОССТАНОВЛЕНИЕ ДАННЫХ
- nameList = ['']
- for i in range(len(dataset_array)):
- if dataset_array['NameOfUser'][i] not in nameList:
- nameList.append(dataset_array['NameOfUser'][i])
- dataset_array['NameOfUser'][i] = (len(nameList))
- else:
- for j in range(len(nameList)):
- if nameList[j] == dataset_array['NameOfUser'][i]:
- dataset_array['NameOfUser'][i] = j
- break
- dataset_array = dataset_array.sort_values('NameOfUser')
- for i in range(len(dataset_array)):
- dataset_array['NameOfUser'][i] = 'User' + str(dataset_array['NameOfUser'][i])
- # ----------------------------------------------------------------------------------------------
- # НОРМАЛИЗАЦИЯ ДАННЫХ
- for j in range(1, 11):
- for i in range(len(dataset_array)):
- if int(dataset_array['Q' + str(j)][i]) > 10:
- dataset_array['Q' + str(j)][i] = 10
- elif int(dataset_array['Q' + str(j)][i]) < 1:
- dataset_array['Q' + str(j)][i] = 1
- # ----------------------------------------------------------------------------------------------
- # ОЧИСТКА ОТ ГРЯЗНЫХ ДАННЫХ
- z = 0
- for i in range(len(dataset_array)):
- # УДАЛЕНИЕ СТРОК СО ВСЕМИ ОДИНАКОВЫМИ СТОЛБЦАМИ
- for j in range(1, 11):
- if int(dataset_array['Q' + str(j)][i]) == int(dataset_array['Q1'][i]):
- continue
- else:
- z = 1
- break
- if z == 0:
- dataset_array = dataset_array.drop(index=i)
- z = 0
- dataset_array = dataset_array.reset_index(drop=True)
- # ----------------------------------------------------------------------------------------------
- # УДАЛЕНИЕ ДУБЛИРУЮЩИХСЯ СТРОК
- stringList = ['']*len(dataset_array)
- for i in range(len(dataset_array)):
- thisString = ''
- for j in range(1, 11):
- thisString += str(dataset_array['Q' + str(j)][i])
- if thisString not in stringList:
- stringList[i] = thisString
- else:
- dataset_array = dataset_array.drop(index=i)
- thisString = ''
- dataset_array = dataset_array.reset_index(drop=True)
- # ----------------------------------------------------------------------------------------------
- numberList = [i for i in range(len(dataset_array))]
- dataset_array = pd.DataFrame(data=dataset_array, index=numberList)
- for i in range(1, 11):
- array = np.array(dataset_array['Q' + str(i)])
- arrayMax = max(array)
- arrayMin = min(array)
- for j in range(len(array)):
- value = str(((array[j]) - arrayMin) / float(arrayMax - arrayMin))
- dataset_array['Q' + str(i)][j] = value[0:8] # 8 ЗНАКОВ ПОСЛЕ ЗАПЯТОЙ
- print(dataset_array)
- dataset_array.to_csv('C:\python hz\dataset2.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement