Advertisement
Fenny_Theo

csc1001 group project part1

May 14th, 2020
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.02 KB | None | 0 0
  1. #part1 input the data
  2. #data is a list containing all the 2039 training lines as dictionary
  3. import os
  4. dataDir=""
  5. dataFile="D:\\A-Good-Place\\Python\\.vscode\\CSC1001\\src\\train.csv"
  6. def parse_file(datafile):
  7. data=[]
  8. with open(datafile,"r",encoding="UTF-8") as f:
  9. header=f.readline().split(",") #获取表头 header is a list
  10. counter=0
  11. for line in f:
  12. #if counter==1119:
  13. # break
  14. fields=line.split(",")
  15. entry={}
  16. for i, value in enumerate(fields):
  17. entry[header[i].strip()]=value.strip()
  18. data.append(entry)
  19. counter+=1
  20. return data
  21.  
  22. def splitDataSet(data, axis, value):#去掉进行分类的那个属性
  23. retDataSet=[]
  24. for featVec in data:
  25. if featVec[axis]==value or featVec[axis]>value :
  26. reducedFeatVec=featVec[:axis]#去掉axis特征
  27. reducedFeatVec.extend(featVec[axis+1:])
  28. retDataSet.append(reducedFeatVec)#len(reDataSet)=该feature中的这一类的个数
  29. return retDataSet
  30.  
  31.  
  32.  
  33. #part2 build the tree
  34. def cart_chooseTheBestFeatureToSplit(data):
  35. numFeature=len(data[0])-1 #大于或小于6 the last row is not feature
  36. bestGini=9999
  37. bestFeature=-1
  38. for i in range(numFeature):
  39. featList=[example[i] for example in data]
  40. uniqueFeature=set(featList) #这个属性中包含的几种元素
  41. gini=0
  42. for value in uniqueFeature:
  43. subDataSet=splitDataSet(data, i, value)#i: 第i个feature; value:i中的某一类
  44. p=len(subDataSet)/float(len(data))#len(subDataSet)= 该feature中的这一类的个数 len(data)总个数(Dv/D)
  45. subp=len(splitDataSet(subDataSet,-1,'6'))/float(len(subDataSet))#该feature某类下满足大于6的比例(Gini(Dv))
  46. gini+=p*(1.0-subp^2-(1-subp)^2)
  47. if(gini<bestGini):
  48. bestGini=gini
  49. bestFeature=i
  50. return bestFeature
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement