Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #part1 input the data
- #data is a list containing all the 2039 training lines as dictionary
- import os
- dataDir=""
- dataFile="D:\\A-Good-Place\\Python\\.vscode\\CSC1001\\src\\train.csv"
- def parse_file(datafile):
- data=[]
- with open(datafile,"r",encoding="UTF-8") as f:
- header=f.readline().split(",") #获取表头 header is a list
- counter=0
- for line in f:
- #if counter==1119:
- # break
- fields=line.split(",")
- entry={}
- for i, value in enumerate(fields):
- entry[header[i].strip()]=value.strip()
- data.append(entry)
- counter+=1
- return data
- def splitDataSet(data, axis, value):#去掉进行分类的那个属性
- retDataSet=[]
- for featVec in data:
- if featVec[axis]==value or featVec[axis]>value :
- reducedFeatVec=featVec[:axis]#去掉axis特征
- reducedFeatVec.extend(featVec[axis+1:])
- retDataSet.append(reducedFeatVec)#len(reDataSet)=该feature中的这一类的个数
- return retDataSet
- #part2 build the tree
- def cart_chooseTheBestFeatureToSplit(data):
- numFeature=len(data[0])-1 #大于或小于6 the last row is not feature
- bestGini=9999
- bestFeature=-1
- for i in range(numFeature):
- featList=[example[i] for example in data]
- uniqueFeature=set(featList) #这个属性中包含的几种元素
- gini=0
- for value in uniqueFeature:
- subDataSet=splitDataSet(data, i, value)#i: 第i个feature; value:i中的某一类
- p=len(subDataSet)/float(len(data))#len(subDataSet)= 该feature中的这一类的个数 len(data)总个数(Dv/D)
- subp=len(splitDataSet(subDataSet,-1,'6'))/float(len(subDataSet))#该feature某类下满足大于6的比例(Gini(Dv))
- gini+=p*(1.0-subp^2-(1-subp)^2)
- if(gini<bestGini):
- bestGini=gini
- bestFeature=i
- return bestFeature
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement