Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding:utf-8
- '''
- step1:将描述中的机构名提取出来
- step2:切分好后转化为向量
- step3:计算相似度
- '''
- import codecs
- import re
- import jieba as jb
- import numpy as np
- #获取数据
- def getInfo(name):
- f = codecs.open(name+'.txt','rb')
- return f.readlines()
- #提取机构名
- def extractOrg(data):
- orgList = []
- for line in data:
- extractRule = re.compile(u"[\u4e00-\u9fa5]+(?:研究员|副会长|副所长|副教授|会长|所长|教授|讲师|副院长|院长|大学|[部所院会室系科])")
- orgList.append(extractRule.findall(line.decode('utf-8')))
- return orgList
- #切分机构名并向量化
- def splitOrg(orgList):
- wordList = []
- vocabulary = []
- for line in orgList:
- words = []
- for item in line:
- words.extend(list(jb.cut(item)))
- wordList.append(words)
- vocabulary = reduce(lambda x,y:x+y,wordList)
- size = len(vocabulary)
- wordMatList = []
- #和单词表比照得到向量表示
- for w in wordList:
- wordArr= np.zeros(size)
- for index,word in enumerate(vocabulary):
- if word in w:
- wordArr[index] = 1
- wordMatList.append(wordArr)
- return np.mat(wordMatList)
- #计算相似度
- def calcSim(dataMat):
- sim = []
- length = len(dataMat)
- for i in range(length):
- irow = np.zeros(length)
- for l in range(length):
- irow[l]=np.linalg.norm(dataMat[i]-dataMat[l])
- sim.append(irow)
- return sim
- def getSimItem(sim):
- loc = []
- for i in range(len(sim)):
- min=sim[i].min()
- curloc =[]
- for t,v in enumerate(sim[i]):# 经试验发现,这里的相似基本上都是欧氏距离为0
- if v==min:#and t!=i这里去掉是为了下面分组的时候的相似具有传递性假设更合理
- curloc.append(t)
- loc.append(curloc)
- return loc
- #给相似度高的聚合,假设相似具有传递性
- def dividObj(loc):
- isize = len(loc)
- tempList = range(isize)
- dividList = []
- for i in range(isize):
- if len(tempList) > 0:
- divid = []
- if len(loc[i]) > 0:
- print loc[i]
- for n in loc[i]:
- if
- tempList.remove(n)
- divid.append(n)
- dividList.append(divid)
- else:
- continue
- else:
- break # 说明所有描述都分好类了,跳出循环
- print '总共有', len(dividList), '个同名学者,对于他们的描述分别是: '
- for i in dividList:
- print '第', i, '个是----->'
- for f in i:
- print content[f]
- if __name__ =='__main__':
- content = getInfo(u'王德胜')
- data = extractOrg(content)
- simMat = calcSim(splitOrg(data))
- loc = getSimItem(simMat)
- # for a in range(len(loc)):
- # print '第',a,'条描述是:',content[a],'与其最接近的有:'
- # if len(loc[a])>0:
- # for b in loc[a]:
- # print '--->第',b,'条描述。分别为: '
- # print content[b]
- # else:
- # print '没有与其相似的描述'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement