Guest User

Untitled

a guest
May 31st, 2016
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.12 KB | None | 0 0
  1. # coding:utf-8
  2. '''
  3. step1:将描述中的机构名提取出来
  4. step2:切分好后转化为向量
  5. step3:计算相似度
  6. '''
  7.  
  8. import codecs
  9. import re
  10. import jieba as jb
  11. import numpy as np
  12.  
  13. #获取数据
  14. def getInfo(name):
  15. f = codecs.open(name+'.txt','rb')
  16. return f.readlines()
  17.  
  18. #提取机构名
  19. def extractOrg(data):
  20. orgList = []
  21. for line in data:
  22. extractRule = re.compile(u"[\u4e00-\u9fa5]+(?:研究员|副会长|副所长|副教授|会长|所长|教授|讲师|副院长|院长|大学|[部所院会室系科])")
  23. orgList.append(extractRule.findall(line.decode('utf-8')))
  24. return orgList
  25.  
  26. #切分机构名并向量化
  27. def splitOrg(orgList):
  28. wordList = []
  29. vocabulary = []
  30. for line in orgList:
  31. words = []
  32. for item in line:
  33. words.extend(list(jb.cut(item)))
  34. wordList.append(words)
  35.  
  36. vocabulary = reduce(lambda x,y:x+y,wordList)
  37. size = len(vocabulary)
  38.  
  39. wordMatList = []
  40. #和单词表比照得到向量表示
  41. for w in wordList:
  42. wordArr= np.zeros(size)
  43. for index,word in enumerate(vocabulary):
  44. if word in w:
  45. wordArr[index] = 1
  46. wordMatList.append(wordArr)
  47.  
  48. return np.mat(wordMatList)
  49.  
  50.  
  51. #计算相似度
  52. def calcSim(dataMat):
  53. sim = []
  54. length = len(dataMat)
  55. for i in range(length):
  56. irow = np.zeros(length)
  57. for l in range(length):
  58. irow[l]=np.linalg.norm(dataMat[i]-dataMat[l])
  59. sim.append(irow)
  60. return sim
  61.  
  62. def getSimItem(sim):
  63. loc = []
  64. for i in range(len(sim)):
  65. min=sim[i].min()
  66. curloc =[]
  67. for t,v in enumerate(sim[i]):# 经试验发现,这里的相似基本上都是欧氏距离为0
  68. if v==min:#and t!=i这里去掉是为了下面分组的时候的相似具有传递性假设更合理
  69. curloc.append(t)
  70. loc.append(curloc)
  71. return loc
  72.  
  73. #给相似度高的聚合,假设相似具有传递性
  74. def dividObj(loc):
  75. isize = len(loc)
  76. tempList = range(isize)
  77. dividList = []
  78. for i in range(isize):
  79. if len(tempList) > 0:
  80. divid = []
  81. if len(loc[i]) > 0:
  82. print loc[i]
  83. for n in loc[i]:
  84. if
  85. tempList.remove(n)
  86. divid.append(n)
  87. dividList.append(divid)
  88. else:
  89. continue
  90. else:
  91. break # 说明所有描述都分好类了,跳出循环
  92.  
  93. print '总共有', len(dividList), '个同名学者,对于他们的描述分别是: '
  94. for i in dividList:
  95. print '第', i, '个是----->'
  96. for f in i:
  97. print content[f]
  98.  
  99.  
  100.  
  101. if __name__ =='__main__':
  102. content = getInfo(u'王德胜')
  103. data = extractOrg(content)
  104. simMat = calcSim(splitOrg(data))
  105. loc = getSimItem(simMat)
  106.  
  107.  
  108.  
  109.  
  110. # for a in range(len(loc)):
  111. # print '第',a,'条描述是:',content[a],'与其最接近的有:'
  112. # if len(loc[a])>0:
  113. # for b in loc[a]:
  114. # print '--->第',b,'条描述。分别为: '
  115. # print content[b]
  116. # else:
  117. # print '没有与其相似的描述'
Add Comment
Please, Sign In to add comment