Guest User

Untitled

a guest
Sep 21st, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.83 KB | None | 0 0
  1. ##############################################################
  2. # V2 = use bounding box information to weight words in OCR
  3. ## add imagehash to dedup
  4. ##############################################
  5. ## common functions
  6. ##############################################
  7. import logging, os, re
  8. import pandas as pd
  9. import collections, struct, pickle, json, re
  10. from ast import literal_eval
  11. from tqdm import tqdm
  12. from io import open
  13. from os.path import join
  14. from multiprocessing import Pool
  15. from math import sqrt, log
  16. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  17. from sklearn.preprocessing import normalize
  18. from scipy.sparse import vstack
  19. import argparse
  20.  
  21.  
  22. def ocrCleanup(OCRstring, minWordLen=3):
  23. """ remove non alphabet/ numbers chars"""
  24. clean = re.sub('[^a-zA-Z1-9]+', ' ', str(OCRstring))
  25. clean = [w for w in clean.split() if len(w)>=minWordLen]
  26. clean = ' '.join(clean)
  27. return clean.lower()
  28.  
  29.  
  30. def extractWordROIs(OCR, WordROIs):
  31. OCR = OCR.split('#N#')
  32. Words = [w for OCRline in OCR for w in OCRline.split(' ')]
  33. WordROIs = list(map(float, WordROIs.split(',')))
  34. OCRjson = []
  35.  
  36. for wordIdx in range(len(Words)):
  37. WordROI = WordROIs[wordIdx*8 : (wordIdx+1)*8]
  38. WordBB = { "Words":
  39. [{ "Text": Words[wordIdx],
  40. "BoundingBox": {
  41. "TopLeft": {"X":WordROI[0], "Y":WordROI[1]},
  42. "TopRight": {"X":WordROI[2], "Y":WordROI[3]},
  43. "BottomRight": {"X":WordROI[4], "Y":WordROI[5]},
  44. "BottomLeft": {"X":WordROI[6], "Y":WordROI[7]}
  45. }
  46. }]
  47. }
  48. OCRjson.append(WordBB)
  49. return OCRjson
  50.  
  51. def calculateWidthHeight(w):
  52. edges = [
  53. sqrt((w['BoundingBox']['BottomLeft']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['BottomLeft']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
  54. sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['BottomRight']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['BottomRight']['Y']) ** 2),
  55. sqrt((w['BoundingBox']['TopRight']['X'] - w['BoundingBox']['TopLeft']['X']) ** 2 + (w['BoundingBox']['TopRight']['Y'] - w['BoundingBox']['TopLeft']['Y']) ** 2),
  56. sqrt((w['BoundingBox']['TopLeft']['X'] - w['BoundingBox']['BottomLeft']['X']) ** 2 + (w['BoundingBox']['TopLeft']['Y'] - w['BoundingBox']['BottomLeft']['Y']) ** 2)
  57. ]
  58. width = max(edges)
  59. height = min(edges)
  60. return width, height
  61.  
  62. def parseOcrRecord(ocrJson):
  63. words = [ y for x in ocrJson for y in x['Words'] ]
  64. words = [ { 'text': w['Text'],
  65. 'wh': calculateWidthHeight(w) } for w in words ]
  66. words = [ {'text': w['text'], 'w': w['wh'][0], 'h': w['wh'][1], 'area': w['wh'][0] * w['wh'][1] } for w in words ]
  67. return words
  68.  
  69. def getNormalizedWeights(words):
  70. sumArea = sum([ sqrt(w['h']) for w in words ])
  71. weights = [ sqrt(w['h']) / sumArea for w in words ]
  72. texts = [w['text'] for w in words]
  73. return list(zip(texts, weights))
  74. #weightedTf = sum([ f * w for f, w in zip(wordsTf, weights) ])
  75.  
  76. import mmap
  77. def getNumLines(file_path):
  78. fp = open(file_path, "r+")
  79. buf = mmap.mmap(fp.fileno(), 0)
  80. lines = 0
  81. while buf.readline():
  82. lines += 1
  83. return lines
  84. #getNumLines(OCR_WEIGHTS_FN)
  85.  
  86. idxSources = ['OCR', 'ProductTitle', 'ProductTitle_and_OCR']
  87.  
  88. ### READ Weights
  89. def getWeightedTfIdfV3(words_w_weights, normMethod=None):
  90. try:
  91. if len(words_w_weights) == 0:
  92. return tfidf_transformer.transform(count_vect.transform([ '' ]))
  93. wordsTf = count_vect.transform([ w[0] for w in words_w_weights])
  94. weights = [w[1] for w in words_w_weights]
  95. weightedTf = sum([ f * w for f, w in zip(wordsTf, weights)])
  96. # get sublinear value of Tf
  97. tfs = sum(wordsTf)
  98. sublinearTfs = tfs.data.astype(float)
  99. sublinearTfs += 1
  100. # scale factor between tf and sublinear tf.
  101. weightedTf.data *= sublinearTfs
  102. weightedTf.data /= tfs.data
  103. textFeature = tfidf_transformer.transform(weightedTf)
  104. if normMethod:
  105. textFeature = normalize(textFeatures, norm=normMethod, axis = 1)
  106. return textFeature
  107. except Exception as e:
  108. print(e)
  109.  
  110. def runPipeline(line, normMethod=None):
  111. MurlKey, MD5String, ProductTitle, OCR, LineROIs, WordROIs = line.strip('\n').split('\t')
  112. ocrJson = extractWordROIs(OCR, WordROIs)
  113. words = parseOcrRecord(ocrJson)
  114. words_w_weights = getNormalizedWeights(words)
  115. words_w_weights = [(ocrCleanup(w[0]), w[1]) for w in words_w_weights if ocrCleanup(w[0])]
  116. return getWeightedTfIdfV3(words_w_weights, normMethod)
  117.  
  118. def getTextFeaturesMultiprocessor(lines):
  119. res = []
  120. for line in tqdm(lines, total=len(lines)):
  121. res.append(runPipeline(line))
  122. # list(map(runPipeline, lines))
  123. return res
  124.  
  125. def linspace(lower, upper, length):
  126. return [int(lower + x*(upper-lower)/length) for x in range(length+1)]
  127.  
  128.  
  129. ###########################################################################
  130. # load processed counter vector
  131. ###########################################################################
  132. idxSource = idxSources[2]
  133. TFIDF_FN = '{}_tfidf_3gram.pickle'.format(idxSource)
  134.  
  135. numProcessor = 64
  136. normMethod = 'l1'
  137. DAT_DIR = "F:\\sechangc\\shoppingProducts\\dat\\"
  138. #DAT_DIR = '\\\\ccpiu02\shoppingProducts\\dat\\'
  139. os.chdir(DAT_DIR)
  140. # TEST SMALL DATASET
  141. #OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2.tsv'
  142. #TFIDF_WEIGHTED_FN = 'tfidf_3gram_weighted_norm_test.pickle'
  143. #OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_50k_test2_precomputedWeights.tsv'
  144.  
  145. # LARGE DATASET
  146. OCR_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927.tsv'
  147. TFIDF_WEIGHTED_FN = join(DAT_DIR, '{}_weighted_tfidf_3gram.pickle'.format(idxSource))
  148.  
  149. #OCR_WEIGHTS_FN = 'FashionIndex_TriggeredList_Title_OCR_bb_V2_20180927_precomputedWeights.tsv'
  150.  
  151. # load count_vect
  152. with open(TFIDF_FN, 'rb') as fp:
  153. tfidf = pickle.load(fp)
  154. count_vect = tfidf['count']
  155. tfidf_transformer = tfidf['tfidf']
  156.  
  157.  
  158.  
  159. if __name__ == '__main__':
  160. # prepare to train new tfidf
  161. print('start reading the file')
  162. #lines = [ x for x in open(OCR_FN, encoding='utf-8') ]
  163. lines = []
  164. with open(OCR_FN, encoding='utf-8') as file:
  165. for line in tqdm(file, total=getNumLines(OCR_FN)):
  166. lines.append(line)
  167. print('file read, num of lines', len(lines))
  168.  
  169. import pdb
  170. pdb.set_trace()
  171. with Pool(processes=numProcessor) as p:
  172. textFeatures = list(tqdm(p.imap(runPipeline, lines), total=len(lines) ))
  173. #textFeatures = list(tqdm(pool.map(runPipeline, lines), total=len(lines)))
  174. trainTfidf = vstack(textFeatures)
  175. print(trainTfidf.shape)
  176.  
  177. print('Vocabulary size in tfidf: {}'.format(trainTfidf.get_shape()))
  178. tfidf = { 'count': count_vect, 'tfidf': trainTfidf }
  179. with open(TFIDF_WEIGHTED_FN, 'wb') as fp:
  180. pickle.dump(tfidf, fp)
  181. print('output saves in ', TFIDF_WEIGHTED_FN, ' successfully.' )
Add Comment
Please, Sign In to add comment