SHARE
TWEET

Untitled

a guest Jul 24th, 2019 76 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. address = """
  2. CHRIS NISWANDEE
  3. SMALLSYS INC
  4. 795 E DRAGRAM ST
  5. TUCSON AZ 85705
  6. USA
  7. """
  8.  
  9. import spacy
  10. from spacy.lang.en import English
  11.  
  12. nlp = spacy.load('en_core_web_lg')
  13.  
  14. STRUCTURE = {
  15.     'country':'',
  16.     'state': '',
  17.     'city': '',
  18.     'pincode': '',
  19.     'street': '',
  20.     'orgranisation': '',
  21.     'person': ''
  22. }
  23.  
  24. results = {}
  25.  
  26. def conditions(index, names, entities, line):
  27.     checks = {0:'country', 1:'state-city', 2:'street', 3:'organisation', 4:'person'}
  28.     if checks[index] == 'country':
  29.         if 'CARDINAL' in entities:
  30.             i = entities.index('CARDINAL')
  31.             results['pincode'] = names[i]
  32.         if 'GPE' in entities:
  33.             i = entities.index('GPE')
  34.             results['country'] = names[i]
  35.     if checks[index] == 'state-city':
  36.         if 'CARDINAL' in entities:
  37.             i = entities.index('CARDINAL')
  38.             results['pincode'] = names[i]
  39.         if 'GPE' in entities:
  40.             i = entities.index('GPE')
  41.             results['city'] = names[i]
  42.         for word in line.split(' '):
  43.             if len(word) == 2:
  44.                 results['state'] = word
  45.             if word not in names:
  46.                 results['city'] = word
  47.             if word.isnumeric():
  48.                 results['pincode'] = word
  49.     if checks[index] == 'street':
  50.         if 'CARDINAL' in entities:
  51.             i = entities.index('CARDINAL')
  52.             results['unit-number'] = names[i]
  53.         print("words are", words)
  54.         for j in range(0, len(words)-1):
  55.             if words[j+1].lower() in ['street', 'st', 'strt']:
  56.                 street_name = words[j] + ' ' + words[j+1]
  57.                 results['street'] = street_name
  58.            
  59.        
  60.  
  61. def get_named_entities(address):
  62.     doc = nlp(address)
  63.     names = []
  64.     entities = []
  65.     for ent in doc.ents:
  66.         names.append(ent.text)
  67.         entities.append(ent.label_)
  68.     return (names, entities)
  69.  
  70. def parseAddress(address):
  71.     address = address.split('\n')
  72.     address_list = [item.split(',') for item in address]
  73.     cleaned_address =  []
  74.     for address in address_list:
  75.         for word in address:
  76.             if len(word) != 0:
  77.                 cleaned_address.append(word)
  78.     return cleaned_address
  79.  
  80. def some(address):
  81.     lines = parseAddress(address)
  82.     for i in range(len(lines)-1, -1, -1):
  83.         names, entities = get_named_entities(lines[i])
  84.         print(names, entities, lines[i])
  85.         conditions(len(lines)-1-i, names, entities, lines[i])
  86.  
  87.        
  88.        
  89.  
  90. some(address)
  91. print(results)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top