Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- address = """
- CHRIS NISWANDEE
- SMALLSYS INC
- 795 E DRAGRAM ST
- TUCSON AZ 85705
- USA
- """
- import spacy
- from spacy.lang.en import English
- nlp = spacy.load('en_core_web_lg')
- STRUCTURE = {
- 'country':'',
- 'state': '',
- 'city': '',
- 'pincode': '',
- 'street': '',
- 'orgranisation': '',
- 'person': ''
- }
- results = {}
- def conditions(index, names, entities, line):
- checks = {0:'country', 1:'state-city', 2:'street', 3:'organisation', 4:'person'}
- if checks[index] == 'country':
- if 'CARDINAL' in entities:
- i = entities.index('CARDINAL')
- results['pincode'] = names[i]
- if 'GPE' in entities:
- i = entities.index('GPE')
- results['country'] = names[i]
- if checks[index] == 'state-city':
- if 'CARDINAL' in entities:
- i = entities.index('CARDINAL')
- results['pincode'] = names[i]
- if 'GPE' in entities:
- i = entities.index('GPE')
- results['city'] = names[i]
- for word in line.split(' '):
- if len(word) == 2:
- results['state'] = word
- if word not in names:
- results['city'] = word
- if word.isnumeric():
- results['pincode'] = word
- if checks[index] == 'street':
- if 'CARDINAL' in entities:
- i = entities.index('CARDINAL')
- results['unit-number'] = names[i]
- print("words are", words)
- for j in range(0, len(words)-1):
- if words[j+1].lower() in ['street', 'st', 'strt']:
- street_name = words[j] + ' ' + words[j+1]
- results['street'] = street_name
- def get_named_entities(address):
- doc = nlp(address)
- names = []
- entities = []
- for ent in doc.ents:
- names.append(ent.text)
- entities.append(ent.label_)
- return (names, entities)
- def parseAddress(address):
- address = address.split('\n')
- address_list = [item.split(',') for item in address]
- cleaned_address = []
- for address in address_list:
- for word in address:
- if len(word) != 0:
- cleaned_address.append(word)
- return cleaned_address
- def some(address):
- lines = parseAddress(address)
- for i in range(len(lines)-1, -1, -1):
- names, entities = get_named_entities(lines[i])
- print(names, entities, lines[i])
- conditions(len(lines)-1-i, names, entities, lines[i])
- some(address)
- print(results)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement