Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def named_entity_recognition(text, lower=True):
- tokenizer = PunktSentenceTokenizer()
- tokenized = tokenizer.tokenize(text)
- '''
- per: Person
- org: Organization
- loc: Location
- '''
- per, loc, org = [set() for _ in range(3)]
- for sentence in tokenized:
- words = word_tokenize(sentence)
- tagged = pos_tag(words)
- ner = ne_chunk(tagged)
- entities = traverse_ner(ner)
- for entity in entities:
- ne = entity[0]
- ent = entity[1].strip().lower()
- if ne == 'PERSON':
- per.add(ent)
- elif ne == 'ORGANIZATION':
- org.add(ent)
- else: # GPE, LOCATION
- loc.add(ent)
- return (per, loc, org)
- def _parse_itemscope(scope_tag):
- has_itemscope = Parser._tag_has_attr('itemscope')
- has_itemprop = Parser._tag_has_attr('itemprop')
- itemscopes = scope_tag.find_all(has_itemscope)
- properties = {}
- if any(itemscopes):
- for each in itemscopes:
- result = Parser._parse_itemscope(each)
- if result[0]:
- properties[result[0]] = result[1]
- properties['type'] = urlparse(scope_tag['itemtype'])
- for prop_tag in scope_tag.find_all(has_itemprop):
- if not prop_tag or not prop_tag.attrs:
- continue
- itemprop = prop_tag['itemprop']
- if prop_tag.name in ['a', 'link']:
- properties[itemprop] = prop_tag['href']
- elif prop_tag.name == 'meta':
- properties[itemprop] = prop_tag['content']
- elif itemprop == 'articleBody':
- properties[itemprop] = prop_tag.text.strip()
- if not has_itemscope(prop_tag):
- prop_tag.decompose()
- return (scope_tag.attrs.get('itemprop'), properties)
- def establish_connection(self):
- try:
- server = AllegroGraphServer(host = '192.168.1.27',
- port = 10035,
- user = self.username,
- password = self.password)
- catalog = server.openCatalog(None)
- except RequestError:
- print("Please run the script again.")
- exit()
- repo = None
- try:
- repo = catalog.getRepository(self.repo , Repository.OPEN)
- except ServerException:
- print("No repository found. Creating repository...")
- repo_name = input("Enter the name of Repository.")
- if repo_name == 'y':
- repo_name = 'DailyNewsEngine'
- self.repo = repo_name
- catalog.createRepository(repo_name)
- repo = catalog.getRepository(self.repo , Repository.OPEN)
- if repo != None:
- repo = repo.initialize()
- connection = repo.getConnection()
- self.connection = connection
- return { 'server' : server,
- 'catalog' : catalog,
- 'repository' : repo,
- 'connection' : connection }
Add Comment
Please, Sign In to add comment