nlp_tokenization_spacy

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Filename: nlp_tokenization_spacy.py
# Author: Jeoi Reqi

"""
This script performs tokenization on a given text using SpaCy.

Requirements:
- Python 3
- SpaCy library with the 'en_core_web_sm' model

Usage:
- Run the script, and it will print the tokenized words and sentences of the provided text.

Example:
python tokenization_spacy.py

Output:
Tokenized Words: ['Natural', 'Language', 'Processing', 'is', 'a', 'fascinating', 'field', '.', 'It', 'involves', 'the', 'use', 'of', 'computers', 'to', 'understand', 'and', 'process', 'human', 'language', '.']
Tokenized Sentences: ['Natural Language Processing is a fascinating field.', 'It involves the use of computers to understand and process human language.']
"""

import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing is a fascinating field. It involves the use of computers to understand and process human language."

# Process the text using SpaCy to get the 'doc' object
doc = nlp(text)

# Tokenize the text
tokens = [token.text for token in doc]
sentences = [sent.text for sent in doc.sents]

print("Tokenized Words:", tokens)
print("Tokenized Sentences:", sentences)