Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import re\n",
- "\n",
- "\n",
- "def clean_text(text):\n",
- " \"\"\"\n",
- " Applies some pre-processing on the given text.\n",
- "\n",
- " Steps :\n",
- " - Removing HTML tags\n",
- " - Removing punctuation\n",
- " - Lowering text\n",
- " \"\"\"\n",
- " \n",
- " # remove HTML tags\n",
- " text = re.sub(r'<.*?>', '', text)\n",
- " \n",
- " # remove the characters [\\], ['] and [\"]\n",
- " text = re.sub(r\"\\\\\", \"\", text) \n",
- " text = re.sub(r\"\\'\", \"\", text) \n",
- " text = re.sub(r\"\\\"\", \"\", text) \n",
- " \n",
- " # convert text to lowercase\n",
- " text = text.strip().lower()\n",
- " \n",
- " # replace punctuation characters with spaces\n",
- " filters='!\"\\'#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n'\n",
- " translate_dict = dict((c, \" \") for c in filters)\n",
- " translate_map = str.maketrans(translate_dict)\n",
- " text = text.translate(translate_map)\n",
- "\n",
- " return text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['this', 'is', 'not', 'a', 'sentence']"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "clean_text(\"<div>This is not a sentence.<\\div>\").split()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment