SHARE
TWEET

Untitled

a guest Sep 17th, 2019 97 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 1,
  6.    "metadata": {},
  7.    "outputs": [],
  8.    "source": [
  9.     "def csv_to_dataframe(file_path):\n",
  10.     "\n",
  11.     "    import pandas as pd\n",
  12.     "    import os\n",
  13.     "    from io import StringIO\n",
  14.     "    \n",
  15.     "    # open the file \n",
  16.     "    \n",
  17.     "    file_reader = open(file_path)\n",
  18.     "\n",
  19.     "    # create the empty dataframe\n",
  20.     "    \n",
  21.     "    headers = file_reader.readline()\n",
  22.     "    headers_ls = headers.split(',')\n",
  23.     "    df_global = pd.DataFrame(columns=headers_ls)\n",
  24.     "\n",
  25.     "    # prepare the error file\n",
  26.     "    \n",
  27.     "    dir_path = os.path.dirname(file_path)\n",
  28.     "    file_name = os.path.basename(file_path)\n",
  29.     "    file_base, file_ext = os.path.splitext(file_name)\n",
  30.     "    \n",
  31.     "    error_file_noext = os.path.join(dir_path, file_base)\n",
  32.     "    error_file_path = f'{error_file_noext}_error.txt'\n",
  33.     "    #print(error_file_path) \n",
  34.     "    \n",
  35.     "    line_no = 2\n",
  36.     "    error_file = open(error_file_path, 'w+')\n",
  37.     "    error_lines = []\n",
  38.     "    \n",
  39.     "    # read one line at a time\n",
  40.     "    \n",
  41.     "    for line in file_reader:\n",
  42.     "        \n",
  43.     "        # if there are no errors, append it to dataframe\n",
  44.     "\n",
  45.     "        try:\n",
  46.     "            #print(line)\n",
  47.     "            df_line = pd.read_csv(StringIO(line), header=None)  # header none makes it treat 1 line like data\n",
  48.     "\n",
  49.     "            # will error here if number of columns in df_line doesn't match number in df_global\n",
  50.     "            df_line.columns = df_global.columns\n",
  51.     "            #df_line.info()    \n",
  52.     "            df_global = df_global.append(df_line)\n",
  53.     "\n",
  54.     "        # if there is an error, store it and output it\n",
  55.     "        \n",
  56.     "        except Exception as ex:\n",
  57.     "            #print(f'{line_no} : {ex}')\n",
  58.     "            \n",
  59.     "            error_lines.append(line)\n",
  60.     "            error_file.write(line)\n",
  61.     "\n",
  62.     "        line_no += 1\n",
  63.     "        #if line_no > 0:\n",
  64.     "        #    break\n",
  65.     "    \n",
  66.     "    return df_global, error_lines"
  67.    ]
  68.   },
  69.   {
  70.    "cell_type": "code",
  71.    "execution_count": 2,
  72.    "metadata": {
  73.     "scrolled": true
  74.    },
  75.    "outputs": [
  76.     {
  77.      "name": "stdout",
  78.      "output_type": "stream",
  79.      "text": [
  80.       "<class 'pandas.core.frame.DataFrame'>\n",
  81.       "Int64Index: 1525 entries, 0 to 0\n",
  82.       "Data columns (total 10 columns):\n",
  83.       "               1525 non-null object\n",
  84.       "business_id    1525 non-null object\n",
  85.       "cool           1525 non-null object\n",
  86.       "date           1525 non-null object\n",
  87.       "funny          1525 non-null object\n",
  88.       "review_id      1525 non-null object\n",
  89.       "stars          1525 non-null object\n",
  90.       "text           1525 non-null object\n",
  91.       "useful         1525 non-null object\n",
  92.       "user_id\n",
  93.       "       1525 non-null object\n",
  94.       "dtypes: object(10)\n",
  95.       "memory usage: 131.1+ KB\n"
  96.      ]
  97.     }
  98.    ],
  99.    "source": [
  100.     "# call the function and inspect the DataFrame\n",
  101.     "\n",
  102.     "df_final, errors = csv_to_dataframe('Data/Yelp_Reviews_corrupt.csv')\n",
  103.     "df_final.info()"
  104.    ]
  105.   },
  106.   {
  107.    "cell_type": "code",
  108.    "execution_count": 3,
  109.    "metadata": {},
  110.    "outputs": [
  111.     {
  112.      "name": "stdout",
  113.      "output_type": "stream",
  114.      "text": [
  115.       "number of errors found: 4583\n"
  116.      ]
  117.     }
  118.    ],
  119.    "source": [
  120.     "# check if there were errors in the file\n",
  121.     "\n",
  122.     "msg = f'number of errors found: {len(errors)}'\n",
  123.     "print(msg)"
  124.    ]
  125.   }
  126.  ],
  127.  "metadata": {
  128.   "kernelspec": {
  129.    "display_name": "Python 3",
  130.    "language": "python",
  131.    "name": "python3"
  132.   },
  133.   "language_info": {
  134.    "codemirror_mode": {
  135.     "name": "ipython",
  136.     "version": 3
  137.    },
  138.    "file_extension": ".py",
  139.    "mimetype": "text/x-python",
  140.    "name": "python",
  141.    "nbconvert_exporter": "python",
  142.    "pygments_lexer": "ipython3",
  143.    "version": "3.7.3"
  144.   }
  145.  },
  146.  "nbformat": 4,
  147.  "nbformat_minor": 2
  148. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top