Advertisement
Guest User

Untitled

a guest
Sep 17th, 2019
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.00 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "def csv_to_dataframe(file_path):\n",
  10. "\n",
  11. " import pandas as pd\n",
  12. " import os\n",
  13. " from io import StringIO\n",
  14. " \n",
  15. " # open the file \n",
  16. " \n",
  17. " file_reader = open(file_path)\n",
  18. "\n",
  19. " # create the empty dataframe\n",
  20. " \n",
  21. " headers = file_reader.readline()\n",
  22. " headers_ls = headers.split(',')\n",
  23. " df_global = pd.DataFrame(columns=headers_ls)\n",
  24. "\n",
  25. " # prepare the error file\n",
  26. " \n",
  27. " dir_path = os.path.dirname(file_path)\n",
  28. " file_name = os.path.basename(file_path)\n",
  29. " file_base, file_ext = os.path.splitext(file_name)\n",
  30. " \n",
  31. " error_file_noext = os.path.join(dir_path, file_base)\n",
  32. " error_file_path = f'{error_file_noext}_error.txt'\n",
  33. " #print(error_file_path) \n",
  34. " \n",
  35. " line_no = 2\n",
  36. " error_file = open(error_file_path, 'w+')\n",
  37. " error_lines = []\n",
  38. " \n",
  39. " # read one line at a time\n",
  40. " \n",
  41. " for line in file_reader:\n",
  42. " \n",
  43. " # if there are no errors, append it to dataframe\n",
  44. "\n",
  45. " try:\n",
  46. " #print(line)\n",
  47. " df_line = pd.read_csv(StringIO(line), header=None) # header none makes it treat 1 line like data\n",
  48. "\n",
  49. " # will error here if number of columns in df_line doesn't match number in df_global\n",
  50. " df_line.columns = df_global.columns\n",
  51. " #df_line.info() \n",
  52. " df_global = df_global.append(df_line)\n",
  53. "\n",
  54. " # if there is an error, store it and output it\n",
  55. " \n",
  56. " except Exception as ex:\n",
  57. " #print(f'{line_no} : {ex}')\n",
  58. " \n",
  59. " error_lines.append(line)\n",
  60. " error_file.write(line)\n",
  61. "\n",
  62. " line_no += 1\n",
  63. " #if line_no > 0:\n",
  64. " # break\n",
  65. " \n",
  66. " return df_global, error_lines"
  67. ]
  68. },
  69. {
  70. "cell_type": "code",
  71. "execution_count": 2,
  72. "metadata": {
  73. "scrolled": true
  74. },
  75. "outputs": [
  76. {
  77. "name": "stdout",
  78. "output_type": "stream",
  79. "text": [
  80. "<class 'pandas.core.frame.DataFrame'>\n",
  81. "Int64Index: 1525 entries, 0 to 0\n",
  82. "Data columns (total 10 columns):\n",
  83. " 1525 non-null object\n",
  84. "business_id 1525 non-null object\n",
  85. "cool 1525 non-null object\n",
  86. "date 1525 non-null object\n",
  87. "funny 1525 non-null object\n",
  88. "review_id 1525 non-null object\n",
  89. "stars 1525 non-null object\n",
  90. "text 1525 non-null object\n",
  91. "useful 1525 non-null object\n",
  92. "user_id\n",
  93. " 1525 non-null object\n",
  94. "dtypes: object(10)\n",
  95. "memory usage: 131.1+ KB\n"
  96. ]
  97. }
  98. ],
  99. "source": [
  100. "# call the function and inspect the DataFrame\n",
  101. "\n",
  102. "df_final, errors = csv_to_dataframe('Data/Yelp_Reviews_corrupt.csv')\n",
  103. "df_final.info()"
  104. ]
  105. },
  106. {
  107. "cell_type": "code",
  108. "execution_count": 3,
  109. "metadata": {},
  110. "outputs": [
  111. {
  112. "name": "stdout",
  113. "output_type": "stream",
  114. "text": [
  115. "number of errors found: 4583\n"
  116. ]
  117. }
  118. ],
  119. "source": [
  120. "# check if there were errors in the file\n",
  121. "\n",
  122. "msg = f'number of errors found: {len(errors)}'\n",
  123. "print(msg)"
  124. ]
  125. }
  126. ],
  127. "metadata": {
  128. "kernelspec": {
  129. "display_name": "Python 3",
  130. "language": "python",
  131. "name": "python3"
  132. },
  133. "language_info": {
  134. "codemirror_mode": {
  135. "name": "ipython",
  136. "version": 3
  137. },
  138. "file_extension": ".py",
  139. "mimetype": "text/x-python",
  140. "name": "python",
  141. "nbconvert_exporter": "python",
  142. "pygments_lexer": "ipython3",
  143. "version": "3.7.3"
  144. }
  145. },
  146. "nbformat": 4,
  147. "nbformat_minor": 2
  148. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement