Guest User

Untitled

a guest
Jun 20th, 2018
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.64 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "metadata": {
  5. "toc": true
  6. },
  7. "cell_type": "markdown",
  8. "source": "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
  9. },
  10. {
  11. "metadata": {
  12. "ExecuteTime": {
  13. "end_time": "2018-06-12T17:21:56.354446Z",
  14. "start_time": "2018-06-12T17:21:56.333079Z"
  15. },
  16. "extensions": {
  17. "jupyter_dashboards": {
  18. "version": 1,
  19. "views": {
  20. "grid_default": {
  21. "hidden": true
  22. },
  23. "report_default": {}
  24. }
  25. }
  26. },
  27. "hide_input": false,
  28. "trusted": true
  29. },
  30. "cell_type": "code",
  31. "source": "import xlrd\nimport os\nimport sys\nimport csv\nimport hashlib\nmd5 = lambda s: hashlib.md5(s).hexdigest()",
  32. "execution_count": 2,
  33. "outputs": []
  34. },
  35. {
  36. "metadata": {
  37. "ExecuteTime": {
  38. "end_time": "2018-06-12T17:22:02.924670Z",
  39. "start_time": "2018-06-12T17:22:02.762354Z"
  40. },
  41. "extensions": {
  42. "jupyter_dashboards": {
  43. "version": 1,
  44. "views": {
  45. "grid_default": {
  46. "hidden": true
  47. },
  48. "report_default": {}
  49. }
  50. }
  51. },
  52. "hide_input": false,
  53. "trusted": true
  54. },
  55. "cell_type": "code",
  56. "source": "def clean_idfa(idfa):\n \"\"\"\n 清洗 idfa 数据格式\n \"\"\"\n try:\n if isinstance(idfa, float): # 检查是否 float 型数据\n return None\n idfa = idfa.upper()\n if len(idfa) == 32:\n idfa = '-'.join([idfa[:8], idfa[8:12], idfa[12:16], idfa[16:20], idfa[20:]])\n except: # 有异常直接 pass 掉\n return None \n return idfa\n\ndef clean_imei(imei):\n \"\"\"\n 清洗 imei 的数据格式\n \"\"\"\n try:\n if len(imei) != 32: # 如果不是 32 位则是异常数据\n return None\n return imei.lower() # 统一转为小写\n except:\n return None\n \ndef clean_excel_file(file_name):\n \"\"\"\n 清洗 Excel 文件的数据\n \"\"\"\n \n idfa_set = set()\n imei_md5_set = set()\n \n wb = xlrd.open_workbook(file_name) # 使用 xlrd 库读取 excel 文件\n sheet = wb.sheets()[0]\n nrows = sheet.nrows #获取第一张表格的行数赋值给nrows\n for i in xrange (1, nrows): #用一个for循环遍历所有的行数\n row = sheet.row_values(i) #打印所有遍历到的行数的内容\n idfa = row[0]\n idfa_ = row[1]\n imei_md5 = row[4]\n\n if idfa != u'':\n idfa = clean_idfa(idfa)\n if idfa: # 有数据则为正常数据\n idfa_set.add(idfa)\n if idfa_ != u'':\n idfa_ = clean_idfa(idfa_)\n if idfa:\n idfa_set.add(idfa_) \n \n if imei_md5 != u'':\n imei_md5 = clean_imei(imei_md5)\n if imei_md5: # 有数据则为正常数据\n imei_md5_set.add(imei_md5)\n \n return idfa_set, imei_md5_set\n\ndef clean_csv_file(file_name):\n \"\"\"\n 清洗 csv 格式的数据\n \"\"\"\n idfa_set = set()\n imei_md5_set = set()\n \n with open(file_name, 'rb') as f:\n reader = csv.reader(f) \n # rows = list(reader) # todo: 可能存在内存问题\n # 直接迭代速度相对会比全部加进内存满\n for row in reader: #用一个for循环遍历所有的行数\n idfa = row[0]\n idfa_ = row[1]\n imei_md5 = row[4]\n\n if idfa != u'':\n idfa = clean_idfa(idfa)\n if idfa:\n idfa_set.add(idfa)\n if idfa_ != u'':\n idfa_ = clean_idfa(idfa_)\n if idfa:\n idfa_set.add(idfa_) \n\n if imei_md5 != u'':\n imei_md5 = clean_imei(imei_md5)\n if imei_md5:\n imei_md5_set.add(imei_md5)\n return idfa_set, imei_md5_set\n \n\ndef clean_file(file_name, output_dir):\n \"\"\"\n 清洗文件\n 目前支持两种格式的特定数据(csv|xlsx)\n \"\"\"\n _, file_extension = os.path.splitext(file_name) # 获取文件后缀\n base_name = os.path.splitext(os.path.basename(file_name))[0] # 获取文件名\n des_dir = os.path.join(output_dir, base_name) # 输出目录\n if not os.path.exists(des_dir):\n os.makedirs(des_dir)\n if file_extension == '.xlsx': # 如果是 xlsx 后缀的文件\n idfa_set, imei_md5_set = clean_excel_file(file_name)\n else:\n idfa_set, imei_md5_set = clean_csv_file(file_name)\n\n idfa_file_path = os.path.join(des_dir, 'idfa.txt')\n imei_file_path = os.path.join(des_dir, 'imei.txt')\n idfa_file = open(idfa_file_path, 'w')\n imei_file = open(imei_file_path, 'w')\n \n for imei in imei_md5_set:\n imei_file.write(imei)\n imei_file.write('\\n')\n imei_file.close() # 养成 close 文件的好习惯\n \n for idfa in idfa_set:\n try:\n idfa_file.write(idfa)\n idfa_file.write('\\n')\n except:\n print idfa\n idfa_file.close()\n \ndef clean_dir(source_dir):\n \"\"\"\n 清洗目录下的所有文件,目前支持一级目录\n 生成路径为: ./source_dir+cleaned/{file_name}/imei.txt 和 ./source_dir+cleaned/{file_name}/idfa.txt\n \"\"\"\n output_dir = os.path.abspath(os.path.join(os.path.join(source_dir, os.pardir), os.path.basename(os.path.normpath(source_dir)) + '-cleaned'))\n if not os.path.exists(output_dir):\n os.mkdir(output_dir)\n file_list = os.listdir(source_dir)\n full_file_path_list = [os.path.join(source_dir, file_path) for file_path in file_list]\n for file_path in full_file_path_list:\n clean_file(file_path, output_dir)",
  57. "execution_count": 3,
  58. "outputs": []
  59. },
  60. {
  61. "metadata": {
  62. "ExecuteTime": {
  63. "end_time": "2018-06-12T17:22:25.640025Z",
  64. "start_time": "2018-06-12T17:22:14.803565Z"
  65. },
  66. "trusted": true
  67. },
  68. "cell_type": "code",
  69. "source": "data_dir = '../data/'\n\ndir_ = os.path.join(data_dir, u'东风日产官网-人群包原文件-导出excel')\n\n# 清洗指定目录文件\nclean_dir(dir_)",
  70. "execution_count": 4,
  71. "outputs": []
  72. },
  73. {
  74. "metadata": {
  75. "ExecuteTime": {
  76. "end_time": "2018-06-12T17:22:33.010756Z",
  77. "start_time": "2018-06-12T17:22:27.475955Z"
  78. },
  79. "extensions": {
  80. "jupyter_dashboards": {
  81. "version": 1,
  82. "views": {
  83. "grid_default": {
  84. "hidden": true
  85. },
  86. "report_default": {}
  87. }
  88. }
  89. },
  90. "trusted": true
  91. },
  92. "cell_type": "code",
  93. "source": "\nfile_name = os.path.join(data_dir, u'奇骏、逍客、劲客、楼兰、天籁外的其他车型0308-0607.csv')\nout_put_dir = '../data/clean/'\n# 清洗指定文件\nclean_file(file_name, out_put_dir)",
  94. "execution_count": 5,
  95. "outputs": []
  96. },
  97. {
  98. "metadata": {
  99. "extensions": {
  100. "jupyter_dashboards": {
  101. "version": 1,
  102. "views": {
  103. "grid_default": {
  104. "hidden": true
  105. },
  106. "report_default": {}
  107. }
  108. }
  109. },
  110. "trusted": true
  111. },
  112. "cell_type": "code",
  113. "source": "",
  114. "execution_count": null,
  115. "outputs": []
  116. }
  117. ],
  118. "metadata": {
  119. "extensions": {
  120. "jupyter_dashboards": {
  121. "activeView": "grid_default",
  122. "version": 1,
  123. "views": {
  124. "grid_default": {
  125. "cellMargin": 10,
  126. "defaultCellHeight": 20,
  127. "maxColumns": 12,
  128. "name": "grid",
  129. "type": "grid"
  130. },
  131. "report_default": {
  132. "name": "report",
  133. "type": "report"
  134. }
  135. }
  136. }
  137. },
  138. "kernelspec": {
  139. "name": "python2",
  140. "display_name": "Python2",
  141. "language": "python"
  142. },
  143. "language_info": {
  144. "mimetype": "text/x-python",
  145. "nbconvert_exporter": "python",
  146. "name": "python",
  147. "pygments_lexer": "ipython2",
  148. "version": "2.7.15",
  149. "file_extension": ".py",
  150. "codemirror_mode": {
  151. "version": 2,
  152. "name": "ipython"
  153. }
  154. },
  155. "toc": {
  156. "nav_menu": {},
  157. "number_sections": true,
  158. "sideBar": true,
  159. "skip_h1_title": true,
  160. "base_numbering": 1,
  161. "title_cell": "Table of Contents",
  162. "title_sidebar": "Contents",
  163. "toc_cell": true,
  164. "toc_position": {
  165. "height": "calc(100% - 180px)",
  166. "left": "10px",
  167. "top": "150px",
  168. "width": "251px"
  169. },
  170. "toc_section_display": true,
  171. "toc_window_display": true
  172. },
  173. "gist": {
  174. "id": "",
  175. "data": {
  176. "description": "notebook/device_clean.ipynb",
  177. "public": true
  178. }
  179. }
  180. },
  181. "nbformat": 4,
  182. "nbformat_minor": 2
  183. }
Add Comment
Please, Sign In to add comment